In [None]:
import pandas as pd
import datetime
import os
from pathlib import Path 

In [None]:
files = os.listdir("data/source")
# loop through files to concat into one dataset
i = 1
for file in files:
    path = os.path.join("data/source", file)
    df = pd.read_csv(path)

    # convert times to datetime to do a timedelta to get trip length
    df['started_at']= pd.to_datetime(df['started_at'])
    df['ended_at']= pd.to_datetime(df['ended_at'])
    df['trip_length'] = (df['ended_at'] - df['started_at'])

    # convert trip length to minutes since tableau doesn't understand timedelta
    df['trip_length_min'] = (df['ended_at'] - df['started_at'])/ datetime.timedelta(minutes=1)

    # break out start/end day of week
    df['start_dow'] = df['started_at'].dt.weekday
    df['end_dow'] = df['ended_at'].dt.weekday

    #break out month
    df['month_num'] = df['started_at'].dt.month

    # keep only trips at least 1 minute long
    df = df.loc[df['trip_length_min'] >= 1]

    # create dataframe
    if i == 1:
        trips_2022 = df
        print(path)
    else:
        trips_2022 = pd.concat([trips_2022, df])
        print(path)
    i += 1
        

In [None]:
trips_2022.head()

In [None]:
filepath = Path('data/trips_2022.csv.zip')   
trips_2022.to_csv(filepath, index=False, compression="zip")

In [None]:
start_station = trips_2022[['start_station_name','start_lat','start_lng']].copy()
start_station_gb = start_station.groupby(['start_station_name'],as_index=False).size()
start_station_gb = start_station_gb.rename(columns={"size":"start_count"})
start_station = start_station.drop_duplicates(subset=['start_station_name'])
start_station.head()

In [None]:
start_station_gb.head()

In [None]:
start_station = pd.merge(start_station_gb, start_station, on="start_station_name")

In [None]:
end_station = trips_2022[['end_station_name','end_lat','end_lng']].copy()
end_station_gb = end_station.groupby(['end_station_name'],as_index=False).size()
end_station_gb = end_station_gb.rename(columns={"size":"end_count"})
end_station = end_station.drop_duplicates(subset=['end_station_name'])
end_station.head()

In [None]:
end_station = pd.merge(end_station_gb, end_station, on="end_station_name")

In [None]:
station_counts = pd.merge(start_station, end_station, how='outer', left_on=["start_station_name"], right_on=["end_station_name"])
station_counts[station_counts.isna().any(axis=1)]
station_counts

In [None]:
station_counts = station_counts[['end_station_name','start_count','end_count', 'end_lat','end_lng']].copy().fillna(0)
station_counts = station_counts.rename(columns={'end_station_name':'station_name','end_lat':'lat', 'end_lng':'lng'})
station_counts

In [None]:
filepath = Path('data/stations_2022.csv')   
station_counts.to_csv(filepath, index=False)

In [None]:
station_counts.loc[station_counts['start_count'] == 0]

In [None]:
trips_2022.loc[trips_2022['rideable_type'] == 'docked_bike']