In [1]:
import os
import pandas as pd
import numpy as np
from IPython.display import display

cwd = os.getcwd()
folder = os.path.join(cwd, "Citibike_Data")

In [2]:
for (dirpath, dirnames, filenames) in os.walk(folder):
    pass
filenames = sorted(filenames)
filenames[:5]

['201701-citibike-tripdata.csv.zip',
 '201702-citibike-tripdata.csv.zip',
 '201703-citibike-tripdata.csv.zip',
 '201704-citibike-tripdata.csv.zip',
 '201705-citibike-tripdata.csv.zip']

In [3]:
# function that is the workhouse
def Get_Data(filename=filenames[0], pd=pd):
    """
    Does all the work.  Read in DataFrames, parse them then return the dataframes
    """
    # read into dataframe
    df = pd.read_csv(os.path.join(folder, filename), compression="zip").iloc[:, 1:]

    # change to datetime both start time and stop time
    # columns change their names for some reason...
    df.iloc[:, 0] = pd.to_datetime(df.iloc[:, 0])
    df.iloc[:, 1] = pd.to_datetime(df.iloc[:, 1])

    # get the Year, Month and Day
    df["Start Year"] = df.iloc[:, 0].dt.year
    df["Start Month"] = df.iloc[:, 0].dt.month
    df["Start Day"] = df.iloc[:, 0].dt.day
    df["Start Hour"] = df.iloc[:, 0].dt.hour
    df["Duration_Seconds"] = (df.iloc[:, 1] - df.iloc[:, 0]).dt.total_seconds().astype('int64')

    # no longer need Start Time and Stop Time
    df = df.iloc[:,2:]
    
    # make sure column names are correct
    df.columns = ['Start Station ID', 'Start Station Name', 'Start Station Latitude',
                  'Start Station Longitude', 'End Station ID', 'End Station Name',
                  'End Station Latitude', 'End Station Longitude', 'Bike ID', 'User Type',
                  'Birth Year', 'Gender', 'Start Year', 'Start Month', 'Start Day', 'Start Hour',
                  'Duration_Seconds']
    
    # return the dataframe
    return df

In [4]:
# for each file name append the dataframes together
for x in filenames:
    
    # for first one just generate the dataframe
    if x == filenames[0]:
        df_total = Get_Data(filename=x)
    
    # for the following append them
    df_total = df_total.append(Get_Data(filename=x))

# reset the index
df_total.reset_index(drop=True, inplace=True)
    
# turn everything categorical    
df_total = df_total.astype({"Start Station ID":'category',
                 "Start Station Name":'category',
                 "End Station ID":'category',
                 "End Station Name":'category',
                 "Bike ID":'category',
                 "User Type":'category',
                 "Birth Year":'category',
                 "Gender":'category',
                 "Start Year":'category',
                 "Start Month":'category',
                 "Start Day":'category',
                 'Start Hour':'category'})
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55191369 entries, 0 to 55191368
Data columns (total 17 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   Start Station ID         category
 1   Start Station Name       category
 2   Start Station Latitude   float64 
 3   Start Station Longitude  float64 
 4   End Station ID           category
 5   End Station Name         category
 6   End Station Latitude     float64 
 7   End Station Longitude    float64 
 8   Bike ID                  category
 9   User Type                category
 10  Birth Year               category
 11  Gender                   category
 12  Start Year               category
 13  Start Month              category
 14  Start Day                category
 15  Start Hour               category
 16  Duration_Seconds         int64   
dtypes: category(12), float64(4), int64(1)
memory usage: 2.9 GB


In [5]:
df_total.sample(10)

Unnamed: 0,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds
40021913,524.0,W 43 St & 6 Ave,40.755273,-73.983169,490.0,8 Ave & W 33 St,40.751551,-73.993934,25687,Subscriber,1964.0,1,2019,5,7,16,327
27498349,3641.0,Broadway & W 25 St,40.742869,-73.989186,164.0,E 47 St & 2 Ave,40.753231,-73.970325,19403,Customer,1969.0,0,2018,8,15,9,1311
25837683,2012.0,E 27 St & 1 Ave,40.739445,-73.976806,167.0,E 39 St & 3 Ave,40.748901,-73.976049,16400,Subscriber,1978.0,2,2018,7,5,14,438
21692013,472.0,E 32 St & Park Ave,40.745712,-73.981948,498.0,Broadway & W 32 St,40.748549,-73.988084,20095,Subscriber,1974.0,1,2018,5,18,17,261
14104501,518.0,E 39 St & 2 Ave,40.747804,-73.973442,545.0,E 23 St & 1 Ave,40.736502,-73.978095,21024,Subscriber,1988.0,1,2017,10,19,8,464
33451132,477.0,W 41 St & 8 Ave,40.756405,-73.990026,422.0,W 59 St & 10 Ave,40.770513,-73.988038,34971,Subscriber,1998.0,2,2018,11,27,13,784
53048204,499.0,Broadway & W 60 St,40.769155,-73.981918,474.0,5 Ave & E 29 St,40.745168,-73.986831,32408,Subscriber,1991.0,2,2019,11,5,15,1377
53036167,2002.0,Wythe Ave & Metropolitan Ave,40.716887,-73.963198,3723.0,Cadman Plaza E & Johnson St,40.695317,-73.990157,39955,Subscriber,1986.0,1,2019,11,5,11,1437
14122914,482.0,W 15 St & 7 Ave,40.739355,-73.999318,3472.0,W 15 St & 10 Ave,40.742754,-74.007474,16482,Subscriber,1991.0,1,2017,10,19,12,223
34310240,497.0,E 17 St & Broadway,40.73705,-73.990093,174.0,E 25 St & 1 Ave,40.738177,-73.977387,34273,Subscriber,1995.0,1,2018,12,18,17,467


In [6]:
# generate a table of all unique Start and End Stations
df_unique_start = df_total.loc[:, ['Start Station ID', 'Start Station Name',
                                   'Start Station Latitude', 'Start Station Longitude']].copy()
df_unique_end = df_total.loc[:, ['End Station ID', 'End Station Name',
                                   'End Station Latitude', 'End Station Longitude']].copy()

# now slice out all the Station Stuff except for the ID's
df_total = df_total.loc[:, ['Start Station ID', 'End Station ID', 'Bike ID', 'User Type',
       'Birth Year', 'Gender', 'Start Year', 'Start Month', 'Start Day', 'Start Hour',
       'Duration_Seconds']]

df_unique_start.head()

Unnamed: 0,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude
0,3226.0,W 82 St & Central Park West,40.78275,-73.97137
1,3263.0,Cooper Square & E 7 St,40.729236,-73.990868
2,3143.0,5 Ave & E 78 St,40.776829,-73.963888
3,3143.0,5 Ave & E 78 St,40.776829,-73.963888
4,3143.0,5 Ave & E 78 St,40.776829,-73.963888


In [7]:
# drop all duplicates
df_unique_start = df_unique_start.drop_duplicates()
df_unique_end = df_unique_end.drop_duplicates()
df_unique_end.columns = df_unique_start.columns

df_unique_start.shape, df_unique_end.shape

((1222, 4), (1280, 4))

In [8]:
# append the 2 lists and drop al duplicates as well by ID
df_unique = df_unique_start.append(df_unique_end)
df_unique = df_unique.drop_duplicates(subset=["Start Station ID"]).dropna(subset=["Start Station ID"])
df_unique.shape

(1076, 4)

In [9]:
# write the unique values to an external lookup table
df_unique.sort_values(by=["Start Station ID"], inplace=True)
df_unique.to_csv(os.path.join(cwd, "Citibike_Clean", "Station_ID.csv"),
                 index=False)

In [10]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55191369 entries, 0 to 55191368
Data columns (total 11 columns):
 #   Column            Dtype   
---  ------            -----   
 0   Start Station ID  category
 1   End Station ID    category
 2   Bike ID           category
 3   User Type         category
 4   Birth Year        category
 5   Gender            category
 6   Start Year        category
 7   Start Month       category
 8   Start Day         category
 9   Start Hour        category
 10  Duration_Seconds  int64   
dtypes: category(10), int64(1)
memory usage: 1.1 GB


In [11]:
# write the csv
df_total.to_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC.csv.zip"),
                index=False, compression="zip")

In [12]:
# aggregate the data
df_total.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds
0,3226.0,3165.0,25542,Subscriber,1965.0,2,2017,1,1,0,680
1,3263.0,498.0,21136,Subscriber,1987.0,2,2017,1,1,0,1283
2,3143.0,3152.0,18147,Customer,,0,2017,1,1,0,649
3,3143.0,3152.0,21211,Customer,,0,2017,1,1,0,632
4,3143.0,3152.0,26819,Customer,,0,2017,1,1,0,622


In [13]:
# groupby on Start Date
df_groupby1 = df_total.groupby(by=["Start Year", "Start Month",
                                   "Start Day", "Start Hour"])["Duration_Seconds"].agg(
                                  ["sum", "count"]
                                  ).reset_index().dropna(subset=["sum"])

# write this to csv file
df_groupby1.to_csv(os.path.join(cwd, "Citibike_Clean", "GroupBy_Start_Date.csv"),
                 index=False)

df_groupby1.head()

Unnamed: 0,Start Year,Start Month,Start Day,Start Hour,sum,count
0,2017,1,1,0,654326.0,706
1,2017,1,1,1,519550.0,704
2,2017,1,1,2,543026.0,564
3,2017,1,1,3,290928.0,414
4,2017,1,1,4,567570.0,192


In [14]:
# groupby on User Type, Birth Year, Gender and Start Date
df_groupby2 = df_total.groupby(by=["User Type", "Birth Year",
                                   "Gender"])["Duration_Seconds"].agg(
                                  ["sum", "count"]
                                  ).reset_index().dropna(subset=["sum"])

# write this to csv file
df_groupby2.to_csv(os.path.join(cwd, "Citibike_Clean", "GroupBy_User_Birth_Gender.csv"),
                 index=False)

df_groupby2.head()

Unnamed: 0,User Type,Birth Year,Gender,sum,count
1,Customer,1857.0,1,7612.0,6
6,Customer,1863.0,0,3729.0,2
10,Customer,1874.0,1,8280.0,6
12,Customer,1880.0,0,702.0,1
16,Customer,1884.0,1,2809.0,2


In [15]:
# to load the dataset back into memory
# just to show you how
df_total = pd.read_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC.csv.zip"),
                       compression="zip")
df_total.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds
0,3226.0,3165.0,25542,Subscriber,1965.0,2,2017,1,1,0,680
1,3263.0,498.0,21136,Subscriber,1987.0,2,2017,1,1,0,1283
2,3143.0,3152.0,18147,Customer,,0,2017,1,1,0,649
3,3143.0,3152.0,21211,Customer,,0,2017,1,1,0,632
4,3143.0,3152.0,26819,Customer,,0,2017,1,1,0,622


In [16]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55191369 entries, 0 to 55191368
Data columns (total 11 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Start Station ID  float64
 1   End Station ID    float64
 2   Bike ID           int64  
 3   User Type         object 
 4   Birth Year        float64
 5   Gender            int64  
 6   Start Year        int64  
 7   Start Month       int64  
 8   Start Day         int64  
 9   Start Hour        int64  
 10  Duration_Seconds  int64  
dtypes: float64(3), int64(7), object(1)
memory usage: 4.5+ GB
