In [1]:
import os
import pandas as pd
import numpy as np
from IPython.display import display

cwd = os.getcwd()
folder = os.path.join(cwd, "Citibike_Data")

In [2]:
for (dirpath, dirnames, filenames) in os.walk(folder):
    pass
filenames = sorted(filenames)
filenames[:5]

['201701-citibike-tripdata.csv.zip',
 '201702-citibike-tripdata.csv.zip',
 '201703-citibike-tripdata.csv.zip',
 '201704-citibike-tripdata.csv.zip',
 '201705-citibike-tripdata.csv.zip']

In [3]:
# function that is the workhouse
def Get_Data(filename=filenames[0], pd=pd):
    """
    Does all the work.  Read in DataFrames, parse them then return the dataframes
    """
    # read into dataframe
    df = pd.read_csv(os.path.join(folder, filename), compression="zip").iloc[:, 1:]

    # change to datetime both start time and stop time
    # columns change their names for some reason...
    df.iloc[:, 0] = pd.to_datetime(df.iloc[:, 0])
    df.iloc[:, 1] = pd.to_datetime(df.iloc[:, 1])

    # get the Year, Month and Day
    df["Start Year"] = df.iloc[:, 0].dt.year
    df["Start Month"] = df.iloc[:, 0].dt.month
    df["Start Day"] = df.iloc[:, 0].dt.day
    df["Duration_Seconds"] = (df.iloc[:, 1] - df.iloc[:, 0]).dt.total_seconds().astype('int64')

    # no longer need Start Time and Stop Time
    df = df.iloc[:,2:]
    
    # make sure column names are correct
    df.columns = ['Start Station ID', 'Start Station Name', 'Start Station Latitude',
                  'Start Station Longitude', 'End Station ID', 'End Station Name',
                  'End Station Latitude', 'End Station Longitude', 'Bike ID', 'User Type',
                  'Birth Year', 'Gender', 'Start Year', 'Start Month', 'Start Day',
                  'Duration_Seconds']
    
    # return the dataframe
    return df

In [4]:
# for each file name append the dataframes together
for x in filenames:
    
    # for first one just generate the dataframe
    if x == filenames[0]:
        df_total = Get_Data(filename=x)
    
    # for the following append them
    df_total = df_total.append(Get_Data(filename=x))

# reset the index
df_total.reset_index(drop=True, inplace=True)
    
# turn everything categorical    
df_total = df_total.astype({"Start Station ID":'category',
                 "Start Station Name":'category',
                 "End Station ID":'category',
                 "End Station Name":'category',
                 "Bike ID":'category',
                 "User Type":'category',
                 "Birth Year":'category',
                 "Gender":'category',
                 "Start Year":'category',
                 "Start Month":'category',
                 "Start Day":'category'})
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55191369 entries, 0 to 55191368
Data columns (total 16 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   Start Station ID         category
 1   Start Station Name       category
 2   Start Station Latitude   float64 
 3   Start Station Longitude  float64 
 4   End Station ID           category
 5   End Station Name         category
 6   End Station Latitude     float64 
 7   End Station Longitude    float64 
 8   Bike ID                  category
 9   User Type                category
 10  Birth Year               category
 11  Gender                   category
 12  Start Year               category
 13  Start Month              category
 14  Start Day                category
 15  Duration_Seconds         int64   
dtypes: category(11), float64(4), int64(1)
memory usage: 2.9 GB


In [5]:
df_total.sample(10)

Unnamed: 0,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Duration_Seconds
5307743,232.0,Cadman Plaza E & Tillary St,40.695977,-73.990149,3347.0,Van Brunt St & Wolcott St,40.677343,-74.012751,28529,Subscriber,1984.0,1,2017,5,20,943
53990589,3344.0,Pioneer St & Van Brunt St,40.679043,-74.011169,3382.0,Carroll St & Smith St,40.680611,-73.994758,34943,Subscriber,1985.0,1,2019,11,24,496
47316400,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3314.0,W 95 St & Broadway,40.79377,-73.971888,33909,Subscriber,1991.0,1,2019,8,19,365
21630431,457.0,Broadway & W 58 St,40.766953,-73.981693,530.0,11 Ave & W 59 St,40.771522,-73.990541,24815,Subscriber,1983.0,1,2018,5,13,475
1899429,3242.0,Schermerhorn St & Court St,40.691029,-73.991834,414.0,Pearl St & Anchorage Pl,40.702819,-73.987658,26814,Customer,,0,2017,2,19,1054
30856254,280.0,E 10 St & 5 Ave,40.73332,-73.995101,466.0,W 25 St & 6 Ave,40.743954,-73.991449,20532,Subscriber,1955.0,1,2018,10,6,801
19111046,490.0,8 Ave & W 33 St,40.751551,-73.993934,494.0,W 26 St & 8 Ave,40.747348,-73.997236,17341,Subscriber,1976.0,1,2018,3,8,178
45325212,3549.0,Grand Ave & Bergen St,40.678045,-73.962408,3016.0,Kent Ave & N 7 St,40.720368,-73.961651,39405,Subscriber,1975.0,1,2019,7,24,1716
19573406,3526.0,31 St & 23 Ave,40.774788,-73.912555,3515.0,24 Ave & 26 St,40.774591,-73.918544,29775,Subscriber,1988.0,1,2018,3,15,211
37171759,3525.0,23 Ave & 27 St,40.776165,-73.915318,3129.0,Queens Plaza North & Crescent St,40.751102,-73.940737,32094,Subscriber,1962.0,1,2019,3,17,1017


In [6]:
# generate a table of all unique Start and End Stations
df_unique_start = df_total.loc[:, ['Start Station ID', 'Start Station Name',
                                   'Start Station Latitude', 'Start Station Longitude']].copy()
df_unique_end = df_total.loc[:, ['End Station ID', 'End Station Name',
                                   'End Station Latitude', 'End Station Longitude']].copy()

# now slice out all the Station Stuff except for the ID's
df_total = df_total.loc[:, ['Start Station ID', 'End Station ID', 'Bike ID', 'User Type',
       'Birth Year', 'Gender', 'Start Year', 'Start Month', 'Start Day',
       'Duration_Seconds']]

df_unique_start.head()

Unnamed: 0,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude
0,3226.0,W 82 St & Central Park West,40.78275,-73.97137
1,3263.0,Cooper Square & E 7 St,40.729236,-73.990868
2,3143.0,5 Ave & E 78 St,40.776829,-73.963888
3,3143.0,5 Ave & E 78 St,40.776829,-73.963888
4,3143.0,5 Ave & E 78 St,40.776829,-73.963888


In [7]:
# drop all duplicates
df_unique_start = df_unique_start.drop_duplicates()
df_unique_end = df_unique_end.drop_duplicates()
df_unique_end.columns = df_unique_start.columns

df_unique_start.shape, df_unique_end.shape

((1222, 4), (1280, 4))

In [8]:
# append the 2 lists and drop al duplicates as well by ID
df_unique = df_unique_start.append(df_unique_end)
df_unique = df_unique.drop_duplicates(subset=["Start Station ID"])
df_unique.shape

(1077, 4)

In [9]:
# write the unique values to an external lookup table
df_unique.sort_values(by=["Start Station ID"], inplace=True)
df_unique.to_csv(os.path.join(cwd, "Citibike_Clean", "Station_ID.csv"),
                 index=False)

In [10]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55191369 entries, 0 to 55191368
Data columns (total 10 columns):
 #   Column            Dtype   
---  ------            -----   
 0   Start Station ID  category
 1   End Station ID    category
 2   Bike ID           category
 3   User Type         category
 4   Birth Year        category
 5   Gender            category
 6   Start Year        category
 7   Start Month       category
 8   Start Day         category
 9   Duration_Seconds  int64   
dtypes: category(9), int64(1)
memory usage: 1.0 GB


In [11]:
# write the csv
df_total.to_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC.csv.zip"),
                index=False, compression="zip")

In [12]:
# aggregate the data
df_total.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Duration_Seconds
0,3226.0,3165.0,25542,Subscriber,1965.0,2,2017,1,1,680
1,3263.0,498.0,21136,Subscriber,1987.0,2,2017,1,1,1283
2,3143.0,3152.0,18147,Customer,,0,2017,1,1,649
3,3143.0,3152.0,21211,Customer,,0,2017,1,1,632
4,3143.0,3152.0,26819,Customer,,0,2017,1,1,622


In [13]:
# groupby on User Type, Birth Year, Gender and Start Date
df_groupby = df_total.groupby(by=["Start Year", "Start Month", "Start Day",
                                  "User Type", "Birth Year", "Gender"])["Duration_Seconds"].agg(
                              ["sum", "count"]
                              ).reset_index().dropna(subset=["sum"])

# write this to csv file
df_groupby.to_csv(os.path.join(cwd, "Citibike_Clean", "GroupBy_User_Birth_Gender.csv"),
                 index=False)

df_groupby.head()

Unnamed: 0,Start Year,Start Month,Start Day,User Type,Birth Year,Gender,sum,count
226,2017,1,1,Customer,1958.0,1,7184.0,2
242,2017,1,1,Customer,1963.0,2,7178.0,2
244,2017,1,1,Customer,1964.0,1,9584.0,10
292,2017,1,1,Customer,1980.0,1,35808.0,2
295,2017,1,1,Customer,1981.0,1,8890.0,6


In [15]:
# to load the dataset back into memory
# just to show you how
df_total = pd.read_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC.csv.zip"),
                       compression="zip")
df_total.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Duration_Seconds
0,3226.0,3165.0,25542,Subscriber,1965.0,2,2017,1,1,680
1,3263.0,498.0,21136,Subscriber,1987.0,2,2017,1,1,1283
2,3143.0,3152.0,18147,Customer,,0,2017,1,1,649
3,3143.0,3152.0,21211,Customer,,0,2017,1,1,632
4,3143.0,3152.0,26819,Customer,,0,2017,1,1,622
