In [1]:
import os
import pandas as pd
import numpy as np
from IPython.display import display

cwd = os.getcwd()
folder = os.path.join(cwd, "Citibike_Data")

# Citibike New York City Data

- One can find the system-data at the following [link](https://www.citibikenyc.com/system-data)
    - [S3 Amazon Bucket](https://s3.amazonaws.com/tripdata/index.html)

In [2]:
# get all filenames for the .csv.zip files
for (dirpath, dirnames, filenames) in os.walk(folder):
    pass
filenames = sorted(filenames)
filenames[:5]

['201701-citibike-tripdata.csv.zip',
 '201702-citibike-tripdata.csv.zip',
 '201703-citibike-tripdata.csv.zip',
 '201704-citibike-tripdata.csv.zip',
 '201705-citibike-tripdata.csv.zip']

# Workhorse Function to read in the 36 datafiles

- files are compressed and presented as .csv.zip

In [3]:
# function that is the workhouse to generate the entire dataset
def Get_Data(filename=filenames[0], pd=pd):
    """
    Does all the work.  Read in DataFrames, parse them then return the dataframes
    """
    # read into dataframe
    df = pd.read_csv(os.path.join(folder, filename), compression="zip").iloc[:, 1:]

    # change to datetime both start time and stop time
    # columns change their names for some reason...
    df.iloc[:, 0] = pd.to_datetime(df.iloc[:, 0])
    df.iloc[:, 1] = pd.to_datetime(df.iloc[:, 1])

    # get the Year, Month and Day
    df["Start Year"] = df.iloc[:, 0].dt.year
    df["Start Month"] = df.iloc[:, 0].dt.month
    df["Start Day"] = df.iloc[:, 0].dt.day
    df["Start Hour"] = df.iloc[:, 0].dt.hour
    df["Duration_Seconds"] = (df.iloc[:, 1] - df.iloc[:, 0]).dt.total_seconds().astype('int64')

    # no longer need Start Time and Stop Time
    df = df.iloc[:,2:]
    
    # make sure column names are correct
    df.columns = ['Start Station ID', 'Start Station Name', 'Start Station Latitude',
                  'Start Station Longitude', 'End Station ID', 'End Station Name',
                  'End Station Latitude', 'End Station Longitude', 'Bike ID', 'User Type',
                  'Birth Year', 'Gender', 'Start Year', 'Start Month', 'Start Day', 'Start Hour',
                  'Duration_Seconds']
    
    # return the dataframe
    return df

In [4]:
# for each file name append the dataframes together
for x in filenames:
    
    # for first one just generate the dataframe
    if x == filenames[0]:
        df_total = Get_Data(filename=x)
    
    # for the following append them
    df_total = df_total.append(Get_Data(filename=x))

# reset the index
df_total.reset_index(drop=True, inplace=True)
    
# turn everything categorical    
df_total = df_total.astype({"Start Station ID":'category',
                 "Start Station Name":'category',
                 "End Station ID":'category',
                 "End Station Name":'category',
                 "Bike ID":'category',
                 "User Type":'category',
                 "Birth Year":'category',
                 "Gender":'category',
                 "Start Year":'category',
                 "Start Month":'category',
                 "Start Day":'category',
                 'Start Hour':'category'})
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55191369 entries, 0 to 55191368
Data columns (total 17 columns):
 #   Column                   Dtype   
---  ------                   -----   
 0   Start Station ID         category
 1   Start Station Name       category
 2   Start Station Latitude   float64 
 3   Start Station Longitude  float64 
 4   End Station ID           category
 5   End Station Name         category
 6   End Station Latitude     float64 
 7   End Station Longitude    float64 
 8   Bike ID                  category
 9   User Type                category
 10  Birth Year               category
 11  Gender                   category
 12  Start Year               category
 13  Start Month              category
 14  Start Day                category
 15  Start Hour               category
 16  Duration_Seconds         int64   
dtypes: category(12), float64(4), int64(1)
memory usage: 2.9 GB


In [5]:
df_total.sample(10)

Unnamed: 0,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds
15608293,466.0,W 25 St & 6 Ave,40.743954,-73.991449,457.0,Broadway & W 58 St,40.766953,-73.981693,33347,Subscriber,1977.0,1,2017,11,15,20,1081
22351503,3260.0,Mercer St & Bleecker St,40.727064,-73.996621,501.0,FDR Drive & E 35 St,40.744219,-73.971212,18314,Subscriber,1954.0,1,2018,5,11,8,1308
9999134,301.0,E 2 St & Avenue B,40.722174,-73.983688,368.0,Carmine St & 6 Ave,40.730386,-74.00215,27909,Subscriber,1979.0,1,2017,8,13,11,512
50678982,531.0,Forsyth St & Broome St,40.718939,-73.992663,517.0,Pershing Square South,40.751581,-73.97791,39563,Subscriber,1990.0,2,2019,10,1,8,1556
50546318,3182.0,Yankee Ferry Terminal,40.686931,-74.016966,3254.0,Soissons Landing,40.692317,-74.014866,28454,Subscriber,1952.0,1,2019,9,29,14,1356
6583006,151.0,Cleveland Pl & Spring St,40.722104,-73.997249,401.0,Allen St & Rivington St,40.720196,-73.989978,18924,Subscriber,1985.0,1,2017,6,14,13,267
5462644,486.0,Broadway & W 29 St,40.746201,-73.988557,483.0,E 12 St & 3 Ave,40.732233,-73.9889,15700,Customer,,0,2017,5,24,9,1117
42306791,3770.0,Central Ave & Flushing Ave,40.702831,-73.9333,3068.0,Humboldt St & Varet St,40.703172,-73.940636,28322,Subscriber,1989.0,1,2019,6,11,1,758
23985673,3078.0,Broadway & Roebling St,40.709248,-73.960631,3116.0,Huron St & Franklin St,40.73266,-73.95826,17564,Subscriber,1987.0,1,2018,6,15,6,769
19258625,2006.0,Central Park S & 6 Ave,40.765909,-73.976342,477.0,W 41 St & 8 Ave,40.756405,-73.990026,19880,Subscriber,1968.0,1,2018,3,13,17,632


# Generate a Lookup table for all the Station ID's

- store the Start and End station ID's in the table but remove the rest of the information
- save the geospatial information in a .csv file

In [6]:
# generate a table of all unique Start and End Stations
df_unique_start = df_total.loc[:, ['Start Station ID', 'Start Station Name',
                                   'Start Station Latitude', 'Start Station Longitude']].copy()
df_unique_end = df_total.loc[:, ['End Station ID', 'End Station Name',
                                   'End Station Latitude', 'End Station Longitude']].copy()

# now slice out all the Station Stuff except for the ID's
df_total = df_total.loc[:, ['Start Station ID', 'End Station ID', 'Bike ID', 'User Type',
       'Birth Year', 'Gender', 'Start Year', 'Start Month', 'Start Day', 'Start Hour',
       'Duration_Seconds']]

df_unique_start.head()

Unnamed: 0,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude
0,3226.0,W 82 St & Central Park West,40.78275,-73.97137
1,3263.0,Cooper Square & E 7 St,40.729236,-73.990868
2,3143.0,5 Ave & E 78 St,40.776829,-73.963888
3,3143.0,5 Ave & E 78 St,40.776829,-73.963888
4,3143.0,5 Ave & E 78 St,40.776829,-73.963888


In [7]:
# drop all duplicates
df_unique_start = df_unique_start.drop_duplicates()
df_unique_end = df_unique_end.drop_duplicates()
df_unique_end.columns = df_unique_start.columns

df_unique_start.shape, df_unique_end.shape

((1222, 4), (1280, 4))

In [8]:
# append the 2 lists and drop al duplicates as well by ID
df_unique = df_unique_start.append(df_unique_end)
df_unique = df_unique.drop_duplicates(subset=["Start Station ID"]).dropna(subset=["Start Station ID"])
df_unique.shape

(1076, 4)

In [9]:
# write the unique values to an external lookup table
df_unique.sort_values(by=["Start Station ID"], inplace=True)
df_unique.to_csv(os.path.join(cwd, "Citibike_Clean", "Station_ID.csv"),
                 index=False)

In [10]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55191369 entries, 0 to 55191368
Data columns (total 11 columns):
 #   Column            Dtype   
---  ------            -----   
 0   Start Station ID  category
 1   End Station ID    category
 2   Bike ID           category
 3   User Type         category
 4   Birth Year        category
 5   Gender            category
 6   Start Year        category
 7   Start Month       category
 8   Start Day         category
 9   Start Hour        category
 10  Duration_Seconds  int64   
dtypes: category(10), int64(1)
memory usage: 1.1 GB


# The main dataset for the 3 year period is written into memory

- January 2017 - December 2019 is the date range

In [11]:
# write the csv
df_total.to_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC.csv.zip"),
                index=False, compression="zip")

In [12]:
df_total.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds
0,3226.0,3165.0,25542,Subscriber,1965.0,2,2017,1,1,0,680
1,3263.0,498.0,21136,Subscriber,1987.0,2,2017,1,1,0,1283
2,3143.0,3152.0,18147,Customer,,0,2017,1,1,0,649
3,3143.0,3152.0,21211,Customer,,0,2017,1,1,0,632
4,3143.0,3152.0,26819,Customer,,0,2017,1,1,0,622


In [13]:
# to load the dataset back into memory
del df_total

# just to show you how
df_total = pd.read_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC.csv.zip"),
                       compression="zip")
df_total.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds
0,3226.0,3165.0,25542,Subscriber,1965.0,2,2017,1,1,0,680
1,3263.0,498.0,21136,Subscriber,1987.0,2,2017,1,1,0,1283
2,3143.0,3152.0,18147,Customer,,0,2017,1,1,0,649
3,3143.0,3152.0,21211,Customer,,0,2017,1,1,0,632
4,3143.0,3152.0,26819,Customer,,0,2017,1,1,0,622


In [14]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55191369 entries, 0 to 55191368
Data columns (total 11 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Start Station ID  float64
 1   End Station ID    float64
 2   Bike ID           int64  
 3   User Type         object 
 4   Birth Year        float64
 5   Gender            int64  
 6   Start Year        int64  
 7   Start Month       int64  
 8   Start Day         int64  
 9   Start Hour        int64  
 10  Duration_Seconds  int64  
dtypes: float64(3), int64(7), object(1)
memory usage: 4.5+ GB


In [15]:
del df_total