In [1]:
import os
import pandas as pd
import numpy as np
from IPython.display import display

cwd = os.getcwd()

In [2]:
# load dataset into memory
# drop the stations that do not have an ID as distance cannot be determined
df = pd.read_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC.csv.zip"),
                       compression="zip").dropna(subset=["Start Station ID", "End Station ID"])

# change the datatype of Start and End Station ID to integer
# all nulls have been dropped
df["Start Station ID"] = df["Start Station ID"].astype("int64")
df["End Station ID"] = df["End Station ID"].astype("int64")

df.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds
0,3226,3165,25542,Subscriber,1965.0,2,2017,1,1,0,680
1,3263,498,21136,Subscriber,1987.0,2,2017,1,1,0,1283
2,3143,3152,18147,Customer,,0,2017,1,1,0,649
3,3143,3152,21211,Customer,,0,2017,1,1,0,632
4,3143,3152,26819,Customer,,0,2017,1,1,0,622


In [3]:
# check out the memory size of the pandas dataframe
print(f"{df.memory_usage().sum()/1_000_000} Megabytes")
df.info()

5298.114432 Megabytes
<class 'pandas.core.frame.DataFrame'>
Int64Index: 55188692 entries, 0 to 55191368
Data columns (total 11 columns):
 #   Column            Dtype  
---  ------            -----  
 0   Start Station ID  int64  
 1   End Station ID    int64  
 2   Bike ID           int64  
 3   User Type         object 
 4   Birth Year        float64
 5   Gender            int64  
 6   Start Year        int64  
 7   Start Month       int64  
 8   Start Day         int64  
 9   Start Hour        int64  
 10  Duration_Seconds  int64  
dtypes: float64(1), int64(9), object(1)
memory usage: 4.9+ GB


In [4]:
# change the datatype for several columns into categorical to save on RAM memory
df["Start Station ID"] = pd.Categorical(df["Start Station ID"])
df["End Station ID"] = pd.Categorical(df["End Station ID"])
df["Bike ID"] = pd.Categorical(df["Bike ID"])
df["User Type"] = pd.Categorical(df["User Type"])
df["Birth Year"] = pd.Categorical(df["Birth Year"])
df["Gender"] = pd.Categorical(df["Gender"])
df["Start Year"] = pd.Categorical(df["Start Year"])
df["Start Month"] = pd.Categorical(df["Start Month"])
df["Start Day"] = pd.Categorical(df["Start Day"])
df["Start Hour"] = pd.Categorical(df["Start Hour"])

In [5]:
# notice the huge memory reductions
print(f"{df.memory_usage().sum()/1_000_000} Megabytes")
df.info()

1601.406172 Megabytes
<class 'pandas.core.frame.DataFrame'>
Int64Index: 55188692 entries, 0 to 55191368
Data columns (total 11 columns):
 #   Column            Dtype   
---  ------            -----   
 0   Start Station ID  category
 1   End Station ID    category
 2   Bike ID           category
 3   User Type         category
 4   Birth Year        category
 5   Gender            category
 6   Start Year        category
 7   Start Month       category
 8   Start Day         category
 9   Start Hour        category
 10  Duration_Seconds  int64   
dtypes: category(10), int64(1)
memory usage: 1.5 GB


In [6]:
# write the intermediate stage back into memory, write the csv
df.to_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC-no_nulls.csv.zip"),
                index=False, compression="zip")

In [7]:
# free up the variable df
#del df

# read the new dataframe back into memory
df = pd.read_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC-no_nulls.csv.zip"),
                       compression="zip")

# change the datatype for several columns into categorical to save on RAM memory
df["Start Station ID"] = pd.Categorical(df["Start Station ID"])
df["End Station ID"] = pd.Categorical(df["End Station ID"])
df["Bike ID"] = pd.Categorical(df["Bike ID"])
df["User Type"] = pd.Categorical(df["User Type"])
df["Birth Year"] = pd.Categorical(df["Birth Year"])
df["Gender"] = pd.Categorical(df["Gender"])
df["Start Year"] = pd.Categorical(df["Start Year"])
df["Start Month"] = pd.Categorical(df["Start Month"])
df["Start Day"] = pd.Categorical(df["Start Day"])
df["Start Hour"] = pd.Categorical(df["Start Hour"])

df.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds
0,3226,3165,25542,Subscriber,1965.0,2,2017,1,1,0,680
1,3263,498,21136,Subscriber,1987.0,2,2017,1,1,0,1283
2,3143,3152,18147,Customer,,0,2017,1,1,0,649
3,3143,3152,21211,Customer,,0,2017,1,1,0,632
4,3143,3152,26819,Customer,,0,2017,1,1,0,622


In [8]:
# load the location dataset into memory
df_loc = pd.read_csv(os.path.join(cwd, "Citibike_Clean", "Station_ID.csv")).dropna()

# change datatype for Start Station ID to integer then category
df_loc["Start Station ID"] = df_loc.astype({"Start Station ID": "int64"})
df_loc["Start Station ID"] = pd.Categorical(df_loc["Start Station ID"])

df_loc.head()

Unnamed: 0,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude
0,72,W 52 St & 11 Ave,40.767272,-73.993929
1,79,Franklin St & W Broadway,40.719116,-74.006667
2,82,St James Pl & Pearl St,40.711174,-74.000165
3,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323
4,116,W 17 St & 8 Ave,40.741776,-74.001497


In [9]:
# need to merge the 2 datasets twice over to get
# Start Station and End Station Latitudes and Longitudes
df_total = df.merge(df_loc, left_on="Start Station ID",
                    right_on="Start Station ID", suffixes=("", "_"))

# free up memory by deleting objects and columns
del df
df_total.drop(columns=["Start Station Name"], inplace=True)

# merge again to get further latitudes and longitudes for the End Stations
df_total = df_total.merge(df_loc, left_on="End Station ID",
                    right_on="Start Station ID", suffixes=("", "_"))

# free up memory by deleting objects and columns
del df_loc
df_total.drop(columns=["Start Station Name", "Start Station ID_"], inplace=True)

df_total.rename(columns={"Start Station Latitude_": "End Station Latitude",
                          "Start Station Longitude_": "End Station Longitude"}, inplace=True)
df_total.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds,Start Station Latitude,Start Station Longitude,End Station Latitude,End Station Longitude
0,3226,3165,25542,Subscriber,1965.0,2,2017,1,1,0,680,40.78275,-73.97137,40.775794,-73.976206
1,3226,3165,24209,Customer,,0,2017,1,1,14,1756,40.78275,-73.97137,40.775794,-73.976206
2,3226,3165,15708,Customer,,0,2017,1,1,14,1750,40.78275,-73.97137,40.775794,-73.976206
3,3226,3165,18403,Customer,,0,2017,1,1,14,1735,40.78275,-73.97137,40.775794,-73.976206
4,3226,3165,16505,Customer,,0,2017,1,1,14,1691,40.78275,-73.97137,40.775794,-73.976206


In [10]:
# Python Haversine formula for numpy arrays
def np_haversine(latit_1=-90, latit_2=90, longit_1=0, longit_2=180, radius=6371000):
    """
    Calculate great circle distance between 2 points on a sphere (Earth is default).
    Positions are in decimal degrees.
    Function was rewritten to be calculated unsing Numpy methods.
    Returns value in metres (m).
    """
    # convert decimal degrees to radians
    latit_1 = np.radians(latit_1)
    latit_2 = np.radians(latit_2)
    longit_1 = np.radians(longit_1)
    longit_2 = np.radians(longit_2)
    
    # calculate the difference between latitudes and longitudes
    delta_latitude = latit_1 - latit_2
    delta_longitude = longit_1 - longit_2
    
    # great circle distance
    a = np.sin(delta_latitude / 2)**2 + np.cos(latit_1) * np.cos(latit_2) * np.sin(delta_longitude / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    # return the distance
    return radius * c

np_haversine()/6371000

3.141592653589793

In [11]:
# use the haversine formula to calculate the distance between
# stations (unit of metres) and add 1 metre to deal with infinity values from division
df_total["Station_Distance"] = pd.Series(np.around(np_haversine(latit_1=df_total["Start Station Latitude"].values,
                                            latit_2=df_total["End Station Latitude"].values,
                                            longit_1=df_total["Start Station Longitude"].values,
                                            longit_2=df_total["End Station Longitude"].values)
                                        , decimals=0)).astype("int64") + 1

# remove the Latitudes and Longitudes as they are no longer needed
df_total.drop(columns=["Start Station Latitude",
                       "Start Station Longitude",
                       "End Station Latitude",
                       "End Station Longitude"], inplace=True)

# calculate seconds travelled per metre
df_total["Seconds_per_Metre"] = (df_total["Duration_Seconds"]/df_total["Station_Distance"]).round(decimals=2)
df_total.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds,Station_Distance,Seconds_per_Metre
0,3226,3165,25542,Subscriber,1965.0,2,2017,1,1,0,680,875,0.78
1,3226,3165,24209,Customer,,0,2017,1,1,14,1756,875,2.01
2,3226,3165,15708,Customer,,0,2017,1,1,14,1750,875,2.0
3,3226,3165,18403,Customer,,0,2017,1,1,14,1735,875,1.98
4,3226,3165,16505,Customer,,0,2017,1,1,14,1691,875,1.93


In [12]:
# write the final dataset back into memory, write the csv
df_total.to_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC-FINAL.csv.zip"),
                index=False, compression="zip")
del df_total

In [13]:
# read the new dataframe back into memory
df = pd.read_csv(os.path.join(cwd, "Citibike_Clean", "CitiBike-NYC-FINAL.csv.zip"),
                       compression="zip")

# change the datatype for several columns into categorical to save on RAM memory
df["Start Station ID"] = pd.Categorical(df["Start Station ID"])
df["End Station ID"] = pd.Categorical(df["End Station ID"])
df["Bike ID"] = pd.Categorical(df["Bike ID"])
df["User Type"] = pd.Categorical(df["User Type"])
df["Birth Year"] = pd.Categorical(df["Birth Year"])
df["Gender"] = pd.Categorical(df["Gender"])
df["Start Year"] = pd.Categorical(df["Start Year"])
df["Start Month"] = pd.Categorical(df["Start Month"])
df["Start Day"] = pd.Categorical(df["Start Day"])
df["Start Hour"] = pd.Categorical(df["Start Hour"])

df.head()

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds,Station_Distance,Seconds_per_Metre
0,3226,3165,25542,Subscriber,1965.0,2,2017,1,1,0,680,875,0.78
1,3226,3165,24209,Customer,,0,2017,1,1,14,1756,875,2.01
2,3226,3165,15708,Customer,,0,2017,1,1,14,1750,875,2.0
3,3226,3165,18403,Customer,,0,2017,1,1,14,1735,875,1.98
4,3226,3165,16505,Customer,,0,2017,1,1,14,1691,875,1.93


In [14]:
# check to see if there are any infinite values in Seconds_per_Metre
# looks empty
df[~np.isfinite(df["Seconds_per_Metre"])]

Unnamed: 0,Start Station ID,End Station ID,Bike ID,User Type,Birth Year,Gender,Start Year,Start Month,Start Day,Start Hour,Duration_Seconds,Station_Distance,Seconds_per_Metre


In [15]:
# clean up memory
del df