In [35]:
"""
copyright: abhirup.ghosh.184098@gmail.com
"""

# dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import optimize
from matplotlib.gridspec import GridSpec
import geopandas as gpd

import matplotlib as mpl
import seaborn as sns

import matplotlib.image as mpimg
import imageio

plt.style.use('ggplot')
mpl.rcParams['font.family'] = 'Times New Roman'
mpl.rcParams["savefig.facecolor"] = mpl.rcParams["axes.facecolor"]
import os

# Loading (trimmed) data

In [36]:
df = pd.read_csv('../data/taxi_trips_201905_trimmed.csv', parse_dates=['Trip Start Timestamp', 'Trip End Timestamp'])
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458362 entries, 0 to 1458361
Data columns (total 7 columns):
 #   Column                      Non-Null Count    Dtype         
---  ------                      --------------    -----         
 0   Taxi ID                     1458320 non-null  object        
 1   Trip Start Timestamp        1458362 non-null  datetime64[ns]
 2   Trip End Timestamp          1458316 non-null  datetime64[ns]
 3   Trip Miles                  1458298 non-null  float64       
 4   Trip Total                  1458239 non-null  float64       
 5   Pickup Centroid Location    1243339 non-null  object        
 6   Dropoff Centroid  Location  1189340 non-null  object        
dtypes: datetime64[ns](2), float64(2), object(3)
memory usage: 77.9+ MB


Unnamed: 0,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Miles,Trip Total,Pickup Centroid Location,Dropoff Centroid Location
0,52a94e237e6ca1f7e21267d6b906e3abaec95d6f057632...,2019-04-01,2019-04-01 00:15:00,11.9,40.75,,POINT (-87.6327464887 41.8809944707)
1,aa92002198c37cf17517fa6bc2d6031b35a2fd6bca4779...,2019-04-01,2019-04-01 00:15:00,8.0,25.78,,
2,7cbc46239a8278687e8c6117a85d1561b642d2a6aa2a67...,2019-04-01,2019-04-01 00:15:00,5.9,16.25,,
3,a8ce20d753f7419b47faeeca1365ddb4a9f55c7a21626c...,2019-04-01,2019-04-01 00:15:00,3.2,17.4,,POINT (-87.8773053996 41.9827750091)
4,82341078cd3736366413a939ab5ad4b58ce845308b8edd...,2019-04-01,2019-04-01 00:00:00,0.0,51.0,,


# Data Cleaning

## Renaming columns

I am not the greatest fan of "strings with space" column names, because it's difficult to prompt in a df.column_name format. I will instead convert them into one lowercase string where substrings are separated using underscores.

In [37]:
df = df.rename(columns={col: col.lower().replace(' ', '_') for col in df.columns})
df.columns

Index(['taxi_id', 'trip_start_timestamp', 'trip_end_timestamp', 'trip_miles',
       'trip_total', 'pickup_centroid_location', 'dropoff_centroid__location'],
      dtype='object')

## Taxi ID

The Taxi IDs are these long strings. According to the [data description](https://data.cityofchicago.org/Transportation/Taxi-Trips-2019/h4cq-z3dy):
> to protect privacy but allow for aggregate analyses, the Taxi ID is consistent for any given taxi medallion number but does not show the number

Since we are only going to be using this data for data analysis/visualisations, and not any complex modelling where patterns within these long strings might become relevant, we are just going to simplify our lives by replacing them with integers. We use the `OrdinalEncoder` function from `sklearn.preprocessing` to do this.  

In [44]:
print("Number of unique taxi_id numbers: ", df.taxi_id.nunique())

Number of unique taxi_id numbers:  4751


In [45]:
from sklearn.preprocessing import OrdinalEncoder


# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
df['taxi_id'] = ordinal_encoder.fit_transform(df[['taxi_id']])
print("Number of unique taxi_id numbers: ", df.taxi_id.nunique())
print("The Unique taxi_id numbers: ", df.taxi_id.unique())

Number of unique taxi_id numbers:  4751
The Unique taxi_id numbers:  [1533. 3108. 2265. ...  408. 2914. 3357.]
