In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

#load initial csv
initial_data = pd.read_csv('Taxi_Trips.csv')

In [57]:
#select desired rows
cab_fare_df = initial_data.iloc[: ,[2,3,4,5,10,13,16,17,18,20,21]].copy()
#print(cab_fare_df.columns)
#print(cab_fare_df.dtypes)
initial_rows = len(cab_fare_df)

In [58]:
#drop duplicates
cab_fare_df.drop_duplicates()

#check null values
print(cab_fare_df.isnull().sum())


Trip Start Timestamp              0
Trip End Timestamp                8
Trip Seconds                    169
Trip Miles                        6
Fare                           2045
Extras                         2045
Company                           0
Pickup Centroid Latitude      22966
Pickup Centroid Longitude     22966
Dropoff Centroid Latitude     79675
Dropoff Centroid Longitude    79675
dtype: int64


In [59]:
#handle blank latitudes and longitudes
 
cab_fare_df['Pickup Centroid Latitude'] = cab_fare_df['Pickup Centroid Latitude'].fillna(cab_fare_df['Pickup Centroid Latitude'].mean())

cab_fare_df['Pickup Centroid Longitude'] = cab_fare_df['Pickup Centroid Longitude'].fillna(cab_fare_df['Pickup Centroid Longitude'].mean())
cab_fare_df['Dropoff Centroid Latitude'] = cab_fare_df['Dropoff Centroid Latitude'].fillna(cab_fare_df['Dropoff Centroid Latitude'].mean())
cab_fare_df['Dropoff Centroid Longitude'] = cab_fare_df['Dropoff Centroid Longitude'].fillna(cab_fare_df['Dropoff Centroid Longitude'].mean())


In [66]:
#drop rows with NA values
print(cab_fare_df.isnull().sum())
cab_fare_df = cab_fare_df.dropna()

Trip Start Timestamp             0
Trip End Timestamp               8
Trip Seconds                   169
Trip Miles                       6
Fare                          2045
Extras                        2045
Company                          0
Pickup Centroid Latitude         0
Pickup Centroid Longitude        0
Dropoff Centroid Latitude        0
Dropoff Centroid Longitude       0
Trip Cost                     2045
Month                            0
Hour                             0
dtype: int64


In [61]:
#Chicago coordinates: 41.8832° N, 87.6324° W
print(cab_fare_df['Pickup Centroid Latitude'].describe())
print(cab_fare_df['Pickup Centroid Longitude'].describe())
print(cab_fare_df['Dropoff Centroid Latitude'].describe())
print(cab_fare_df['Dropoff Centroid Longitude'].describe())
#min and max of all coordinates are appropriate 

count    865247.000000
mean         41.902214
std           0.065541
min          41.660136
25%          41.878866
50%          41.899156
75%          41.979071
max          42.021224
Name: Pickup Centroid Latitude, dtype: float64
count    865247.000000
mean        -87.704695
std           0.113981
min         -87.913625
25%         -87.756047
50%         -87.642808
75%         -87.626211
max         -87.534903
Name: Pickup Centroid Longitude, dtype: float64
count    865247.000000
mean         41.892124
std           0.056692
min          41.650222
25%          41.878866
50%          41.892124
75%          41.914616
max          42.021224
Name: Dropoff Centroid Latitude, dtype: float64
count    865247.000000
mean        -87.659592
std           0.064780
min         -87.913625
25%         -87.663416
50%         -87.642649
75%         -87.626211
max         -87.534903
Name: Dropoff Centroid Longitude, dtype: float64


In [62]:
#filtering out problematic rows

#filter seconds
#print(cab_fare_df['Trip Seconds'].describe())
#cab_fare_df = cab_fare_df[cab_fare_df['Trip Seconds'].between(120.0, 21600.0)]

#filter miles
#cab_fare_df = cab_fare_df[cab_fare_df['Trip Miles'].between(1.0, 100.0)]


In [71]:
#calculating % dropped
cleaned_rows = len(cab_fare_df)
dropped_rows = initial_rows - cleaned_rows
percent_dropped = round(dropped_rows/initial_rows,4)*100
print(f"Initial number of rows: {initial_rows}")
print(f"Final number of rows after cleaning: {cleaned_rows}")
print(f"Percent of dataset dropped: {percent_dropped}%")

Initial number of rows: 865247
Final number of rows after cleaning: 863104
Percent of dataset dropped: 0.25%


In [64]:
#sum fare and extras columns 
cab_fare_df['Trip Cost'] = cab_fare_df['Fare'] + cab_fare_df['Extras']

In [65]:
#convert timestamp columns

cab_fare_df['Trip Start Timestamp'] = pd.to_datetime(cab_fare_df['Trip Start Timestamp'], format="%m/%d/%Y %I:%M:%S %p")
#cab_fare_df['Trip End Timestamp'] = pd.to_datetime(cab_fare_df['Trip End Timestamp'])

#create month column 
cab_fare_df['Month'] = cab_fare_df['Trip Start Timestamp'].dt.month

#create hour column
cab_fare_df['Hour'] = cab_fare_df['Trip Start Timestamp'].dt.hour 
