In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/taxi_data.csv')

In [2]:
print(f"dataset shape: {df.shape}")
print("first 5 rows:")
display(df.head())

# data types
print("data types before conversion:")
print(df.dtypes)

dataset shape: (1000, 19)
first 5 rows:


Unnamed: 0,vendorid,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecodeid,store_and_fwd_flag,pulocationid,dolocationid,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01T00:32:10.000,2023-01-01T00:40:36.000,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01T00:55:08.000,2023-01-01T01:01:27.000,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01T00:25:04.000,2023-01-01T00:37:49.000,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01T00:03:48.000,2023-01-01T00:13:25.000,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01T00:10:29.000,2023-01-01T00:21:19.000,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


data types before conversion:
vendorid                   int64
tpep_pickup_datetime      object
tpep_dropoff_datetime     object
passenger_count          float64
trip_distance            float64
ratecodeid               float64
store_and_fwd_flag        object
pulocationid               int64
dolocationid               int64
payment_type               int64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
improvement_surcharge    float64
total_amount             float64
congestion_surcharge     float64
airport_fee              float64
dtype: object


In [3]:
# cleaning columns: tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee

columns_to_clean = [
    'tolls_amount', 
    'improvement_surcharge', 
    'total_amount', 
    'congestion_surcharge', 
    'airport_fee'
]

# converting all cols to numeric and handle missing vals
for col in columns_to_clean:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # NaN values
    null_count = df[col].isna().sum()
    if null_count > 0:
        print(f"{null_count} missing values in {col}")
        # filled with 0
        df[col] = df[col].fillna(0)
    
    # making sure all values are non negative
    neg_count = (df[col] < 0).sum()
    if neg_count > 0:
        print(f" {neg_count} negatives values in {col}")
        df[col] = df[col].clip(lower=0)
    
    # round to 2 decimal places
    df[col] = df[col].round(2)

print()
print("cleaned cols:")
display(df[columns_to_clean].describe())

 6 negatives values in improvement_surcharge
 6 negatives values in total_amount
 5 negatives values in congestion_surcharge

cleaned cols:


Unnamed: 0,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.15381,0.9905,25.27634,2.31,0.05375
std,1.113337,0.091477,15.978075,0.662827,0.253698
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,15.48,2.5,0.0
50%,0.0,1.0,20.52,2.5,0.0
75%,0.0,1.0,29.76,2.5,0.0
max,18.3,1.0,189.98,2.5,1.25


In [4]:
# checking vals for improvement_surcharge
# from NYC TLC data dictionary and documentation:
# $0.30 surcharge from 2015 - 2018
# $0.50 prior to February 2022
# $1.00 after February 2022
# $0.00 for some exempt trips
print("improvement_surcharge value counts:")
display(df['improvement_surcharge'].value_counts().sort_index())

# checking for unusual values outside the expected ranges
expected_improvement_values = [0.0, 0.3, 0.5, 1.0]
mask = ~df['improvement_surcharge'].isin(expected_improvement_values)
if mask.sum() > 0:
    print(f"{mask.sum()} records with unexpected values:")
    display(df.loc[mask, 'improvement_surcharge'].value_counts())
    
    # replacing to the nearest expected value
    most_common_value = df.loc[~mask, 'improvement_surcharge'].mode()[0]
    df.loc[mask, 'improvement_surcharge'] = most_common_value
    print(f"fixed unexpected values by setting them to {most_common_value}")

improvement_surcharge value counts:


improvement_surcharge
0.0      6
0.3      5
1.0    989
Name: count, dtype: int64

In [5]:
# congestion_surcharge
# from NYC TLC regulations:
# $2.50 for standard rides in congestion zone
# $2.75 for shared rides
# $0.00 for exempt trips or trips outside the congestion zone
print("congestion_surcharge value counts:")
display(df['congestion_surcharge'].value_counts().sort_index())

# checking for unexpected congestion surcharge vals
expected_congestion_values = [0.0, 2.5, 2.75]
mask = ~df['congestion_surcharge'].isin(expected_congestion_values)
if mask.sum() > 0:
    print(f"{mask.sum()} records with unexpected vlaues:")
    display(df.loc[mask, 'congestion_surcharge'].value_counts())
    
    most_common_value = df.loc[~mask, 'congestion_surcharge'].mode()[0]
    df.loc[mask, 'congestion_surcharge'] = most_common_value
    print(f"fixed unexpected values by setting them to {most_common_value}")

congestion_surcharge value counts:


congestion_surcharge
0.0     76
2.5    924
Name: count, dtype: int64

In [6]:
# values for airport_fee
# from NYC TLC regulations:
# $1.25 for pickups at JFK or LaGuardia airports
# $0.00 for all other trips
print("airport_fee value counts:")
display(df['airport_fee'].value_counts().sort_index())

# checking for unexpected airport fee vals
expected_airport_values = [0.0, 1.25]
mask = ~df['airport_fee'].isin(expected_airport_values)
if mask.sum() > 0:
    print(f"{mask.sum()} records with unexpected airport_fee values:")
    display(df.loc[mask, 'airport_fee'].value_counts())
    
    most_common_value = df.loc[~mask, 'airport_fee'].mode()[0]
    df.loc[mask, 'airport_fee'] = most_common_value
    print(f"Fixed unexpected values by setting them to {most_common_value}")

airport_fee value counts:


airport_fee
0.00    957
1.25     43
Name: count, dtype: int64

In [7]:
# correcting total_amount by checking against the sum of components

component_columns = [
    'fare_amount', 'extra', 'mta_tax', 'tip_amount', 
    'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee'
]

for col in component_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).round(2)

# calculating the expected total
df['calculated_total'] = df[component_columns].sum(axis=1).round(2)

# comparing with the actual total_amount
df['difference'] = (df['total_amount'] - df['calculated_total']).abs().round(2)

# records with significant differences
discrepancies = df[df['difference'] > 0.01]

print(f"{len(discrepancies)} records where total_amount doesn't match the sum of components")
if len(discrepancies) > 0:
    print("sample:")
    display(discrepancies[['total_amount', 'calculated_total', 'difference']].head())
    
    df.loc[df['difference'] > 0.01, 'total_amount'] = df.loc[df['difference'] > 0.01, 'calculated_total']
    print("total amount fixed using calculated total amount")

df = df.drop(['calculated_total', 'difference'], axis=1)

293 records where total_amount doesn't match the sum of components
sample:


Unnamed: 0,total_amount,calculated_total,difference
3,20.85,22.1,1.25
11,46.55,49.05,2.5
17,29.5,32.0,2.5
18,13.6,16.1,2.5
19,20.6,23.1,2.5


total amount fixed using calculated total amount


In [9]:
print("cleaned data sample:")
display(df.head())

monetary_columns = [
    'fare_amount', 'extra', 'mta_tax', 'tip_amount', 
    'tolls_amount', 'improvement_surcharge', 'total_amount', 
    'congestion_surcharge', 'airport_fee'
]

# rounding to have 2 decimal places
for col in monetary_columns:
    if col in df.columns:
        df[col] = df[col].round(2)

print("\ndecimal precision in columns:")
display(df[monetary_columns].head())

cleaned_dataset_path = 'data/cleaned_taxi_data.csv'
df.to_csv(cleaned_dataset_path, index=False)
print("cleaned data saved " + cleaned_dataset_path)

cleaned data sample:


Unnamed: 0,vendorid,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecodeid,store_and_fwd_flag,pulocationid,dolocationid,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01T00:32:10.000,2023-01-01T00:40:36.000,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01T00:55:08.000,2023-01-01T01:01:27.000,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01T00:25:04.000,2023-01-01T00:37:49.000,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01T00:03:48.000,2023-01-01T00:13:25.000,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,22.1,0.0,1.25
4,2,2023-01-01T00:10:29.000,2023-01-01T00:21:19.000,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0



decimal precision in columns:


Unnamed: 0,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,12.1,7.25,0.5,0.0,0.0,1.0,22.1,0.0,1.25
4,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


cleaned data saved data/cleaned_taxi_data.csv
