In [1]:
## Display in Notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None)  # Show full column width

import numpy as np
import os
from collections import Counter
from sqlalchemy import create_engine
import re
from sklearn.feature_extraction.text import CountVectorizer

# first 5 cols

In [2]:
df = pd.read_csv('/Users/leisha/Documents/DAEN 328/final project/nyc_taxi.csv')
print("Data loaded successfully!")
df.info()  # Display DataFrame information

Data loaded successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             80000 non-null  int64  
 1   vendorid               80000 non-null  int64  
 2   tpep_pickup_datetime   80000 non-null  object 
 3   tpep_dropoff_datetime  80000 non-null  object 
 4   passenger_count        80000 non-null  int64  
 5   trip_distance          80000 non-null  float64
 6   ratecodeid             80000 non-null  int64  
 7   store_and_fwd_flag     80000 non-null  object 
 8   pulocationid           80000 non-null  int64  
 9   dolocationid           80000 non-null  int64  
 10  payment_type           80000 non-null  int64  
 11  fare_amount            80000 non-null  float64
 12  extra                  80000 non-null  float64
 13  mta_tax                80000 non-null  float64
 14  tip_amount             80000

In [3]:
#drop duplicates- if any exist
df = df.drop_duplicates()
print(f'Number unique rows: {len(df)}')

Number unique rows: 80000


### missing values

In [4]:
# Step 2: Identify and Standardize Missing Values

# Replace all NaN values with Pandas' NA representation
df = df.replace({np.nan: pd.NA})

# Count the number of missing values in each column
missing_values = df.isna().sum()

# Count the number of rows with at least one missing value
num_rows_with_missing = df.isnull().any(axis=1).sum()

# Display results
print(f" Number of Rows with Missing Values: {num_rows_with_missing}\n")

print(" Missing Values Count Per Column (Before Cleaning):")
print(missing_values)

 Number of Rows with Missing Values: 0

 Missing Values Count Per Column (Before Cleaning):
Unnamed: 0               0
vendorid                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
ratecodeid               0
store_and_fwd_flag       0
pulocationid             0
dolocationid             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
dtype: int64


### pickup_datetime 

In [5]:
print(df[['tpep_pickup_datetime']].head(20))

       tpep_pickup_datetime
0   2023-01-01T00:32:10.000
1   2023-01-01T00:55:08.000
2   2023-01-01T00:25:04.000
3   2023-01-01T00:03:48.000
4   2023-01-01T00:10:29.000
5   2023-01-01T00:50:34.000
6   2023-01-01T00:09:22.000
7   2023-01-01T00:27:12.000
8   2023-01-01T00:21:44.000
9   2023-01-01T00:39:42.000
10  2023-01-01T00:53:01.000
11  2023-01-01T00:43:37.000
12  2023-01-01T00:34:44.000
13  2023-01-01T00:09:29.000
14  2023-01-01T00:33:53.000
15  2023-01-01T00:13:04.000
16  2023-01-01T00:45:11.000
17  2023-01-01T00:04:33.000
18  2023-01-01T00:03:36.000
19  2023-01-01T00:15:23.000


In [6]:
import pandas as pd

# Your function
def standardize_birthdate(date):
    try:
        if pd.isna(date) or date in ["None", "nan", ""]:  # Handle missing values
            return "01/01/2000"
        
        date = str(date).strip()
        if "." in date:  # If the format is YYYY.MM.DD
            return pd.to_datetime(date, format="%Y.%m.%d").strftime("%m/%d/%Y")
        else:  # Assume the format is already a valid date
            return pd.to_datetime(date).strftime("%m/%d/%Y")
    except:
        return "01/01/2000"  # Replace invalid dates with default

# Convert datetime column to datetime dtype if it's not already
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')

# Split into date and time
df['pickup_date'] = df['tpep_pickup_datetime'].dt.date.astype(str)
df['pickup_time'] = df['tpep_pickup_datetime'].dt.time.astype(str)

# Standardize the date format
df['pickup_date'] = df['pickup_date'].apply(standardize_birthdate)

# Final output
print(df[['tpep_pickup_datetime', 'pickup_date', 'pickup_time']].head(10))


  tpep_pickup_datetime pickup_date pickup_time
0  2023-01-01 00:32:10  01/01/2023    00:32:10
1  2023-01-01 00:55:08  01/01/2023    00:55:08
2  2023-01-01 00:25:04  01/01/2023    00:25:04
3  2023-01-01 00:03:48  01/01/2023    00:03:48
4  2023-01-01 00:10:29  01/01/2023    00:10:29
5  2023-01-01 00:50:34  01/01/2023    00:50:34
6  2023-01-01 00:09:22  01/01/2023    00:09:22
7  2023-01-01 00:27:12  01/01/2023    00:27:12
8  2023-01-01 00:21:44  01/01/2023    00:21:44
9  2023-01-01 00:39:42  01/01/2023    00:39:42


### dropoff_datetime

In [7]:
# Convert datetime column to datetime dtype if it's not already
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], errors='coerce')

# Split into date and time
df['dropoff_date'] = df['tpep_dropoff_datetime'].dt.date.astype(str)
df['dropoff_time'] = df['tpep_dropoff_datetime'].dt.time.astype(str)

# Standardize the date format
df['dropoff_date'] = df['dropoff_date'].apply(standardize_birthdate)

# Final output
print(df[['tpep_dropoff_datetime', 'dropoff_date', 'dropoff_time']].head(10))

  tpep_dropoff_datetime dropoff_date dropoff_time
0   2023-01-01 00:40:36   01/01/2023     00:40:36
1   2023-01-01 01:01:27   01/01/2023     01:01:27
2   2023-01-01 00:37:49   01/01/2023     00:37:49
3   2023-01-01 00:13:25   01/01/2023     00:13:25
4   2023-01-01 00:21:19   01/01/2023     00:21:19
5   2023-01-01 01:02:52   01/01/2023     01:02:52
6   2023-01-01 00:19:49   01/01/2023     00:19:49
7   2023-01-01 00:49:56   01/01/2023     00:49:56
8   2023-01-01 00:36:40   01/01/2023     00:36:40
9   2023-01-01 00:50:36   01/01/2023     00:50:36


### vendor id

In [8]:
print(df[['vendorid']].head(10))
# Check for vendorid values not equal to 1 or 2
invalid_vendor_ids = df[~df['vendorid'].isin([1, 2])]

# Display them (if any)
print(invalid_vendor_ids[['vendorid']])

#vendorid is already clean

   vendorid
0         2
1         2
2         2
3         1
4         2
5         2
6         2
7         2
8         2
9         2
Empty DataFrame
Columns: [vendorid]
Index: []


### passenger count

In [9]:
print(df[['passenger_count']].head(10))
# Find rows where passenger_count is not between 1 and 6
invalid_passenger_counts = df[~df['passenger_count'].between(1, 6)]
print("Number of invalid passenger counts:", len(invalid_passenger_counts))

   passenger_count
0                1
1                1
2                1
3                0
4                1
5                1
6                1
7                1
8                1
9                1
Number of invalid passenger counts: 1432


In [10]:
# Remove rows where passenger_count is 0
df = df[df['passenger_count'] != 0]

# Optional: Reset the index if you want a clean index after removal
df.reset_index(drop=True, inplace=True)

In [11]:
#count invalid passenger_counts again to check if dataset is clean
invalid_passenger_counts = df[~df['passenger_count'].between(1, 6)]
print("Number of invalid passenger counts:", len(invalid_passenger_counts))

Number of invalid passenger counts: 0


### trip distance

In [12]:
print(df[['trip_distance']].head(10))

   trip_distance
0           0.97
1           1.10
2           2.51
3           1.43
4           1.84
5           1.66
6          11.70
7           2.95
8           3.01
9           1.80


In [13]:
# Count rows where trip_distance is 0
zero_distance_count = (df['trip_distance'] == 0).sum()
print("Number of rows with trip_distance = 0:", zero_distance_count)


Number of rows with trip_distance = 0: 1441


In [14]:
# Remove rows where trip_distance is 0
df = df[df['trip_distance'] != 0]

# Optional: Reset the index if you want a clean index after removal
df.reset_index(drop=True, inplace=True)

zero_distance_count = (df['trip_distance'] == 0).sum()
print("Number of rows with trip_distance = 0:", zero_distance_count)

Number of rows with trip_distance = 0: 0


### remove rows where dropoff < pickup

In [15]:
# Ensure the datetime columns are in proper datetime format
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], errors='coerce')

# Find rows where dropoff is not after pickup
invalid_datetime_rows = df[df['tpep_dropoff_datetime'] <= df['tpep_pickup_datetime']]

# Display the rows
print(invalid_datetime_rows[['tpep_pickup_datetime', 'tpep_dropoff_datetime']])

# Optional: Count how many
print("Number of rows where dropoff is not after pickup:", len(invalid_datetime_rows))


      tpep_pickup_datetime tpep_dropoff_datetime
44855  2023-01-01 15:51:30   2023-01-01 15:51:30
Number of rows where dropoff is not after pickup: 1


In [16]:
# Remove rows where dropoff is not after pickup
df = df[df['tpep_dropoff_datetime'] > df['tpep_pickup_datetime']]

# Optional: Reset index
df.reset_index(drop=True, inplace=True)


# next 4 cols 

In [17]:
# Preview the columns you're cleaning
df[["ratecodeid", "store_and_fwd_flag", "pulocationid", "dolocationid"]].head()


Unnamed: 0,ratecodeid,store_and_fwd_flag,pulocationid,dolocationid
0,1,N,161,141
1,1,N,43,237
2,1,N,48,238
3,1,N,107,79
4,1,N,161,137


In [18]:
# Define the mapping
ratecode_map = {
    1: "Standard rate",
    2: "JFK",
    3: "Newark",
    4: "Nassau/Westchester",
    5: "Negotiated fare",
    6: "Group ride",
    99: "Unknown"
}

# Convert to numeric first (if not already)
df['ratecodeid'] = pd.to_numeric(df['ratecodeid'], errors='coerce')

# Map the values to descriptions
df['rate_description'] = df['ratecodeid'].map(ratecode_map)

# Drop the original column if no longer needed
df.drop(columns=['ratecodeid', 'tpep_dropoff_datetime', 'tpep_pickup_datetime'], inplace=True)


# next 5 cols

In [19]:
## cleans payment_type by creaing a map of the existing numeric values to the actual payment type
def clean_payment_type(df):
    payment_type_map = {
        0: "Flex Fare trip",
        1: "Credit Card",
        2: "Cash",
        3: "No charge",
        4: "Dispute",
        5: "Unknown",
        6: "Voided trip"
    }
    df['payment_type'] = pd.to_numeric(df['payment_type'], errors='coerce')
    df['payment_type'] = df['payment_type'].map(payment_type_map)
    
    return df

## clean fare_amount, extra, mta_tax, tip_amount
def clean_fare_and_related_columns(df):
    numeric_cols = ['fare_amount', 'extra', 'mta_tax', 'tip_amount']
    
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        df[col] = df[col].round(2)
        df = df[df[col] >= 0] 
    
    return df    

In [20]:
df = clean_payment_type(df)
df = clean_fare_and_related_columns(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].round(2)


In [21]:
df.drop(columns=['store_and_fwd_flag'], inplace=True)


# last 5 cols

In [22]:
# cleaning columns: tolls_amount, improvement_surcharge, total_amount, congestion_surcharge, airport_fee

columns_to_clean = [
    'tolls_amount', 
    'improvement_surcharge', 
    'total_amount', 
    'congestion_surcharge', 
    'airport_fee'
]

# converting all cols to numeric and handle missing vals
for col in columns_to_clean:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # NaN values
    null_count = df[col].isna().sum()
    if null_count > 0:
        print(f"{null_count} missing values in {col}")
        # filled with 0
        df[col] = df[col].fillna(0)
    
    # making sure all values are non negative
    neg_count = (df[col] < 0).sum()
    if neg_count > 0:
        print(f" {neg_count} negatives values in {col}")
        df[col] = df[col].clip(lower=0)
    
    # round to 2 decimal places
    df[col] = df[col].round(2)

print()
print("cleaned cols:")
display(df[columns_to_clean].describe())

 2 negatives values in improvement_surcharge
 2 negatives values in total_amount

cleaned cols:


Unnamed: 0,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
count,76401.0,76401.0,76401.0,76401.0,76401.0
mean,0.784439,0.997993,31.317227,2.226116,0.154972
std,2.511378,0.037789,25.332725,0.780836,0.411947
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,15.48,2.5,0.0
50%,0.0,1.0,21.38,2.5,0.0
75%,0.0,1.0,35.28,2.5,0.0
max,49.85,1.0,472.25,2.5,1.25


In [23]:
# checking vals for improvement_surcharge
# from NYC TLC data dictionary and documentation:
# $0.30 surcharge from 2015 - 2018
# $0.50 prior to February 2022
# $1.00 after February 2022
# $0.00 for some exempt trips
print("improvement_surcharge value counts:")
display(df['improvement_surcharge'].value_counts().sort_index())

# checking for unusual values outside the expected ranges
expected_improvement_values = [0.0, 0.3, 0.5, 1.0]
mask = ~df['improvement_surcharge'].isin(expected_improvement_values)
if mask.sum() > 0:
    print(f"{mask.sum()} records with unexpected values:")
    display(df.loc[mask, 'improvement_surcharge'].value_counts())
    
    # replacing to the nearest expected value
    most_common_value = df.loc[~mask, 'improvement_surcharge'].mode()[0]
    df.loc[mask, 'improvement_surcharge'] = most_common_value
    print(f"fixed unexpected values by setting them to {most_common_value}")

improvement_surcharge value counts:


improvement_surcharge
0.0        7
0.3      209
1.0    76185
Name: count, dtype: int64

In [24]:
# congestion_surcharge
# from NYC TLC regulations:
# $2.50 for standard rides in congestion zone
# $2.75 for shared rides
# $0.00 for exempt trips or trips outside the congestion zone
print("congestion_surcharge value counts:")
display(df['congestion_surcharge'].value_counts().sort_index())

# checking for unexpected congestion surcharge vals
expected_congestion_values = [0.0, 2.5, 2.75]
mask = ~df['congestion_surcharge'].isin(expected_congestion_values)
if mask.sum() > 0:
    print(f"{mask.sum()} records with unexpected vlaues:")
    display(df.loc[mask, 'congestion_surcharge'].value_counts())
    
    most_common_value = df.loc[~mask, 'congestion_surcharge'].mode()[0]
    df.loc[mask, 'congestion_surcharge'] = most_common_value
    print(f"fixed unexpected values by setting them to {most_common_value}")

congestion_surcharge value counts:


congestion_surcharge
0.0     8370
2.5    68031
Name: count, dtype: int64

In [25]:
# values for airport_fee
# from NYC TLC regulations:
# $1.25 for pickups at JFK or LaGuardia airports
# $0.00 for all other trips
print("airport_fee value counts:")
display(df['airport_fee'].value_counts().sort_index())

# checking for unexpected airport fee vals
expected_airport_values = [0.0, 1.25]
mask = ~df['airport_fee'].isin(expected_airport_values)
if mask.sum() > 0:
    print(f"{mask.sum()} records with unexpected airport_fee values:")
    display(df.loc[mask, 'airport_fee'].value_counts())
    
    most_common_value = df.loc[~mask, 'airport_fee'].mode()[0]
    df.loc[mask, 'airport_fee'] = most_common_value
    print(f"Fixed unexpected values by setting them to {most_common_value}")

airport_fee value counts:


airport_fee
0.00    66929
1.25     9472
Name: count, dtype: int64

In [26]:
# correcting total_amount by checking against the sum of components

component_columns = [
    'fare_amount', 'extra', 'mta_tax', 'tip_amount', 
    'tolls_amount', 'improvement_surcharge', 'congestion_surcharge', 'airport_fee'
]

for col in component_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).round(2)

# calculating the expected total
df['calculated_total'] = df[component_columns].sum(axis=1).round(2)

# comparing with the actual total_amount
df['difference'] = (df['total_amount'] - df['calculated_total']).abs().round(2)

# records with significant differences
discrepancies = df[df['difference'] > 0.01]

print(f"{len(discrepancies)} records where total_amount doesn't match the sum of components")
if len(discrepancies) > 0:
    print("sample:")
    display(discrepancies[['total_amount', 'calculated_total', 'difference']].head())
    
    df.loc[df['difference'] > 0.01, 'total_amount'] = df.loc[df['difference'] > 0.01, 'calculated_total']
    print("total amount fixed using calculated total amount")

df = df.drop(['calculated_total', 'difference'], axis=1)

17308 records where total_amount doesn't match the sum of components
sample:


Unnamed: 0,total_amount,calculated_total,difference
10,46.55,49.05,2.5
16,29.5,32.0,2.5
17,13.6,16.1,2.5
18,20.6,23.1,2.5
19,17.15,19.65,2.5


total amount fixed using calculated total amount


In [27]:
print("cleaned data sample:")
display(df.head())

monetary_columns = [
    'fare_amount', 'extra', 'mta_tax', 'tip_amount', 
    'tolls_amount', 'improvement_surcharge', 'total_amount', 
    'congestion_surcharge', 'airport_fee'
]

# rounding to have 2 decimal places
for col in monetary_columns:
    if col in df.columns:
        df[col] = df[col].round(2)

print("\ndecimal precision in columns:")
display(df[monetary_columns].head())


cleaned data sample:


Unnamed: 0.1,Unnamed: 0,vendorid,passenger_count,trip_distance,pulocationid,dolocationid,payment_type,fare_amount,extra,mta_tax,...,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_date,pickup_time,dropoff_date,dropoff_time,rate_description
0,0,2,1,0.97,161,141,Cash,9.3,1.0,0.5,...,0.0,1.0,14.3,2.5,0.0,01/01/2023,00:32:10,01/01/2023,00:40:36,Standard rate
1,1,2,1,1.1,43,237,Credit Card,7.9,1.0,0.5,...,0.0,1.0,16.9,2.5,0.0,01/01/2023,00:55:08,01/01/2023,01:01:27,Standard rate
2,2,2,1,2.51,48,238,Credit Card,14.9,1.0,0.5,...,0.0,1.0,34.9,2.5,0.0,01/01/2023,00:25:04,01/01/2023,00:37:49,Standard rate
3,4,2,1,1.43,107,79,Credit Card,11.4,1.0,0.5,...,0.0,1.0,19.68,2.5,0.0,01/01/2023,00:10:29,01/01/2023,00:21:19,Standard rate
4,5,2,1,1.84,161,137,Credit Card,12.8,1.0,0.5,...,0.0,1.0,27.8,2.5,0.0,01/01/2023,00:50:34,01/01/2023,01:02:52,Standard rate



decimal precision in columns:


Unnamed: 0,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0
4,12.8,1.0,0.5,10.0,0.0,1.0,27.8,2.5,0.0


In [28]:
df.to_csv('nyc_taxi_cleaned_data.csv', index=False)