In [1]:
## Display in Notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_colwidth', None)  # Show full column width

import numpy as np
import os
from collections import Counter
from sqlalchemy import create_engine
import re
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('/Users/leisha/Documents/DAEN 328/final project/nyc_taxi.csv')
print("Data loaded successfully!")
df.info()  # Display DataFrame information

Data loaded successfully!
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             80000 non-null  int64  
 1   vendorid               80000 non-null  int64  
 2   tpep_pickup_datetime   80000 non-null  object 
 3   tpep_dropoff_datetime  80000 non-null  object 
 4   passenger_count        80000 non-null  int64  
 5   trip_distance          80000 non-null  float64
 6   ratecodeid             80000 non-null  int64  
 7   store_and_fwd_flag     80000 non-null  object 
 8   pulocationid           80000 non-null  int64  
 9   dolocationid           80000 non-null  int64  
 10  payment_type           80000 non-null  int64  
 11  fare_amount            80000 non-null  float64
 12  extra                  80000 non-null  float64
 13  mta_tax                80000 non-null  float64
 14  tip_amount             80000

In [5]:
#drop duplicates- if any exist
df = df.drop_duplicates()
print(f'Number unique rows: {len(df)}')

Number unique rows: 80000


In [6]:
# Step 2: Identify and Standardize Missing Values

# Replace all NaN values with Pandas' NA representation
df = df.replace({np.nan: pd.NA})

# Count the number of missing values in each column
missing_values = df.isna().sum()

# Count the number of rows with at least one missing value
num_rows_with_missing = df.isnull().any(axis=1).sum()

# Display results
print(f" Number of Rows with Missing Values: {num_rows_with_missing}\n")

print(" Missing Values Count Per Column (Before Cleaning):")
print(missing_values)

 Number of Rows with Missing Values: 0

 Missing Values Count Per Column (Before Cleaning):
Unnamed: 0               0
vendorid                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
ratecodeid               0
store_and_fwd_flag       0
pulocationid             0
dolocationid             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
dtype: int64


In [7]:
print(df[['tpep_pickup_datetime']].head(20))

       tpep_pickup_datetime
0   2023-01-01T00:32:10.000
1   2023-01-01T00:55:08.000
2   2023-01-01T00:25:04.000
3   2023-01-01T00:03:48.000
4   2023-01-01T00:10:29.000
5   2023-01-01T00:50:34.000
6   2023-01-01T00:09:22.000
7   2023-01-01T00:27:12.000
8   2023-01-01T00:21:44.000
9   2023-01-01T00:39:42.000
10  2023-01-01T00:53:01.000
11  2023-01-01T00:43:37.000
12  2023-01-01T00:34:44.000
13  2023-01-01T00:09:29.000
14  2023-01-01T00:33:53.000
15  2023-01-01T00:13:04.000
16  2023-01-01T00:45:11.000
17  2023-01-01T00:04:33.000
18  2023-01-01T00:03:36.000
19  2023-01-01T00:15:23.000


In [12]:
import pandas as pd

# Your function
def standardize_birthdate(date):
    try:
        if pd.isna(date) or date in ["None", "nan", ""]:  # Handle missing values
            return "01/01/2000"
        
        date = str(date).strip()
        if "." in date:  # If the format is YYYY.MM.DD
            return pd.to_datetime(date, format="%Y.%m.%d").strftime("%m/%d/%Y")
        else:  # Assume the format is already a valid date
            return pd.to_datetime(date).strftime("%m/%d/%Y")
    except:
        return "01/01/2000"  # Replace invalid dates with default

# Convert datetime column to datetime dtype if it's not already
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')

# Split into date and time
df['pickup_date'] = df['tpep_pickup_datetime'].dt.date.astype(str)
df['pickup_time'] = df['tpep_pickup_datetime'].dt.time.astype(str)

# Standardize the date format
df['pickup_date'] = df['pickup_date'].apply(standardize_birthdate)

# Final output
print(df[['tpep_pickup_datetime', 'pickup_date', 'pickup_time']].head(10))


  tpep_pickup_datetime pickup_date pickup_time
0  2023-01-01 00:32:10  01/01/2023    00:32:10
1  2023-01-01 00:55:08  01/01/2023    00:55:08
2  2023-01-01 00:25:04  01/01/2023    00:25:04
3  2023-01-01 00:03:48  01/01/2023    00:03:48
4  2023-01-01 00:10:29  01/01/2023    00:10:29
5  2023-01-01 00:50:34  01/01/2023    00:50:34
6  2023-01-01 00:09:22  01/01/2023    00:09:22
7  2023-01-01 00:27:12  01/01/2023    00:27:12
8  2023-01-01 00:21:44  01/01/2023    00:21:44
9  2023-01-01 00:39:42  01/01/2023    00:39:42


In [13]:
# Convert datetime column to datetime dtype if it's not already
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], errors='coerce')

# Split into date and time
df['dropoff_date'] = df['tpep_dropoff_datetime'].dt.date.astype(str)
df['dropoff_time'] = df['tpep_dropoff_datetime'].dt.time.astype(str)

# Standardize the date format
df['dropoff_date'] = df['dropoff_date'].apply(standardize_birthdate)

# Final output
print(df[['tpep_dropoff_datetime', 'dropoff_date', 'dropoff_time']].head(10))

  tpep_dropoff_datetime dropoff_date dropoff_time
0   2023-01-01 00:40:36   01/01/2023     00:40:36
1   2023-01-01 01:01:27   01/01/2023     01:01:27
2   2023-01-01 00:37:49   01/01/2023     00:37:49
3   2023-01-01 00:13:25   01/01/2023     00:13:25
4   2023-01-01 00:21:19   01/01/2023     00:21:19
5   2023-01-01 01:02:52   01/01/2023     01:02:52
6   2023-01-01 00:19:49   01/01/2023     00:19:49
7   2023-01-01 00:49:56   01/01/2023     00:49:56
8   2023-01-01 00:36:40   01/01/2023     00:36:40
9   2023-01-01 00:50:36   01/01/2023     00:50:36


In [16]:
print(df[['vendorid']].head(10))
# Check for vendorid values not equal to 1 or 2
invalid_vendor_ids = df[~df['vendorid'].isin([1, 2])]

# Display them (if any)
print(invalid_vendor_ids[['vendorid']])

#vendorid is already clean

   vendorid
0         2
1         2
2         2
3         1
4         2
5         2
6         2
7         2
8         2
9         2
Empty DataFrame
Columns: [vendorid]
Index: []


In [25]:
print(df[['passenger_count']].head(10))
# Find rows where passenger_count is not between 1 and 6
invalid_passenger_counts = df[~df['passenger_count'].between(1, 6)]
print("Number of invalid passenger counts:", len(invalid_passenger_counts))

   passenger_count
0                1
1                1
2                1
3                1
4                1
5                1
6                1
7                1
8                1
9                1
Number of invalid passenger counts: 0


In [26]:
# Remove rows where passenger_count is 0
df = df[df['passenger_count'] != 0]

# Optional: Reset the index if you want a clean index after removal
df.reset_index(drop=True, inplace=True)

In [27]:
#count invalid passenger_counts again to check if dataset is clean
invalid_passenger_counts = df[~df['passenger_count'].between(1, 6)]
print("Number of invalid passenger counts:", len(invalid_passenger_counts))

Number of invalid passenger counts: 0


In [28]:
print(df[['trip_distance']].head(10))

   trip_distance
0           0.97
1           1.10
2           2.51
3           1.43
4           1.84
5           1.66
6          11.70
7           2.95
8           3.01
9           1.80


In [29]:
# Count rows where trip_distance is 0
zero_distance_count = (df['trip_distance'] == 0).sum()
print("Number of rows with trip_distance = 0:", zero_distance_count)


Number of rows with trip_distance = 0: 1441


In [31]:
# Remove rows where trip_distance is 0
df = df[df['trip_distance'] != 0]

# Optional: Reset the index if you want a clean index after removal
df.reset_index(drop=True, inplace=True)

zero_distance_count = (df['trip_distance'] == 0).sum()
print("Number of rows with trip_distance = 0:", zero_distance_count)

Number of rows with trip_distance = 0: 0


In [32]:
# Ensure the datetime columns are in proper datetime format
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], errors='coerce')
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], errors='coerce')

# Find rows where dropoff is not after pickup
invalid_datetime_rows = df[df['tpep_dropoff_datetime'] <= df['tpep_pickup_datetime']]

# Display the rows
print(invalid_datetime_rows[['tpep_pickup_datetime', 'tpep_dropoff_datetime']])

# Optional: Count how many
print("Number of rows where dropoff is not after pickup:", len(invalid_datetime_rows))


      tpep_pickup_datetime tpep_dropoff_datetime
44855  2023-01-01 15:51:30   2023-01-01 15:51:30
Number of rows where dropoff is not after pickup: 1


In [33]:
# Remove rows where dropoff is not after pickup
df = df[df['tpep_dropoff_datetime'] > df['tpep_pickup_datetime']]

# Optional: Reset index
df.reset_index(drop=True, inplace=True)
