In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
file_path = "nyc_taxi_trip_duration.csv"
data = pd.read_csv(file_path)

# Display dataset information
print(data.info())
print(data.describe())

# Check for missing values
missing = data.isnull().sum()
print("Missing Values:\n", missing)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729322 entries, 0 to 729321
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  729322 non-null  object 
 1   vendor_id           729322 non-null  int64  
 2   pickup_datetime     729322 non-null  object 
 3   dropoff_datetime    729322 non-null  object 
 4   passenger_count     729322 non-null  int64  
 5   pickup_longitude    729322 non-null  float64
 6   pickup_latitude     729322 non-null  float64
 7   dropoff_longitude   729322 non-null  float64
 8   dropoff_latitude    729322 non-null  float64
 9   store_and_fwd_flag  729322 non-null  object 
 10  trip_duration       729322 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 61.2+ MB
None
           vendor_id  passenger_count  pickup_longitude  pickup_latitude  \
count  729322.000000    729322.000000     729322.000000    729322.000000   
mean        1.53540

In [3]:
# Handle missing values (drop or impute)
data = data.dropna()

# Remove outliers in trip duration
q1 = data['trip_duration'].quantile(0.25)
q3 = data['trip_duration'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
data = data[(data['trip_duration'] >= lower_bound) & (data['trip_duration'] <= upper_bound)]

# Check for duplicates
data = data.drop_duplicates()

print("Cleaned data shape:", data.shape)

Cleaned data shape: (692359, 11)


In [5]:
from geopy.distance import geodesic

# Calculate distance between pickup and dropoff points
def haversine_distance(row):
    start = (row['pickup_latitude'], row['pickup_longitude'])
    end = (row['dropoff_latitude'], row['dropoff_longitude'])
    return geodesic(start, end).km

data['distance'] = data.apply(haversine_distance, axis=1)

# Extract temporal features
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
data['hour'] = data['pickup_datetime'].dt.hour
data['day'] = data['pickup_datetime'].dt.day
data['month'] = data['pickup_datetime'].dt.month

# One-hot encoding for categorical variables
data = pd.get_dummies(data, columns=['vendor_id', 'store_and_fwd_flag'], drop_first=True)

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Remove non-feature columns (like IDs or timestamps)
X = data.drop(columns=['trip_duration', 'id', 'pickup_datetime'], errors='ignore')

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Frequency encoding for high-cardinality categorical variables
for col in categorical_cols:
    freq_map = X[col].value_counts(normalize=True).to_dict()  # Map frequency of each category
    X[col] = X[col].map(freq_map)

# Replace NaN values (if any) generated due to mapping
X = X.fillna(0)

# Features and target variable
y = data['trip_duration']

# Ensure all columns are numeric
X = X.select_dtypes(include=[float, int])  # Retain numeric columns

# Scaling numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (553887, 10)
Testing set shape: (138472, 10)
