In [None]:
# Install necessary libraries
# !pip install pandas scikit-learn

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# Step 1: Download data
url_jan = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
url_feb = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'

df_jan = pd.read_parquet(url_jan)
df_feb = pd.read_parquet(url_feb)

In [4]:
df_jan.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [7]:
df_jan.shape

(3066766, 20)

In [3]:
# Q1: Number of columns
print("Q1: Number of columns =", df_jan.shape[1])

Q1: Number of columns = 19


In [5]:
# Step 2: Add duration column (in minutes)
for df in [df_jan, df_feb]:
  df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60

In [6]:
# Q2: Standard deviation of trip duration in January
print("Q2: Std deviation of trip duration (Jan) =", round(df_jan['duration'].std(), 2))

Q2: Std deviation of trip duration (Jan) = 42.59


In [8]:
# Q3: Drop outliers (1 <= duration <= 60)
df_jan_filtered = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]
fraction_left = len(df_jan_filtered) / len(df_jan)
print(f"Q3: Fraction of records after filtering outliers: {fraction_left:.3f}")

Q3: Fraction of records after filtering outliers: 0.981


In [9]:
# Q4: One-hot encoding pickup and dropoff location IDs
categorical = ['PULocationID', 'DOLocationID']

# Recast to string
df_jan_filtered[categorical] = df_jan_filtered[categorical].astype(str)

# Convert to list of dictionaries
train_dicts = df_jan_filtered[categorical].to_dict(orient='records')

# Vectorization
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

print("Q4: Dimensionality of feature matrix:", X_train.shape[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_jan_filtered[categorical] = df_jan_filtered[categorical].astype(str)


Q4: Dimensionality of feature matrix: 515


In [10]:
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6018346 stored elements and shape (3009173, 515)>

In [11]:
# Prepare target variable
y_train = df_jan_filtered['duration'].values

# Q5: Train linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
rmse_train = sqrt(mean_squared_error(y_train, y_pred_train))
print("Q5: RMSE on train =", round(rmse_train, 2))

Q5: RMSE on train = 7.65


In [12]:
# Q6: Validation on February 2023
df_feb_filtered = df_feb[(df_feb.duration >= 1) & (df_feb.duration <= 60)]
df_feb_filtered[categorical] = df_feb_filtered[categorical].astype(str)
val_dicts = df_feb_filtered[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_feb_filtered['duration'].values
y_pred_val = lr.predict(X_val)
rmse_val = sqrt(mean_squared_error(y_val, y_pred_val))
print("Q6: RMSE on validation =", round(rmse_val, 2))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feb_filtered[categorical] = df_feb_filtered[categorical].astype(str)


Q6: RMSE on validation = 7.81
