In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [8]:
# Q1. Downloading the data
df = pd.read_parquet('dataset/yellow_tripdata_2023-01.parquet')
num_columns = df.shape[1]
print(f"Q1. After reading the data for January, no. of columns are: {num_columns}")

Q1. After reading the data for January, no. of columns are: 19


In [9]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

# Q2. Computing duration & STD
std_duration = df['duration'].std() 
print(f"Q2. The standard deviation of the trips duration in January is: {std_duration:.2f}")

Q2. The standard deviation of the trips duration in January is: 42.59


In [10]:
records_before = len(df)
df = df[(df.duration >= 1) & (df.duration <= 60)]
records_after = len(df)
records_remaining = (records_after / records_before ) * 100

# Q3. Dropping outliers
print(f"Q3. Fraction of the records left after dropping the outliers: {records_remaining:.2f}%")

Q3. Fraction of the records left after dropping the outliers: 98.12%


In [11]:
categorical = ['PULocationID', 'DOLocationID']
df[categorical]=df[categorical].astype(str)
train_dicts = df[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

# Q4. One-hot encoding
num_columns = X_train.shape[1]
print(f"Q4. Number of columns in the matrix after One-Hot encoding: {num_columns}")


Q4. Number of columns in the matrix after One-Hot encoding: 515


In [12]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

# Q5. Training a model & RMSE on train
mse_train = mean_squared_error(y_train, y_pred, squared=False)
print(f"Q5. The RMSE on train: {mse_train:.2f}")

Q5. The RMSE on train: 7.65


In [13]:
# Q6. Evaluating the model - apply this model to the validation dataset (February 2023) to get RMSE on Validation
df_val = pd.read_parquet('dataset/yellow_tripdata_2023-02.parquet')
df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
df_val[categorical]=df_val[categorical].astype(str)
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val[target].values
y_val_pred = lr.predict(X_val)
mse_val = mean_squared_error(y_val, y_val_pred, squared=False)
print(f"Q6. The RMSE on validation: {mse_val:.2f}")

Q6. The RMSE on validation: 7.81
