In [1]:
!python -V

Python 3.9.12


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [6]:
! ls /home/ubuntu/data

fhv_tripdata_2021-01.parquet  green_tripdata_2021-01.parquet
fhv_tripdata_2021-02.parquet  green_tripdata_2021-02.parquet


In [7]:
def read_dataframe(filename):
    if filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    else:
        raise ValueError("Unknown file format.")
        
    print("Q1 Number of records: ", len(df))

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    print("Q2 Average duration: ", round(df.duration.mean(), 2))
    
    filtered_df = df[(df.duration >= 1) & (df.duration <= 60)]
    print("Dropped records: ", len(df) - len(filtered_df))
    df = filtered_df

    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].fillna(-1)
    
    missing_values = len(df[df.PUlocationID == -1]) / len(df) * 100
    print("Q3 Missing values: ", round(missing_values, 2), "%")
    
    df[categorical] = df[categorical].astype(str)
    
    return df

In [8]:
df_jan = read_dataframe('/home/ubuntu/data/fhv_tripdata_2021-01.parquet')

Q1 Number of records:  1154112
Q2 Average duration:  19.17
Dropped records:  44286
Q3 Missing values:  83.53 %


In [9]:
categorical = ['PUlocationID', 'DOlocationID']
train_dicts = df_jan[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
print("Q4 The dimensionality of the matrix: ", X_train.shape[1])

Q4 The dimensionality of the matrix:  525


In [10]:
target = 'duration'
y_train = df_jan[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

rmse_train = mean_squared_error(y_train, y_pred, squared=False)

print("Q5 RMSE on train: ", round(rmse_train, 2))

Q5 RMSE on train:  10.53


In [12]:
df_feb = read_dataframe('/home/ubuntu/data/fhv_tripdata_2021-02.parquet')
validation_dicts = df_feb[categorical].to_dict(orient='records')
X_val = dv.transform(validation_dicts)
y_val = df_feb[target].values
y_pred = lr.predict(X_val)
rmse_val = mean_squared_error(y_val, y_pred, squared=False)
print("Q6 RMSE on validation: ", round(rmse_val, 2))

Q1 Number of records:  1037692
Q2 Average duration:  20.71
Dropped records:  47579
Q3 Missing values:  85.71 %
Q6 RMSE on validation:  11.01
