In [3]:
import pandas as pd
import os

## Q1. Downloading the data

In [9]:
jan = pd.read_parquet("C:/Users/boldy/Downloads/yellow_tripdata_2022-01.parquet")

In [14]:
jan.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [12]:
jan.shape

(2463931, 19)

## Q2. Computing duration

In [27]:
jan['duration'] = pd.to_datetime(jan['tpep_dropoff_datetime']) - pd.to_datetime(jan['tpep_pickup_datetime']) 
jan['duration_min'] = jan['duration'].apply(lambda td: td.total_seconds() / 60)

In [28]:
jan['duration_min'].std()

46.44530513776802

## Q3. Dropping outliers

In [30]:
((jan['duration_min'] >= 1) & (jan['duration_min'] <= 60)).mean()

0.9827547930522406

In [31]:
jan_no_out = jan[(jan['duration_min'] >= 1) & (jan['duration_min'] <= 60)]

## Q4. One-hot encoding

In [84]:
train_dict = jan_no_out[["PULocationID", "DOLocationID"]].astype(str).to_dict(orient="records")
#train_dict

In [41]:
from sklearn.feature_extraction import DictVectorizer

In [97]:
dv = DictVectorizer()
X = dv.fit_transform(train_dict)

In [86]:
X

<2421440x515 sparse matrix of type '<class 'numpy.float64'>'
	with 4842880 stored elements in Compressed Sparse Row format>

## Q5. Training a model

In [67]:
y = jan_no_out["duration_min"].values

In [56]:
from sklearn.linear_model import LinearRegression

In [87]:
lm = LinearRegression()
lm.fit(X, y)

In [88]:
pred = lm.predict(X)
pred

array([ 9.45450433, 16.13442829, 13.81654061, ..., 10.61549065,
       10.29014437, 26.61826112])

In [61]:
from sklearn.metrics import mean_squared_error

In [89]:
mean_squared_error(y, pred, squared=False)

6.986190837370544

## Q6. Evaluating the model

In [90]:
feb = pd.read_parquet("C:/Users/boldy/Downloads/yellow_tripdata_2022-02.parquet")

In [92]:
feb['duration'] = pd.to_datetime(feb['tpep_dropoff_datetime']) - pd.to_datetime(feb['tpep_pickup_datetime']) 
feb['duration_min'] = feb['duration'].apply(lambda td: td.total_seconds() / 60)
feb_no_out = feb[(feb['duration_min'] >= 1) & (feb['duration_min'] <= 60)]

In [93]:
feb_dict = feb_no_out[["PULocationID", "DOLocationID"]].astype(str).to_dict(orient="records")

In [98]:
X_val = dv.transform(feb_dict)

In [99]:
y_val = feb_no_out['duration_min'].values

In [100]:
pred_val = lm.predict(X_val)

In [101]:
mean_squared_error(y_val, pred_val, squared=False)

7.786409085078911