In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

In [36]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

# Q1. Downloading the data
Data fetched for yello taxis from Jan|Feb 2023

In [3]:
df = pd.read_parquet('../data/yellow_tripdata_2023-01.parquet')

len(df.columns)

19

Read the data for January. How many columns are there?
> 19

# Q2. Computing duration

In [4]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [7]:
df['duration'].describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

What's the standard deviation of the trips duration in January?
> std      42.59

# Q3. Dropping outliers

In [8]:
len(df)

3066766

In [9]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

len(df)

3009173

In [11]:
round(3009173 / 3066766, 3)

0.981

What fraction of the records left after you dropped the outliers?
> 0.981

# Q4. One-hot encoding

In [12]:
categorical = ['PULocationID', 'DOLocationID']

df[categorical] = df[categorical].astype(str)

In [24]:
train_dicts = df[categorical].to_dict(orient= 'records')

dv= DictVectorizer(sparse= True)

x_train = dv.fit_transform(train_dicts)


In [25]:
dv.get_feature_names_out().shape[0]

515

In [26]:
type(x_train), x_train.shape

(scipy.sparse._csr.csr_matrix, (3009173, 515))

What's the dimensionality of this matrix (number of columns)?
> 515

In [27]:
target = 'duration'
y_train = df[target].values

In [29]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [39]:
y_pred = lr.predict(x_train)

round(root_mean_squared_error(y_train, y_pred), 4)

7.6493

What's the RMSE on train?
> 7.64

In [31]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

df_val = read_dataframe('../data/yellow_tripdata_2023-02.parquet')

In [33]:
df_val.sample(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
1074902,2,2023-02-11 14:35:28,2023-02-11 14:45:04,1.0,1.34,1.0,N,230,43,1,10.0,0.0,0.5,1.0,0.0,1.0,15.0,2.5,0.0,9.6
2908954,2,2023-02-27 07:20:24,2023-02-27 07:30:49,,3.31,,,141,137,0,16.39,0.0,0.5,3.0,0.0,1.0,23.39,,,10.416667
928289,1,2023-02-10 09:49:13,2023-02-10 10:04:43,1.0,1.5,1.0,N,140,236,1,12.1,2.5,0.5,3.2,0.0,1.0,19.3,2.5,0.0,15.5


In [34]:
val_dicts = df_val[categorical].to_dict(orient='records')
x_val = dv.transform(val_dicts)
y_val = df_val[target].values

In [40]:
y_val_pred= lr.predict(x_val)

round(root_mean_squared_error(y_val, y_val_pred), 3)

7.812

What's the RMSE on validation?
> 7.81