In [None]:
!python -V

### Setup env

In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

### First exploration

In [9]:
df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [4]:
df.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
count,3066766.0,3066766,3066766,2995023.0,3066766.0,2995023.0,3066766.0,3066766.0,3066766.0,3066766.0,3066766.0,3066766.0,3066766.0,3066766.0,3066766.0,3066766.0,2995023.0,2995023.0
mean,1.730215,2023-01-17 00:22:26.288164,2023-01-17 00:38:06.427874,1.362532,3.847342,1.49744,166.398,164.3926,1.194483,18.36707,1.537842,0.48829,3.367941,0.5184907,0.9820847,27.02038,2.274231,0.1074086
min,1.0,2008-12-31 23:01:42,2009-01-01 14:29:11,0.0,0.0,1.0,1.0,1.0,0.0,-900.0,-7.5,-0.5,-96.22,-65.0,-1.0,-751.0,-2.5,-1.25
25%,1.0,2023-01-09 16:21:57.250000,2023-01-09 16:37:06,1.0,1.06,1.0,132.0,114.0,1.0,8.6,0.0,0.5,1.0,0.0,1.0,15.4,2.5,0.0
50%,2.0,2023-01-17 08:42:29.500000,2023-01-17 08:58:30.500000,1.0,1.8,1.0,162.0,162.0,1.0,12.8,1.0,0.5,2.72,0.0,1.0,20.16,2.5,0.0
75%,2.0,2023-01-24 16:26:27,2023-01-24 16:42:49,1.0,3.33,1.0,234.0,234.0,1.0,20.5,2.5,0.5,4.2,0.0,1.0,28.7,2.5,0.0
max,2.0,2023-02-01 00:56:53,2023-02-02 09:28:47,9.0,258928.1,99.0,265.0,265.0,4.0,1160.1,12.5,53.16,380.8,196.99,1.0,1169.4,2.5,1.25
std,0.443848,,,0.89612,249.5838,6.474767,64.24413,69.94368,0.5294762,17.80782,1.789592,0.1034641,3.826759,2.017579,0.1833529,22.16359,0.7718454,0.3556511


In [12]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

df['duration'].describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

In [14]:
# Check the first few rows of the DataFrame
df_short = df[(df.duration >= 1) & (df.duration <= 60)]

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df_short[categorical] = df_short[categorical].astype(str)
df_short['duration'].count()
df_short['duration'].describe()
df_short['duration'].count()/df['duration'].count()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short[categorical] = df_short[categorical].astype(str)


np.float64(0.9812202822125979)

In [None]:
train_dicts = df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_train, y_pred)

In [None]:
sns.histplot(y_pred, label='prediction')
sns.histplot(y_train, label='actual')

plt.legend()

## Main

Creating the import function and cleaning.

In [2]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

## Importing Data

In [3]:
df_train = read_dataframe('./data/yellow_tripdata_2023-01.parquet')

In [4]:
df_val = read_dataframe('./data/yellow_tripdata_2023-02.parquet')

Doublecheck sizes

In [5]:
len(df_train), len(df_val)

(3009173, 2855951)

In [6]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [7]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

: 

In [None]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

In [None]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [None]:
lr = Lasso(0.01)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)