In [33]:
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

### Functions

In [47]:
def read_data(filename):
    df = pd.read_parquet(filename)
    df['duration']=(df['dropOff_datetime']-df['pickup_datetime']).apply(lambda x : x.total_seconds()/60)
    df=df[(df['duration']>1) & (df['duration']<60)]    
   
    categorical = ['PUlocationID','DOlocationID']
    for category in categorical:
        df[category]=df[category].fillna(-1).astype(str)
    return df

### Reading data

In [48]:
df_train=read_data("../data/fhv_tripdata_2021-01.parquet")
df_val=read_data("../data/fhv_tripdata_2021-02.parquet")

### EDA (Raw data)

In [50]:
df = pd.read_parquet("../data/fhv_tripdata_2021-01.parquet")

In [51]:
df.shape

(1154112, 7)

There is 1154112 rows

In [4]:
df.head(5)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [5]:
df.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                           object
Affiliated_base_number            object
dtype: object

In [53]:
df['duration']=(df['dropOff_datetime']-df['pickup_datetime']).apply(lambda x : x.total_seconds()/60)

In [54]:
df['duration']

0           17.000000
1           17.000000
2          110.000000
3            8.283333
4           15.216667
              ...    
1154107      8.750000
1154108     57.600000
1154109     16.200000
1154110     19.433333
1154111     36.000000
Name: duration, Length: 1154112, dtype: float64

In [8]:
df['duration'].mean()

19.167224093791006

The mean duration is about 19.17 minutes

In [9]:
#Takes too long to compute
#sns.displot(df,x='duration')

In [10]:
df['duration'].describe(percentiles=[0.95,0.98,0.99])

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
50%      1.340000e+01
95%      4.725000e+01
98%      6.613333e+01
99%      9.030000e+01
max      4.233710e+05
Name: duration, dtype: float64

In [11]:
df=df[(df['duration']>1) & (df['duration']<60)]

In [12]:
print(f'we started with 1154112 rows and now we have {df.shape[0]}')
print(f'{1154112-df.shape[0]} were dropped')

we started with 1154112 rows and now we have 1106890
47222 were dropped


In [13]:
df_pickup=df['PUlocationID']

In [14]:
df_pickup.isna().sum()/len(df_pickup)

0.8363125513827029

83% of the variable is missing data. Let's replace those missing values by -1

In [15]:
#categorical = ['PUlocationID','DOlocationID']
#for category in categorical:
#    df[category]=df[category].fillna(-1).astype(str)

In [56]:
categorical = ['PUlocationID','DOlocationID']
train_dict=df_train[categorical].to_dict(orient='records')

In [57]:
train_dict

[{'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '71.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '91.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '37.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '89.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '177.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '225.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '63.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '67.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '22.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '14.0'},
 {'PUlocationID': '-1.0', 'DO

In [58]:
dv=DictVectorizer()
X_train=dv.fit_transform(train_dict)

X has 525 features

### Train and eval on same set (train)

In [73]:
y_train = df_train['duration'].values

In [76]:
lr = LinearRegression()
lr.fit(X_train,y_train)

In [77]:
y_pred_train=lr.predict(X_train)

In [78]:
mean_squared_error(y_pred_train,y_train,squared=False)

10.387902992843289

In [79]:
print(f'RMSE Score : {mean_squared_error(y_pred,y_train,squared=False)}')

RMSE Score 10.387902992843289


### Eval on validation set

In [69]:
val_dict=df_val[categorical].to_dict(orient='records')
X_val=dv.transform(val_dict)
y_pred_val=lr.predict(X_val)

In [70]:
y_val = df_val['duration'].values
mean_squared_error(y_pred_val,y_val,squared=False)

10.89704217961414

In [80]:
print(f'RMSE Score : {mean_squared_error(y_pred_val,y_val,squared=False)}')

RMSE Score : 10.89704217961414
