In [44]:
import pandas as pd 
import numpy as np 

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [62]:
df = pd.read_parquet('data/fhv_tripdata_2021-01.parquet')
df['duration'] = df.dropOff_datetime - df.pickup_datetime
len(df)

1154112

In [63]:
df['duration'] = df['duration'].apply(lambda td:td.total_seconds()/60)
df.describe()

Unnamed: 0,PUlocationID,DOlocationID,duration
count,195845.0,991892.0,1154112.0
mean,139.85969,135.89803,19.16722
std,74.991382,80.474902,398.6922
min,1.0,1.0,0.01666667
25%,75.0,67.0,7.766667
50%,143.0,132.0,13.4
75%,206.0,213.0,22.28333
max,265.0,265.0,423371.0


In [64]:
df.duration.mean()

19.1672240937939

In [65]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

missing_value_df

Unnamed: 0,column_name,percent_missing
dispatching_base_num,dispatching_base_num,0.0
pickup_datetime,pickup_datetime,0.0
dropOff_datetime,dropOff_datetime,0.0
PUlocationID,PUlocationID,83.030676
DOlocationID,DOlocationID,14.055828
SR_Flag,SR_Flag,100.0
Affiliated_base_number,Affiliated_base_number,0.076682
duration,duration,0.0


In [66]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

categorical = ['PUlocationID', 'DOlocationID']
numerical = ['trip_distance']



In [67]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
dispatching_base_num,dispatching_base_num,0.0
pickup_datetime,pickup_datetime,0.0
dropOff_datetime,dropOff_datetime,0.0
PUlocationID,PUlocationID,83.527328
DOlocationID,DOlocationID,13.327044
SR_Flag,SR_Flag,100.0
Affiliated_base_number,Affiliated_base_number,0.069651
duration,duration,0.0


In [68]:
df["DOlocationID"] = df.DOlocationID.replace(np.nan,-1)

In [69]:
df["PUlocationID"] = df.PUlocationID.replace(np.nan,-1)

In [70]:
len(df[df["DOlocationID"]==-1])

147907

In [71]:
len(df[df["PUlocationID"]==-1])

927008

In [72]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,'percent_missing': percent_missing})
missing_value_df

Unnamed: 0,column_name,percent_missing
dispatching_base_num,dispatching_base_num,0.0
pickup_datetime,pickup_datetime,0.0
dropOff_datetime,dropOff_datetime,0.0
PUlocationID,PUlocationID,0.0
DOlocationID,DOlocationID,0.0
SR_Flag,SR_Flag,100.0
Affiliated_base_number,Affiliated_base_number,0.069651
duration,duration,0.0


In [73]:
df.columns

Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number',
       'duration'],
      dtype='object')

In [74]:
df.PUlocationID

0           -1.0
1           -1.0
3           -1.0
4           -1.0
5           -1.0
           ...  
1154107      7.0
1154108     44.0
1154109    171.0
1154110     15.0
1154111     -1.0
Name: PUlocationID, Length: 1109826, dtype: float64

In [85]:
train_dicts = df[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

11.415432830521663