In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [2]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
        df.dropOff_datetime = pd.to_datetime(df.dropOff_datetime)
        df.pickup_datetime = pd.to_datetime(df.pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    
    print("#Answer for 1st question --", df.shape)

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    print("# Answer for 2nd question - ", df.duration.mean())
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    print("# Answer for 3rd question - ", 1-(df.shape[0] - df[df.PUlocationID.isna()].shape[0])/df.shape[0])
    
    df['PUlocationID'][df.PUlocationID.isna()] = -1
    df['DOlocationID'][df.DOlocationID.isna()] = -1
    
    return df

In [3]:
df_jan = read_dataframe('./data/fhv_tripdata_2021-01.parquet') #Training Dataset
df_feb = read_dataframe('./data/fhv_tripdata_2021-02.parquet') #Validation Dataset

#Answer for 1st question -- (1154112, 7)
# Answer for 2nd question -  19.1672240937939
# Answer for 3rd question -  0.8352732770722617
#Answer for 1st question -- (1037692, 7)
# Answer for 2nd question -  20.70698622520125
# Answer for 3rd question -  0.8571354986754037


In [4]:
df_jan.head(20)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,-1.0,-1.0,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,-1.0,-1.0,,B00009,17.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,-1.0,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,-1.0,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,-1.0,71.0,,B00037,9.05
6,B00037,2021-01-01 00:18:12,2021-01-01 00:30:04,-1.0,91.0,,B00037,11.866667
7,B00037,2021-01-01 00:36:15,2021-01-01 00:45:08,-1.0,39.0,,B00037,8.883333
8,B00037,2021-01-01 00:55:04,2021-01-01 01:13:02,-1.0,37.0,,B00037,17.966667
9,B00037,2021-01-01 00:48:40,2021-01-01 01:12:02,-1.0,39.0,,B00037,23.366667
10,B00037,2021-01-01 00:27:23,2021-01-01 00:29:25,-1.0,72.0,,B00037,2.033333


In [5]:
categorical = ['PUlocationID', 'DOlocationID']
numerical = []
df_jan[categorical] = df_jan[categorical].astype(str)

dv = DictVectorizer()

train_dicts = df_jan[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
print("# Answer for 4th question - ", X_train.shape)

val_dicts = df_feb[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Answer for 4th question -  (1109826, 525)


In [6]:
target = 'duration'
y_train = df_jan[target].values
y_val = df_feb[target].values

In [7]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False) # Answer for 5th Question

10.528519107206316

In [8]:
y_val_pred = lr.predict(X_val)

mean_squared_error(y_val, y_val_pred, squared=False) # Answer for 6th Question

12.855087041305907