In [1]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Read Jan and Feb as train/val

In [2]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    return df

In [3]:
df_train = read_dataframe('./data/fhv_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/fhv_tripdata_2021-02.parquet')

# Q1

In [4]:
df_train.shape

(1154112, 8)

# Q2

In [5]:
df_train['duration'].mean()

19.167224093791006

# Data Prep

In [6]:
df_train_filtered = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
df_val_filtered = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

In [7]:
print(f"dropped {df_train.shape[0] - df_train_filtered.shape[0]} records limiting to 1-60s")

dropped 44286 records limiting to 1-60s


# Q3

In [8]:
categorical = ['PUlocationID','DOlocationID']

df_train_filled = df_train_filtered.copy()
df_train_filled[categorical] = df_train_filled[categorical].fillna(-1)

df_val_filled = df_val_filtered.copy()
df_val_filled[categorical] = df_val_filled[categorical].fillna(-1)


In [9]:
df_train_filled['PUlocationID'].value_counts(normalize=True).round(4).loc[-1]

0.8353

# Q4

In [10]:
df_train_filled[categorical] = df_train_filled[categorical].astype(str)
df_val_filled[categorical] = df_val_filled[categorical].astype(str)

In [11]:
dv = DictVectorizer(sparse=False)

train_dicts = df_train_filled[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val_filled[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [12]:
len(dv.get_feature_names())



525

# Q5

In [22]:
target = 'duration'
y_train = df_train_filled[target].values
y_val = df_val_filled[target].values

In [14]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [15]:
pred_train = lr.predict(X_train)
mean_squared_error(y_train, pred_train, squared=False)

10.528562839714741

# Q6

In [16]:

pred_val = lr.predict(X_val)
mean_squared_error(y_val, pred_val, squared=False)

14108252.251720905

#### hmm something has gone wrong here 

In [17]:
pred_val[pred_val < 0]

array([-8.10503640e+09, -8.10503639e+09, -8.10503639e+09])

### it's predicted a duration less than 0 (and a lot less) for a few predictions, use 0 for these values to get stats

In [23]:
pred_val[pred_val < 0] = 0
mean_squared_error(y_val, pred_val, squared=False)

11.014329017434141