# Downloading the Data

In [89]:
import pandas as pd
import re
import requests

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from typing import List

In [90]:
dataset_uris = ["https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet",
                "https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet"]

In [91]:
filenames = []
def download_files(file_list: List) -> None:
    for url in file_list:
        filename = "data/" + url.split("/")[-1]
        filenames.append(filename)
        body = requests.get(url)
        with open(filename, 'wb') as file:
            file.write(body.content)

download_files(dataset_uris)

# Read Datasets

In [92]:
pd.set_option("display.precision", 2)

In [93]:
jan = pd.read_parquet(filenames[0], columns=["pickup_datetime", "dropOff_datetime", "PUlocationID", "DOlocationID"])
feb = pd.read_parquet(filenames[-1], columns=["pickup_datetime", "dropOff_datetime", "PUlocationID", "DOlocationID"])

# QUESTION 1

In [94]:
print(f"There are {jan.shape[0]} records on January for this dataset")

There are 1154112 records on January for this dataset


# QUESTION 2

In [95]:
jan["duration"] = jan["dropOff_datetime"] - jan["pickup_datetime"]

In [96]:
feb["duration"] = feb["dropOff_datetime"] - feb["pickup_datetime"]

In [97]:
print(f'The mean duration for January is {jan["duration"].mean().total_seconds()/60}')

The mean duration for January is 19.167224083333334


In [98]:
jan["duration"] = jan["duration"].apply(lambda x: x.total_seconds()/60)

In [99]:
feb["duration"] = feb["duration"].apply(lambda x: x.total_seconds()/60)

In [100]:
jan.drop(columns=["pickup_datetime", "dropOff_datetime"], inplace=True)
feb.drop(columns=["pickup_datetime", "dropOff_datetime"], inplace=True)

# QUESTION 3

In [101]:
query_jan = (jan["duration"] >= 1) & (jan["duration"] <= 60)
query_feb = (feb["duration"] >= 1) & (feb["duration"] <= 60)

In [102]:
jan = jan[query_jan]
feb = feb[query_feb]

In [103]:
jan.shape

(1109826, 3)

In [104]:
print(f"I drop {1154112-jan.shape[0]} records")

I drop 44286 records


In [105]:
jan.columns

Index(['PUlocationID', 'DOlocationID', 'duration'], dtype='object')

In [106]:
jan.isnull().sum()
feb.isnull().sum()

PUlocationID    848661
DOlocationID    134760
duration             0
dtype: int64

In [107]:
print(f'{(jan["PUlocationID"].isnull().sum() / jan.shape[0])*100:.2f}% of missing values')

83.53% of missing values


In [108]:
jan.fillna("-1", inplace=True)
feb.fillna("-1", inplace=True)

In [109]:
print(f'{(jan[jan["PUlocationID"] == "-1"]["PUlocationID"].count() / jan.shape[0])*100:.2f}% of missing values')

83.53% of missing values


# QUESTION 4 - OHE

In [110]:
categorical = ["PUlocationID", "DOlocationID"]

In [111]:
jan[categorical] = jan[categorical].astype("str")

In [112]:
jan.dtypes

PUlocationID     object
DOlocationID     object
duration        float64
dtype: object

In [113]:
train_dicts = jan[categorical].to_dict(orient="records")

In [114]:
dv = DictVectorizer()

In [115]:
# fit_transform to train data
X_train = dv.fit_transform(train_dicts)

In [116]:
print(f"There are {len(dv.get_feature_names())} feature matrix")

There are 525 feature matrix


# QUESTION 5

In [117]:
y_train = jan["duration"]

In [118]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

10.5285191072072

# QUESTION 6

In [119]:
feb[categorical] = feb[categorical].astype("str")
val_dicts = feb[categorical].to_dict(orient='records')
# transform to validate data
X_val = dv.transform(val_dicts)

target = 'duration'
y_val = feb[target].values

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

11.014283163400654