In [1]:
!pip install pyarrow
!pip install --upgrade scikit-learn



In [64]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# q1: 1154112
# q2: 19:10
# q3: 83%
# q4: 525
# q5: 

In [65]:
filepaths = {
    "jan": "./data/fhv_tripdata_2021-01.parquet",
    "feb": "./data/fhv_tripdata_2021-02.parquet",
}

In [66]:
jan_data = pd.read_parquet(filepaths["jan"])
print(jan_data.shape)

(1154112, 7)


In [67]:
jan_data.columns

Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number'],
      dtype='object')

In [68]:
jan_data["duration"] = jan_data["dropOff_datetime"] - jan_data["pickup_datetime"]
jan_data["duration"] = jan_data["duration"].dt.total_seconds()/60

In [69]:
jan_data["duration"].mean()

19.1672240937939

In [70]:
pu_count_na = jan_data["PUlocationID"].isna().sum()
print("pu na fraction = ", pu_count_na/jan_data.shape[0])

pu na fraction =  0.8303067639882438


In [71]:
def prepare_data(data_df):
    data_df = data_df[(data_df["duration"] >= 1) & (data_df["duration"] <= 60)].copy()
    data_df["PUlocationID"] = data_df["PUlocationID"].fillna("-1")
    data_df["DOlocationID"] = data_df["DOlocationID"].fillna("-1")
    data_df = data_df.astype({"PUlocationID": int, "DOlocationID": int})
    data_df = data_df.astype({"PUlocationID": str, "DOlocationID": str})
    features_df = data_df[["PUlocationID", "DOlocationID"]].copy()
    y_data = data_df["duration"].to_numpy()
    features = features_df.to_dict(orient="records")
    return features, y_data

In [72]:
features, y_data = prepare_data(jan_data)

In [73]:
vectorizer = DictVectorizer(sparse=True)
x_data = vectorizer.fit_transform(features)

In [74]:
print(x_data.shape)
print(y_data.shape)
print(features[:10])
print(y_data[:10])

(1109826, 525)
(1109826,)
[{'PUlocationID': '-1', 'DOlocationID': '-1'}, {'PUlocationID': '-1', 'DOlocationID': '-1'}, {'PUlocationID': '-1', 'DOlocationID': '72'}, {'PUlocationID': '-1', 'DOlocationID': '61'}, {'PUlocationID': '-1', 'DOlocationID': '71'}, {'PUlocationID': '-1', 'DOlocationID': '91'}, {'PUlocationID': '-1', 'DOlocationID': '39'}, {'PUlocationID': '-1', 'DOlocationID': '37'}, {'PUlocationID': '-1', 'DOlocationID': '39'}, {'PUlocationID': '-1', 'DOlocationID': '72'}]
[17.         17.          8.28333333 15.21666667  9.05       11.86666667
  8.88333333 17.96666667 23.36666667  2.03333333]


In [75]:
reg = LinearRegression().fit(x_data, y_data)

In [77]:
y_train_pred = reg.predict(x_data)
print("RMSE train =", mean_squared_error(y_data, y_train_pred, squared=False))

RMSE train = 10.52851910722048


In [78]:
feb_data = pd.read_parquet(filepaths["feb"])
feb_data["duration"] = feb_data["dropOff_datetime"] - feb_data["pickup_datetime"]
feb_data["duration"] = feb_data["duration"].dt.total_seconds()/60

In [79]:
val_features, y_val_data = prepare_data(feb_data)

In [80]:
x_val_data = vectorizer.transform(val_features)

In [81]:
print(x_val_data.shape)
print(y_val_data.shape)

(990113, 525)
(990113,)


In [82]:
y_val_pred = reg.predict(x_val_data)
print("RMSE train =", mean_squared_error(y_val_data, y_val_pred, squared=False))

RMSE train = 11.01428322486305
