In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
jan_df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")

In [3]:
# Number of columns in january data
jan_df.shape[1]

19

In [4]:
jan_df["duration"] = (jan_df["tpep_dropoff_datetime"] - jan_df["tpep_pickup_datetime"]).dt.total_seconds()/60

In [5]:
# Standard deviation of duration 
jan_df["duration"].std()

np.float64(42.59435124195458)

In [6]:
# Keeping only the records between 1 and 60 duration minutes (inclusive)
jan_df2 = jan_df[(jan_df["duration"] >= 1) & (jan_df["duration"] <= 60)]

In [7]:
# Fraction of records left
len(jan_df2) / len(jan_df)

0.9812202822125979

In [8]:
df = jan_df2[["PULocationID", "DOLocationID", "duration"]].sample(100000)

In [10]:
X = df.drop("duration", axis = 1)

In [11]:
X = X.astype(str)

In [12]:
X_dict = X.to_dict(orient = 'records')

In [13]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse = False, )

In [14]:
dv.fit(X_dict)

In [15]:
X_dict_transformed = dv.transform(X_dict)

In [16]:
X_dict_transformed

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(100000, 461))

In [17]:
# Number of columns will be 515 in the transformed dataset
X_dict_transformed.shape

(100000, 461)

In [18]:
y = df["duration"].values

In [19]:
from sklearn.linear_model import LinearRegression

In [20]:
lr = LinearRegression()

In [21]:
lr.fit(X_dict_transformed, y)

In [22]:
from sklearn.metrics import mean_squared_error
import math
math.sqrt(mean_squared_error(y, lr.predict(X_dict_transformed)))

7.592056182801464

In [23]:
feb_path = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'

In [24]:
def process_data(path):
    df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
    df["duration"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds()/60

    df2 = df[(df["duration"] >= 1) & (df["duration"] <= 60)].reset_index(drop = True)

    return df2

In [29]:
def get_features(df):
    df2 = df[["PULocationID", "DOLocationID", "duration"]].sample(100000)

    X = df2.drop("duration", axis = 1)

    X = X.astype(str)

    X_dict = X.to_dict(orient = 'records')
    X_dict_transformed = dv.transform(X_dict)

    return X_dict_transformed, df2["duration"].values

In [26]:
feb_data = process_data(feb_path)

In [27]:
feb_data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,8.433333
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,6.316667
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,12.75
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,9.616667
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,10.833333


In [30]:
X_oot, y_oot = get_features(feb_data)

In [31]:
math.sqrt(mean_squared_error(y_oot, lr.predict(X_oot)))

7.744447429301015