In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_parquet("yellow_tripdata_2023-01.parquet", engine="pyarrow")
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [3]:
# checking the data types of the column
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [4]:
# checking the shape of the data
df.shape

(3066766, 19)

In [5]:
# computing the duration of the ride in minutes
df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

In [6]:
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [7]:
# Checking the simple statistics for the duration column
df.duration.describe()

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64

In [8]:
# Keeping the rides that has duration between 1 and 60 minutes
data = df[(df.duration >= 1) & (df.duration <= 60)]

In [9]:
# About 98% of the rides falls between 1 and 60 minutes
len(data)/len(df) * 100

98.1220282212598

In [10]:
# picking the columns to use for modelling
columns = ["PULocationID", "DOLocationID"]
data[columns] = data[columns].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[columns] = data[columns].astype(str)


In [11]:
data.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                     object
DOLocationID                     object
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
duration                        float64
dtype: object

In [12]:
df_train = data[columns]

In [13]:
dv = DictVectorizer()
train_dicts = df_train.to_dict(orient="records")

In [14]:
X_train = dv.fit_transform(train_dicts)

In [15]:
X_train

<3009173x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6018346 stored elements in Compressed Sparse Row format>

In [16]:
target = "duration"
y_train = data[target].values

In [17]:
y_train

array([ 8.43333333,  6.31666667, 12.75      , ..., 24.51666667,
       13.        , 14.4       ])

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [19]:
y_pred = lr.predict(X_train)

In [20]:
y_pred

array([11.52727631, 10.89774082, 11.32573217, ..., 11.73764853,
       12.70527512, 11.54225863])

In [21]:
mean_squared_error(y_train, y_pred, squared=False)

7.649261027792376

In [22]:
# Loading validation data
val_data = pd.read_parquet("yellow_tripdata_2023-02.parquet")
val_data["duration"] = val_data.tpep_dropoff_datetime - val_data.tpep_pickup_datetime
val_data.duration = val_data.duration.apply(lambda td: td.total_seconds() / 60)
val_data= val_data[(val_data.duration >= 1) & (val_data.duration <= 60)]

In [23]:
# picking the columns to use for modelling
columns = ["PULocationID", "DOLocationID"]
val_data[columns] = val_data[columns].astype(str)

In [24]:
val_data[columns]

Unnamed: 0,PULocationID,DOLocationID
0,142,163
3,132,26
4,161,145
5,148,236
6,137,244
...,...,...
2913950,249,140
2913951,186,79
2913952,158,143
2913953,79,162


In [25]:
val_dicts = val_data[columns].to_dict(orient="records")

In [29]:
X_val = dv.transform(val_dicts)

In [31]:
X_val

<2855951x514 sparse matrix of type '<class 'numpy.float64'>'
	with 5711902 stored elements in Compressed Sparse Row format>

In [27]:
y_val = val_data[target].values

In [30]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

ValueError: X has 514 features, but LinearRegression is expecting 515 features as input.

In [5]:
# Modularizing the codes
def read_dataframe(filename):
    df = pd.read_parquet(filename, engine="pyarrow")
    
#     df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
#     df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

    # computing the duration of the ride in minutes
    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    categorical = ["PULocationID", "DOLocationID"]
    
    
    df[categorical] = df[categorical].astype(str)
    return df

In [6]:
df_train = read_dataframe("yellow_tripdata_2023-01.parquet")
df_val = read_dataframe("yellow_tripdata_2023-02.parquet")

In [7]:
len(df_train), len(df_val)

(3009173, 2855951)

In [8]:
df_val

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.30,1.0,N,142,163,2,4.40,3.50,0.5,0.00,0.0,1.0,9.40,2.5,0.00,1.683333
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.80,1.0,N,132,26,1,70.90,2.25,0.5,0.00,0.0,1.0,74.65,0.0,1.25,32.083333
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,17.00,1.00,0.5,3.30,0.0,1.0,25.30,2.5,0.00,13.300000
5,1,2023-02-01 00:52:40,2023-02-01 01:07:18,1.0,5.10,1.0,N,148,236,1,21.90,3.50,0.5,5.35,0.0,1.0,32.25,2.5,0.00,14.633333
6,1,2023-02-01 00:12:39,2023-02-01 00:40:36,1.0,8.90,1.0,N,137,244,1,41.50,3.50,0.5,3.50,0.0,1.0,50.00,2.5,0.00,27.950000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2913950,2,2023-02-28 23:46:00,2023-03-01 00:05:00,,4.65,,,249,140,0,20.22,0.00,0.5,4.84,0.0,1.0,29.06,,,19.000000
2913951,2,2023-02-28 23:26:02,2023-02-28 23:37:10,,2.47,,,186,79,0,13.66,0.00,0.5,2.65,0.0,1.0,20.31,,,11.133333
2913952,2,2023-02-28 23:24:00,2023-02-28 23:38:00,,3.49,,,158,143,0,17.64,0.00,0.5,0.00,0.0,1.0,21.64,,,14.000000
2913953,2,2023-02-28 23:03:00,2023-02-28 23:10:00,,2.13,,,79,162,0,13.56,0.00,0.5,2.63,0.0,1.0,20.19,,,7.000000


In [10]:
# picking the columns to use for modelling
columns = ["PULocationID", "DOLocationID"]

In [11]:
df_train_1 = df_train[columns]
df_val_1 = df_val[columns]

In [12]:
dv = DictVectorizer()
train_dicts = df_train_1.to_dict(orient="records")
val_dicts = df_val_1.to_dict(orient="records")

In [13]:
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [14]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.811832836304415