In [1]:
import pandas as pd

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction import DictVectorizer

In [4]:
pd.__version__

'1.4.2'

In [5]:
!pip install pyarrow



In [6]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [7]:
df.tail()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
3066761,2,2023-01-31 23:58:34,2023-02-01 00:12:33,,3.05,,,107,48,0,15.8,0.0,0.5,3.96,0.0,1.0,23.76,,
3066762,2,2023-01-31 23:31:09,2023-01-31 23:50:36,,5.8,,,112,75,0,22.43,0.0,0.5,2.64,0.0,1.0,29.07,,
3066763,2,2023-01-31 23:01:05,2023-01-31 23:25:36,,4.67,,,114,239,0,17.61,0.0,0.5,5.32,0.0,1.0,26.93,,
3066764,2,2023-01-31 23:40:00,2023-01-31 23:53:00,,3.15,,,230,79,0,18.15,0.0,0.5,4.43,0.0,1.0,26.58,,
3066765,2,2023-01-31 23:07:32,2023-01-31 23:21:56,,2.85,,,262,143,0,15.97,0.0,0.5,2.0,0.0,1.0,21.97,,


How many columns there are?

In [8]:
print(len(df.columns))

19


Standard deviation of the trips duration in january

In [9]:
tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)

df['duration'] = tpep_dropoff_datetime - tpep_pickup_datetime

In [10]:
#transform duration to minutes
df['duration'] = df.duration.apply(lambda td: td.total_seconds() / 60)

df['duration'].std()

42.594351241920904

Drop the outliers. Keep only duration between 1 and 60 minutes

In [11]:
print(f"df len before: {len(df)}")
df = df[(df.duration >=1) & (df.duration <=60)]
print(f"df len after: {len(df)}")

df len before: 3066766
df len after: 3009173


In [12]:
3009173/3066766

0.9812202822125979

One-hot encoding

In [13]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

In [14]:
df[categorical] = df[categorical].astype(str)

In [15]:
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                     object
DOLocationID                     object
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
duration                        float64
dtype: object

In [16]:
dv=DictVectorizer()

In [17]:
train_dicts = df[categorical + numerical].to_dict(orient='record')

  train_dicts = df[categorical + numerical].to_dict(orient='record')


In [18]:
X_train = dv.fit_transform(train_dicts)

In [19]:
X_train

<3009173x516 sparse matrix of type '<class 'numpy.float64'>'
	with 9027519 stored elements in Compressed Sparse Row format>

In [20]:
target = 'duration'
y_train = df[target].values

Training a model

In [21]:
from sklearn.linear_model import LinearRegression

In [22]:
lr = LinearRegression()
lr.fit(X_train,y_train)

LinearRegression()

In [23]:
y_pred =lr.predict(X_train)

In [24]:
# sns.distplot(y_pred, label='prediction')
# sns.distplot(y_train, label='actual')
# plt.legend()

In [25]:
from sklearn.metrics import mean_squared_error

rms = mean_squared_error(y_train, y_pred, squared=False)

In [26]:
print(rms)

7.649140464167203


Evaluating the model

In [27]:
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')
tpep_pickup_datetime = pd.to_datetime(df_val.tpep_pickup_datetime)
tpep_dropoff_datetime = pd.to_datetime(df_val.tpep_dropoff_datetime)

df_val['duration'] = tpep_dropoff_datetime - tpep_pickup_datetime

#transform duration to minutes
df_val['duration'] = df_val.duration.apply(lambda td: td.total_seconds() / 60)

df_val = df_val[(df_val.duration >=1) & (df_val.duration <=60)]

df_val[categorical] = df_val[categorical].astype(str)

train_dicts_val = df_val[categorical + numerical].to_dict(orient='record')

  train_dicts_val = df_val[categorical + numerical].to_dict(orient='record')


In [28]:
 df_val[categorical + numerical]

Unnamed: 0,PULocationID,DOLocationID,trip_distance
0,142,163,0.30
3,132,26,18.80
4,161,145,3.22
5,148,236,5.10
6,137,244,8.90
...,...,...,...
2913950,249,140,4.65
2913951,186,79,2.47
2913952,158,143,3.49
2913953,79,162,2.13


In [30]:
X_val = dv.transform(train_dicts_val)
y_val = df_val[target].values

y_pred_val =lr.predict(X_val)

# sns.distplot(y_pred_val, label='prediction')
# sns.distplot(y_val, label='actual')
# plt.legend()

rms_val = mean_squared_error(y_val, y_pred_val, squared=False)
print(rms_val)

7.811462911207304
