In [None]:
import pandas as pd
import numpy as np

In [None]:
import seaborn as sns

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
df_jan = pd.read_parquet("data/yellow_tripdata_2023-01.parquet")

In [None]:
df_jan.columns

In [None]:
num_columns = len(df_jan.columns)
print("Number of columns: ")
print(num_columns)

### Number of columns : 19

In [None]:
df_jan['duration'] = df_jan['tpep_dropoff_datetime'] - df_jan['tpep_pickup_datetime']
df_jan.duration = df_jan.duration.apply(lambda x: x.total_seconds()/60)

In [None]:
df_jan['duration'].describe()

### Standard deviation of trips duration: 42.59

In [None]:
sns.boxplot(df_jan['duration'])

In [None]:
no_outlier = df_jan[(df_jan['duration']>=1) & (df_jan['duration']<=60)]
len_no_outlier = len(no_outlier)
len_with_outlier = len(df_jan['duration'])
frac_left = len_no_outlier/len_with_outlier*100.0
print(frac_left)

### Fraction Left after removing outliers: 98.12

In [None]:
df_jan = df_jan[(df_jan['duration']>=1) & (df_jan['duration']<=60)]

In [None]:
categorical =  ['PULocationID', 'DOLocationID']

In [None]:
df_jan[categorical] = df_jan[categorical].astype(str)

In [None]:
jan_dicts = df_jan[categorical].to_dict(orient='records')

In [None]:
dv = DictVectorizer(sparse=True)

In [None]:
dv.fit(jan_dicts)

In [None]:
X_jan = dv.transform(jan_dicts)

In [None]:
X_jan.shape

### Number of columns of the feature matrix: 515

In [None]:
Y_jan = np.array(df_jan['duration'])

In [None]:
linear = LinearRegression()
linear.fit(X_jan, Y_jan)

In [None]:
rmse = mean_squared_error(Y_jan, linear.predict(X_jan), squared=False)
print(rmse)

### RMSE on the trained dataset: 7.64

In [None]:
def preprocess(df):
    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df.duration = df.duration.apply(lambda x: x.total_seconds()/60)
    df = df[(df['duration']>=1) & (df['duration']<=60)]
    categorical =  ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    dicts = df[categorical].to_dict(orient='records')
    return dicts, df

In [None]:
df_feb = pd.read_parquet("data/yellow_tripdata_2023-02.parquet")

In [None]:
feb_dicts, df_feb = preprocess(df_feb)

In [None]:
X_feb = dv.transform(feb_dicts)

In [None]:
Y_feb = np.array(df_feb['duration'])

In [None]:
rmse = mean_squared_error(Y_feb, linear.predict(X_feb), squared=False)
print(rmse)

### RMSE on the validation data set (February): 7.81