In [None]:
# import libraries
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# read both parquet files in the dir data/
df_jan = pd.read_parquet("./data/yellow_tripdata_2022-01.parquet")
df_feb = pd.read_parquet("./data/yellow_tripdata_2022-02.parquet")
dfs = [df_jan, df_feb]

# concatenate the two dataframes
df = pd.concat(dfs, ignore_index=True)

# How many columns are there?
print("Number of columns:", len(df.columns))

In [None]:
# What's the standard deviation of the trips duration in January?
def add_duration(df):
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    return df

df_jan = add_duration(df_jan)
print("Standard deviation of the trips duration in January:", df_jan["duration"].std())


In [None]:
def filter_duration(df):
    # keep only the records where the duration was between 1 and 60 minutes (inclusive).
    return df[(df["duration"]>=1) & (df["duration"]<=60)]

filtered_jan_df = filter_duration(df_jan)

# What fraction of the records left after you dropped the outliers?
print("Fraction of the records left after you dropped the outliers:", round(len(filtered_jan_df)/len(df_jan) * 100, 2), "%")


In [None]:
# Create a new dataframe with only the pickup and dropoff location IDs
train_df = filtered_jan_df[["PULocationID", "DOLocationID"]]

# cast the columns to string
train_df = train_df.astype(str)
# Convert the dataframe to a list of dictionaries.
train_df = train_df.to_dict(orient="records")
# Apply one-hot encoding to the list of dictionaries.
vec = DictVectorizer(sparse=False)
train_df = vec.fit_transform(train_df)

# How many columns are there now?
print("Number of columns after one-hot encoding:", len(train_df[0]))

In [None]:
# Create a new dataframe with the one-hot encoded columns.
train_df = pd.DataFrame(train_df, columns=vec.feature_names_)

# train a plain linear regression model with default parameters
model = LinearRegression()
model.fit(train_df, filtered_jan_df["duration"].values)

# What's the RMSE of the model on the training data?
y_pred = model.predict(train_df)
rmse = ((y_pred - filtered_jan_df["duration"].values) ** 2).mean() ** .5

print("RMSE of the model on the training data:", rmse)

In [None]:
# Evaluating the model
# Now let's apply this model to the validation dataset df_feb.
# Create a new dataframe with the one-hot encoded columns.
df_feb = add_duration(df_feb)
filtered_df_feb = filter_duration(df_feb)
test_df = filtered_df_feb[["PULocationID", "DOLocationID"]]
test_df = test_df.astype(str)
test_df = test_df.to_dict(orient="records")
test_df = vec.transform(test_df)
test_df = pd.DataFrame(test_df, columns=vec.feature_names_)
# What's the RMSE of the model on the validation data?
y_pred = model.predict(test_df)

rmse = ((y_pred - filtered_df_feb["duration"]) ** 2).mean() ** .5
print("RMSE of the model on the validation data:", rmse)