In [14]:
import fastparquet
import pyarrow
import pandas as pd
import numpy as np

In [16]:
df = pd.read_parquet("yellow_trip_data.parquet", engine = "pyarrow")

ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
# Important Feature's Imformation

# Target: Duration
# Model: Regression Model
# Independent features: trip_distance, total_amount, PULocationID, DOLocationID, pickup_time -> (which have to converted into time_sin and time_cos)  

In [None]:
df["duration"] = df["lpep_dropoff_datetime"] - df["lpep_pickup_datetime"]
df["duration"] = df["duration"].astype("str")
df.loc[0, "duration"]

In [None]:
def getMins(x):
    h, m, s = list(map(int, x.split()[2].split(":")))
    return h*60 + m + s // 60

In [None]:
df["duration"] = df["duration"].apply(lambda x: getMins(x))

In [None]:
df["hour"] = df["lpep_pickup_datetime"].dt.hour
df["minute"] = df["lpep_pickup_datetime"].dt.minute
df["time_numeric"] = df["hour"] + df["minute"] / 60

In [None]:
df["time_sin"] = np.sin(2 * np.pi * df["time_numeric"] / 24)
df["time_cos"] = np.cos(2 * np.pi * df["time_numeric"] / 24)

In [None]:
nominal_cols = ["PULocationID", "DOLocationID"]
numerical_cols = ["time_sin", "time_cos", "trip_distance", "total_amount"]

df = df[numerical_cols + nominal_cols + ["duration"]]

In [None]:
df.head()

In [None]:
X = df.drop("duration", axis = 1)
y = df["duration"]

In [None]:
# models -> Xgboost, random_forest, linear_regression, knn

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

## LinearRegression Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(sparse_output = True, handle_unknown = "ignore"), nominal_cols),
    ("num", StandardScaler(), numerical_cols)
])

In [None]:
LR_model = Pipeline([
    ("preprocess", preprocess),
    ("model", LinearRegression())
])

In [None]:
LR_model.fit(X_train, y_train)

In [None]:
print("Training Score:", LR_model.score(X_train, y_train))
print("Testing Score:", LR_model.score(X_test, y_test))