In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("PCA_train_reduced_gX.csv")
X_train = train[["PC1", "PC2"]]
y_train = train["taxi_time"]

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
val = pd.read_csv("PCA_validation_reduced_gX.csv")

In [None]:
X_val = val[["PC1", "PC2"]]
y_val = val["taxi_time"]

In [None]:
y_val_pred = model.predict(X_val)

In [None]:
df = pd.read_csv("train_raw_gX.csv")
X = df[["shortest_path"]]
y = df["taxi_time"]
model_sp = LinearRegression()
model_sp.fit(X, y)
y_pred = model_sp.predict(X)
mask = y > 1   
X_plot = X[mask]
y_plot = y[mask]
y_pred_plot = y_pred[mask]
plt.figure(figsize=(7,5))
plt.scatter(X_plot, y_plot, label="Actual", color="blue")
plt.scatter(X_plot, y_pred_plot, label="Predicted", color="orange")
sorted_idx = np.argsort(X_plot.values.flatten())
plt.plot(
    X_plot.values.flatten()[sorted_idx],
    y_pred_plot[sorted_idx],
    color='red',
    label="Regression Line",
    linewidth=2
)

plt.xlabel("Shortest Path")
plt.ylabel("Taxi Time")
plt.title("Regression Analysis: shortest path")
plt.legend()
plt.ylim(bottom=0)

plt.savefig(
    "Figure_4_Regression_shortest_path.png",
    dpi=300,
    bbox_inches="tight"
)

plt.show()

In [None]:
X = df[["type_binary"]]
y = df["taxi_time"]
model_type = LinearRegression()
model_type.fit(X, y)
y_pred = model_type.predict(X)
plt.figure(figsize=(7,5))
plt.scatter(X, y, label="Actual")
plt.plot(X, y_pred, color='red', label="Regression Line")
plt.ylim(bottom=0)
plt.xlabel("type")
plt.ylabel("Taxi Time")
plt.title("Regression Analysis: type")
plt.legend()
plt.savefig(
    "Figure_5_Regression_type.png",
    dpi=300,
    bbox_inches="tight"
)
plt.show()

In [None]:
test = pd.read_csv("PCA_test_reduced_gX.csv")
X_test = test[["PC1", "PC2"]]          
y_test = test["taxi_time"]           
y_test_pred = model.predict(X_test)
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_test_pred, alpha=0.7, label="Predicted vs Actual")
min_val = min(y_test.min(), y_test_pred.min())
max_val = max(y_test.max(), y_test_pred.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label="Perfect Prediction")
plt.xlabel("Actual Taxi Time")
plt.ylabel("Predicted Taxi Time")
plt.title("Actual vs Predicted Taxi Times")
plt.legend()
plt.grid(False)
plt.savefig(
    "Figure_8_LR_PCA_Test_Actual_vs_Predicted.png",
    dpi=300,
    bbox_inches="tight"
)
plt.show()

In [None]:
y_test_pred = model.predict(X_test)
plt.figure(figsize=(10,6))
plt.scatter(
    range(len(y_test)),
    y_test,
    color='blue',
    label="Actual Taxi Time",
    alpha=0.7
)
plt.scatter(
    range(len(y_test_pred)),
    y_test_pred,
    color='orange',
    label="Predicted Taxi Time",
    alpha=0.7
)
plt.xlabel("Index of Test Samples")
plt.ylabel("Taxi Time")
plt.title("Actual vs Predicted Taxi Time")
plt.legend()
plt.grid(False)
plt.savefig(
    "Figure_9_LR_PCA_Test_Actual_vs_Predicted_Index.png",
    dpi=300,
    bbox_inches="tight"
)
plt.show()

In [None]:
df = pd.read_csv("train_raw_gX.csv")
df.columns = df.columns.str.strip()
if "type" in df.columns:
    df["type"] = df["type"].astype(str).str.strip().str.lower()
    df["type"] = df["type"].map({
        "arrival": 0,
        "departure": 1
    })
df_numeric = df.select_dtypes(include=[np.number])
y = df_numeric["taxi_time"]
feature_cols = df_numeric.columns.drop("taxi_time")
r2_scores = {}
for col in feature_cols:
    X = df_numeric[[col]]  
    model = LinearRegression()
    model.fit(X, y)
    r2_scores[col] = model.score(X, y)
r2_sorted = pd.Series(r2_scores).sort_values(ascending=False)
print(r2_sorted)

In [None]:
mse = mean_squared_error(y_test, y_test_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_test_pred)
