In [None]:
# ================================================
# MODELING: Polynomial, Ridge, Lasso, KNN, RandomForest
# ================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- Load your dataset ---
df = pd.read_csv("clean_data_scaled_new.csv")   # ⚠️ Thay tên file dataset của bạn
X = df.drop("DEP_DELAY", axis=1)           # ⚠️ Thay tên cột target nếu khác
y = df["DEP_DELAY"]

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Scale numeric features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Hàm evaluate chung ---
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    print(f"\n===== {name} =====")
    print(f"MAE : {mae:.3f}")
    print(f"MSE : {mse:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"R²  : {r2:.3f}")
    return {"Model": name, "MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}

# --- Danh sách model ---
results = []

# 1️⃣ Polynomial Regression (bậc 2)
poly = PolynomialFeatures(degree=3)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)
poly_reg = LinearRegression()
results.append(evaluate_model("Polynomial Regression (deg=2)", poly_reg, X_train_poly, X_test_poly, y_train, y_test))

# # 2️⃣ Ridge Regression
# ridge = Ridge(alpha=1.0)
# results.append(evaluate_model("Ridge Regression", ridge, X_train_scaled, X_test_scaled, y_train, y_test))

# # 3️⃣ Lasso Regression
# lasso = Lasso(alpha=0.001, max_iter=10000)
# results.append(evaluate_model("Lasso Regression", lasso, X_train_scaled, X_test_scaled, y_train, y_test))

# # 4️⃣ K-Nearest Neighbors
# knn = KNeighborsRegressor(n_neighbors=5)
# results.append(evaluate_model("KNN Regression", knn, X_train_scaled, X_test_scaled, y_train, y_test))

# # 5️⃣ Random Forest Regressor
# rf = RandomForestRegressor(n_estimators=200, random_state=42)
# results.append(evaluate_model("Random Forest Regression", rf, X_train, X_test, y_train, y_test))  # không cần scale

# --- Tổng hợp kết quả ---
results_df = pd.DataFrame(results)
print("\n\n=== SUMMARY RESULTS ===")
print(results_df.sort_values(by="R2", ascending=False))
