In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [2]:
from sklearn.model_selection import train_test_split

# Configure problem number here
PROBLEM_NUM = 36

X_path = f"./data_31_40/problem_{PROBLEM_NUM}/dataset_{PROBLEM_NUM}.csv"
y_path = f"./data_31_40/problem_{PROBLEM_NUM}/target_{PROBLEM_NUM}.csv"
Xeval_path = f"./data_31_40/problem_{PROBLEM_NUM}/EVAL_{PROBLEM_NUM}.csv"

X = pd.read_csv(X_path)
y = pd.read_csv(y_path)
X_eval = pd.read_csv(Xeval_path)

y1 = y["target01"]

print(f"Problem {PROBLEM_NUM}")
print(f"X: {X.shape}, y1: {y1.shape}, X_eval: {X_eval.shape}")
assert list(X.columns) == list(X_eval.columns), "Train/EVAL column mismatch!"

# Create train/validation split to detect overfitting
X_train, X_val, y_train, y_val = train_test_split(
    X, y1, test_size=0.2, random_state=42, shuffle=True
)

print(f"\nTrain/Val Split:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")


Problem 36
X: (10000, 273), y1: (10000,), X_eval: (10000, 273)

Train/Val Split:
X_train: (8000, 273), y_train: (8000,)
X_val: (2000, 273), y_val: (2000,)


In [3]:
num_cols = X.columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), num_cols)
    ],
    remainder="drop"
)


In [4]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(
    lambda yt, yp: -rmse(yt, yp)  # GridSearchCV maximizes score
)

cv = KFold(n_splits=5, shuffle=True, random_state=42)


In [5]:
models = {
    # "LinearRegression": LinearRegression(),
    # "Ridge": Ridge(random_state=42),
    # "Lasso": Lasso(random_state=42),
    # "ElasticNet": ElasticNet(random_state=42),
    # "SVR": SVR(),
    # "RandomForest": RandomForestRegressor(random_state=42, n_jobs=-1),
    # "HistGBR": HistGradientBoostingRegressor(random_state=42),
    # "XGBoost": XGBRegressor(random_state=42, n_jobs=-1, objective="reg:squarederror"),
    "LightGBM": LGBMRegressor(random_state=42, n_jobs=-1)
}

# Optimized hyperparameter grids - reduced search space for faster runtime
param_grids = {
    "LinearRegression": {},

    "Ridge": {
        "model__alpha": [0.1, 1.0, 10.0, 100.0]
    },

    "Lasso": {
        "model__alpha": [1e-4, 1e-3, 1e-2, 1e-1]
    },

    "ElasticNet": {
        "model__alpha": [1e-4, 1e-3, 1e-2],
        "model__l1_ratio": [0.2, 0.5, 0.8]
    },

    # "SVR": {
    #     "model__C": [0.1, 1, 10],
    #     "model__epsilon": [0.01, 0.1, 0.2],
    #     "model__gamma": ["scale"]
    # },

    "RandomForest": {
        "model__n_estimators": [300],
        "model__max_depth": [None, 20],
        "model__min_samples_leaf": [1, 10],
        "model__max_features": ["sqrt"]
    },

    "HistGBR": {
        "model__max_depth": [None, 6],
        "model__learning_rate": [0.05, 0.1],
        "model__max_iter": [500, 1000],
        "model__min_samples_leaf": [20]
    },

    "XGBoost": {
        "model__n_estimators": [500, 1000],
        "model__max_depth": [4, 6],
        "model__learning_rate": [0.05, 0.1],
        "model__subsample": [0.8],
        "model__colsample_bytree": [0.8],
    },

    "LightGBM": {
        "model__n_estimators": [800, 1500, 2000],
        "model__learning_rate": [0.01, 0.1],
        "model__max_depth": [-1, 10],
        # "model__num_leaves": [31, 63],
        # "model__subsample": [0.8],
        # "model__colsample_bytree": [0.8],
        # "model__min_child_samples": [20],
    }
}


In [6]:
from sklearn.model_selection import cross_val_score

scan_results = []

print("Initial Model Scan (using training data only)")
print("="*90)

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocess", preprocess), ("model", model)])
    
    # CV on training data only (no data leakage)
    rmse_scores = -cross_val_score(pipe, X_train, y_train, cv=cv, scoring=rmse_scorer, n_jobs=-1)
    r2_scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring='r2', n_jobs=-1)
    
    scan_results.append((name, rmse_scores.mean(), rmse_scores.std(), 
                        r2_scores.mean(), r2_scores.std()))

scan_results = sorted(scan_results, key=lambda x: x[1])  # lower RMSE better
print(f"{'Model':<16} {'RMSE':<20} {'R²':<20}")
print("="*90)
for r in scan_results:
    print(f"{r[0]:<16} {r[1]:.4f} ± {r[2]:.4f}      {r[3]:.4f} ± {r[4]:.4f}")
print("="*90)


Initial Model Scan (using training data only)
Model            RMSE                 R²                  
LightGBM         0.1109 ± 0.0057      0.7633 ± 0.0235


In [7]:
all_results = []
best_models = {}  # Store trained models

print("\nHyperparameter Tuning (using training data only)")
print("="*80)

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocess", preprocess), ("model", model)])

    # GridSearchCV on training data only (no data leakage)
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=param_grids[name],
        scoring=rmse_scorer,
        cv=cv,
        n_jobs=-1,
        verbose=1,
        refit=True
    )
    
    try:
        print(f"\nTraining {name}...")
        gs.fit(X_train, y_train)
        
        all_results.append((name, -gs.best_score_, gs.best_params_))
        best_models[name] = gs.best_estimator_
        print(f"{name} completed: CV RMSE={-gs.best_score_:.4f}")
    except Exception as e:
        print(f"ERROR with {name}: {e}")
        continue

all_results = sorted(all_results, key=lambda x: x[1])  # lower RMSE better
print("\n" + "="*80)
print("FINAL RESULTS (CV on Training Data):")
print("="*80)
for name, score, params in all_results:
    print(f"{name:<16} RMSE={score:.4f}  params={params}")



Hyperparameter Tuning (using training data only)

Training LightGBM...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64779
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 273
[LightGBM] [Info] Start training from score -0.124016
LightGBM completed: CV RMSE=0.1008

FINAL RESULTS (CV on Training Data):
LightGBM         RMSE=0.1008  params={'model__learning_rate': 0.01, 'model__max_depth': 10, 'model__n_estimators': 2000}


In [8]:
from sklearn.metrics import r2_score

print("\n" + "="*80)
print("VALIDATION SET EVALUATION - Checking for Overfitting")
print("="*80)

validation_results = []

for name, model_estimator in best_models.items():
    # Train predictions (already fitted on X_train)
    y_train_pred = model_estimator.predict(X_train)
    train_rmse = rmse(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    
    # Validation predictions (unseen data - detects overfitting)
    y_val_pred = model_estimator.predict(X_val)
    val_rmse = rmse(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    
    # Calculate overfitting gap
    rmse_gap = val_rmse - train_rmse
    r2_gap = train_r2 - val_r2
    
    validation_results.append((name, train_rmse, val_rmse, train_r2, val_r2, rmse_gap, r2_gap))

# Sort by validation RMSE (best generalization)
validation_results = sorted(validation_results, key=lambda x: x[2])

print(f"{'Model':<16} {'Train RMSE':<12} {'Val RMSE':<12} {'Train R²':<10} {'Val R²':<10} {'RMSE Gap':<10}")
print("="*80)
for vr in validation_results:
    gap_indicator = "⚠️" if vr[5] > 0.05 else ""  # Flag if validation is much worse
    print(f"{vr[0]:<16} {vr[1]:<12.4f} {vr[2]:<12.4f} {vr[3]:<10.4f} {vr[4]:<10.4f} {vr[5]:<10.4f} {gap_indicator}")
print("="*80)
print("Note: Large RMSE Gap or R² Gap indicates overfitting")

best_name_val = validation_results[0][0]
print(f"\nBest model by validation RMSE: {best_name_val}")



VALIDATION SET EVALUATION - Checking for Overfitting
Model            Train RMSE   Val RMSE     Train R²   Val R²     RMSE Gap  
LightGBM         0.0259       0.0877       0.9871     0.8535     0.0617     ⚠️
Note: Large RMSE Gap or R² Gap indicates overfitting

Best model by validation RMSE: LightGBM




In [9]:
# Retrain best model on FULL training data for final predictions
print("\n" + "="*80)
print("FINAL MODEL TRAINING ON FULL DATA")
print("="*80)

# Use best model from validation results
best_model_class = models[best_name_val]
best_params_dict = next(params for name, _, params in all_results if name == best_name_val)

# Create fresh pipeline with best hyperparameters
final_pipe = Pipeline(steps=[("preprocess", preprocess), ("model", best_model_class)])
final_pipe.set_params(**best_params_dict)

# Train on ALL training data (X, y1) for maximum performance
print(f"Retraining {best_name_val} on full training data...")
final_pipe.fit(X, y1)

# Calculate final metrics on full data
y_full_pred = final_pipe.predict(X)
full_rmse = rmse(y1, y_full_pred)
full_r2 = r2_score(y1, y_full_pred)

print(f"\nFinal model: {best_name_val}")
print(f"Full data RMSE: {full_rmse:.4f}")
print(f"Full data R²: {full_r2:.4f}")
print(f"Best params: {best_params_dict}")

# Generate predictions for EVAL set
eval_pred = final_pipe.predict(X_eval)

# Save predictions
output_filename = f"EVAL_target01_{PROBLEM_NUM}.csv"
submission = pd.DataFrame({"target01": eval_pred})
submission.to_csv(output_filename, index=False)

print(f"\nSaved: {output_filename}")
print(f"Predictions shape: {eval_pred.shape}")
print(f"Predictions range: [{eval_pred.min():.4f}, {eval_pred.max():.4f}]")



FINAL MODEL TRAINING ON FULL DATA
Retraining LightGBM on full training data...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013226 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64779
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 273
[LightGBM] [Info] Start training from score -0.123096





Final model: LightGBM
Full data RMSE: 0.0300
Full data R²: 0.9828
Best params: {'model__learning_rate': 0.01, 'model__max_depth': 10, 'model__n_estimators': 2000}

Saved: EVAL_target01_36.csv
Predictions shape: (10000,)
Predictions range: [-0.4805, 0.3249]
