In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# ----------------------------
# 1. LOAD DATA AND PREPROCESSING
# ----------------------------
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_ids = test["id"]

for df in [train, test]:
    for col in ["id", "string_id"]:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

for col in ["humidity", "wind_speed", "pressure"]:
    train[col] = pd.to_numeric(train[col], errors="coerce")
    test[col]  = pd.to_numeric(test[col], errors="coerce")

numeric_cols = ["humidity", "wind_speed", "pressure"]
for col in numeric_cols:
    median_val = train[col].median()
    train[col] = train[col].fillna(median_val)
    test[col]  = test[col].fillna(median_val)

categorical_cols = ["installation_type", "error_code"]
for col in categorical_cols:
    train[col] = train[col].astype(str)
    test[col]  = test[col].astype(str)

train = pd.get_dummies(train, columns=categorical_cols)
test = pd.get_dummies(test, columns=categorical_cols)

train, test = train.align(test, join="left", axis=1, fill_value=0)

X = train.drop(columns=["efficiency"])
y = train["efficiency"]

X = X.fillna(0)
test = test.fillna(0)

# ----------------------------
# 2. FEATURE SCALING
# ----------------------------
test_features = test.drop(columns=["efficiency"], errors="ignore")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_features)

# ----------------------------
# 3. TRAIN–VALIDATION SPLIT (90:10)
# ----------------------------
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.10, random_state=42)

# ----------------------------
# 4. TRAIN BASE MODELS
# ----------------------------
# XGBoost
xgb_model = xgb.XGBRegressor(
    n_estimators=1000, learning_rate=0.03, max_depth=6,
    subsample=0.80, colsample_bytree=0.80,
    objective="reg:squarederror", eval_metric="rmse",
    verbosity=1, random_state=42, n_jobs=-1
)
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

# LightGBM
lgb_model = lgb.LGBMRegressor(
    n_estimators=1000, learning_rate=0.03, max_depth=6,
    subsample=0.80, colsample_bytree=0.80,
    objective="regression", metric="rmse",
    random_state=42, n_jobs=-1
)
lgb_callbacks = [lgb.early_stopping(stopping_rounds=50, verbose=True)]
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=lgb_callbacks)

# CatBoost
cat_model = CatBoostRegressor(
    iterations=1000, learning_rate=0.03, depth=6,
    loss_function='RMSE', random_seed=42,
    verbose=False
)
cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)

# RandomForest
rf_model = RandomForestRegressor(
    n_estimators=300, max_depth=10,
    random_state=42, n_jobs=-1
)
rf_model.fit(X_train, y_train)

# AdaBoostRegressor
ada_model = AdaBoostRegressor(
    n_estimators=300, learning_rate=0.03, random_state=42
)
ada_model.fit(X_train, y_train)

# ----------------------------
# 5. BUILD STACKED FEATURES
# ----------------------------
xgb_val_preds = xgb_model.predict(X_val)
lgb_val_preds = lgb_model.predict(X_val)
cat_val_preds = cat_model.predict(X_val)
rf_val_preds = rf_model.predict(X_val)
ada_val_preds = ada_model.predict(X_val)

stacked_val = np.column_stack((xgb_val_preds, lgb_val_preds, cat_val_preds, rf_val_preds, ada_val_preds))

xgb_test_preds = xgb_model.predict(test_scaled)
lgb_test_preds = lgb_model.predict(test_scaled)
cat_test_preds = cat_model.predict(test_scaled)
rf_test_preds = rf_model.predict(test_scaled)
ada_test_preds = ada_model.predict(test_scaled)
stacked_test = np.column_stack((xgb_test_preds, lgb_test_preds, cat_test_preds, rf_test_preds, ada_test_preds))

# ----------------------------
# 6. TRAIN META MODEL (RIDGE) WITH GRID SEARCH
# ----------------------------
param_grid = {"alpha": [0.01, 0.1, 0.3, 1, 3, 10]}
ridge = Ridge(random_state=42)
grid_search = GridSearchCV(ridge, param_grid, scoring="neg_mean_squared_error", cv=5)
grid_search.fit(stacked_val, y_val)
meta_model = grid_search.best_estimator_
print("Best Ridge alpha:", grid_search.best_params_["alpha"])

val_preds = meta_model.predict(stacked_val)
val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
custom_score = 100 * (1 - val_rmse)
print(f"✅ Validation RMSE: {val_rmse:.4f}")
print(f"✅ Custom Score: {custom_score:.2f}/100")

# ----------------------------
# 7. FINAL TEST PREDICTIONS AND SUBMISSION
# ----------------------------
final_test_preds = meta_model.predict(stacked_test)
submission = pd.DataFrame({"id": test_ids, "efficiency": final_test_preds})
submission.to_csv("submission.csv", index=False)
print("✅ 'submission.csv' created successfully.")

[0]	validation_0-rmse:0.14264
[1]	validation_0-rmse:0.14114
[2]	validation_0-rmse:0.14054
[3]	validation_0-rmse:0.13910
[4]	validation_0-rmse:0.13776
[5]	validation_0-rmse:0.13649
[6]	validation_0-rmse:0.13618
[7]	validation_0-rmse:0.13500
[8]	validation_0-rmse:0.13387
[9]	validation_0-rmse:0.13348
[10]	validation_0-rmse:0.13237
[11]	validation_0-rmse:0.13139
[12]	validation_0-rmse:0.13053
[13]	validation_0-rmse:0.12957
[14]	validation_0-rmse:0.12870
[15]	validation_0-rmse:0.12784
[16]	validation_0-rmse:0.12706
[17]	validation_0-rmse:0.12630
[18]	validation_0-rmse:0.12558
[19]	validation_0-rmse:0.12532
[20]	validation_0-rmse:0.12462
[21]	validation_0-rmse:0.12396
[22]	validation_0-rmse:0.12344
[23]	validation_0-rmse:0.12289
[24]	validation_0-rmse:0.12234
[25]	validation_0-rmse:0.12181
[26]	validation_0-rmse:0.12131
[27]	validation_0-rmse:0.12088
[28]	validation_0-rmse:0.12053
[29]	validation_0-rmse:0.12009
[30]	validation_0-rmse:0.11977
[31]	validation_0-rmse:0.11935
[32]	validation_0-



Early stopping, best iteration is:
[222]	valid_0's rmse: 0.110711




Best Ridge alpha: 0.1
✅ Validation RMSE: 0.1102
✅ Custom Score: 88.98/100
✅ 'submission.csv' created successfully.
