In [1]:
!pip install -q hillclimbers

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for hillclimbers (setup.py) ... [?25l[?25hdone


In [2]:
import os
import warnings
import numpy as np
import pandas as pd

from hillclimbers import climb_hill, partial
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")

TARGET = "exam_score"
BASE_PATH = "/kaggle/input/s6e1-models"

In [3]:
train_df = pd.read_csv("/kaggle/input/playground-series-s6e1/train.csv")
test_df  = pd.read_csv("/kaggle/input/playground-series-s6e1/test.csv")

submission_df = pd.read_csv(
    "/kaggle/input/playground-series-s6e1/sample_submission.csv"
)


In [4]:
files = os.listdir(BASE_PATH)
oof_files = [f for f in files if f.endswith("_oof.csv")]

print("Total files:", len(files))
print("OOF files found:", len(oof_files))
oof_files[:10]

Total files: 142
OOF files found: 64


['xgb_ridge_oof.csv',
 'extra_tree_8.59813_oof.csv',
 'xgboost_8.60034_oof.csv',
 'xlearn-ffm_oof.csv',
 'rohan_oof.csv',
 '8.7910_catboost_oof.csv',
 'akira_lgb_oof.csv',
 'xgb_fe_ridge_oof.csv',
 'godara_xgb_oof.csv',
 'xgb-ridge_oof.csv']

In [5]:
files = os.listdir(BASE_PATH)

oof_files  = sorted([f for f in files if "oof" in f.lower()])
test_files = sorted([f for f in files if "test" in f.lower() or "sub" in f.lower()])

print("OOF files:", len(oof_files))
print("Test files:", len(test_files))


OOF files: 65
Test files: 77


In [6]:
oof_preds = {}
test_preds = {}
scores = {}

for oof_file in oof_files:
    # Match corresponding sub file
    sub_file = oof_file.replace("_oof.csv", "_sub.csv")
    sub_path = os.path.join(BASE_PATH, sub_file)

    if not os.path.exists(sub_path):
        continue

    model_name = oof_file.replace("_oof.csv", "")

    try:
        oof = pd.read_csv(os.path.join(BASE_PATH, oof_file))[TARGET].values
        test = pd.read_csv(sub_path)[TARGET].values

        if len(oof) != len(train_df):
            continue

        rmse = np.sqrt(mean_squared_error(train_df[TARGET], oof))

        oof_preds[model_name] = oof
        test_preds[model_name] = test
        scores[model_name] = rmse

    except Exception:
        continue

In [7]:
score_df = (
    pd.DataFrame.from_dict(scores, orient="index", columns=["rmse"])
    .sort_values("rmse")
)

score_df.head(30)


Unnamed: 0,rmse
sung,8.592464
sunghur,8.592464
sunghur_ensemble,8.593624
catboost_8.59505,8.595047
histgb_8.59509,8.595093
xgb_limitdepth_8.59673,8.596731
extra_tree_8.59813,8.598129
xgb_best,8.598726
xgb_fe,8.599125
omid_ensemble,8.599365


In [8]:
# Keep models within +0.15 RMSE of best
best_rmse = score_df.iloc[0]["rmse"]

selected_models = score_df[
    score_df["rmse"] <= best_rmse + 0.15
].index.tolist()

print("Selected models:", len(selected_models))

Selected models: 57


In [9]:
oof_df = pd.DataFrame({m: oof_preds[m] for m in selected_models})
test_df_pred = pd.DataFrame({m: test_preds[m] for m in selected_models})

In [10]:
# corr = oof_df.corr().abs()

# to_drop = set()
# for i in corr.columns:
#     for j in corr.columns:
#         if i != j and corr.loc[i, j] > 0.995:
#             worse = i if scores[i] > scores[j] else j
#             to_drop.add(worse)

# oof_df = oof_df.drop(columns=to_drop)
# test_df_pred = test_df_pred.drop(columns=to_drop)

# print("After correlation pruning:", oof_df.shape[1])

In [13]:
hc_test, hc_oof = climb_hill(
    train=train_df,
    target=TARGET,
    objective="minimize",
    eval_metric=partial(mean_squared_error),
    oof_pred_df=oof_df,
    test_pred_df=test_df_pred,
    plot_hill=True,
    plot_hist=False,
    precision=0.001,
    negative_weights=True,
    return_oof_preds=True
)

[1m[34m   /\  
  /__\  hillclimbers[0m[1m 
 /    \
/______\ 
[0m
[1m[33mModels to be ensembled | (57 total):[0m 

[1m[32msung:                   73.83043 (best solo model)[0m
[1msunghur:                73.83043[0m
[1msunghur_ensemble:       73.85037[0m
[1mcatboost_8.59505:       73.87484[0m
[1mhistgb_8.59509:         73.87562[0m
[1mxgb_limitdepth_8.59673: 73.90378[0m
[1mextra_tree_8.59813:     73.92783[0m
[1mxgb_best:               73.93808[0m
[1mxgb_fe:                 73.94496[0m
[1momid_ensemble:          73.94908[0m
[1mxgboost_8.60034:        73.96584[0m
[1mnpo:                    74.02002[0m
[1mtabm:                   74.05530[0m
[1mutaazu_tabm:            74.07248[0m
[1mrf_8.60663:             74.07401[0m
[1mknight_ensemble:        74.08308[0m
[1mspiritmilk:             74.08437[0m
[1mtmm:                    74.10966[0m
[1mhaha750:                74.11517[0m
[1makira_v2:               74.11779[0m
[1makira_xgb:              74.1180

In [14]:
print(
    "Final Hill-Climb OOF RMSE:",
    np.sqrt(mean_squared_error(train_df[TARGET], hc_oof))
)

Final Hill-Climb OOF RMSE: 8.583047898103532


In [15]:
pd.DataFrame({
    "id": train_df["id"],
    TARGET: hc_oof
}).to_csv("hillclimb_oof_02.csv", index=False)

In [16]:
submission_df[TARGET] = np.clip(hc_test, 0, 100)
submission_df.to_csv("hillclimb_submission_02.csv", index=False)

submission_df.head()

Unnamed: 0,id,exam_score
0,630000,69.425454
1,630001,67.488439
2,630002,90.372688
3,630003,56.409821
4,630004,46.682466
