In [1]:
# ===================================================================
#  Library
# ===================================================================
import polars as pl
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import optuna
import numpy as np

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    filename = "exp073"
    n_trials = 2000
    n_seeds = 3

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  DataLoading
# ===================================================================
train = pl.read_csv(CFG.data_dir+"train.csv", columns=["id", "price"])

df = pl.read_csv(CFG.save_dir+"oof_df_exp051.csv").rename({"pred":f"pred_{0}"})
files = ["exp052", "exp053", "exp054"]
for i, f in enumerate(files, start=1):
    df = df.join(
        pl.read_csv(CFG.save_dir+f"oof_df_{f}.csv").rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
    
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00055_oof_pred.csv").rename({'optimized_pred':f'pred_{len(files)+1}'}),
    on="id", how="left",
)

df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00056_oof_pred.csv").rename({"oof_pred": f'pred_{len(files)+2}'}),
    on="id", how="left",
)

df = df.join(
    train, on="id", how="left"
)

df.head()

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,price
i64,f64,f64,f64,f64,f64,f64,i64
0,8759.290471,8929.546032,8916.322165,8977.757097,8805.219623,7895.416,27587
1,3859.910547,3825.508199,3822.104706,3876.047282,3799.844508,4177.358,4724
2,3027.760649,3049.493176,2982.516292,3056.648628,2959.861854,2818.1372,10931
3,7990.517289,8005.600906,7993.295606,7894.338559,7934.6545,8179.846,16553
4,4524.287997,4479.006617,4569.630683,4570.899182,4461.927715,4321.6885,5158


In [5]:
# ===================================================================
#  Hill Climbing
# ===================================================================
# https://www.kaggle.com/competitions/feedback-prize-english-language-learning/discussion/369609


def evaluate(current_best_preds, k: int, best_score: float):
    """
    評価関数
    
    preds = current_best_preds*(1-w) + df[models[k]]*w
    において最もスコアが良くなるようにwを探す.
    もしその最も良いスコアがbest_scoreよりも良ければそのスコアと重みを返し、その最も良いスコアがbest_scoreよりも良くなければ元々のbest_scoreと重み0を返す
    """
    best_weight = 0
    for w in np.arange(-0.5, 0.5, 0.01):
        preds = current_best_preds*(1-w) + df[models[k]]*w
        score = get_score(y_true=df["price"], y_pred=preds)
        if score < best_score:
            best_score = score
            best_weight = w
    return best_score, best_weight



stores = dict() # Hill Climbingで得た重りを保存する
orders = []     # Hill Climbingで選ばれたモデルの順番を保存する
scores = dict() # Hill Climbingするときのスコアを保存する

# single modelで最もCVが良いモデルを選択する
for col in df.columns:
    if col not in ["id", "price"]:
        scores[col] = get_score(y_true=df["price"], y_pred=df[col])        
selected_model = min(scores, key=scores.get)
current_best_preds = df[selected_model]
orders.append(selected_model) # 順番を保存
stores[selected_model] = 1    # 重みを保存
models = [col for col in df.columns if col not in ["id", "price"]]

# Start Hill Climbing
i = 0
print(f"[{i}] baseline {selected_model} {min(scores.values())}")

while True:
    
    # 前段階のベストスコア
    best_score = min(scores.values())
    
    # 前段階で選ばれたモデル
    selected_model = min(scores, key=scores.get)
    
    # 重複しないように削除
    models.remove(selected_model)
    
    
    # 残ったモデルの数が0になれば終了
    if len(models) == 0:
        break
    
    # 
    scores, weights = dict(), dict()
    for k in range(len(models)):
        score, weight = evaluate(current_best_preds, k, best_score)
        scores[models[k]] = score
        weights[models[k]] = weight
        
    i += 1
    selected_model = min(scores, key=scores.get)
    print(f"[{i}] add {selected_model}: {min(scores.values())} {weights[selected_model]}")
    best_weight = weights[selected_model]
    stores[selected_model] = best_weight
    orders.append(selected_model)
    current_best_preds = current_best_preds*(1-best_weight) + df[selected_model]*best_weight

[0] baseline pred_0 43.56925636218776
[1] add pred_5: 43.50093990230066 0.45000000000000084
[2] add pred_4: 43.500358657136026 0.09000000000000052
[3] add pred_2: 43.49842202750322 -0.17999999999999972
[4] add pred_3: 43.498085037353654 0.0700000000000005
[5] add pred_1: 43.49803242660311 -0.029999999999999583


In [6]:
# ===================================================================
#  Check
# ===================================================================
def get_preds(df: pl.DataFrame):
    best_preds = 0
    for exp in orders:
        w = stores[exp]
        best_preds  = best_preds * (1-w) + df[exp] * w
    return best_preds
    
get_score(y_true=df["price"], y_pred=get_preds(df))

43.49803242660311

In [7]:
# ===================================================================
#  oof_df
# ===================================================================
test = pl.read_csv(CFG.save_dir+f"exp051.csv", has_header=False).rename({"column_1":"id", "column_2":"pred"}).sort("id").rename({"pred":f"pred_{0}"})
files = ["exp052.csv", "exp053.csv", "exp054.csv", "kun_exp00055.csv", "kun_exp00056.csv"]
for i, f in enumerate(files, start=1):
    test = test.join(
        pl.read_csv(CFG.save_dir+f, has_header=False).rename({"column_1":"id", "column_2":"pred"}).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )

test = test.with_columns(
    pl.Series(get_preds(test)).alias("pred")
)
display(test.head())


test[["id", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred
i64,f64,f64,f64,f64,f64,f64,f64
27532,9681.016732,9631.179775,9535.601879,9589.502294,9347.528622,8806.765109,9262.398861
27533,5589.905122,5625.972274,5579.913722,5696.370941,5473.58948,5556.561678,5570.955783
27534,5588.496733,5559.707831,5556.921123,5567.494508,5542.547482,5788.564407,5681.220795
27535,18539.01419,18585.315662,18690.825713,18540.038374,17815.695204,19133.043389,18712.89727
27536,4461.198006,4460.135571,4474.330698,4508.779351,4446.383701,4014.969884,4254.344876
