In [1]:
# ===================================================================
#  Library
# ===================================================================
import polars as pl
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import optuna
import numpy as np

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    filename = "exp084"
    n_trials = 2000
    n_seeds = 3

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  DataLoading
# ===================================================================
train = pl.read_csv(CFG.data_dir+"train.csv", columns=["id", "price"])

df = pl.read_csv(CFG.save_dir+"oof_df_exp072.csv", columns=["id", "pred"]).rename({"pred":f"pred_{0}"})
files = ["exp073", "exp074", "exp075", "exp076", ]
for i, f in enumerate(files, start=1):
    print()
    df = df.join(
        pl.read_csv(CFG.save_dir+f"oof_df_{f}.csv", columns=["id", "pred"]).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
    
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00061_oof_pred.csv").rename({'oof_pred':f'pred_{len(files)+1}'}),
    on="id", how="left",
)
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00062_oof_pred.csv").rename({'oof_pred':f'pred_{len(files)+2}'}),
    on="id", how="left",
)

df = df.join(
    train, on="id", how="left"
)

df.head()







id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,price
i64,f64,f64,f64,f64,f64,f64,f64,i64
0,8349.927994,8347.671992,8374.812648,8384.157015,8368.7128,8365.577188,8227.171,27587
1,3985.761719,4009.450083,3988.691655,3988.850911,3991.109782,4005.751997,3704.7322,4724
2,2918.185395,2933.057596,2921.001787,2921.707668,2917.474388,2941.156625,2834.8005,10931
3,8048.014373,8064.602372,8060.942061,8061.291081,8059.011798,8067.291115,8023.76,16553
4,4406.907983,4421.068587,4408.700347,4412.51081,4412.382296,4383.119601,4103.224,5158


In [5]:
# ===================================================================
#  Hill Climbing
# ===================================================================
# https://www.kaggle.com/competitions/feedback-prize-english-language-learning/discussion/369609


def evaluate(current_best_preds, k: int, best_score: float):
    """
    評価関数
    
    preds = current_best_preds*(1-w) + df[models[k]]*w
    において最もスコアが良くなるようにwを探す.
    もしその最も良いスコアがbest_scoreよりも良ければそのスコアと重みを返し、その最も良いスコアがbest_scoreよりも良くなければ元々のbest_scoreと重み0を返す
    """
    best_weight = 0
    for w in np.arange(-0.5, 0.5, 0.01):
        preds = current_best_preds*(1-w) + df[models[k]]*w
        score = get_score(y_true=df["price"], y_pred=preds)
        if score < best_score:
            best_score = score
            best_weight = w
    return best_score, best_weight



stores = dict() # Hill Climbingで得た重りを保存する
orders = []     # Hill Climbingで選ばれたモデルの順番を保存する
scores = dict() # Hill Climbingするときのスコアを保存する

# single modelで最もCVが良いモデルを選択する
for col in df.columns:
    if col not in ["id", "price"]:
        scores[col] = get_score(y_true=df["price"], y_pred=df[col])        
selected_model = min(scores, key=scores.get)
current_best_preds = df[selected_model]
orders.append(selected_model) # 順番を保存
stores[selected_model] = 1    # 重みを保存
models = [col for col in df.columns if col not in ["id", "price"]]

# Start Hill Climbing
i = 0
print(f"[{i}] baseline {selected_model} {min(scores.values())}")

while True:
    
    # 前段階のベストスコア
    best_score = min(scores.values())
    
    # 前段階で選ばれたモデル
    selected_model = min(scores, key=scores.get)
    
    # 重複しないように削除
    models.remove(selected_model)
    
    
    # 残ったモデルの数が0になれば終了
    if len(models) == 0:
        break
    
    # 
    scores, weights = dict(), dict()
    for k in range(len(models)):
        score, weight = evaluate(current_best_preds, k, best_score)
        scores[models[k]] = score
        weights[models[k]] = weight
        
    i += 1
    selected_model = min(scores, key=scores.get)
    print(f"[{i}] add {selected_model}: {min(scores.values())} {weights[selected_model]}")
    best_weight = weights[selected_model]
    stores[selected_model] = best_weight
    orders.append(selected_model)
    current_best_preds = current_best_preds*(1-best_weight) + df[selected_model]*best_weight

[0] baseline pred_5 43.496076310831896
[1] add pred_6: 43.47783056328021 0.36000000000000076
[2] add pred_2: 43.477814076406915 0.03000000000000047
[3] add pred_0: 43.47779857218461 -0.029999999999999583
[4] add pred_4: 43.477784298044014 0.03000000000000047
[5] add pred_1: 43.47778313918253 -0.009999999999999565
[6] add pred_3: 43.47778313918253 0


In [6]:
# ===================================================================
#  Check
# ===================================================================
def get_preds(df: pl.DataFrame):
    best_preds = 0
    for exp in orders:
        w = stores[exp]
        best_preds  = best_preds * (1-w) + df[exp] * w
    return best_preds
    
print(get_score(y_true=df["price"], y_pred=get_preds(df)))

df = df.with_columns(
    pl.Series(get_preds(df)).alias("pred")
)
df.write_csv(CFG.save_dir+f"oof_df_{CFG.filename}.csv", has_header=True)
display(df.head())

43.47778313918253


id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,price,pred
i64,f64,f64,f64,f64,f64,f64,f64,i64,f64
0,8349.927994,8347.671992,8374.812648,8384.157015,8368.7128,8365.577188,8227.171,27587,8317.819955
1,3985.761719,4009.450083,3988.691655,3988.850911,3991.109782,4005.751997,3704.7322,4724,3899.270705
2,2918.185395,2933.057596,2921.001787,2921.707668,2917.474388,2941.156625,2834.8005,10931,2903.107855
3,8048.014373,8064.602372,8060.942061,8061.291081,8059.011798,8067.291115,8023.76,16553,8052.10224
4,4406.907983,4421.068587,4408.700347,4412.51081,4412.382296,4383.119601,4103.224,5158,4285.073911


In [7]:
# ===================================================================
#  test
# ===================================================================
test = pl.read_csv(CFG.save_dir+f"exp072.csv", has_header=False).rename({"column_1":"id", "column_2":"pred"}).sort("id").rename({"pred":f"pred_{0}"})
files = ["exp073.csv", "exp074.csv", "exp075.csv", "exp076.csv", "kun_exp00061.csv", "kun_exp00062.csv"]
for i, f in enumerate(files, start=1):
    test = test.join(
        pl.read_csv(CFG.save_dir+f, has_header=False).rename({"column_1":"id", "column_2":"pred"}).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
display(test.head())

test = test.with_columns(
    pl.Series(get_preds(test)).alias("pred")
)
display(test.head())


test[["id", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6
i64,f64,f64,f64,f64,f64,f64,f64
27532,9219.200678,9262.398861,9209.845453,9211.657631,9201.403646,9222.758846,8606.683365
27533,5555.610927,5570.955783,5546.121886,5550.776992,5544.537385,5569.564552,5412.844072
27534,5653.424701,5681.220795,5667.925276,5667.435425,5669.104669,5680.045525,5585.359034
27535,18752.937246,18712.89727,18594.4331,18620.599651,18594.588899,18570.819391,19174.202965
27536,4234.809532,4254.344876,4254.846798,4256.454343,4253.998513,4246.374616,4395.118442


id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred
i64,f64,f64,f64,f64,f64,f64,f64,f64
27532,9219.200678,9262.398861,9209.845453,9211.657631,9201.403646,9222.758846,8606.683365,9004.339701
27533,5555.610927,5570.955783,5546.121886,5550.776992,5544.537385,5569.564552,5412.844072,5513.268441
27534,5653.424701,5681.220795,5667.925276,5667.435425,5669.104669,5680.045525,5585.359034,5646.752652
27535,18752.937246,18712.89727,18594.4331,18620.599651,18594.588899,18570.819391,19174.202965,18778.098086
27536,4234.809532,4254.344876,4254.846798,4256.454343,4253.998513,4246.374616,4395.118442,4299.535846
