In [1]:
# ===================================================================
#  Library
# ===================================================================
import polars as pl
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import optuna
import numpy as np

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    filename = "exp076"
    n_trials = 2000
    n_seeds = 3

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  DataLoading
# ===================================================================
train = pl.read_csv(CFG.data_dir+"train.csv", columns=["id", "price"])

df = pl.read_csv(CFG.save_dir+"oof_df_exp051.csv").rename({"pred":f"pred_{0}"})
files = ["exp052", "exp053", "exp054"]
for i, f in enumerate(files, start=1):
    df = df.join(
        pl.read_csv(CFG.save_dir+f"oof_df_{f}.csv").rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
    
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00055_oof_pred.csv").rename({'optimized_pred':f'pred_{len(files)+1}'}),
    on="id", how="left",
)

df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00056_oof_pred.csv").rename({"oof_pred": f'pred_{len(files)+2}'}),
    on="id", how="left",
)

df = df.join(
    train, on="id", how="left"
)

df.head()

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,price
i64,f64,f64,f64,f64,f64,f64,i64
0,8759.290471,8929.546032,8916.322165,8977.757097,8805.219623,7895.416,27587
1,3859.910547,3825.508199,3822.104706,3876.047282,3799.844508,4177.358,4724
2,3027.760649,3049.493176,2982.516292,3056.648628,2959.861854,2818.1372,10931
3,7990.517289,8005.600906,7993.295606,7894.338559,7934.6545,8179.846,16553
4,4524.287997,4479.006617,4569.630683,4570.899182,4461.927715,4321.6885,5158


In [5]:
# ===================================================================
#  simple greedy forward selection
# ===================================================================
# single modelで最もCVが良いモデルを選択する
scores = dict()
for col in df.columns:
    if col not in ["id", "price"]:
        scores[col] = get_score(y_true=df["price"], y_pred=df[col])    
        
BEST_SCORE = np.inf
for seed in range(40, 60):    
    selected_model = min(scores, key=scores.get)
    best_preds = df[selected_model]
    best_score = min(scores.values())
    
    stores = dict()
    orders = [selected_model]
    stores[selected_model] = 1    # 重みを保存
    

    filenames = [col for col in df.columns if col not in ["id", "price", selected_model]]
    filenames = np.random.RandomState(seed).permutation(filenames)

    for exp in filenames:
        best_weight = 0
        for w in np.arange(-0.5, 0.5, 0.01):
            preds = best_preds * (1-w) + df[exp] * w
            score = get_score(y_true=df["price"], y_pred=preds)
            if best_score > score:
                best_score = score
                best_weight = w
        stores[exp] = best_weight
        orders.append(exp)
        best_preds = best_preds * (1-best_weight) + df[exp] * best_weight
    print(seed, best_score)
    
    if BEST_SCORE > best_score:
        BEST_SCORE = best_score
        BEST_STORE = stores.copy()
        BEST_ORDER = orders.copy()

40 43.49973597001526
41 43.499244631866475
42 43.50017787619886
43 43.498096596705466
44 43.49855093766878
45 43.50041718403783
46 43.49790678952483
47 43.498553070579
48 43.498654292859825
49 43.49882250483757
50 43.49799259278867
51 43.49813107062635
52 43.49855093766878
53 43.49817015447887
54 43.499059993641715
55 43.4993704630094
56 43.49863700852603
57 43.49867278983334
58 43.499716190736
59 43.498301543846


In [6]:
BEST_SCORE

43.49790678952483

In [7]:
# ===================================================================
#  Check
# ===================================================================
def get_preds(df: pl.DataFrame):
    best_preds = 0
    for exp in BEST_ORDER:
        w = BEST_STORE[exp]
        best_preds  = best_preds * (1-w) + df[exp] * w
    return best_preds
    
print(get_score(y_true=df["price"], y_pred=get_preds(df)))

df = df.with_columns(
    pl.Series(get_preds(df)).alias("pred")
)
df.write_csv(CFG.save_dir+f"oof_df_{CFG.filename}.csv", has_header=True)

43.49790678952483


In [8]:
# ===================================================================
#  test
# ===================================================================
test = pl.read_csv(CFG.save_dir+f"exp051.csv", has_header=False).rename({"column_1":"id", "column_2":"pred"}).sort("id").rename({"pred":f"pred_{0}"})
files = ["exp052.csv", "exp053.csv", "exp054.csv", "kun_exp00055.csv", "kun_exp00056.csv"]
for i, f in enumerate(files, start=1):
    test = test.join(
        pl.read_csv(CFG.save_dir+f, has_header=False).rename({"column_1":"id", "column_2":"pred"}).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )

test = test.with_columns(
    pl.Series(get_preds(test)).alias("pred")
)
display(test.head())


test[["id", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred
i64,f64,f64,f64,f64,f64,f64,f64
27532,9681.016732,9631.179775,9535.601879,9589.502294,9347.528622,8806.765109,9201.403646
27533,5589.905122,5625.972274,5579.913722,5696.370941,5473.58948,5556.561678,5544.537385
27534,5588.496733,5559.707831,5556.921123,5567.494508,5542.547482,5788.564407,5669.104669
27535,18539.01419,18585.315662,18690.825713,18540.038374,17815.695204,19133.043389,18594.588899
27536,4461.198006,4460.135571,4474.330698,4508.779351,4446.383701,4014.969884,4253.998513
