In [1]:
# ===================================================================
#  Library
# ===================================================================
import polars as pl
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import optuna
import numpy as np

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    filename = "exp072"
    n_trials = 2000
    n_seeds = 3

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  DataLoading
# ===================================================================
train = pl.read_csv(CFG.data_dir+"train.csv", columns=["id", "price"])

df = pl.read_csv(CFG.save_dir+"oof_df_exp051.csv").rename({"pred":f"pred_{0}"})
files = ["exp052", "exp053", "exp054"]
for i, f in enumerate(files, start=1):
    df = df.join(
        pl.read_csv(CFG.save_dir+f"oof_df_{f}.csv").rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
    
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00055_oof_pred.csv").rename({'optimized_pred':f'pred_{len(files)+1}'}),
    on="id", how="left",
)

df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00056_oof_pred.csv").rename({"oof_pred": f'pred_{len(files)+2}'}),
    on="id", how="left",
)

df = df.join(
    train, on="id", how="left"
)

df.head()

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,price
i64,f64,f64,f64,f64,f64,f64,i64
0,8759.290471,8929.546032,8916.322165,8977.757097,8805.219623,7895.416,27587
1,3859.910547,3825.508199,3822.104706,3876.047282,3799.844508,4177.358,4724
2,3027.760649,3049.493176,2982.516292,3056.648628,2959.861854,2818.1372,10931
3,7990.517289,8005.600906,7993.295606,7894.338559,7934.6545,8179.846,16553
4,4524.287997,4479.006617,4569.630683,4570.899182,4461.927715,4321.6885,5158


In [5]:
# ===================================================================
#  optuna
# ===================================================================

def objective(trial):
    a = trial.suggest_float("a", 1e-8, 1, log=True)
    b = trial.suggest_float("b", 1e-8, 1, log=True)
    c = trial.suggest_float("c", 1e-8, 1, log=True)
    d = trial.suggest_float("d", 1e-8, 1, log=True)
    e = trial.suggest_float("e", 1e-8, 1, log=True)
    
    prediction = df["pred_1"] * a +\
                 df["pred_2"] * b +\
                 df["pred_3"] * c +\
                 df["pred_4"] * d +\
                 df["pred_5"] * e
                      
                      
    score = get_score(y_true=df["price"], y_pred = prediction)
    return score
    
optuna.logging.set_verbosity(optuna.logging.WARNING)

# シードのリストを定義
seeds = [seed for seed in range(CFG.seed, CFG.seed+CFG.n_seeds)]

best_values = []
best_params_list = []

for seed in seeds:    
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(seed=seed)
    )
    study.optimize(objective, 
                   n_trials=CFG.n_trials, 
                   n_jobs = -1,
                   show_progress_bar=True)
    
    best_value = study.best_value
    best_params = study.best_params
    
    best_values.append(best_value)
    best_params_list.append(best_params)
    
    print(f"Seed: {seed}, Best Value: {best_value}, Best Params: {best_params}")

  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 42, Best Value: 43.50338414202693, Best Params: {'a': 2.6232632697666626e-06, 'b': 1.1307496065858793e-08, 'c': 0.5322687710563295, 'd': 3.1460733405567695e-05, 'e': 0.4601138079833096}


  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 43, Best Value: 43.5130476309392, Best Params: {'a': 0.36017774883382503, 'b': 7.585869331114354e-07, 'c': 1.2918768218298221e-07, 'd': 0.006442472449752265, 'e': 0.6303799178933577}


  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 44, Best Value: 43.52500048336675, Best Params: {'a': 0.6978286559066503, 'b': 0.07217307163957125, 'c': 0.00015151108575444153, 'd': 0.00036326219769887, 'e': 0.23465710995989286}


In [6]:
# 最も小さい best_value を持つ Study を探索
best_index = np.argmin(best_values)
best_params_final = best_params_list[best_index]
best_value_final = best_values[best_index]

print("Final Best Value:", best_value_final)
print("Final Best Params:", best_params_final)

Final Best Value: 43.50338414202693
Final Best Params: {'a': 2.6232632697666626e-06, 'b': 1.1307496065858793e-08, 'c': 0.5322687710563295, 'd': 3.1460733405567695e-05, 'e': 0.4601138079833096}


In [7]:
# ===================================================================
#  test
# ===================================================================
test = pl.read_csv(CFG.save_dir+f"exp051.csv", has_header=False).rename({"column_1":"id", "column_2":"pred"}).sort("id").rename({"pred":f"pred_{0}"})
files = ["exp052.csv", "exp053.csv", "exp054.csv", "kun_exp00055.csv", "kun_exp00056.csv"]
for i, f in enumerate(files, start=1):
    test = test.join(
        pl.read_csv(CFG.save_dir+f, has_header=False).rename({"column_1":"id", "column_2":"pred"}).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
display(test.head())

prediction = test["pred_1"] * best_params_final["a"] +\
                 test["pred_2"] * best_params_final["b"] +\
                 test["pred_3"] * best_params_final["c"] +\
                 test["pred_4"] * best_params_final["d"] +\
                 test["pred_5"] * best_params_final["e"]
                 
test = test.with_columns(
    prediction.alias("pred")
)
                     
test[["id", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)
test[["id", "pred"]].head(2)

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5
i64,f64,f64,f64,f64,f64,f64
27532,9681.016732,9631.179775,9535.601879,9589.502294,9347.528622,8806.765109
27533,5589.905122,5625.972274,5579.913722,5696.370941,5473.58948,5556.561678
27534,5588.496733,5559.707831,5556.921123,5567.494508,5542.547482,5788.564407
27535,18539.01419,18585.315662,18690.825713,18540.038374,17815.695204,19133.043389
27536,4461.198006,4460.135571,4474.330698,4508.779351,4446.383701,4014.969884


id,pred
i64,f64
27532,9156.626285
27533,5588.838138
