In [1]:
# ===================================================================
#  Library
# ===================================================================
import polars as pl
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import optuna
import numpy as np

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    filename = "exp083"
    n_trials = 2000
    n_seeds = 3

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  DataLoading
# ===================================================================
train = pl.read_csv(CFG.data_dir+"train.csv", columns=["id", "price"])

df = pl.read_csv(CFG.save_dir+"oof_df_exp072.csv", columns=["id", "pred"]).rename({"pred":f"pred_{0}"})
files = ["exp073", "exp074", "exp075", "exp076", ]
for i, f in enumerate(files, start=1):
    print()
    df = df.join(
        pl.read_csv(CFG.save_dir+f"oof_df_{f}.csv", columns=["id", "pred"]).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
    
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00061_oof_pred.csv").rename({'oof_pred':f'pred_{len(files)+1}'}),
    on="id", how="left",
)
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00062_oof_pred.csv").rename({'oof_pred':f'pred_{len(files)+2}'}),
    on="id", how="left",
)

df = df.join(
    train, on="id", how="left"
)

df.head()







id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,price
i64,f64,f64,f64,f64,f64,f64,f64,i64
0,8349.927994,8347.671992,8374.812648,8384.157015,8368.7128,8365.577188,8227.171,27587
1,3985.761719,4009.450083,3988.691655,3988.850911,3991.109782,4005.751997,3704.7322,4724
2,2918.185395,2933.057596,2921.001787,2921.707668,2917.474388,2941.156625,2834.8005,10931
3,8048.014373,8064.602372,8060.942061,8061.291081,8059.011798,8067.291115,8023.76,16553
4,4406.907983,4421.068587,4408.700347,4412.51081,4412.382296,4383.119601,4103.224,5158


In [5]:
# ===================================================================
#  optuna
# ===================================================================

def objective(trial):
    a = trial.suggest_float("a", 1e-8, 1, log=True)
    b = trial.suggest_float("b", 1e-8, 1, log=True)
    c = trial.suggest_float("c", 1e-8, 1, log=True)
    d = trial.suggest_float("d", 1e-8, 1, log=True)
    e = trial.suggest_float("e", 1e-8, 1, log=True)
    f = trial.suggest_float("f", 1e-8, 1, log=True)
    g = trial.suggest_float("g", 1e-8, 1, log=True)
    
    
    prediction = df["pred_0"] * a +\
                 df["pred_1"] * b +\
                 df["pred_2"] * c +\
                 df["pred_3"] * d +\
                 df["pred_4"] * e +\
                 df["pred_5"] * f +\
                 df["pred_6"] * g
                 
    score = get_score(y_true=df["price"], y_pred = prediction)
    return score
    
optuna.logging.set_verbosity(optuna.logging.WARNING)

# シードのリストを定義
seeds = [seed for seed in range(CFG.seed, CFG.seed+CFG.n_seeds)]

best_values = []
best_params_list = []

for seed in seeds:    
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(seed=seed)
    )
    study.optimize(objective, 
                   n_trials=CFG.n_trials, 
                   n_jobs = -1,
                   show_progress_bar=True)
    
    best_value = study.best_value
    best_params = study.best_params
    
    best_values.append(best_value)
    best_params_list.append(best_params)
    
    print(f"Seed: {seed}, Best Value: {best_value}, Best Params: {best_params}")

  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 42, Best Value: 43.478900422112574, Best Params: {'a': 1.9815385598537837e-05, 'b': 0.00016814417224199918, 'c': 0.00017051349872371503, 'd': 0.6991764685212293, 'e': 3.0122240712575534e-05, 'f': 1.2696213634667646e-05, 'g': 0.3019175632077489}


  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 43, Best Value: 43.47844292303129, Best Params: {'a': 2.189497664579112e-05, 'b': 0.008027233954657872, 'c': 0.6985160054090418, 'd': 1.77739840521753e-05, 'e': 5.7256427706773924e-05, 'f': 0.0001247399351806612, 'g': 0.2932941262509666}


  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 44, Best Value: 43.49546063371665, Best Params: {'a': 0.00010228621727726122, 'b': 0.06603857759829963, 'c': 0.8639204998134645, 'd': 6.353410800293052e-07, 'e': 2.6196529391739106e-05, 'f': 3.8418362629853626e-06, 'g': 0.06213669925164217}


In [6]:
# 最も小さい best_value を持つ Study を探索
best_index = np.argmin(best_values)
best_params_final = best_params_list[best_index]
best_value_final = best_values[best_index]

print("Final Best Value:", best_value_final)
print("Final Best Params:", best_params_final)

Final Best Value: 43.47844292303129
Final Best Params: {'a': 2.189497664579112e-05, 'b': 0.008027233954657872, 'c': 0.6985160054090418, 'd': 1.77739840521753e-05, 'e': 5.7256427706773924e-05, 'f': 0.0001247399351806612, 'g': 0.2932941262509666}


In [7]:
# ===================================================================
#  oof_df
# ===================================================================
prediction = df["pred_0"] * best_params_final["a"] +\
             df["pred_1"] * best_params_final["b"] +\
             df["pred_2"] * best_params_final["c"] +\
             df["pred_3"] * best_params_final["d"] +\
             df["pred_4"] * best_params_final["e"] +\
             df["pred_5"] * best_params_final["f"]

df = df.with_columns(
    pl.Series(prediction).alias("pred")
)
df[["id", "pred"]].write_csv(CFG.save_dir+f"oof_df_{CFG.filename}.csv", has_header=True)
display(df.head())

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,price,pred
i64,f64,f64,f64,f64,f64,f64,f64,i64,f64
0,8349.927994,8347.671992,8374.812648,8384.157015,8368.7128,8365.577188,8227.171,27587,5918.803918
1,3985.761719,4009.450083,3988.691655,3988.850911,3991.109782,4005.751997,3704.7322,4724,2819.236115
2,2918.185395,2933.057596,2921.001787,2921.707668,2917.474388,2941.156625,2834.8005,10931,2064.560587
3,8048.014373,8064.602372,8060.942061,8061.291081,8059.011798,8067.291115,8023.76,16553,5697.220734
4,4406.907983,4421.068587,4408.700347,4412.51081,4412.382296,4383.119601,4103.224,5158,3116.011012


In [8]:
# ===================================================================
#  test
# ===================================================================
test = pl.read_csv(CFG.save_dir+f"exp072.csv", has_header=False).rename({"column_1":"id", "column_2":"pred"}).sort("id").rename({"pred":f"pred_{0}"})
files = ["exp073.csv", "exp074.csv", "exp075.csv", "exp076.csv", "kun_exp00061.csv", "kun_exp00062.csv"]
for i, f in enumerate(files, start=1):
    test = test.join(
        pl.read_csv(CFG.save_dir+f, has_header=False).rename({"column_1":"id", "column_2":"pred"}).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
display(test.head())

prediction = test["pred_0"] * best_params_final["a"] +\
                 test["pred_1"] * best_params_final["b"] +\
                 test["pred_2"] * best_params_final["c"] +\
                 test["pred_3"] * best_params_final["d"] +\
                 test["pred_4"] * best_params_final["e"] +\
                 test["pred_5"] * best_params_final["f"]
                 
test = test.with_columns(
    prediction.alias("pred")
)
                     
test[["id", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)
test[["id", "pred"]].head(2)

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6
i64,f64,f64,f64,f64,f64,f64,f64
27532,9219.200678,9262.398861,9209.845453,9211.657631,9201.403646,9222.758846,8606.683365
27533,5555.610927,5570.955783,5546.121886,5550.776992,5544.537385,5569.564552,5412.844072
27534,5653.424701,5681.220795,5667.925276,5667.435425,5669.104669,5680.045525,5585.359034
27535,18752.937246,18712.89727,18594.4331,18620.599651,18594.588899,18570.819391,19174.202965
27536,4234.809532,4254.344876,4254.846798,4256.454343,4253.998513,4246.374616,4395.118442


id,pred
i64,f64
27532,6509.618767
27533,3920.006778
