In [1]:
# ===================================================================
#  Library
# ===================================================================
import polars as pl
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import optuna
import numpy as np

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    filename = "exp082"
    n_trials = 2000
    n_seeds = 3

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  DataLoading
# ===================================================================
train = pl.read_csv(CFG.data_dir+"train.csv", columns=["id", "price"])

df = pl.read_csv(CFG.save_dir+"oof_df_exp066.csv").rename({"pred":f"pred_{0}"})
files = ["exp067", "exp068", "exp069", "exp070", "exp071", "exp077"]
for i, f in enumerate(files, start=1):
    df = df.join(
        pl.read_csv(CFG.save_dir+f"oof_df_{f}.csv").rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
    
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00060_oof_pred.csv").rename({'oof_pred':f'pred_{len(files)+1}'}),
    on="id", how="left",
)

df = df.join(
    train, on="id", how="left"
)

df.head()

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,price
i64,f64,f64,f64,f64,f64,f64,f64,f64,i64
0,8854.693376,9152.341955,8780.223598,8955.701853,8946.926107,9196.765168,7579.777693,9322.824825,27587
1,3869.563539,3864.146063,3878.933184,3892.975519,3870.646555,3885.20529,4001.311742,3891.99762,4724
2,3020.466736,2996.637993,3037.993605,3035.039709,3021.979636,3060.125866,2899.577998,2971.556923,10931
3,8014.619227,8068.601962,8042.945716,7945.924737,7996.75402,8007.789593,8312.877683,7959.507481,16553
4,4522.686418,4480.841914,4437.256123,4493.480177,4469.191551,4537.125697,4522.498509,4375.457013,5158


In [5]:
# ===================================================================
#  optuna
# ===================================================================

def objective(trial):
    a = trial.suggest_float("a", 1e-8, 1, log=True)
    b = trial.suggest_float("b", 1e-8, 1, log=True)
    c = trial.suggest_float("c", 1e-8, 1, log=True)
    d = trial.suggest_float("d", 1e-8, 1, log=True)
    e = trial.suggest_float("e", 1e-8, 1, log=True)
    f = trial.suggest_float("f", 1e-8, 1, log=True)
    g = trial.suggest_float("g", 1e-8, 1, log=True)
    
    
    prediction = df["pred_1"] * a +\
                 df["pred_2"] * b +\
                 df["pred_3"] * c +\
                 df["pred_4"] * d +\
                 df["pred_5"] * e +\
                 df["pred_6"] * f +\
                 df["pred_7"] * g
                 
    score = get_score(y_true=df["price"], y_pred = prediction)
    return score
    
optuna.logging.set_verbosity(optuna.logging.WARNING)

# シードのリストを定義
seeds = [seed for seed in range(CFG.seed, CFG.seed+CFG.n_seeds)]

best_values = []
best_params_list = []

for seed in seeds:    
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(seed=seed)
    )
    study.optimize(objective, 
                   n_trials=CFG.n_trials, 
                   n_jobs = -1,
                   show_progress_bar=True)
    
    best_value = study.best_value
    best_params = study.best_params
    
    best_values.append(best_value)
    best_params_list.append(best_params)
    
    print(f"Seed: {seed}, Best Value: {best_value}, Best Params: {best_params}")

  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 42, Best Value: 43.525503530105766, Best Params: {'a': 1.1491278929972336e-06, 'b': 0.013838507482314568, 'c': 2.0637491975880085e-06, 'd': 0.003622514686679603, 'e': 0.11763720782459204, 'f': 0.3239552696152092, 'g': 0.5307991225206081}


  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 43, Best Value: 43.52905646321788, Best Params: {'a': 0.0007273900719321455, 'b': 3.073541550310323e-06, 'c': 0.18621387488202734, 'd': 0.30468431213892283, 'e': 4.243778574391451e-05, 'f': 0.5063815406770792, 'g': 1.934561379033801e-08}


  0%|          | 0/2000 [00:00<?, ?it/s]

Seed: 44, Best Value: 43.56560117302641, Best Params: {'a': 0.005806324693532473, 'b': 0.6913803397038273, 'c': 0.058723829628181505, 'd': 0.0007486468844864078, 'e': 0.1876085628120509, 'f': 0.05236276016706091, 'g': 4.6471261193156724e-07}


In [6]:
# 最も小さい best_value を持つ Study を探索
best_index = np.argmin(best_values)
best_params_final = best_params_list[best_index]
best_value_final = best_values[best_index]

print("Final Best Value:", best_value_final)
print("Final Best Params:", best_params_final)

Final Best Value: 43.525503530105766
Final Best Params: {'a': 1.1491278929972336e-06, 'b': 0.013838507482314568, 'c': 2.0637491975880085e-06, 'd': 0.003622514686679603, 'e': 0.11763720782459204, 'f': 0.3239552696152092, 'g': 0.5307991225206081}


In [None]:
# ===================================================================
#  test
# ===================================================================
prediction = df["pred_1"] * best_params_final["a"] +\
                df["pred_2"] * best_params_final["b"] +\
                df["pred_3"] * best_params_final["c"] +\
                df["pred_4"] * best_params_final["d"] +\
                df["pred_5"] * best_params_final["e"] +\
                df["pred_6"] * best_params_final["f"] +\
                df["pred_7"] * best_params_final["g"]

In [7]:
# ===================================================================
#  test
# ===================================================================
test = pl.read_csv(CFG.save_dir+f"exp066.csv", has_header=False).rename({"column_1":"id", "column_2":"pred"}).sort("id").rename({"pred":f"pred_{0}"})
files = ["exp067.csv", "exp068.csv", "exp069.csv", "exp070.csv", "exp071.csv", "exp077.csv", "kun_exp00060.csv"]
for i, f in enumerate(files, start=1):
    test = test.join(
        pl.read_csv(CFG.save_dir+f, has_header=False).rename({"column_1":"id", "column_2":"pred"}).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
display(test.head())

prediction = test["pred_1"] * best_params_final["a"] +\
                 test["pred_2"] * best_params_final["b"] +\
                 test["pred_3"] * best_params_final["c"] +\
                 test["pred_4"] * best_params_final["d"] +\
                 test["pred_5"] * best_params_final["e"] +\
                 test["pred_6"] * best_params_final["f"] +\
                 test["pred_7"] * best_params_final["g"]                     
                 
test = test.with_columns(
    prediction.alias("pred")
)
                     
test[["id", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)
test[["id", "pred"]].head(2)

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7
i64,f64,f64,f64,f64,f64,f64,f64,f64
27532,9673.031004,9645.690291,9773.740233,9543.305631,9574.334533,9774.45317,9017.561685,9456.567691
27533,5585.545628,5611.390293,5625.333842,5745.546668,5722.954729,5762.007883,5339.376994,5475.010419
27534,5595.254657,5581.507724,5572.766942,5592.961109,5618.968955,5606.688175,5958.715261,5480.674549
27535,18194.779659,18336.045587,18254.450168,18366.815171,18448.417408,18256.398339,18936.869485,18054.427058
27536,4416.241293,4401.738676,4296.607046,4481.441494,4280.70203,4366.343344,4107.333654,4255.354923


id,pred
i64,f64
27532,9260.631762
27533,5412.272576
