In [1]:
# ===================================================================
#  Library
# ===================================================================
import polars as pl
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import optuna
import numpy as np

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    filename = "exp085"
    n_trials = 2000
    n_seeds = 3

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  DataLoading
# ===================================================================
train = pl.read_csv(CFG.data_dir+"train.csv", columns=["id", "price"])

df = pl.read_csv(CFG.save_dir+"oof_df_exp072.csv", columns=["id", "pred"]).rename({"pred":f"pred_{0}"})
files = ["exp073", "exp074", "exp075", "exp076", ]
for i, f in enumerate(files, start=1):
    print()
    df = df.join(
        pl.read_csv(CFG.save_dir+f"oof_df_{f}.csv", columns=["id", "pred"]).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
    
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00061_oof_pred.csv").rename({'oof_pred':f'pred_{len(files)+1}'}),
    on="id", how="left",
)
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00062_oof_pred.csv").rename({'oof_pred':f'pred_{len(files)+2}'}),
    on="id", how="left",
)

df = df.join(
    train, on="id", how="left"
)

df.head()







id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,price
i64,f64,f64,f64,f64,f64,f64,f64,i64
0,8349.927994,8347.671992,8374.812648,8384.157015,8368.7128,8365.577188,8227.171,27587
1,3985.761719,4009.450083,3988.691655,3988.850911,3991.109782,4005.751997,3704.7322,4724
2,2918.185395,2933.057596,2921.001787,2921.707668,2917.474388,2941.156625,2834.8005,10931
3,8048.014373,8064.602372,8060.942061,8061.291081,8059.011798,8067.291115,8023.76,16553
4,4406.907983,4421.068587,4408.700347,4412.51081,4412.382296,4383.119601,4103.224,5158


In [5]:
# ===================================================================
#  simple greedy forward selection
# ===================================================================
# single modelで最もCVが良いモデルを選択する
scores = dict()
for col in df.columns:
    if col not in ["id", "price"]:
        scores[col] = get_score(y_true=df["price"], y_pred=df[col])    
        
BEST_SCORE = np.inf
for seed in range(20):    
    selected_model = min(scores, key=scores.get)
    best_preds = df[selected_model]
    best_score = min(scores.values())
    
    stores = dict()
    orders = [selected_model]
    stores[selected_model] = 1    # 重みを保存
    

    filenames = [col for col in df.columns if col not in ["id", "price", selected_model]]
    filenames = np.random.RandomState(seed).permutation(filenames)

    for exp in filenames:
        best_weight = 0
        for w in np.arange(-0.5, 0.5, 0.01):
            preds = best_preds * (1-w) + df[exp] * w
            score = get_score(y_true=df["price"], y_pred=preds)
            if best_score > score:
                best_score = score
                best_weight = w
        stores[exp] = best_weight
        orders.append(exp)
        best_preds = best_preds * (1-best_weight) + df[exp] * best_weight
    print(seed, best_score)
    
    if BEST_SCORE > best_score:
        BEST_SCORE = best_score
        BEST_STORE = stores.copy()
        BEST_ORDER = orders.copy()

0 43.477784380190556
1 43.47779475700654
2 43.477813219957824
3 43.477774917616244
4 43.47776380559784
5 43.47778789717585
6 43.47777659224575
7 43.47779281747571
8 43.47780666668102
9 43.477779900688866
10 43.47779542567535
11 43.47777844087348
12 43.47780895850274
13 43.47777570652932
14 43.47778606501944
15 43.47779022278763
16 43.47779331458548
17 43.47778560827878
18 43.47778747685847
19 43.47781422894833


In [6]:
# ===================================================================
#  Check
# ===================================================================
def get_preds(df: pl.DataFrame):
    best_preds = 0
    for exp in BEST_ORDER:
        w = BEST_STORE[exp]
        best_preds  = best_preds * (1-w) + df[exp] * w
    return best_preds
    
print(get_score(y_true=df["price"], y_pred=get_preds(df)))

df = df.with_columns(
    pl.Series(get_preds(df)).alias("pred")
)
display(df.head())
df.write_csv(CFG.save_dir+f"oof_df_{CFG.filename}.csv", has_header=True)

43.47776380559784


id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,price,pred
i64,f64,f64,f64,f64,f64,f64,f64,i64,f64
0,8349.927994,8347.671992,8374.812648,8384.157015,8368.7128,8365.577188,8227.171,27587,8318.123566
1,3985.761719,4009.450083,3988.691655,3988.850911,3991.109782,4005.751997,3704.7322,4724,3900.518082
2,2918.185395,2933.057596,2921.001787,2921.707668,2917.474388,2941.156625,2834.8005,10931,2903.188316
3,8048.014373,8064.602372,8060.942061,8061.291081,8059.011798,8067.291115,8023.76,16553,8052.270313
4,4406.907983,4421.068587,4408.700347,4412.51081,4412.382296,4383.119601,4103.224,5158,4287.762823


In [7]:
# ===================================================================
#  test
# ===================================================================
test = pl.read_csv(CFG.save_dir+f"exp072.csv", has_header=False).rename({"column_1":"id", "column_2":"pred"}).sort("id").rename({"pred":f"pred_{0}"})
files = ["exp073.csv", "exp074.csv", "exp075.csv", "exp076.csv", "kun_exp00061.csv", "kun_exp00062.csv"]
for i, f in enumerate(files, start=1):
    test = test.join(
        pl.read_csv(CFG.save_dir+f, has_header=False).rename({"column_1":"id", "column_2":"pred"}).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
display(test.head())

test = test.with_columns(
    pl.Series(get_preds(test)).alias("pred")
)
display(test.head())


test[["id", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6
i64,f64,f64,f64,f64,f64,f64,f64
27532,9219.200678,9262.398861,9209.845453,9211.657631,9201.403646,9222.758846,8606.683365
27533,5555.610927,5570.955783,5546.121886,5550.776992,5544.537385,5569.564552,5412.844072
27534,5653.424701,5681.220795,5667.925276,5667.435425,5669.104669,5680.045525,5585.359034
27535,18752.937246,18712.89727,18594.4331,18620.599651,18594.588899,18570.819391,19174.202965
27536,4234.809532,4254.344876,4254.846798,4256.454343,4253.998513,4246.374616,4395.118442


id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred
i64,f64,f64,f64,f64,f64,f64,f64,f64
27532,9219.200678,9262.398861,9209.845453,9211.657631,9201.403646,9222.758846,8606.683365,9008.092612
27533,5555.610927,5570.955783,5546.121886,5550.776992,5544.537385,5569.564552,5412.844072,5513.912384
27534,5653.424701,5681.220795,5667.925276,5667.435425,5669.104669,5680.045525,5585.359034,5647.242869
27535,18752.937246,18712.89727,18594.4331,18620.599651,18594.588899,18570.819391,19174.202965,18780.93823
27536,4234.809532,4254.344876,4254.846798,4256.454343,4253.998513,4246.374616,4395.118442,4299.55308
