In [1]:
# ===================================================================
#  Library
# ===================================================================
import polars as pl
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.simplefilter("ignore")

import optuna
import numpy as np

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    filename = "exp078"
    n_trials = 2000
    n_seeds = 3

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  DataLoading
# ===================================================================
train = pl.read_csv(CFG.data_dir+"train.csv", columns=["id", "price"])

df = pl.read_csv(CFG.save_dir+"oof_df_exp066.csv").rename({"pred":f"pred_{0}"})
files = ["exp067", "exp068", "exp069", "exp070", "exp071", "exp077"]
for i, f in enumerate(files, start=1):
    df = df.join(
        pl.read_csv(CFG.save_dir+f"oof_df_{f}.csv").rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )
    
df = df.join(
    pl.read_csv(CFG.save_dir+"kun_exp00060_oof_pred.csv").rename({'oof_pred':f'pred_{len(files)+1}'}),
    on="id", how="left",
)

df = df.join(
    train, on="id", how="left"
)

df.head()

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,price
i64,f64,f64,f64,f64,f64,f64,f64,f64,i64
0,8854.693376,9152.341955,8780.223598,8955.701853,8946.926107,9196.765168,7579.777693,9322.824825,27587
1,3869.563539,3864.146063,3878.933184,3892.975519,3870.646555,3885.20529,4001.311742,3891.99762,4724
2,3020.466736,2996.637993,3037.993605,3035.039709,3021.979636,3060.125866,2899.577998,2971.556923,10931
3,8014.619227,8068.601962,8042.945716,7945.924737,7996.75402,8007.789593,8312.877683,7959.507481,16553
4,4522.686418,4480.841914,4437.256123,4493.480177,4469.191551,4537.125697,4522.498509,4375.457013,5158


In [5]:
# ===================================================================
#  simple greedy forward selection
# ===================================================================
# single modelで最もCVが良いモデルを選択する
scores = dict()
for col in df.columns:
    if col not in ["id", "price"]:
        scores[col] = get_score(y_true=df["price"], y_pred=df[col])    
        
BEST_SCORE = np.inf
for seed in range(20):    
    selected_model = min(scores, key=scores.get)
    best_preds = df[selected_model]
    best_score = min(scores.values())
    
    stores = dict()
    orders = [selected_model]
    stores[selected_model] = 1    # 重みを保存
    

    filenames = [col for col in df.columns if col not in ["id", "price", selected_model]]
    filenames = np.random.RandomState(seed).permutation(filenames)

    for exp in filenames:
        best_weight = 0
        for w in np.arange(-0.5, 0.5, 0.01):
            preds = best_preds * (1-w) + df[exp] * w
            score = get_score(y_true=df["price"], y_pred=preds)
            if best_score > score:
                best_score = score
                best_weight = w
        stores[exp] = best_weight
        orders.append(exp)
        best_preds = best_preds * (1-best_weight) + df[exp] * best_weight
    print(seed, best_score)
    
    if BEST_SCORE > best_score:
        BEST_SCORE = best_score
        BEST_STORE = stores.copy()
        BEST_ORDER = orders.copy()

0 43.5129951467611
1 43.51283284004883
2 43.515790787005436
3 43.513397322384556
4 43.51380087355155
5 43.512878829864
6 43.51473832238349
7 43.51457656656957
8 43.5144506431161
9 43.51453462214112
10 43.51581047368647
11 43.51550374393488
12 43.512634988062615
13 43.514022347633684
14 43.51309589526648
15 43.51541725985467
16 43.51389552117487
17 43.51425081675565
18 43.513768436040074
19 43.51340867332813


In [6]:
# ===================================================================
#  Check
# ===================================================================
def get_preds(df: pl.DataFrame):
    best_preds = 0
    for exp in BEST_ORDER:
        w = BEST_STORE[exp]
        best_preds  = best_preds * (1-w) + df[exp] * w
    return best_preds
    
get_score(y_true=df["price"], y_pred=get_preds(df))

43.512634988062615

In [7]:
# ===================================================================
#  oof_df
# ===================================================================
df.with_columns(
    pl.Series(get_preds(df)).alias("pred")
)[["id", "pred"]].write_csv(CFG.save_dir+f"oof_df_{CFG.filename}.csv", has_header=True)

display(df.with_columns(
    pl.Series(get_preds(df)).alias("pred")
)[["id", "pred"]].head())

id,pred
i64,f64
0,8411.919101
1,3934.698797
2,2948.348619
3,8136.409007
4,4478.863139


In [8]:
# ===================================================================
#  oof_df
# ===================================================================
test = pl.read_csv(CFG.save_dir+f"exp066.csv", has_header=False).rename({"column_1":"id", "column_2":"pred"}).sort("id").rename({"pred":f"pred_{0}"})
files = ["exp067.csv", "exp068.csv", "exp069.csv", "exp070.csv", "exp071.csv", "exp077.csv", "kun_exp00060.csv"]
for i, f in enumerate(files, start=1):
    test = test.join(
        pl.read_csv(CFG.save_dir+f, has_header=False).rename({"column_1":"id", "column_2":"pred"}).rename({"pred":f"pred_{i}"}),
        on="id", how="left"
    )

test = test.with_columns(
    pl.Series(get_preds(test)).alias("pred")
)
display(test.head())


test[["id", "pred"]].write_csv(CFG.save_dir+f"{CFG.filename}.csv", has_header=False)

id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
27532,9673.031004,9645.690291,9773.740233,9543.305631,9574.334533,9774.45317,9017.561685,9456.567691,9306.069297
27533,5585.545628,5611.390293,5625.333842,5745.546668,5722.954729,5762.007883,5339.376994,5475.010419,5430.160645
27534,5595.254657,5581.507724,5572.766942,5592.961109,5618.968955,5606.688175,5958.715261,5480.674549,5729.268412
27535,18194.779659,18336.045587,18254.450168,18366.815171,18448.417408,18256.398339,18936.869485,18054.427058,18485.669751
27536,4416.241293,4401.738676,4296.607046,4481.441494,4280.70203,4366.343344,4107.333654,4255.354923,4206.281597
