In [1]:
# ===================================================================
#  Library
# ===================================================================
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import math
import time

from math import comb
from itertools import combinations


from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_percentage_error
from tqdm.auto import tqdm

import warnings
warnings.simplefilter("ignore")

import unicodedata
import lightgbm as lgb
import optuna
from optuna.visualization import plot_parallel_coordinate, plot_param_importances

In [2]:
# ===================================================================
#  CFG
# ===================================================================
class CFG:
    seed = 42
    n_splits = 8
    stopping_rounds = 100
    num_cores = 4 # kaggleの方と統一
    save_dir = "G:/マイドライブ/signate_StudentCup2023/exp/"
    data_dir = "G:/マイドライブ/signate_StudentCup2023/data/"
    filename = "exp077"

In [3]:
# ===================================================================
#  Utils
# ===================================================================
def seed_everything(seed):
    """fix random factors"""
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.seed)
    

def get_score(y_true, y_pred):
    """get MAPE score"""
    score = mean_absolute_percentage_error(y_true, y_pred)
    return score * 100

In [4]:
# ===================================================================
#  Data Loading
# ===================================================================
df = pd.read_csv(CFG.save_dir + "oof_df_exp065.csv").sort_values("id")
train = pd.read_csv(CFG.data_dir + "train.csv", usecols=["id", "price"])
kun_exp_df = pd.read_csv(CFG.save_dir + "kun_exp00052_oof_pred.csv")

# Join dataframes
train = df.merge(train, on="id", how="left")
train = train.merge(kun_exp_df, on="id", how="left")

# Display the head of the dataframe
display(train.head())


test = pd.read_csv(CFG.save_dir+"exp065.csv").sort_values("id")
kun_exp_df = pd.read_csv(CFG.save_dir + "kun_exp00052.csv")

test = pd.merge(
    test, kun_exp_df, on="id", how="left"
)
display(test.head())

Unnamed: 0,id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,...,kun_pred_0,kun_pred_1,kun_pred_2,kun_pred_3,kun_pred_4,kun_pred_5,kun_pred_6,kun_pred_7,kun_pred_8,kun_pred_9
0,0,6881.892301,6517.414858,7476.567746,7052.61059,6809.002174,6465.089852,7714.185588,7010.85559,7184.306903,...,7118.9443,7627.1987,7084.0015,7043.095,7122.753,7335.231,7350.006,9836.875,8781.688,8747.975
1,1,3740.634027,3496.012582,3789.907986,3322.844722,3603.935425,3485.623398,3507.497452,3714.057283,3488.353022,...,3660.8042,3526.6816,3732.6855,3593.2502,3596.9211,3562.2925,3922.8992,3761.7544,3852.5068,3476.0593
2,2,2954.247573,2735.110086,3288.647709,2893.84021,3253.350282,2885.690295,3202.10377,2799.365349,2989.931642,...,2883.739,3149.7769,2995.776,3038.0789,2876.831,2982.5576,2787.5325,2911.18,3172.8408,3092.2769
3,3,8430.949224,8337.416095,9033.353098,8081.40405,8187.453186,8477.54243,8546.771462,8388.923127,8119.18331,...,8650.627,8943.903,9243.175,8973.261,9158.718,8505.161,8713.231,8287.674,8649.64,8874.921
4,4,3972.418866,4254.790314,4089.549517,4272.046434,4446.949951,4022.828501,4374.413436,4369.679027,4150.7095,...,4031.1462,4406.548,4107.919,4059.3975,4229.2666,4429.893,4209.0796,4026.7249,4228.1797,4164.242


Unnamed: 0,id,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,...,kun_pred_0,kun_pred_1,kun_pred_2,kun_pred_3,kun_pred_4,kun_pred_5,kun_pred_6,kun_pred_7,kun_pred_8,kun_pred_9
0,27532,9665.240676,9060.741324,8931.822504,9993.239922,9387.607212,9908.462741,10128.928844,9738.111509,10579.697128,...,9462.117521,9616.369529,9455.621674,9411.32132,9439.296013,9339.256741,9496.255924,9169.724825,9408.6979,9737.733929
1,27533,5140.878681,4974.611255,4485.057773,5533.795617,5044.527731,4849.379307,5257.737182,5354.979862,5545.143501,...,5431.933504,5579.297671,5502.604659,5421.178,5426.123259,5696.500918,5520.123463,5516.627145,5408.202246,5147.619477
2,27534,5628.236365,5804.522418,5796.571207,5726.217723,5822.816944,5759.30897,5798.455075,5419.099745,5595.549648,...,5563.460291,5543.298528,5644.606929,5568.560278,5470.429184,5637.070499,5705.703291,5364.698823,5388.325417,5441.72168
3,27535,18624.817802,20021.162475,18835.327938,22007.95277,18960.034958,19900.729462,19313.421537,18475.89215,19846.987451,...,19615.535011,19333.748566,19484.368965,19209.276232,19294.428362,19318.374103,19171.163948,16937.493371,17264.432164,17340.319266
4,27536,3625.637392,4375.456011,4220.231775,3687.370209,4082.474807,4105.429409,3789.946321,3915.428485,4189.230892,...,4165.521012,4132.487296,4262.280629,4212.731834,4118.16039,4143.661658,4292.907859,4212.163549,4187.348948,4000.892623


In [5]:
# ===================================================================
#  feature_engineering
# ===================================================================
def preprocessing(all_data: pd.DataFrame):
    pred_columns = [col for col in all_data.columns if col not in ["price", "id"]]
    print(pred_columns)
    all_data["predict_mean"] = all_data[pred_columns].mean(axis=1)
    all_data["predict_max"] = all_data[pred_columns].max(axis=1)
    return all_data

train = preprocessing(train)
test = preprocessing(test)

['pred_0', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10', 'pred_11', 'pred_12', 'pred_13', 'pred_14', 'pred_15', 'pred_16', 'pred_17', 'pred_18', 'pred_19', 'pred_20', 'pred_21', 'pred_22', 'pred_23', 'pred_24', 'pred_25', 'pred_26', 'pred_27', 'pred_28', 'pred_29', 'pred_30', 'pred_31', 'pred_32', 'pred_33', 'pred_34', 'pred_35', 'pred_36', 'pred_37', 'pred_38', 'pred_39', 'kun_pred_0', 'kun_pred_1', 'kun_pred_2', 'kun_pred_3', 'kun_pred_4', 'kun_pred_5', 'kun_pred_6', 'kun_pred_7', 'kun_pred_8', 'kun_pred_9']
['pred_0', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10', 'pred_11', 'pred_12', 'pred_13', 'pred_14', 'pred_15', 'pred_16', 'pred_17', 'pred_18', 'pred_19', 'pred_20', 'pred_21', 'pred_22', 'pred_23', 'pred_24', 'pred_25', 'pred_26', 'pred_27', 'pred_28', 'pred_29', 'pred_30', 'pred_31', 'pred_32', 'pred_33', 'pred_34', 'pred_35', 'pred_36', 'pred_37', 'pred_38', 'pred_39'

In [6]:
# ===================================================================
#  Cross Validation
# ===================================================================
train.sort_values(by="id", ignore_index=True, inplace=True)

# priceを小さい順に各foldに振り分ける
train.sort_values(by="price", ignore_index=True, inplace=True)
train["fold"] = [i for i in range(CFG.n_splits)] * (train.shape[0] // CFG.n_splits) + [i for i in range(train.shape[0] % CFG.n_splits)]
train.sort_values(by="id", ignore_index=True, inplace=True)
print("The variance of the mean of the folds: ", train.groupby("fold")["price"].mean().std())

The variance of the mean of the folds:  7.232164844021037


In [7]:
def preprocessing_per_fold(CFG, train:pd.DataFrame, test: pd.DataFrame, fold: int = 0):
    """
    foldごとの前処理: leakageを防ぐ
    """
    X_train = train[train["fold"] != fold].reset_index(drop=True)
    X_valid = train[train["fold"] == fold].reset_index(drop=True)    
    test_df = test.copy()
    return X_train, X_valid, test_df

In [8]:
# ===================================================================
#  evaluate
# ===================================================================
def train_lgb(CFG, lgb_param):
    oof_df = pd.DataFrame()
    preds = []
    for fold in range(CFG.n_splits):
        X_train, X_valid, test_df = preprocessing_per_fold(CFG, train, test, fold)
        # train
        categorical_features = [col for col in CFG.use_features if "_category" in col]
        lgb_train = lgb.Dataset(X_train[CFG.use_features], X_train["price"], categorical_feature = categorical_features,)
        lgb_valid = lgb.Dataset(X_valid[CFG.use_features], X_valid["price"], categorical_feature = categorical_features,)
        model = lgb.train(
                        lgb_param, 
                        lgb_train, 
                        valid_sets=[lgb_valid],
                        categorical_feature = categorical_features,
                        callbacks=[lgb.early_stopping(stopping_rounds=CFG.stopping_rounds, verbose=False),],
                        )
        X_valid[f"pred"] = model.predict(X_valid[CFG.use_features], num_iteration=model.best_iteration)
        print(f"fold{fold}:", get_score(y_true=X_valid["price"], y_pred=X_valid["pred"]))
        oof_df = pd.concat([oof_df, X_valid], ignore_index=True)
        preds.append(model.predict(test_df[CFG.use_features], num_iteration=model.best_iteration))
    test[f"pred"] = np.mean(preds, axis=0)        
    score = get_score(oof_df["price"], oof_df["pred"])
    return score, oof_df, test


In [9]:
# ===================================================================
#  evaluate
# ===================================================================
CFG.use_features = [col for col in train.columns if "pred_" in col] + ["predict_mean", "predict_max"]
print(CFG.use_features)


lgb_param = {
    "task":"train",
    "objective": "mape",
    "boosting":"gbdt",
    "n_estimators": 7772,
    "learning_rate": 0.018055517654000923, 
    "max_depth": 15,
    "num_leaves": 15582,
    "min_data_in_leaf": 257,
    "max_bin": 107,
    "subsample": 0.409805093503704,
    "subsample_freq": 6,
    "feature_fraction": 0.12888597010335143,
    "reg_lambda": 6.868581856096029,
    "reg_alpha": 2.6710544693327396e-06,
    "scale_pos_weight": 17.724717705709008,
    "num_threads":CFG.num_cores,
    "metric": 'mape',
    "seed" : CFG.seed,
    "verbosity": -1,   
}


best_score, oof_df, test_df = train_lgb(CFG, lgb_param)
print('\033[32m'+"====== CV score ======"+'\033[0m')
print('\033[32m'+f'{best_score}'+'\033[0m')

['pred_0', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10', 'pred_11', 'pred_12', 'pred_13', 'pred_14', 'pred_15', 'pred_16', 'pred_17', 'pred_18', 'pred_19', 'pred_20', 'pred_21', 'pred_22', 'pred_23', 'pred_24', 'pred_25', 'pred_26', 'pred_27', 'pred_28', 'pred_29', 'pred_30', 'pred_31', 'pred_32', 'pred_33', 'pred_34', 'pred_35', 'pred_36', 'pred_37', 'pred_38', 'pred_39', 'kun_pred_0', 'kun_pred_1', 'kun_pred_2', 'kun_pred_3', 'kun_pred_4', 'kun_pred_5', 'kun_pred_6', 'kun_pred_7', 'kun_pred_8', 'kun_pred_9', 'predict_mean', 'predict_max']
fold0: 43.060716450234466
fold1: 44.22799829657833
fold2: 43.28463298503994
fold3: 44.61944749521788
fold4: 42.80158969740142
fold5: 42.821593337063256
fold6: 44.40896892362935
fold7: 43.65836609641618
[32m43.61044144258124[0m


In [10]:
# ===================================================================
#  test
# ===================================================================
oof_df[["id", "pred"]].to_csv(CFG.save_dir+f"oof_df_{CFG.filename}.csv", index=False, header=True)
test_df[["id", "pred"]].to_csv(CFG.save_dir+f"{CFG.filename}.csv", index=False, header=False)
test_df[["id", "pred"]]

Unnamed: 0,id,pred
0,27532,9017.561685
1,27533,5339.376994
2,27534,5958.715261
3,27535,18936.869485
4,27536,4107.333654
...,...,...
27532,55064,14520.403540
27533,55065,8472.134068
27534,55066,5887.029019
27535,55067,4972.316537
