In [1]:
import pandas as pd
import numpy as np

In [2]:
# データフレームを水平に表示する。
def show_many_dfs(*dfs, n=10):
    class HorizontalDisplay:
        def _repr_html_(self):
            template = '<div style="float: left; padding: 5px;">{}</div>'
            return  ''.join(template.format(df.head(n)._repr_html_()) for df in dfs)
    return HorizontalDisplay()

In [3]:
# 自作関数の呼び出し
import sys
sys.path.append("../../")    # <- 親フォルダから呼び出すためにpathを追加する。
from my_package.excute_notebook import execute_notebook

In [4]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

### <font color="orange">make_data.ipynbの実行</font>

In [5]:
import os

file_path = "../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv"

if os.path.exists(file_path):
    print("既にデータが保存されています。")
else:
    print("make_data.ipynbを実行します。")
    notebook_path = "../../1_datasets/1_make_dataset/make_data.ipynb"
    execute_notebook(notebook_path=notebook_path)
    print("実行が完了しました。")

既にデータが保存されています。


### <font color="orange">データの読み込み</font>

In [6]:
data = pd.read_csv("../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv")
data.head()

Unnamed: 0,review_score,order_status,count_payment_sequential,payment_type_credit_card,payment_type_boleto,payment_type_voucher,payment_type_debit_card,mean_credit_card_payment_installments,payment_value,seller_id,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_volume,days_approved_deliverd,days_estimated_deliverd
0,4.0,delivered,3,1,0,1,0,1,38.71,3504c0cb71d7fa48d967e0e4c94d59d9,housewares,40,268,4,500,1976,8,-8
1,4.0,delivered,1,0,1,0,0,0,141.46,289cdb325fb7e7f891c38608bf9e0962,perfumery,29,178,1,400,4693,12,-6
2,5.0,delivered,1,1,0,0,0,3,179.12,4869f7a5dfa277a7dca6462dcf3b52b2,auto,46,232,1,420,9576,9,-18
3,5.0,delivered,1,1,0,0,0,1,72.2,66922902710d126a0e7d26b0e3805106,pet_shop,59,468,3,450,6000,13,-13
4,5.0,delivered,1,1,0,0,0,1,28.62,2c9e548be18521d1c43cde1c582c6de8,stationery,38,316,4,250,11475,2,-10


### <font color="orange">モデルの実装</font>

In [7]:
# random_stateナンバー
num_seed = 11

In [8]:
# lightgbmの関数
import lightgbm as lgb

def lightgbm_class(X_train, X_test, y_train, y_test, num_seed=num_seed):

    # LightGBMのパラメータ
    params = {
        "boosting" : "gbdt",
        'objective': 'multiclass',  # 回帰タスク
        "num_class" : 5,
        'metric': 'multi_logloss',
        "min_data_in_leaf" : 30,
        'num_leaves': 16,            # 木の最大の葉の数
        "max_depth" : -1,
        'learning_rate': 0.05,       # 学習率
        'is_unbalance': True,        # 不均衡なクラスの扱い
        "verbose" : -1,
        "random_seed" : num_seed
    }
    
    y_train = y_train-1
    y_test = y_test-1

    # LightGBMのモデルの作成と学習
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model_lgb_class = lgb.train(params=params,
                          num_boost_round=5000,
                          train_set=lgb_train,
                          valid_sets=[lgb_train, lgb_valid]
                         )

    # 予測
    y_pred = model_lgb_class.predict(X_test, num_iteration=model_lgb_class.best_iteration)
    y_pred_lgb_class = np.argmax(y_pred, axis=1)
    
    y_pred_lgb_class += 1
    y_test += 1
    
    return y_pred_lgb_class

In [9]:
from sklearn.ensemble import RandomForestClassifier

def rf_class(X_train, X_test, y_train, y_test, num_seed):
    # ランダムフォレストの作成と学習
    model_rf_class = RandomForestClassifier(n_estimators=50, random_state=num_seed)
    model_rf_class.fit(X_train, y_train)
    # 予測
    y_pred_rf_class = model_rf_class.predict(X_test)
    
    return y_pred_rf_class

### <font color="orange">oputunaを取り入れた交差検証</font>

- lightgbmとランダムフォレストの予測値から、最良のスコアを算出するためにoptunaで探索する。
- lightgbmとランダムフォレストの予測値が同じなら、最終予測はその値とする。
- それぞれの予測が異なれば、最終予測値をどうするかを、optunaで探索して決定する。

In [10]:
import optuna
from sklearn.metrics import cohen_kappa_score

def objective(trial, y_pred_lgb_class, y_pred_rf_class, y_test):
    
    # optunaの関数に渡すために、予測値をデータフレームに変更。
    df_pred = pd.DataFrame({"lgb_class" : y_pred_lgb_class,
                            "rf_class" : y_pred_rf_class})
    
    for pred_lgb_class in [1,2,3,4,5]:
        for pred_rf_class in [1,2,3,4,5]:
            
            # lightgbmとランダムフォレストの予測値が同じ場合は、その値を最終予測値とする。
            if pred_lgb_class == pred_rf_class:
                df_pred.loc[(df_pred["lgb_class"] == pred_lgb_class) & (df_pred["rf_class"] == pred_rf_class), "final"] = pred_lgb_class
            
            # lightgbmとランダムフォレストの予測値が異なる場合は、optunaの探索対象とし、ベストパラメーターを最終予測値とする。
            else:
                df_pred.loc[(df_pred["lgb_class"] == pred_lgb_class) & (df_pred["rf_class"] == pred_rf_class), "final"] = trial.suggest_int(f"line_{pred_lgb_class}_{pred_rf_class}", 1, 5)
            
            
    y_pred = df_pred["final"]
    kappa = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    
    return kappa

In [11]:
from sklearn.metrics import cohen_kappa_score
from concurrent.futures import ThreadPoolExecutor

def final_model(X_train, X_test, y_train, y_test, num_seed):
    
    # モデルの学習、予測
    executer = ThreadPoolExecutor(max_workers=2)
    y_pred_lgb_class = executer.submit(lightgbm_class, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, num_seed=num_seed).result()
    y_pred_rf_class = executer.submit(rf_class, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, num_seed=num_seed).result()
    
    
    # df_predを作成。
    # optuna用に df_pred を df_pred_for_optuna と df_pred_for_test に2分割。
    # df_pred_for_optuna でoptunaによるパラメータ調整。 df_pred_for_testにoptunaで探索したパラメータを用いて最終予測値を決定する。
    df_pred = pd.DataFrame({"pred_lgb_class" : y_pred_lgb_class,
                            "pred_rf_class" : y_pred_rf_class,
                            "act" : y_test})
    
    df_pred = df_pred.astype(int)
    
    df_pred_for_optuna = df_pred.sample(frac=0.1, random_state=num_seed)
    df_pred_for_test = df_pred.drop(df_pred_for_optuna.index)
    

    # Quadratic Weighted Kappaの計算
    kappa_lgb_class = cohen_kappa_score(df_pred_for_test["act"], df_pred_for_test["pred_lgb_class"], weights='quadratic')
    kappa_rf_class = cohen_kappa_score(df_pred_for_test["act"], df_pred_for_test["pred_rf_class"], weights='quadratic')
    
    
    # optuna
    optuna.logging.disable_default_handler()    # ログを非表示にする
    study = optuna.create_study(direction='maximize',  sampler=optuna.samplers.RandomSampler(num_seed))
    study.optimize(lambda trial: objective(trial,
                                           y_pred_lgb_class=df_pred_for_optuna["pred_lgb_class"],
                                           y_pred_rf_class=df_pred_for_optuna["pred_rf_class"],
                                           y_test=df_pred_for_optuna["act"]
                                           ),
                                           n_trials=500)
    
    best_parameters = study.best_params
    best_parameters["line_1_1"] = 1
    best_parameters["line_2_2"] = 2
    best_parameters["line_3_3"] = 3
    best_parameters["line_4_4"] = 4
    best_parameters["line_5_5"] = 5
    
    df_pred_for_test["key"] = df_pred_for_test.apply(lambda row : "line_" + str(row["pred_lgb_class"]) + "_" + str(row["pred_rf_class"]), axis=1)
        
    df_pred_for_test["pred_final"] = df_pred_for_test["key"].map(best_parameters)
    
    kappa_final = cohen_kappa_score(df_pred_for_test["act"], df_pred_for_test["pred_final"], weights='quadratic')
    
    return kappa_lgb_class, kappa_rf_class, kappa_final

In [12]:
from sklearn.model_selection import StratifiedKFold

X = data.drop("review_score", axis=1)
y = data["review_score"]

scores_lgb_class = []
scores_rf_class = []
scores_final = []

# データの分割
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=num_seed)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()
    
    
    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
    
    kappa_lgb_class, kappa_rf_class, kappa_final = final_model(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, num_seed=num_seed)
    
    scores_lgb_class.append(kappa_lgb_class)
    scores_rf_class.append(kappa_rf_class)
    scores_final.append(kappa_final)
    
    print("="*20, i, "回目", "="*20)
    print(f'Quadratic Weighted Kappa (lgb_class) : {np.round(kappa_lgb_class, 4)}')
    print(f'Quadratic Weighted Kappa (rf_class)  : {np.round(kappa_rf_class, 4)}')
    print(f'Quadratic Weighted Kappa (final)     : {np.round(kappa_final, 4)}')

# 交差検証スコアの平均値と標準偏差を出力
scores_lgb_class = np.array(scores_lgb_class)
scores_rf_class = np.array(scores_rf_class)
scores_final = np.array(scores_final)
print("\n")
print(f"lgb_class_score -- mean : {np.round(scores_lgb_class.mean(), 4)}, std : {np.round(scores_lgb_class.std(), 4)}")
print(f"rf_class_score  -- mean : {np.round(scores_rf_class.mean(), 4)}, std : {np.round(scores_rf_class.std(), 4)}")
print(f"final_score     -- mean : {np.round(scores_final.mean(), 4)}, std : {np.round(scores_final.std(), 4)}")

Quadratic Weighted Kappa (lgb_class) : 0.4362
Quadratic Weighted Kappa (rf_class)  : 0.4387
Quadratic Weighted Kappa (final)     : 0.445
Quadratic Weighted Kappa (lgb_class) : 0.4296
Quadratic Weighted Kappa (rf_class)  : 0.4327
Quadratic Weighted Kappa (final)     : 0.4335
Quadratic Weighted Kappa (lgb_class) : 0.4414
Quadratic Weighted Kappa (rf_class)  : 0.4426
Quadratic Weighted Kappa (final)     : 0.4407
Quadratic Weighted Kappa (lgb_class) : 0.4405
Quadratic Weighted Kappa (rf_class)  : 0.4389
Quadratic Weighted Kappa (final)     : 0.4477
Quadratic Weighted Kappa (lgb_class) : 0.4433
Quadratic Weighted Kappa (rf_class)  : 0.4421
Quadratic Weighted Kappa (final)     : 0.454


lgb_class_score -- mean : 0.4382, std : 0.0049
rf_class_score  -- mean : 0.439, std : 0.0035
final_score     -- mean : 0.4442, std : 0.0069
