In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns

In [19]:
# データフレームを水平に表示する。
def show_many_dfs(*dfs, n=10):
    class HorizontalDisplay:
        def _repr_html_(self):
            template = '<div style="float: left; padding: 5px;">{}</div>'
            return  ''.join(template.format(df.head(n)._repr_html_()) for df in dfs)
    return HorizontalDisplay()

In [20]:
# 自作関数の呼び出し
import sys
sys.path.append("../../")    # <- 親フォルダから呼び出すためにpathを追加する。
from my_package.excute_notebook import execute_notebook

In [21]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

### <font color="orange">make_data.ipynbの実行</font>

In [22]:
import os

file_path = "../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv"

if os.path.exists(file_path):
    print("既にデータが保存されています。")
else:
    print("make_data.ipynbを実行します。")
    notebook_path = "../../1_datasets/1_make_dataset/make_data.ipynb"
    execute_notebook(notebook_path=notebook_path)
    print("実行が完了しました。")

既にデータが保存されています。


### <font color="orange">データの読み込み</font>

In [23]:
data = pd.read_csv("../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv")
data.head()

Unnamed: 0,review_score,order_status,count_payment_sequential,payment_type_credit_card,payment_type_boleto,payment_type_voucher,payment_type_debit_card,mean_credit_card_payment_installments,payment_value,seller_id,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_volume,days_approved_deliverd,days_estimated_deliverd
0,4.0,delivered,3,1,0,1,0,1,38.71,3504c0cb71d7fa48d967e0e4c94d59d9,housewares,40,268,4,500,1976,8,-8
1,4.0,delivered,1,0,1,0,0,0,141.46,289cdb325fb7e7f891c38608bf9e0962,perfumery,29,178,1,400,4693,12,-6
2,5.0,delivered,1,1,0,0,0,3,179.12,4869f7a5dfa277a7dca6462dcf3b52b2,auto,46,232,1,420,9576,9,-18
3,5.0,delivered,1,1,0,0,0,1,72.2,66922902710d126a0e7d26b0e3805106,pet_shop,59,468,3,450,6000,13,-13
4,5.0,delivered,1,1,0,0,0,1,28.62,2c9e548be18521d1c43cde1c582c6de8,stationery,38,316,4,250,11475,2,-10


### <font color="orange">モデルの実装</font>

In [24]:
# random_stateナンバー
num_seed = 11

In [25]:
import lightgbm as lgb

def lightgbm_reg(X_train, X_test, y_train, y_test, num_seed):

    # LightGBMのパラメータ
    params = {
        "boosting" : "gbdt",
        'objective': 'regression',  # 回帰タスク
        'metric': 'rmse',
        "min_data_in_leaf" : 30,
        'num_leaves': 16,            # 木の最大の葉の数
        "max_depth" : -1,
        'learning_rate': 0.05,       # 学習率
        'is_unbalance': True,        # 不均衡なクラスの扱い
        "verbose" : -1,
        "random_seed" : num_seed
    }

    # LightGBMのモデルの作成と学習
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model_lgb = lgb.train(params=params,
                          num_boost_round=5000,
                          train_set=lgb_train,
                          valid_sets=[lgb_train, lgb_valid]
                         )

    # 予測
    y_pred = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)
    y_pred_lgb = np.round(y_pred).astype(int)

    y_pred_lgb[y_pred_lgb < 1] = 1
    y_pred_lgb[y_pred_lgb >5] = 5
    
    return y_pred_lgb

In [26]:
from sklearn.ensemble import RandomForestClassifier

def rf_class(X_train, X_test, y_train, y_test, num_seed):
    # ランダムフォレストの作成と学習
    model_rf = RandomForestClassifier(n_estimators=50, random_state=num_seed)
    model_rf.fit(X_train, y_train)
    # 予測
    y_pred_rf = model_rf.predict(X_test)
    
    return y_pred_rf

In [27]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from concurrent.futures import ThreadPoolExecutor

X = data.drop("review_score", axis=1)
y = data["review_score"]

scores_lgb = []
scores_rf = []

# データの分割
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=num_seed)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()
    
    
    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
    
    executer = ThreadPoolExecutor(max_workers=2)    #  並列処理
    # lightgbmモデルの学習、予測
    y_pred_lgb = executer.submit(lightgbm_reg, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, num_seed=num_seed).result()
    # ランダムフォレストモデルの学習、予測
    y_pred_rf = executer.submit(rf_class, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, num_seed=num_seed).result()
    
    # Quadratic Weighted Kappaの計算
    kappa_lgb = cohen_kappa_score(y_test, y_pred_lgb, weights='quadratic')
    kappa_rf = cohen_kappa_score(y_test, y_pred_rf, weights='quadratic')
    
    scores_lgb.append(kappa_lgb)
    scores_rf.append(kappa_rf)
    
    print("="*20, i, "="*20)
    print(f'Quadratic Weighted Kappa (lgb)  : {np.round(kappa_lgb, 4)}')
    print(f'Quadratic Weighted Kappa (rf)   : {np.round(kappa_rf, 4)}')

# 交差検証スコアの平均値と標準偏差を出力
scores_lgb = np.array(scores_lgb)
scores_rf = np.array(scores_rf)
print("\n")
print(f"lgb_score -- mean : {np.round(scores_lgb.mean(), 4), }, std : {np.round(scores_lgb.std(), 4)}")
print(f"lgb_score -- mean : {np.round(scores_rf.mean(), 4), }, std : {np.round(scores_rf.std(), 4)}")

Quadratic Weighted Kappa (lgb)  : 0.4345
Quadratic Weighted Kappa (rf)   : 0.44
Quadratic Weighted Kappa (lgb)  : 0.4358
Quadratic Weighted Kappa (rf)   : 0.4336
Quadratic Weighted Kappa (lgb)  : 0.4375
Quadratic Weighted Kappa (rf)   : 0.4425
Quadratic Weighted Kappa (lgb)  : 0.4364
Quadratic Weighted Kappa (rf)   : 0.4382
Quadratic Weighted Kappa (lgb)  : 0.4291
Quadratic Weighted Kappa (rf)   : 0.4379


lgb_score -- mean : (0.4346,), std : 0.003
lgb_score -- mean : (0.4384,), std : 0.0029


- lightgbmとランダムフォレストの予測精度は近い。

### <font color="orange">lightgbmとランダムフォレストの予測分布の確認</font>

In [28]:
from sklearn.metrics import confusion_matrix

labels = [1,2,3,4,5]

cm_lgb = confusion_matrix(y_true=y_test, y_pred=y_pred_lgb, labels=labels)
cm_rf = confusion_matrix(y_true=y_test, y_pred=y_pred_rf, labels=labels)

columns_labels = ["pred_"+str(label) for label in labels]
index_labels = ["act_"+str(label) for label in labels]

cm_lgb = pd.DataFrame(cm_lgb, index=index_labels, columns=columns_labels)
cm_rf = pd.DataFrame(cm_rf, index=index_labels, columns=columns_labels)

show_many_dfs(cm_lgb, cm_rf)

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,309,738,325,935,164
act_2,33,89,86,385,90
act_3,29,103,159,1097,316
act_4,17,59,230,2684,914
act_5,28,117,429,7358,3768

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1187,18,47,91,1128
act_2,137,24,9,39,474
act_3,165,7,49,130,1353
act_4,147,11,55,338,3353
act_5,286,17,120,611,10666


In [29]:
y_pred = pd.DataFrame({"lgb" : y_pred_lgb,
                       "rf" : y_pred_rf,
                       "act" : y_test})

check_df = y_pred.groupby(["act", "lgb", "rf"])[["act"]].count()

show_many_dfs(check_df, check_df.query("lgb ==2 & rf ==1"), n=10)    #? <- nを増やすことで表示する行数を変更できる。

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,act
act,lgb,rf,Unnamed: 3_level_1
1.0,1,1.0,299
1.0,1,2.0,2
1.0,1,3.0,5
1.0,1,4.0,1
1.0,1,5.0,2
1.0,2,1.0,694
1.0,2,2.0,11
1.0,2,3.0,13
1.0,2,4.0,3
1.0,2,5.0,17

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,act
act,lgb,rf,Unnamed: 3_level_1
1.0,2,1.0,694
2.0,2,1.0,79
3.0,2,1.0,93
4.0,2,1.0,50
5.0,2,1.0,104


- スコアは近いものの、予測分布は大きく違うことが分かった。

### <font color="orange">lightgbmとランダムフォレストの予測値から最終予測値を決定することで、精度が上がるか確認する</font>

In [30]:
# 条件分岐による採択する予測値を決定し、スコア確認。
y_pred = pd.DataFrame({"lgb" : y_pred_lgb, "rf" : y_pred_rf})
y_pred["final"] = 0

def final_pred(row):
    """
    "lgb"と"rf"列の予測値から、最終的な予測値を決める関数です。
    効果確認のために、とりあえずいくつかの条件分岐を作成。
    効果が確認できれば、optunaで全体探索し、最適化をする。
    """
    
    if (row["lgb"] != 4) & (row["rf"] == 4):
        return row["lgb"]
        
    elif (row["lgb"] == 4) & (row["rf"] != 4):
        return row["rf"]
        
    elif (row["lgb"] != 5) & (row["rf"] == 5):
        return row["lgb"]
        
    elif (row["lgb"] == 5) & (row["rf"] != 5):
        return row["rf"]
    
    else:
        return row["rf"]

# y_pred["final"] に関数final_predを適用
y_pred["final"] = y_pred.apply(final_pred, axis=1)

# スコアの出力
kappa_lgb = cohen_kappa_score(y_test, y_pred["lgb"], weights='quadratic')
print(f'Quadratic Weighted Kappa (lgb)  : {kappa_lgb}')
kappa_rf = cohen_kappa_score(y_test, y_pred["rf"], weights='quadratic')
print(f'Quadratic Weighted Kappa (rf)   : {kappa_rf}')
kappa_final = cohen_kappa_score(y_test, y_pred["final"], weights='quadratic')
print(f'Quadratic Weighted Kappa (final): {kappa_final}')

#! 精度が上がることが確認できた。

Quadratic Weighted Kappa (lgb)  : 0.42908429623231603
Quadratic Weighted Kappa (rf)   : 0.4379001552752467
Quadratic Weighted Kappa (final): 0.46344585106499325


In [31]:
from sklearn.metrics import confusion_matrix

labels = [1,2,3,4,5]

cm_lgb = confusion_matrix(y_true=y_test, y_pred=y_pred["lgb"], labels=labels)
cm_rf = confusion_matrix(y_true=y_test, y_pred=y_pred["rf"], labels=labels)
cm_final = confusion_matrix(y_true=y_test, y_pred=y_pred["final"], labels=labels)

columns_labels = ["pred_"+str(label) for label in labels]
index_labels = ["act_"+str(label) for label in labels]

cm_lgb = pd.DataFrame(cm_lgb, index=index_labels, columns=columns_labels)
cm_rf = pd.DataFrame(cm_rf, index=index_labels, columns=columns_labels)
cm_final = pd.DataFrame(cm_final, index=index_labels, columns=columns_labels)

show_many_dfs(cm_lgb, cm_rf, cm_final)

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,309,738,325,935,164
act_2,33,89,86,385,90
act_3,29,103,159,1097,316
act_4,17,59,230,2684,914
act_5,28,117,429,7358,3768

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1187,18,47,91,1128
act_2,137,24,9,39,474
act_3,165,7,49,130,1353
act_4,147,11,55,338,3353
act_5,286,17,120,611,10666

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1190,38,203,70,970
act_2,137,29,55,30,432
act_3,166,12,154,106,1266
act_4,148,17,214,260,3265
act_5,286,25,432,479,10478


- 精度が上がることが確認できたので、optunaで探索する。

### <font color="orange">oputunaを取り入れた交差検証</font>

- lightgbmとランダムフォレストの予測値から、最良のスコアを算出するためにoptunaで探索する。
- lightgbmとランダムフォレストの予測値が同じなら、最終予測はその値とする。
- それぞれの予測が異なれば、最終予測値をどうするかを、optunaで探索して決定する。

In [32]:
import optuna
from sklearn.metrics import cohen_kappa_score

def objective(trial, y_pred_lgb, y_pred_rf, y_test):
    
    # optunaの関数に渡すために、予測値をデータフレームに変更。
    df_pred = pd.DataFrame({"lgb" : y_pred_lgb,
                            "rf" : y_pred_rf})
    
    for pred_lgb in [1,2,3,4,5]:
        for pred_rf in [1,2,3,4,5]:
            
            # lightgbmとランダムフォレストの予測値が同じ場合は、その値を最終予測値とする。
            if pred_lgb == pred_rf:
                df_pred.loc[(df_pred["lgb"] == pred_lgb) & (df_pred["rf"] == pred_rf), "final"] = pred_lgb
            
            # lightgbmとランダムフォレストの予測値が異なる場合は、optunaの探索対象とし、ベストパラメーターを最終予測値とする。
            else:
                df_pred.loc[(df_pred["lgb"] == pred_lgb) & (df_pred["rf"] == pred_rf), "final"] = trial.suggest_int(f"line_{pred_lgb}_{pred_rf}", 1, 5)
            
            
    y_pred = df_pred["final"]
    kappa = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    
    return kappa

In [33]:
from sklearn.metrics import cohen_kappa_score
from concurrent.futures import ThreadPoolExecutor

def final_model(X_train, X_test, y_train, y_test, num_seed):
    
    # モデルの学習、予測
    executer = ThreadPoolExecutor(max_workers=2)
    y_pred_lgb = executer.submit(lightgbm_reg, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, num_seed=num_seed).result()
    y_pred_rf = executer.submit(rf_class, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, num_seed=num_seed).result()
    
    
    # df_predを作成。
    # optuna用に df_pred を df_pred_for_optuna と df_pred_for_test に2分割。
    # df_pred_for_optuna でoptunaによるパラメータ調整。 df_pred_for_testにoptunaで探索したパラメータを用いて最終予測値を決定する。
    df_pred = pd.DataFrame({"pred_lgb" : y_pred_lgb,
                            "pred_rf" : y_pred_rf,
                            "act" : y_test})
    
    df_pred = df_pred.astype(int)
    
    df_pred_for_optuna = df_pred.sample(frac=0.1, random_state=num_seed)
    df_pred_for_test = df_pred.drop(df_pred_for_optuna.index)
    

    # Quadratic Weighted Kappaの計算
    kappa_lgb = cohen_kappa_score(df_pred_for_test["act"], df_pred_for_test["pred_lgb"], weights='quadratic')
    kappa_rf = cohen_kappa_score(df_pred_for_test["act"], df_pred_for_test["pred_rf"], weights='quadratic')
    
    
    # optuna
    optuna.logging.disable_default_handler()    # ログを非表示にする
    study = optuna.create_study(direction='maximize',  sampler=optuna.samplers.RandomSampler(num_seed))
    study.optimize(lambda trial: objective(trial,
                                           y_pred_lgb=df_pred_for_optuna["pred_lgb"],
                                           y_pred_rf=df_pred_for_optuna["pred_rf"],
                                           y_test=df_pred_for_optuna["act"]
                                           ),
                                           n_trials=500)
    
    best_parameters = study.best_params
    best_parameters["line_1_1"] = 1
    best_parameters["line_2_2"] = 2
    best_parameters["line_3_3"] = 3
    best_parameters["line_4_4"] = 4
    best_parameters["line_5_5"] = 5
    
    df_pred_for_test["key"] = df_pred_for_test.apply(lambda row : "line_" + str(row["pred_lgb"]) + "_" + str(row["pred_rf"]), axis=1)
        
    df_pred_for_test["pred_final"] = df_pred_for_test["key"].map(best_parameters)
    
    kappa_final = cohen_kappa_score(df_pred_for_test["act"], df_pred_for_test["pred_final"], weights='quadratic')
    
    return kappa_lgb, kappa_rf, kappa_final

In [34]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

X = data.drop("review_score", axis=1)
y = data["review_score"]

scores_lgb = []
scores_rf = []
scores_final = []

# データの分割
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=num_seed)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()
    
    
    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
    
    kappa_lgb, kappa_rf, kappa_final = final_model(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, num_seed=num_seed)
    
    scores_lgb.append(kappa_lgb)
    scores_rf.append(kappa_rf)
    scores_final.append(kappa_final)
    
    print("="*20, i, "回目", "="*20)
    print(f'Quadratic Weighted Kappa (lgb)   : {np.round(kappa_lgb, 4)}')
    print(f'Quadratic Weighted Kappa (rf)    : {np.round(kappa_rf, 4)}')
    print(f'Quadratic Weighted Kappa (final) : {np.round(kappa_final, 4)}')

# 交差検証スコアの平均値と標準偏差を出力
scores_lgb = np.array(scores_lgb)
scores_rf = np.array(scores_rf)
scores_final = np.array(scores_final)
print("\n")
print(f"lgb_score -- mean : {np.round(scores_lgb.mean(), 4), }, std : {np.round(scores_lgb.std(), 4)}")
print(f"lgb_score -- mean : {np.round(scores_rf.mean(), 4), }, std : {np.round(scores_rf.std(), 4)}")
print(f"lgb_score -- mean : {np.round(scores_final.mean(), 4), }, std : {np.round(scores_final.std(), 4)}")

Quadratic Weighted Kappa (lgb)   : 0.4318
Quadratic Weighted Kappa (rf)    : 0.4387
Quadratic Weighted Kappa (final) : 0.4696
Quadratic Weighted Kappa (lgb)   : 0.434
Quadratic Weighted Kappa (rf)    : 0.4327
Quadratic Weighted Kappa (final) : 0.47
Quadratic Weighted Kappa (lgb)   : 0.4375
Quadratic Weighted Kappa (rf)    : 0.4426
Quadratic Weighted Kappa (final) : 0.4752
Quadratic Weighted Kappa (lgb)   : 0.4387
Quadratic Weighted Kappa (rf)    : 0.4389
Quadratic Weighted Kappa (final) : 0.4737
Quadratic Weighted Kappa (lgb)   : 0.4348
Quadratic Weighted Kappa (rf)    : 0.4421
Quadratic Weighted Kappa (final) : 0.4751


lgb_score -- mean : (0.4354,), std : 0.0025
lgb_score -- mean : (0.439,), std : 0.0035
lgb_score -- mean : (0.4727,), std : 0.0024
