In [1]:
import pandas as pd
import numpy as np

In [2]:
# データフレームを水平に表示する。
def show_many_dfs(*dfs, n=10):
    class HorizontalDisplay:
        def _repr_html_(self):
            template = '<div style="float: left; padding: 5px;">{}</div>'
            return  ''.join(template.format(df.head(n)._repr_html_()) for df in dfs)
    return HorizontalDisplay()

In [3]:
# 自作関数の呼び出し
import sys
sys.path.append("../")    # <- 親フォルダから呼び出すためにpathを追加する。
from my_package.excute_notebook import execute_notebook

In [4]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

### <font color="orange">make_data.ipynbの実行</font>

In [5]:
import os

file_path_train_data = "../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv"
file_path_test_data = "../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/test_data.csv"

if os.path.exists(file_path_train_data) and os.path.exists(file_path_test_data):
    print("既にデータが保存されています。")
else:
    print("make_data.ipynbを実行します。")
    notebook_path = "../../1_datasets/1_make_dataset/make_data.ipynb"
    execute_notebook(notebook_path=notebook_path)
    print("実行が完了しました。")

既にデータが保存されています。


### <font color="orange">データの読み込み</font>

In [6]:
train_data = pd.read_csv("../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv")
test_data = pd.read_csv("../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/test_data.csv")
display(train_data.head())
display(test_data.head())

Unnamed: 0,review_score,order_status,count_payment_sequential,payment_type_credit_card,payment_type_boleto,payment_type_voucher,payment_type_debit_card,mean_credit_card_payment_installments,payment_value,seller_id,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_volume,days_approved_deliverd,days_estimated_deliverd
0,4.0,delivered,3,1,0,1,0,1,38.71,3504c0cb71d7fa48d967e0e4c94d59d9,housewares,40,268,4,500,1976,8,-8
1,4.0,delivered,1,0,1,0,0,0,141.46,289cdb325fb7e7f891c38608bf9e0962,perfumery,29,178,1,400,4693,12,-6
2,5.0,delivered,1,1,0,0,0,3,179.12,4869f7a5dfa277a7dca6462dcf3b52b2,auto,46,232,1,420,9576,9,-18
3,5.0,delivered,1,1,0,0,0,1,72.2,66922902710d126a0e7d26b0e3805106,pet_shop,59,468,3,450,6000,13,-13
4,5.0,delivered,1,1,0,0,0,1,28.62,2c9e548be18521d1c43cde1c582c6de8,stationery,38,316,4,250,11475,2,-10


Unnamed: 0,order_status,count_payment_sequential,payment_type_credit_card,payment_type_boleto,payment_type_voucher,payment_type_debit_card,mean_credit_card_payment_installments,payment_value,seller_id,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_volume,days_approved_deliverd,days_estimated_deliverd
0,delivered,1,1,0,0,0,10,1376.45,c4af86330efa7a2620772227d2d670c9,construction_tools_construction,38,143,2,20850,125000,17,-17
1,shipped,1,0,1,0,0,0,69.12,cc419e0650a3c5ba77189a1882b7556a,perfumery,44,334,1,200,3328,18,-7
2,delivered,1,0,1,0,0,0,162.25,7a67c85e85bb2ce8582c35f2203ad736,cool_stuff,54,629,1,1050,19008,28,6
3,delivered,1,1,0,0,0,4,41.1,c42fd8e4d47dfb18ce5222f2dd7752f9,furniture_decor,64,669,4,200,4096,18,-8
4,delivered,1,0,1,0,0,0,87.58,850f4f8af5ea87287ac68de36e29107f,watches_gifts,18,796,2,750,2700,8,-20


In [7]:
train_data.shape, test_data.shape

((102314, 18), (814, 17))

In [8]:
test_data.isna().sum()

order_status                             0
count_payment_sequential                 0
payment_type_credit_card                 0
payment_type_boleto                      0
payment_type_voucher                     0
payment_type_debit_card                  0
mean_credit_card_payment_installments    0
payment_value                            0
seller_id                                0
product_category_name_english            0
product_name_lenght                      0
product_description_lenght               0
product_photos_qty                       0
product_weight_g                         0
product_volume                           0
days_approved_deliverd                   0
days_estimated_deliverd                  0
dtype: int64

### <font color="orange">モデルの実装</font>

In [9]:
# random_stateナンバー
num_seed = 11

In [10]:
# lightgbmの関数
import lightgbm as lgb

def lightgbm_reg(X_train, X_test, y_train, y_test, num_seed=num_seed):

    # LightGBMのパラメータ
    params = {
        "boosting" : "gbdt",
        'objective': 'regression',  # 回帰タスク
        'metric': 'rmse',
        "min_data_in_leaf" : 30,
        'num_leaves': 16,            # 木の最大の葉の数
        "max_depth" : -1,
        'learning_rate': 0.05,       # 学習率
        'is_unbalance': True,        # 不均衡なクラスの扱い
        "verbose" : -1,
        "random_seed" : num_seed
    }

    # LightGBMのモデルの作成と学習
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model_lgb = lgb.train(params=params,
                          num_boost_round=5000,
                          train_set=lgb_train,
                          valid_sets=[lgb_train, lgb_valid]
                         )

    # 予測
    y_pred = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)
    y_pred_lgb = np.round(y_pred).astype(int)

    y_pred_lgb[y_pred_lgb < 1] = 1
    y_pred_lgb[y_pred_lgb >5] = 5
    
    return y_pred_lgb, model_lgb

In [11]:
# ランダムフォレストの関数
from sklearn.ensemble import RandomForestClassifier

def rf_class(X_train, X_test, y_train, y_test, num_seed=num_seed):
    # ランダムフォレストの作成と学習
    model_rf = RandomForestClassifier(n_estimators=50, random_state=num_seed)
    model_rf.fit(X_train, y_train)
    # 予測
    y_pred_rf = model_rf.predict(X_test)
    
    return y_pred_rf, model_rf

- lightgbmとランダムフォレストの予測値から、最良のスコアを算出するためにoptunaで探索する。
- lightgbmとランダムフォレストの予測値が同じなら、最終予測はその値とする。
- それぞれの予測が異なれば、最終予測値をどうするかを、optunaで探索して決定する。

In [12]:
# optunaの関数
import optuna
from sklearn.metrics import cohen_kappa_score

def objective(trial, y_pred_lgb, y_pred_rf, y_test):
    
    # optunaの関数に渡すために、予測値をデータフレームに変更。
    df_pred = pd.DataFrame({"lgb" : y_pred_lgb,
                            "rf" : y_pred_rf})
    
    for pred_lgb in [1,2,3,4,5]:
        for pred_rf in [1,2,3,4,5]:
            
            # lightgbmとランダムフォレストの予測値が同じ場合は、その値を最終予測値とする。
            if pred_lgb == pred_rf:
                df_pred.loc[(df_pred["lgb"] == pred_lgb) & (df_pred["rf"] == pred_rf), "final"] = pred_lgb
            
            # lightgbmとランダムフォレストの予測値が異なる場合は、optunaの探索対象とし、ベストパラメーターを最終予測値とする。
            else:
                df_pred.loc[(df_pred["lgb"] == pred_lgb) & (df_pred["rf"] == pred_rf), "final"] = trial.suggest_int(f"line_{pred_lgb}_{pred_rf}", 1, 5)
            
            
    y_pred = df_pred["final"]
    kappa = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    
    return kappa

In [13]:
# final_modelの関数（lightgbm, ランダムフォレスト、optunaを実行する）
from sklearn.metrics import cohen_kappa_score
from concurrent.futures import ThreadPoolExecutor

def final_model(X_train, X_test, y_train, y_test, num_seed):
    
    # モデルの学習、予測
    executer = ThreadPoolExecutor(max_workers=2)
    y_pred_lgb, model_lgb = executer.submit(lightgbm_reg, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, num_seed=num_seed).result()
    y_pred_rf, model_rf = executer.submit(rf_class, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, num_seed=num_seed).result()
    
    
    # df_predを作成。
    # optuna用に df_pred を df_pred_for_optuna と df_pred_for_test に2分割。
    # df_pred_for_optuna でoptunaによるパラメータ調整。 df_pred_for_testにoptunaで探索したパラメータを用いて最終予測値を決定する。
    df_pred = pd.DataFrame({"pred_lgb" : y_pred_lgb,
                            "pred_rf" : y_pred_rf,
                            "act" : y_test})
    
    df_pred = df_pred.astype(int)
    
    df_pred_for_optuna = df_pred.sample(frac=0.1, random_state=num_seed)
    df_pred_for_test = df_pred.drop(df_pred_for_optuna.index)
    

    # Quadratic Weighted Kappaの計算
    kappa_lgb = cohen_kappa_score(df_pred_for_test["act"], df_pred_for_test["pred_lgb"], weights='quadratic')
    kappa_rf = cohen_kappa_score(df_pred_for_test["act"], df_pred_for_test["pred_rf"], weights='quadratic')
    
    
    # optuna
    optuna.logging.disable_default_handler()    # ログを非表示にする
    study = optuna.create_study(direction='maximize',  sampler=optuna.samplers.RandomSampler(num_seed))
    study.optimize(lambda trial: objective(trial,
                                           y_pred_lgb=df_pred_for_optuna["pred_lgb"],
                                           y_pred_rf=df_pred_for_optuna["pred_rf"],
                                           y_test=df_pred_for_optuna["act"]
                                           ),
                                           n_trials=500
                                           )
    
    best_parameters = study.best_params
    best_parameters["line_1_1"] = 1
    best_parameters["line_2_2"] = 2
    best_parameters["line_3_3"] = 3
    best_parameters["line_4_4"] = 4
    best_parameters["line_5_5"] = 5
    
    df_pred_for_test["key"] = df_pred_for_test.apply(lambda row : "line_" + str(row["pred_lgb"]) + "_" + str(row["pred_rf"]), axis=1)
        
    df_pred_for_test["pred_final"] = df_pred_for_test["key"].map(best_parameters)
    
    kappa_final = cohen_kappa_score(df_pred_for_test["act"], df_pred_for_test["pred_final"], weights='quadratic')
    
    return kappa_lgb, kappa_rf, kappa_final, model_lgb, model_rf, best_parameters

In [14]:
# モデルの学習
from sklearn.model_selection import StratifiedKFold

X = train_data.drop("review_score", axis=1)
y = train_data["review_score"]

scores_lgb = []
scores_rf = []
scores_final = []
models_lgb = []
models_rf = []
best_parameters = []
te_test_datas = []

# データの分割
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=num_seed)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()
    
    te_test_data = test_data.copy()
    
    
    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = train_data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        
        te_test_data[add_column_name] = test_data[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
        del te_test_data[category_column]
    
    te_test_datas.append(te_test_data)
    
    kappa_lgb, kappa_rf, kappa_final, model_lgb, model_rf, best_parameter = final_model(X_train=X_train,
                                                                                         X_test=X_test,
                                                                                         y_train=y_train,
                                                                                         y_test=y_test,
                                                                                         num_seed=num_seed)
    
    models_lgb.append(model_lgb)
    models_rf.append(model_rf)
    best_parameters.append(best_parameter)
    
    scores_lgb.append(kappa_lgb)
    scores_rf.append(kappa_rf)
    scores_final.append(kappa_final)
    
    print("="*20, i, "回目", "="*20)
    print(f'Quadratic Weighted Kappa (lgb)   : {np.round(kappa_lgb, 4)}')
    print(f'Quadratic Weighted Kappa (rf)    : {np.round(kappa_rf, 4)}')
    print(f'Quadratic Weighted Kappa (final) : {np.round(kappa_final, 4)}')

# 交差検証スコアの平均値と標準偏差を出力
scores_lgb = np.array(scores_lgb)
scores_rf = np.array(scores_rf)
scores_final = np.array(scores_final)
print("\n")
print(f"lgb_score -- mean : {np.round(scores_lgb.mean(), 4), }, std : {np.round(scores_lgb.std(), 4)}")
print(f"lgb_score -- mean : {np.round(scores_rf.mean(), 4), }, std : {np.round(scores_rf.std(), 4)}")
print(f"lgb_score -- mean : {np.round(scores_final.mean(), 4), }, std : {np.round(scores_final.std(), 4)}")

Quadratic Weighted Kappa (lgb)   : 0.4318
Quadratic Weighted Kappa (rf)    : 0.4359
Quadratic Weighted Kappa (final) : 0.4689
Quadratic Weighted Kappa (lgb)   : 0.434
Quadratic Weighted Kappa (rf)    : 0.4339
Quadratic Weighted Kappa (final) : 0.4703
Quadratic Weighted Kappa (lgb)   : 0.4375
Quadratic Weighted Kappa (rf)    : 0.4445
Quadratic Weighted Kappa (final) : 0.4767
Quadratic Weighted Kappa (lgb)   : 0.4387
Quadratic Weighted Kappa (rf)    : 0.4426
Quadratic Weighted Kappa (final) : 0.4762
Quadratic Weighted Kappa (lgb)   : 0.4348
Quadratic Weighted Kappa (rf)    : 0.4409
Quadratic Weighted Kappa (final) : 0.4757


lgb_score -- mean : (0.4354,), std : 0.0025
lgb_score -- mean : (0.4396,), std : 0.004
lgb_score -- mean : (0.4735,), std : 0.0033


### <font color="orange">テストデータの予測</font>

In [15]:
# テストデータで予測
df_final_pred = pd.DataFrame()

for i in range(len(models_lgb)):
    # ターゲットエンコーディングの際にテストデータに欠損値が出たため、平均値で欠損値補完
    te_test_datas[i]["te_seller_id"] = te_test_datas[i]["te_seller_id"].fillna(te_test_datas[i]["te_seller_id"].mean())

    # lightgbmモデルで予測
    tmp_pred_lgb = models_lgb[i].predict(te_test_datas[i], num_iteration=models_lgb[i].best_iteration)
    tmp_pred_lgb = np.round(tmp_pred_lgb).astype(int)
    tmp_pred_lgb[tmp_pred_lgb < 1] = 1
    tmp_pred_lgb[tmp_pred_lgb >5] = 5
    
    # ランダムフォレストモデルで予測
    tmp_pred_rf = models_rf[i].predict(te_test_datas[i])
    
    # optunaパラメータを適用するためのデータフレームを作成
    df_pred_lgb_rf = pd.DataFrame({"pred_lgb" : tmp_pred_lgb,
                                   "pred_rf" : tmp_pred_rf})
    df_pred_lgb_rf["pred_lgb"] = df_pred_lgb_rf["pred_lgb"].astype(int)
    df_pred_lgb_rf["pred_rf"] = df_pred_lgb_rf["pred_rf"].astype(int)
    
    df_pred_lgb_rf["key"] = df_pred_lgb_rf.apply(lambda row : "line_" + str(row["pred_lgb"]) + "_" + str(row["pred_rf"]), axis=1)
    df_pred_lgb_rf["final_pred"] = df_pred_lgb_rf["key"].map(best_parameters[i])
    
    column_name = "pred_" + str(i+1)
    df_final_pred[column_name] = df_pred_lgb_rf["final_pred"]

In [16]:
df_mode = df_final_pred.mode(axis=1)
df_mode.rename(columns={0 : "mode_1", 1 : "mode_2"}, inplace=True)

df_final_pred = pd.concat([df_final_pred, df_mode], axis=1)

In [17]:
def get_not_mode(row):
    if pd.notna(row["mode_2"]):
        # 最頻値が２つある場合は、残りの最頻値ではない予測値を取得する。
        for column_name in ["pred_1", "pred_2", "pred_3", "pred_4", "pred_5"]:
            if (row[column_name] != row["mode_1"]) and (row[column_name] != row["mode_2"]):
                return row[column_name]

df_final_pred["not_mode"] = df_final_pred.apply(get_not_mode, axis=1)

In [18]:
def decide_final_pred(row):
    
    # 最頻値が1つの場合
    if pd.isna(row["mode_2"]):
        # mode_1をreturn
        return row["mode_1"]
    
    # 最頻値が2つの場合
    else:
        # 最頻値でない予測値との差が同じ場合
        if abs(row["mode_1"] - row["not_mode"]) == abs(row["mode_2"] - row["not_mode"]):
            # 最頻値でない予測値をreturn
            return row["not_mode"]
        else:
            # mode_*-not_modeを計算。not_modeに近い方のmodeをreturn
            if abs(row["mode_1"] - row["not_mode"]) < abs(row["mode_2"] - row["not_mode"]):
                return row["mode_1"]
            else:
                return row["mode_2"]

df_final_pred["final_pred"] = df_final_pred.apply(decide_final_pred, axis=1)

In [19]:
# 確認
df_final_pred[df_final_pred["not_mode"].notna()]

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5,mode_1,mode_2,not_mode,final_pred
48,1,4,4,3,3,3.0,4.0,1.0,3.0
51,1,3,1,3,4,1.0,3.0,4.0,3.0
112,4,1,3,1,4,1.0,4.0,3.0,4.0
283,1,3,3,5,1,1.0,3.0,5.0,3.0
607,5,1,1,2,2,1.0,2.0,5.0,2.0
638,5,3,4,4,5,4.0,5.0,3.0,4.0


In [20]:
y_test_pred = df_final_pred["final_pred"].astype(int)