In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)

In [3]:
# 自作関数の呼び出し
import sys
sys.path.append("../../")    # <- 親フォルダから呼び出すためにpathを追加する。
from my_package.excute_notebook import execute_notebook

In [4]:
# データフレームを水平に表示する。
def show_many_dfs(*dfs, n=10):
    class HorizontalDisplay:
        def _repr_html_(self):
            template = '<div style="float: left; padding: 5px;">{}</div>'
            return  ''.join(template.format(df.head(n)._repr_html_()) for df in dfs)
    return HorizontalDisplay()

### <font color="orange">make_data.ipynbの実行</font>

In [5]:
import os

file_path = "../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv"

if os.path.exists(file_path):
    print("既にデータが保存されています。")
else:
    print("make_data.ipynbを実行します。")
    notebook_path = "../../1_datasets/1_make_dataset/make_data.ipynb"
    execute_notebook(notebook_path=notebook_path)
    print("実行が完了しました。")

既にデータが保存されています。


### <font color="orange">データの読み込み</font>

In [6]:
data = pd.read_csv("../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv")
data.head()

Unnamed: 0,review_score,order_status,count_payment_sequential,payment_type_credit_card,payment_type_boleto,payment_type_voucher,payment_type_debit_card,mean_credit_card_payment_installments,payment_value,seller_id,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_volume,days_approved_deliverd,days_estimated_deliverd
0,4.0,delivered,3,1,0,1,0,1,38.71,3504c0cb71d7fa48d967e0e4c94d59d9,housewares,40,268,4,500,1976,8,-8
1,4.0,delivered,1,0,1,0,0,0,141.46,289cdb325fb7e7f891c38608bf9e0962,perfumery,29,178,1,400,4693,12,-6
2,5.0,delivered,1,1,0,0,0,3,179.12,4869f7a5dfa277a7dca6462dcf3b52b2,auto,46,232,1,420,9576,9,-18
3,5.0,delivered,1,1,0,0,0,1,72.2,66922902710d126a0e7d26b0e3805106,pet_shop,59,468,3,450,6000,13,-13
4,5.0,delivered,1,1,0,0,0,1,28.62,2c9e548be18521d1c43cde1c582c6de8,stationery,38,316,4,250,11475,2,-10


### <font color="orange">モデルの実装</font>

In [7]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

In [8]:
# 混同行列の作成
from sklearn.metrics import confusion_matrix

def make_confusion_matrix(y_test, y_pred):
    
    labels = [1,2,3,4,5]

    columns_labels = ["pred_"+str(label) for label in labels]
    index_labels = ["act_"+str(label) for label in labels]
    
    cm = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=labels)

    cm = pd.DataFrame(cm, index=index_labels, columns=columns_labels)
    
    return cm

model_1    
objective : regression    
metric : rmse

In [10]:
X = data.drop("review_score", axis=1)
y = data["review_score"]

scores = []
models = []
confusion_matrixs = []
df_feature_importances = []

# LightGBMのパラメータ
params = {
    "boosting" : "gbdt",
    'objective': 'regression',  # 回帰タスク
    'metric': 'rmse',
    "min_data_in_leaf" : 30,
    'num_leaves': 16,            # 木の最大の葉の数
    "max_depth" : -1,
    'learning_rate': 0.05,       # 学習率
    'is_unbalance': True,        # 不均衡なクラスの扱い
    "verbose" : -1
}

# データの分割
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()
    
    
    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
    
    
    # LightGBMのモデルの作成と学習
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model = lgb.train(params=params,
                      num_boost_round=5000,
                      train_set=lgb_train,
                      valid_sets=[lgb_train, lgb_valid],
                      )
    models.append(model)

    # 予測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred_rounded = np.round(y_pred).astype(int)
    
    y_pred_rounded[y_pred_rounded < 1] = 1
    y_pred_rounded[y_pred_rounded >5] = 5
    
    # 混同行列を作成し、リストに格納する。
    tmp_cm = make_confusion_matrix(y_test=y_test, y_pred=y_pred_rounded)
    confusion_matrixs.append(tmp_cm)

    # 変数重要度のデータフレームを作成し、リストに格納する。
    tmp_df_feature_imprtance = make_df_feature_importance(X_train=X_train, model=model)
    df_feature_importances.append(tmp_df_feature_imprtance)
    
    # Quadratic Weighted Kappaの計算
    kappa = cohen_kappa_score(y_test, y_pred_rounded, weights='quadratic')
    scores.append(kappa)
    print("="*20 + str(i) + "回目" + "="*20)
    print(f'Quadratic Weighted Kappa: {np.round(kappa, 4)}')

scores = np.array(scores)
print("\n")
print(f"Quadratic Weighted Kappa -- mean : {np.round(scores.mean(), 4)},  std : {np.round(scores.std(), 4)}")

Quadratic Weighted Kappa: 0.4345
Quadratic Weighted Kappa: 0.4358
Quadratic Weighted Kappa: 0.4375
Quadratic Weighted Kappa: 0.4364
Quadratic Weighted Kappa: 0.4291


Quadratic Weighted Kappa -- mean : 0.4346,  std : 0.003


In [11]:
# 混同行列の出力
show_many_dfs(confusion_matrixs[0],
              confusion_matrixs[1],
              confusion_matrixs[2],
              confusion_matrixs[3],
              confusion_matrixs[4])

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,325,709,325,949,164
act_2,38,110,81,393,60
act_3,34,97,153,1126,294
act_4,18,71,211,2605,999
act_5,27,101,401,7459,3713

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,344,689,339,933,167
act_2,32,89,81,396,84
act_3,29,97,145,1122,311
act_4,16,74,208,2646,960
act_5,21,107,411,7260,3902

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,343,687,336,962,144
act_2,39,103,74,409,57
act_3,30,100,155,1110,309
act_4,15,69,200,2669,951
act_5,22,115,449,7317,3798

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,341,686,330,950,164
act_2,35,105,95,377,70
act_3,38,86,165,1144,272
act_4,14,61,194,2702,933
act_5,23,91,399,7538,3650

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,309,738,325,935,164
act_2,33,89,86,385,90
act_3,29,103,159,1097,316
act_4,17,59,230,2684,914
act_5,28,117,429,7358,3768


In [12]:
# 変数重要度の出力
show_many_dfs(df_feature_importances[0],
              df_feature_importances[1],
              df_feature_importances[2],
              df_feature_importances[3],
              df_feature_importances[4]
              )

Unnamed: 0,columns,value
13,days_estimated_deliverd,248709.609083
14,te_order_status,144724.015122
15,te_seller_id,107457.650048
6,payment_value,66940.930717
12,days_approved_deliverd,48989.774797
8,product_description_lenght,39900.904446
11,product_volume,36126.997987
10,product_weight_g,36063.335099
7,product_name_lenght,25951.483025
16,te_product_category_name_english,23003.720059

Unnamed: 0,columns,value
13,days_estimated_deliverd,249463.910339
14,te_order_status,146709.494473
15,te_seller_id,104575.779977
6,payment_value,67511.741451
12,days_approved_deliverd,50964.551463
8,product_description_lenght,40713.254571
11,product_volume,37226.367571
10,product_weight_g,34256.291489
7,product_name_lenght,26196.27738
16,te_product_category_name_english,23491.797304

Unnamed: 0,columns,value
13,days_estimated_deliverd,249099.647004
14,te_order_status,144182.68151
15,te_seller_id,106790.738211
6,payment_value,66536.901608
12,days_approved_deliverd,48778.292007
8,product_description_lenght,41067.534072
11,product_volume,36387.080078
10,product_weight_g,35229.691061
7,product_name_lenght,25882.182363
16,te_product_category_name_english,23695.465351

Unnamed: 0,columns,value
13,days_estimated_deliverd,249529.618452
14,te_order_status,144378.636908
15,te_seller_id,104318.140826
6,payment_value,67672.418477
12,days_approved_deliverd,47728.089081
8,product_description_lenght,39901.966783
11,product_volume,37080.432844
10,product_weight_g,36438.722943
7,product_name_lenght,25286.40226
16,te_product_category_name_english,22749.858178

Unnamed: 0,columns,value
13,days_estimated_deliverd,245372.084736
14,te_order_status,148760.213363
15,te_seller_id,104883.249564
6,payment_value,65573.750654
12,days_approved_deliverd,49622.497904
8,product_description_lenght,39909.272585
10,product_weight_g,37285.61587
11,product_volume,35982.437093
7,product_name_lenght,24724.759725
16,te_product_category_name_english,24682.406614


model_2    
objective : multiclass    
metric : multi_logloss

In [13]:
X = data.drop("review_score", axis=1)
y = data["review_score"]

scores = []
models = []
confusion_matrixs = []
df_feature_importances = []

# LightGBMのパラメータ
params = {
    "boosting" : "gbdt",
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    "num_class" : 5,
    "min_data_in_leaf" : 30,
    'num_leaves': 16,            # 木の最大の葉の数
    "max_depth" : -1,
    'learning_rate': 0.05,       # 学習率
    'is_unbalance': True,        # 不均衡なクラスの扱い
    "verbose" : -1,
    "bagging_freq" : 1,
    "bagging_fraction" : 0.9
    
}

# データの分割
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()
    
    y_train -= 1
    y_test -= 1
    
    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
    
    
    # LightGBMのモデルの作成と学習
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model = lgb.train(params=params,
                      num_boost_round=5000,
                      train_set=lgb_train,
                      valid_sets=[lgb_train, lgb_valid],
                      )
    models.append(model)

    # 予測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred = np.argmax(y_pred, axis=1)
    
    y_pred += 1
    y_test += 1

    # 混同行列を作成し、リストに格納する。
    tmp_cm = make_confusion_matrix(y_test=y_test, y_pred=y_pred)
    confusion_matrixs.append(tmp_cm)

    # 変数重要度のデータフレームを作成し、リストに格納する。
    tmp_df_feature_imprtance = make_df_feature_importance(X_train=X_train, model=model)
    df_feature_importances.append(tmp_df_feature_imprtance)
    
    # Quadratic Weighted Kappaの計算
    kappa = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    scores.append(kappa)
    print("="*20 + str(i) + "回目" + "="*20)
    print(f'Quadratic Weighted Kappa: {np.round(kappa, 4)}')

scores = np.array(scores)
print("\n")
print(f"Quadratic Weighted Kappa -- mean : {np.round(scores.mean(), 4)},  std : {np.round(scores.std(), 4)}")

Quadratic Weighted Kappa: 0.447
Quadratic Weighted Kappa: 0.436
Quadratic Weighted Kappa: 0.4401
Quadratic Weighted Kappa: 0.4433
Quadratic Weighted Kappa: 0.4431


Quadratic Weighted Kappa -- mean : 0.4419,  std : 0.0037


In [14]:
# 混同行列の出力
show_many_dfs(confusion_matrixs[0],
              confusion_matrixs[1],
              confusion_matrixs[2],
              confusion_matrixs[3],
              confusion_matrixs[4])

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1170,14,42,69,1177
act_2,166,20,14,41,441
act_3,163,7,48,85,1401
act_4,148,8,30,189,3529
act_5,234,14,78,380,10995

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1175,14,29,58,1196
act_2,136,22,3,38,483
act_3,167,4,40,77,1416
act_4,143,12,32,199,3518
act_5,251,11,83,367,10989

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1171,16,26,90,1169
act_2,154,20,11,30,467
act_3,170,5,32,78,1419
act_4,129,9,30,181,3555
act_5,247,29,68,415,10942

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1155,18,34,78,1186
act_2,153,21,12,33,463
act_3,164,9,44,79,1409
act_4,131,6,34,208,3525
act_5,236,13,51,329,11072

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1188,23,33,71,1156
act_2,143,17,9,30,484
act_3,170,6,38,80,1410
act_4,144,5,36,191,3528
act_5,260,19,80,350,10991


In [15]:
# 変数重要度の出力
show_many_dfs(df_feature_importances[0],
              df_feature_importances[1],
              df_feature_importances[2],
              df_feature_importances[3],
              df_feature_importances[4]
              )

Unnamed: 0,columns,value
13,days_estimated_deliverd,187382.330292
6,payment_value,158729.045564
15,te_seller_id,133109.366268
12,days_approved_deliverd,111355.009096
8,product_description_lenght,107216.507776
11,product_volume,97903.379004
10,product_weight_g,89901.743298
7,product_name_lenght,71592.7538
14,te_order_status,57958.105232
16,te_product_category_name_english,55455.051445

Unnamed: 0,columns,value
13,days_estimated_deliverd,188695.065265
6,payment_value,158000.362455
15,te_seller_id,132303.185571
12,days_approved_deliverd,110940.050122
8,product_description_lenght,108559.980966
11,product_volume,96389.575997
10,product_weight_g,91778.616407
7,product_name_lenght,71111.761907
14,te_order_status,59129.436597
16,te_product_category_name_english,55938.793197

Unnamed: 0,columns,value
13,days_estimated_deliverd,190463.159876
6,payment_value,157157.405456
15,te_seller_id,133619.721845
12,days_approved_deliverd,111328.541268
8,product_description_lenght,105936.536006
11,product_volume,97931.657925
10,product_weight_g,89690.193023
7,product_name_lenght,70989.237869
14,te_order_status,57809.890539
16,te_product_category_name_english,54562.538918

Unnamed: 0,columns,value
13,days_estimated_deliverd,189787.877399
6,payment_value,155960.330947
15,te_seller_id,131892.493348
12,days_approved_deliverd,111358.175228
8,product_description_lenght,107511.380145
11,product_volume,96517.102018
10,product_weight_g,90365.428476
7,product_name_lenght,71112.506803
14,te_order_status,58168.696776
16,te_product_category_name_english,54255.944249

Unnamed: 0,columns,value
13,days_estimated_deliverd,187354.784366
6,payment_value,157886.453368
15,te_seller_id,132659.189443
12,days_approved_deliverd,109952.100211
8,product_description_lenght,107677.430872
11,product_volume,96258.690793
10,product_weight_g,89949.788506
7,product_name_lenght,71550.411061
14,te_order_status,59825.877037
16,te_product_category_name_english,55269.894515
