In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)

In [3]:
# 自作関数の呼び出し
import sys
sys.path.append("../../")    # <- 親フォルダから呼び出すためにpathを追加する。
from my_package.excute_notebook import execute_notebook

In [4]:
# データフレームを水平に表示する。
def show_many_dfs(*dfs, n=10):
    class HorizontalDisplay:
        def _repr_html_(self):
            template = '<div style="float: left; padding: 5px;">{}</div>'
            return  ''.join(template.format(df.head(n)._repr_html_()) for df in dfs)
    return HorizontalDisplay()

### <font color="orange">make_data.ipynbの実行</font>

In [5]:
import os

file_path = "../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv"

if os.path.exists(file_path):
    print("既にデータが保存されています。")
else:
    print("make_data.ipynbを実行します。")
    notebook_path = "../../1_datasets/1_make_dataset/make_data.ipynb"
    execute_notebook(notebook_path=notebook_path)
    print("実行が完了しました。")

既にデータが保存されています。


### <font color="orange">データの読み込み</font>

In [6]:
data = pd.read_csv("../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv")
data.head()

Unnamed: 0,review_score,order_status,count_payment_sequential,payment_type_credit_card,payment_type_boleto,payment_type_voucher,payment_type_debit_card,mean_credit_card_payment_installments,payment_value,seller_id,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_volume,days_approved_deliverd,days_estimated_deliverd
0,4.0,delivered,3,1,0,1,0,1,38.71,3504c0cb71d7fa48d967e0e4c94d59d9,housewares,40,268,4,500,1976,8,-8
1,4.0,delivered,1,0,1,0,0,0,141.46,289cdb325fb7e7f891c38608bf9e0962,perfumery,29,178,1,400,4693,12,-6
2,5.0,delivered,1,1,0,0,0,3,179.12,4869f7a5dfa277a7dca6462dcf3b52b2,auto,46,232,1,420,9576,9,-18
3,5.0,delivered,1,1,0,0,0,1,72.2,66922902710d126a0e7d26b0e3805106,pet_shop,59,468,3,450,6000,13,-13
4,5.0,delivered,1,1,0,0,0,1,28.62,2c9e548be18521d1c43cde1c582c6de8,stationery,38,316,4,250,11475,2,-10


### <font color="orange">モデルの実装</font>

In [7]:
# 混同行列の作成
from sklearn.metrics import confusion_matrix

def make_confusion_matrix(y_test, y_pred):
    
    labels = [1,2,3,4,5]

    columns_labels = ["pred_"+str(label) for label in labels]
    index_labels = ["act_"+str(label) for label in labels]
    
    cm = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=labels)

    cm = pd.DataFrame(cm, index=index_labels, columns=columns_labels)
    
    return cm

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score


X = data.drop("review_score", axis=1)
y = data["review_score"]

scores = []
models = []
confusion_matrixs = []

# データの分割
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()


    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
    
    # 予測
    model = RandomForestClassifier(n_estimators=50, random_state=11)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # 混同行列を作成し、リストに格納する。
    tmp_cm = make_confusion_matrix(y_test=y_test, y_pred=y_pred)
    confusion_matrixs.append(tmp_cm)

    # Quadratic Weighted Kappaの計算
    kappa = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    scores.append(kappa)
    print("="*20 + str(i) + "回目" + "="*20)
    print(f'Quadratic Weighted Kappa: {np.round(kappa, 4)}')
    

scores = np.array(scores)
print("\n")
print(f"Quadratic Weighted Kappa -- mean : {np.round(scores.mean(), 4)},  std : {np.round(scores.std(), 4)}")

Quadratic Weighted Kappa: 0.44
Quadratic Weighted Kappa: 0.4336
Quadratic Weighted Kappa: 0.4425
Quadratic Weighted Kappa: 0.4382
Quadratic Weighted Kappa: 0.4379


Quadratic Weighted Kappa -- mean : 0.4384,  std : 0.0029


In [9]:
# 混同行列の出力
show_many_dfs(confusion_matrixs[0],
              confusion_matrixs[1],
              confusion_matrixs[2],
              confusion_matrixs[3],
              confusion_matrixs[4])

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1165,22,40,90,1155
act_2,163,22,15,43,439
act_3,178,7,54,117,1348
act_4,155,12,43,301,3393
act_5,253,21,124,658,10645

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1179,12,34,90,1157
act_2,135,29,7,49,462
act_3,173,6,58,120,1347
act_4,152,17,67,321,3347
act_5,279,20,122,609,10671

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1175,24,43,107,1123
act_2,157,26,12,38,449
act_3,162,10,49,139,1344
act_4,150,20,48,320,3366
act_5,268,25,118,623,10667

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1170,15,25,95,1166
act_2,159,23,6,38,456
act_3,172,7,58,101,1367
act_4,138,8,46,323,3389
act_5,249,19,122,608,10703

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,1187,18,47,91,1128
act_2,137,24,9,39,474
act_3,165,7,49,130,1353
act_4,147,11,55,338,3353
act_5,286,17,120,611,10666
