In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns

In [7]:
# 自作関数の呼び出し
import sys
sys.path.append("../../")    # <- 親フォルダから呼び出すためにpathを追加する。
from my_package.excute_notebook import execute_notebook

In [8]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)

### <font color="orange">make_data.ipynbの実行</font>

In [9]:
import os

file_path = "../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv"

if os.path.exists(file_path):
    print("既にデータが保存されています。")
else:
    print("make_data.ipynbを実行します。")
    notebook_path = "../../1_datasets/1_make_dataset/make_data.ipynb"
    execute_notebook(notebook_path=notebook_path)
    print("実行が完了しました。")

既にデータが保存されています。


### <font color="orange">データの読み込み</font>

In [10]:
data = pd.read_csv("../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv")
data.head()

Unnamed: 0,review_score,order_status,count_payment_sequential,payment_type_credit_card,payment_type_boleto,payment_type_voucher,payment_type_debit_card,mean_credit_card_payment_installments,payment_value,seller_id,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_volume,days_approved_deliverd,days_estimated_deliverd
0,4.0,delivered,3,1,0,1,0,1,38.71,3504c0cb71d7fa48d967e0e4c94d59d9,housewares,40,268,4,500,1976,8,-8
1,4.0,delivered,1,0,1,0,0,0,141.46,289cdb325fb7e7f891c38608bf9e0962,perfumery,29,178,1,400,4693,12,-6
2,5.0,delivered,1,1,0,0,0,3,179.12,4869f7a5dfa277a7dca6462dcf3b52b2,auto,46,232,1,420,9576,9,-18
3,5.0,delivered,1,1,0,0,0,1,72.2,66922902710d126a0e7d26b0e3805106,pet_shop,59,468,3,450,6000,13,-13
4,5.0,delivered,1,1,0,0,0,1,28.62,2c9e548be18521d1c43cde1c582c6de8,stationery,38,316,4,250,11475,2,-10


### <font color="orange">モデルの実装</font>

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

scores = []

X = data.drop("review_score", axis=1)
y = data["review_score"]

# データの分割
kf = StratifiedKFold(n_splits=2, shuffle=True)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()


    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
    
    # 予測
    model = RandomForestClassifier(n_estimators=50)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Quadratic Weighted Kappaの計算
    kappa = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    print(f'Quadratic Weighted Kappa: {kappa}')
    
    scores.append(kappa)
    
    break # 後で削除

print(f"mean_soce : {sum(scores)/len(scores)}")

Quadratic Weighted Kappa: 0.4275972037719127
mean_soce : 0.4275972037719127


In [12]:
from sklearn.metrics import confusion_matrix

labels = [1,2,3,4,5]

cm = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=labels)

columns_labels = ["pred_"+str(label) for label in labels]
index_labels = ["act_"+str(label) for label in labels]

cm = pd.DataFrame(cm, index=index_labels, columns=columns_labels)
cm

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,2835,29,99,239,2977
act_2,394,37,35,97,1143
act_3,413,10,125,274,3438
act_4,347,19,123,740,8531
act_5,631,45,282,1549,26745


In [13]:
im = pd.DataFrame({"columns":X_train.columns, "value":model.feature_importances_})
im.sort_values(by="value", ascending=False).head(10)

Unnamed: 0,columns,value
13,days_estimated_deliverd,0.137581
6,payment_value,0.131671
12,days_approved_deliverd,0.10869
15,te_seller_id,0.091578
8,product_description_lenght,0.09001
11,product_volume,0.082232
10,product_weight_g,0.080671
7,product_name_lenght,0.071301
5,mean_credit_card_payment_installments,0.062237
16,te_product_category_name_english,0.051906
