In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns

In [2]:
# 自作関数の呼び出し
import sys
sys.path.append("../../")    # <- 親フォルダから呼び出すためにpathを追加する。
from my_package.excute_notebook import execute_notebook

In [3]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)

### <font color="orange">make_data.ipynbの実行</font>

In [4]:
import os

file_path = "../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv"

if os.path.exists(file_path):
    print("既にデータが保存されています。")
else:
    print("make_data.ipynbを実行します。")
    notebook_path = "../../1_datasets/1_make_dataset/make_data.ipynb"
    execute_notebook(notebook_path=notebook_path)
    print("実行が完了しました。")

既にデータが保存されています。


### <font color="orange">データの読み込み</font>

In [5]:
data = pd.read_csv("../../1_datasets/1_make_dataset/tmp_data_folder/3_main_data/train_data.csv")
data.head()

Unnamed: 0,review_score,order_status,count_payment_sequential,payment_type_credit_card,payment_type_boleto,payment_type_voucher,payment_type_debit_card,mean_credit_card_payment_installments,payment_value,seller_id,product_category_name_english,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_volume,days_approved_deliverd,days_estimated_deliverd
0,4.0,delivered,3,1,0,1,0,1,38.71,3504c0cb71d7fa48d967e0e4c94d59d9,housewares,40,268,4,500,1976,8,-8
1,4.0,delivered,1,0,1,0,0,0,141.46,289cdb325fb7e7f891c38608bf9e0962,perfumery,29,178,1,400,4693,12,-6
2,5.0,delivered,1,1,0,0,0,3,179.12,4869f7a5dfa277a7dca6462dcf3b52b2,auto,46,232,1,420,9576,9,-18
3,5.0,delivered,1,1,0,0,0,1,72.2,66922902710d126a0e7d26b0e3805106,pet_shop,59,468,3,450,6000,13,-13
4,5.0,delivered,1,1,0,0,0,1,28.62,2c9e548be18521d1c43cde1c582c6de8,stationery,38,316,4,250,11475,2,-10


### <font color="orange">モデルの実装</font>

In [6]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import cohen_kappa_score
import numpy as np

# Quadratic Weighted Kappaの目的関数
def qwk_loss(preds, train_data):
    labels = train_data.get_label()
    preds_rounded = np.round(preds).astype(int)
    kappa = cohen_kappa_score(labels, preds_rounded, weights='quadratic')
    return 'qwk', -kappa, True

X = data.drop("review_score", axis=1)
y = data["review_score"]

# LightGBMのパラメータ
params = {
    "boosting" : "gbdt",
    'objective': 'regression',  # 回帰タスク
    'metric': 'custom',
    "min_data_in_leaf" : 30,
    'num_leaves': 16,            # 木の最大の葉の数
    'learning_rate': 0.05,       # 学習率
    'is_unbalance': True,        # 不均衡なクラスの扱い
    "verbose" : -1,
    "max_depth" : -1,
    "bagging_fraction" : 0.9,
    "bagging_freq" : 0,
}

# データの分割
kf = StratifiedKFold(n_splits=2, shuffle=True)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()

    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
    
    # LightGBMのモデルの作成と学習
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model = lgb.train(params=params,
                      num_boost_round=5000,
                      train_set=lgb_train,
                      valid_sets=[lgb_train, lgb_valid],
                      feval=qwk_loss,
                      )

    # 予測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred_rounded = np.round(y_pred).astype(int)

    y_pred_rounded[y_pred_rounded < 1] = 1
    y_pred_rounded[y_pred_rounded >5] = 5

    # Quadratic Weighted Kappaの計算
    kappa = cohen_kappa_score(y_test, y_pred_rounded, weights='quadratic')
    print(f'Quadratic Weighted Kappa: {kappa}')
    
    break  # 後で削除


Quadratic Weighted Kappa: 0.4280219013512222


In [7]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
import numpy as np

X = data.drop("review_score", axis=1)
y = data["review_score"]

# LightGBMのパラメータ
params = {
    "boosting" : "gbdt",
    'objective': 'regression',  # 回帰タスク
    'metric': 'rmse',
    "min_data_in_leaf" : 30,
    'num_leaves': 16,            # 木の最大の葉の数
    "max_depth" : -1,
    'learning_rate': 0.05,       # 学習率
    'is_unbalance': True,        # 不均衡なクラスの扱い
    "verbose" : -1
}

# データの分割
kf = StratifiedKFold(n_splits=2, shuffle=True)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()
    
    
    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
    
    
    # LightGBMのモデルの作成と学習
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model = lgb.train(params=params,
                      num_boost_round=5000,
                      train_set=lgb_train,
                      valid_sets=[lgb_train, lgb_valid],
                      )

    # 予測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred_rounded = np.round(y_pred).astype(int)
    
    y_pred_rounded[y_pred_rounded < 1] = 1
    y_pred_rounded[y_pred_rounded >5] = 5
    
    # Quadratic Weighted Kappaの計算
    kappa = cohen_kappa_score(y_test, y_pred_rounded, weights='quadratic')
    print(f'Quadratic Weighted Kappa: {kappa}')
    
    break  # 後で削除

Quadratic Weighted Kappa: 0.43197634300262444


In [8]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
import numpy as np

X = data.drop("review_score", axis=1)
y = data["review_score"]

# LightGBMのパラメータ
params = {
    "boosting" : "gbdt",
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    "num_class" : 5,
    "min_data_in_leaf" : 30,
    'num_leaves': 16,            # 木の最大の葉の数
    "max_depth" : -1,
    'learning_rate': 0.05,       # 学習率
    'is_unbalance': True,        # 不均衡なクラスの扱い
    "verbose" : -1,
    "bagging_freq" : 1,
    "bagging_fraction" : 0.9
    
}

# データの分割
kf = StratifiedKFold(n_splits=2, shuffle=True)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):
    
    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()
    
    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()
    
    y_train -= 1
    y_test -= 1
    
    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict() 
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]
    
    
    # LightGBMのモデルの作成と学習
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model = lgb.train(params=params,
                      num_boost_round=5000,
                      train_set=lgb_train,
                      valid_sets=[lgb_train, lgb_valid],
                      )

    # 予測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred_rounded = np.argmax(y_pred, axis=1)
    
    y_pred_rounded += 1
    y_test += 1

    
    # Quadratic Weighted Kappaの計算
    kappa = cohen_kappa_score(y_test, y_pred_rounded, weights='quadratic')
    print(f'Quadratic Weighted Kappa: {kappa}')
    
    break  # 後で削除

Quadratic Weighted Kappa: 0.43502150955912977


In [9]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
import numpy as np

def quadratic_weighted_kappa(preds, true):
    preds = np.argmax(preds, axis=1) + 1
    true = true.get_label() + 1
    return 'qwk', cohen_kappa_score(true, preds, weights='quadratic'), True

X = data.drop("review_score", axis=1)
y = data["review_score"]

# LightGBMのパラメータ
params = {
    "boosting": "gbdt",
    'objective': 'multiclass',
    'metric': 'custom',
    "num_class": 5,
    "min_data_in_leaf": 30,
    'num_leaves': 16,
    "max_depth": -1,
    'learning_rate': 0.05,
    'is_unbalance': True,
    "verbose": -1,
    "bagging_freq": 1,
    "bagging_fraction": 0.9,
}

# データの分割
kf = StratifiedKFold(n_splits=2, shuffle=True)

for i, (tr_index, te_index) in enumerate(kf.split(X, y), 1):

    X_train = X.iloc[tr_index].copy()
    y_train = y.iloc[tr_index].copy()

    X_test = X.iloc[te_index].copy()
    y_test = y.iloc[te_index].copy()

    y_train -= 1
    y_test -= 1

    # ターゲットエンコーディング
    for category_column in X_train.select_dtypes(include="object"):
        tmp_te = data.groupby(category_column)["review_score"].mean().to_dict()
        add_column_name = "te_" + category_column
        X_train[add_column_name] = X_train[category_column].map(tmp_te)
        X_test[add_column_name] = X_test[category_column].map(tmp_te)
        del X_train[category_column]
        del X_test[category_column]

    # LightGBMのモデルの作成と学習
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_test, y_test, reference=lgb_train)

    model = lgb.train(params=params,
                      num_boost_round=5000,
                      train_set=lgb_train,
                      valid_sets=[lgb_train, lgb_valid],
                      feval=quadratic_weighted_kappa
                      )

    # 予測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred_rounded = np.argmax(y_pred, axis=1)
    
    y_pred_rounded += 1
    y_test += 1
    
    # Quadratic Weighted Kappaの計算
    kappa = cohen_kappa_score(y_test, y_pred_rounded, weights='quadratic')
    print(f'Quadratic Weighted Kappa: {kappa}')

    break  # 後で削除


Quadratic Weighted Kappa: 0.4300953561220655


In [10]:
from sklearn.metrics import confusion_matrix

labels = [1,2,3,4,5]

cm = confusion_matrix(y_true=y_test, y_pred=y_pred_rounded, labels=labels)

columns_labels = ["pred_"+str(label) for label in labels]
index_labels = ["act_"+str(label) for label in labels]

cm = pd.DataFrame(cm, index=index_labels, columns=columns_labels)
cm

Unnamed: 0,pred_1,pred_2,pred_3,pred_4,pred_5
act_1,2823,53,106,299,2898
act_2,395,57,29,91,1134
act_3,415,23,112,302,3408
act_4,358,29,134,671,8568
act_5,650,72,297,1476,26757


In [11]:
im = pd.DataFrame({"columns":X_train.columns, "value":model.feature_importance(importance_type="gain")})
im.sort_values(by="value", ascending=False).head(10)

Unnamed: 0,columns,value
13,days_estimated_deliverd,136346.476201
6,payment_value,128479.914525
15,te_seller_id,98861.987274
8,product_description_lenght,87035.529661
12,days_approved_deliverd,84982.361387
11,product_volume,79489.16589
10,product_weight_g,74016.888311
7,product_name_lenght,57233.209969
16,te_product_category_name_english,43642.264604
5,mean_credit_card_payment_installments,38529.354799
