In [11]:
from pprint import pprint
import pandas as pd
import sklearn
import lightgbm as lgb
import shap
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

plt.style.use("ggplot")
sns.set(font="meiryo")

In [21]:
# データセットの取得
X, y = shap.datasets.adult()

# 説明変数の表示
display(X.head(10))

Unnamed: 0,Age,Workclass,Education-Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country
0,39.0,7,13.0,4,1,0,4,1,2174.0,0.0,40.0,39
1,50.0,6,13.0,2,4,4,4,1,0.0,0.0,13.0,39
2,38.0,4,9.0,0,6,0,4,1,0.0,0.0,40.0,39
3,53.0,4,7.0,2,6,4,2,1,0.0,0.0,40.0,39
4,28.0,4,13.0,2,10,5,2,0,0.0,0.0,40.0,5
5,37.0,4,14.0,2,4,5,4,0,0.0,0.0,40.0,39
6,49.0,4,5.0,3,8,0,2,0,0.0,0.0,16.0,23
7,52.0,6,9.0,2,4,4,4,1,0.0,0.0,45.0,39
8,31.0,4,14.0,4,10,0,4,0,14084.0,0.0,50.0,39
9,42.0,4,13.0,2,4,4,4,1,5178.0,0.0,40.0,39


In [22]:
# 目的変数の表示
print(y[:10])

[False False False False False False False  True  True  True]


In [14]:
# データサイズの表示
print(X.shape)

(32561, 12)


In [15]:
# 各列のデータ型の表示
print(X.dtypes)

Age               float32
Workclass            int8
Education-Num     float32
Marital Status       int8
Occupation           int8
Relationship        int32
Race                 int8
Sex                  int8
Capital Gain      float32
Capital Loss      float32
Hours per week    float32
Country              int8
dtype: object


In [16]:
# 各列の欠損数：欠損値がないことを確認
print(X.isna().sum())

Age               0
Workclass         0
Education-Num     0
Marital Status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital Gain      0
Capital Loss      0
Hours per week    0
Country           0
dtype: int64


In [17]:
# カテゴリ変数の指定
for c in X.columns:
    if X[c].dtype == np.int8:
        # データ型を変更
        X[c] = X[c].astype("category")

# 各列のデータ型の表示
print(X.dtypes)

Age                float32
Workclass         category
Education-Num      float32
Marital Status    category
Occupation        category
Relationship         int32
Race              category
Sex               category
Capital Gain       float32
Capital Loss       float32
Hours per week     float32
Country           category
dtype: object


In [18]:
# 学習用とテスト用へのデータ分割
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)

In [24]:
# モデルの学習
model = lgb.LGBMClassifier()
model.fit(X_train.values, y_train)

# 予測値の算出
# predict_prop の返り値は（0の確率, 1の確率）の2列からなるので、1の確率のみを取り出す
y_pred_prop = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# 予測値の評価
val = sklearn.metrics.roc_auc_score(y_test, y_pred_prop)
print(f"auc-rocスコア: {val}")

auc-rocスコア: 0.9277370504224567


In [20]:
# 予測値の混同行列
display(pd.crosstab(y_test, y_pred))

col_0,False,True
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
False,5779,368
True,694,1300
