# Shapによる解釈性の可視化
- [機械学習モデルの説明性・解釈性について -SHAPによる実践あり
](https://cpp-learning.com/interpretable-model/)

In [1]:
import pandas as pd
from matplotlib import pyplot as plt

import lightgbm as lgb

from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error
import shap

ModuleNotFoundError: No module named 'shap'

### 前処理

In [None]:
train_df = pd.read_csv('res/train.csv')
test_df = pd.read_csv('res/test.csv')
submission_df = pd.read_csv('res/sample_submit.csv')

In [None]:
combine = [train_df, test_df]

In [None]:
# for df in combine:
# train_df.drop(columns=['index'], inplace=True)

### 1. age -> 一定年齢ごとにグループ分けする
- ~22, ~28, ~34, ~40, ~46, ~52, ~64で分けてみる

In [None]:
for df in combine:
    
    df.loc[                     df['age'] <= 22,  'age' ] = 0
    df.loc[ (22 < df['age']) & (df['age'] <= 28), 'age' ] = 1
    df.loc[ (28 < df['age']) & (df['age'] <= 34), 'age' ] = 2
    df.loc[ (34 < df['age']) & (df['age'] <= 40), 'age' ] = 3
    df.loc[ (40 < df['age']) & (df['age'] <= 46), 'age' ] = 4
    df.loc[ (46 < df['age']) & (df['age'] <= 52), 'age' ] = 5
    df.loc[ (52 < df['age'])                    , 'age' ] = 6
    
train_df['age']

### 2. workclass -> シンプルにラベルを振る

In [None]:
workclass_map = {}
for index, item in enumerate(train_df['workclass'].unique()):
    workclass_map[item] = index

for df in combine:
    df['workclass'] = df['workclass'].map( workclass_map ).astype(int)
    
train_df.head()

### 3. fnlwgt -> 確認中
- Yと関連性が薄そうなので一旦削除

In [None]:
train_df['fnlwgt'].nunique(), train_df['fnlwgt'].dtype

In [None]:
for df in combine:
    df.drop(columns=['fnlwgt'], axis=1, inplace=True)

In [None]:
train_df.head()

### 4. education
- 余力で*th系をまとめるなどの工夫をする

In [None]:
education_map = {}
for index, item in enumerate(train_df['education'].unique()):
    education_map[item] = index

for df in combine:
    df['education'] = df['education'].map( education_map ).astype(int)
    
train_df.head()

### 5. education-num
- そのままでOK
- 余力で項目数を削減する方向での分類を行う

In [None]:
# education_num_map = {}
# for index, item in enumerate(train_df['education-num'].unique()):
#     education_num_map[item] = index

# for df in combine:
#     df['education-num'] = df['education-num'].map( education_num_map ).astype(int)
    
# train_df.head()

### 6. marital-status
- シンプルにラベル付けする

In [None]:
replace_map = {}
column_name = 'marital-status'
for index, item in enumerate(train_df[column_name].unique()):
    replace_map[item] = index

for df in combine:
    df[column_name] = df[column_name].map( replace_map ).astype(int)
    
train_df.head()

### occupation
- 一旦ラベル付け
- 余力でone-hotコーディングをしてみる

In [None]:
replace_map = {}
column_name = 'occupation'
for index, item in enumerate(train_df[column_name].unique()):
    replace_map[item] = index

for df in combine:
    df[column_name] = df[column_name].map( replace_map ).astype(int)
    
train_df.head()

### relationship
- husbandかそれ以外か、という分類でも問題なさそう
- 余力でone-hotコーディングをしてみる

In [None]:
for df in combine:
    
    df.loc[ df['relationship'] != 'Husband' , 'relationship'] = 0.
    df.loc[ df['relationship'] == 'Husband' , 'relationship'] = 1.
    df['relationship'].astype(int)
    
train_df.head()
train_df['relationship'].nunique()

### race
- 一旦シンプルにラベリング

In [None]:
replace_map = {}
column_name = 'race'
for index, item in enumerate(train_df[column_name].unique()):
    replace_map[item] = index

for df in combine:
    df[column_name] = df[column_name].map( replace_map ).astype(int)
    
train_df.head()

### native-country
- 一旦シンプルにラベリング

In [None]:
replace_map = {}
column_name = 'native-country'
for index, item in enumerate(train_df[column_name].unique()):
    replace_map[item] = index

for df in combine:
    df[column_name] = df[column_name].map( replace_map ).astype(int)
    
train_df.head()

In [None]:
for dataset in combine:
    dataset['sex'] = dataset['sex'].map( {'Female': 0, 'Male': 1} ).astype(int)
    
train_df['sex']

### index = Yに変換する

In [None]:
train_df.set_index('Y')

In [None]:
X_train = train_df.drop('Y', axis=1)
Y_train = train_df['Y']
X_test  = test_df
X_train.shape, Y_train.shape, X_test.shape

In [None]:
X_train = X_train.astype('int64')
X_test  = X_test.astype('int64')

X_train.info()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.3)
print(len(X_train))
print(len(X_test))
# X_train.head()
X_train.info()

In [None]:
# # X, y = shap.datasets.boston()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# # print(len(X_train))
# # print(len(X_test))
# # X.head()


### lightgbmによる学習

In [None]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

# LightGBM parameters
params = {
        'objective' : 'regression',
        'metric' : 'rmse',
        'num_leaves' : 31,
        'learning_rate' : 0.1,
        'feature_fraction' : 1.0,
        'bagging_fraction' : 1.0,
        'bagging_freq': 0,
        'verbose' : 0,
        'min_child_samples': 5       
}

# train
model = lgb.train(params,
            lgb_train,
            num_boost_round=200,
            valid_sets=lgb_eval,
            early_stopping_rounds=10)

In [None]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
# metric = y_test-y_pred
plt.figure(figsize=(10, 4))
plt.plot(y_test, label="y")
plt.plot(y_pred, label="y_pred")
# plt.plot(metric)
plt.legend()
plt.show()

In [None]:
# target data
ID = 5
print("===== Explanatory variable =====")
print(X_train.iloc[ID,:])
print("====== Response variable =======")
print("y:", y_train[ID])
# explain the model's predictions using SHAP
# (same syntax works for LightGBM, CatBoost, scikit-learn and spark models)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

# load JS visualization code to notebook
shap.initjs()

# visualize the 5th prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[ID,:], X_train.iloc[ID,:])

In [None]:
# load JS visualization code to notebook
shap.initjs()

# visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, X_train)

In [None]:
# create a dependence plot to show the effect of a single feature across the whole dataset
shap.dependence_plot("RM", shap_values, X_train)


In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar")


In [None]:
lgb.plot_importance(model)
plt.show()