# 利用shap解释Xgboost模型

在SHAP被广泛使用之前，我们通常用feature importance或者partial dependence plot来解释xgboost。 feature importance是用来衡量数据集中每个特征的重要性。  

简单来说，每个特征对于提升整个模型的预测能力的贡献程度就是特征的重要性。  

Feature importance可以直观地反映出特征的重要性，看出哪些特征对最终的模型影响较大。但是无法判断特征与最终预测结果的关系是如何的。  

下面这个例子中，我们用2018年足球球员身价数据来具体阐述。  

## 1、Feature Importance

### 导包

In [7]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'xgboost'

### 读取数据  
目标变量y是球员的身价

In [2]:
data = pd.read_csv("database/data/train.csv")
data

Unnamed: 0,id,club,league,birth_date,height_cm,weight_kg,nationality,potential,pac,sho,...,st,lw,cf,cam,cm,cdm,cb,lb,gk,y
0,0,293,25,10/4/96,177,72,78,73,65,60,...,63.0,64.0,64.0,64.0,63.0,57.0,53.0,56.0,,70.0
1,1,258,24,9/21/84,178,70,51,62,56,39,...,52.0,60.0,57.0,59.0,61.0,64.0,61.0,64.0,,24.0
2,2,112,3,6/8/99,177,69,52,68,68,57,...,56.0,54.0,55.0,53.0,45.0,34.0,31.0,36.0,,17.0
3,3,604,9,7/25/88,181,81,54,81,76,74,...,77.0,76.0,77.0,77.0,79.0,78.0,77.0,78.0,,1750.0
4,4,80,37,8/4/80,179,75,96,72,40,62,...,62.0,66.0,65.0,68.0,71.0,70.0,66.0,64.0,,97.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10436,10436,277,9,5/24/98,176,79,1,75,74,43,...,52.0,54.0,53.0,52.0,52.0,58.0,60.0,62.0,,55.0
10437,10437,101,8,7/27/91,178,68,147,62,66,38,...,50.0,54.0,52.0,54.0,56.0,60.0,60.0,60.0,,25.0
10438,10438,626,26,2/29/92,178,72,64,77,81,77,...,76.0,72.0,73.0,70.0,62.0,46.0,41.0,46.0,,1000.0
10439,10439,147,9,11/25/89,186,79,1,76,70,74,...,74.0,69.0,72.0,69.0,64.0,52.0,49.0,50.0,,800.0


### 计算当时球员的年龄

将生日转换为标准日期

In [3]:
today = pd.to_datetime('2018-01-01')
data['birth_date'] = pd.to_datetime(data['birth_date'])
data

Unnamed: 0,id,club,league,birth_date,height_cm,weight_kg,nationality,potential,pac,sho,...,st,lw,cf,cam,cm,cdm,cb,lb,gk,y
0,0,293,25,1996-10-04,177,72,78,73,65,60,...,63.0,64.0,64.0,64.0,63.0,57.0,53.0,56.0,,70.0
1,1,258,24,1984-09-21,178,70,51,62,56,39,...,52.0,60.0,57.0,59.0,61.0,64.0,61.0,64.0,,24.0
2,2,112,3,1999-06-08,177,69,52,68,68,57,...,56.0,54.0,55.0,53.0,45.0,34.0,31.0,36.0,,17.0
3,3,604,9,1988-07-25,181,81,54,81,76,74,...,77.0,76.0,77.0,77.0,79.0,78.0,77.0,78.0,,1750.0
4,4,80,37,1980-08-04,179,75,96,72,40,62,...,62.0,66.0,65.0,68.0,71.0,70.0,66.0,64.0,,97.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10436,10436,277,9,1998-05-24,176,79,1,75,74,43,...,52.0,54.0,53.0,52.0,52.0,58.0,60.0,62.0,,55.0
10437,10437,101,8,1991-07-27,178,68,147,62,66,38,...,50.0,54.0,52.0,54.0,56.0,60.0,60.0,60.0,,25.0
10438,10438,626,26,1992-02-29,178,72,64,77,81,77,...,76.0,72.0,73.0,70.0,62.0,46.0,41.0,46.0,,1000.0
10439,10439,147,9,1989-11-25,186,79,1,76,70,74,...,74.0,69.0,72.0,69.0,64.0,52.0,49.0,50.0,,800.0


计算当时球员的年龄

In [4]:
data['age'] = np.round((today - data['birth_date']).apply(lambda x: x.days) / 365.,1)
data

Unnamed: 0,id,club,league,birth_date,height_cm,weight_kg,nationality,potential,pac,sho,...,lw,cf,cam,cm,cdm,cb,lb,gk,y,age
0,0,293,25,1996-10-04,177,72,78,73,65,60,...,64.0,64.0,64.0,63.0,57.0,53.0,56.0,,70.0,21.3
1,1,258,24,1984-09-21,178,70,51,62,56,39,...,60.0,57.0,59.0,61.0,64.0,61.0,64.0,,24.0,33.3
2,2,112,3,1999-06-08,177,69,52,68,68,57,...,54.0,55.0,53.0,45.0,34.0,31.0,36.0,,17.0,18.6
3,3,604,9,1988-07-25,181,81,54,81,76,74,...,76.0,77.0,77.0,79.0,78.0,77.0,78.0,,1750.0,29.5
4,4,80,37,1980-08-04,179,75,96,72,40,62,...,66.0,65.0,68.0,71.0,70.0,66.0,64.0,,97.5,37.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10436,10436,277,9,1998-05-24,176,79,1,75,74,43,...,54.0,53.0,52.0,52.0,58.0,60.0,62.0,,55.0,19.6
10437,10437,101,8,1991-07-27,178,68,147,62,66,38,...,54.0,52.0,54.0,56.0,60.0,60.0,60.0,,25.0,26.5
10438,10438,626,26,1992-02-29,178,72,64,77,81,77,...,72.0,73.0,70.0,62.0,46.0,41.0,46.0,,1000.0,25.9
10439,10439,147,9,1989-11-25,186,79,1,76,70,74,...,69.0,72.0,69.0,64.0,52.0,49.0,50.0,,800.0,28.1


### 选择特征

In [5]:
cols = ['height_cm', 'potential', 'pac', 'sho', 'pas', 'dri', 'def', 'phy', 'international_reputation', 'age']

### 训练Xgboost回归模型

In [6]:
model = xgb.XGBRegressor(max_depth=4, learning_rate=0.05, n_estimators=150)
model.fit(data[cols], data['y'].values)

NameError: name 'xgb' is not defined

### 获取features_importance

In [None]:
plt.figure(figsize=(15, 5))
plt.bar(range(len(cols)), model.feature_importances_)
plt.xticks(range(len(cols)), cols, rotation=-45, fontsize=14)
plt.title('Feature importance', fontsize=14)
plt.show()