# 机器学习与社会科学应用

# 第五章 集成算法

<font face="宋体" >郭峰    
    教授、博士生导师  
上海财经大学公共经济与管理学院  
邮箱：guofengsfi@163.com</font> 

<font face="宋体" >本章目录：  
第二节  随机森林算法  
第三节  梯度提升树算法  
第四节  XGBoost算法</font> 

## 第一节 随机森林算法 

### 1.1 导入第三方模块，调用数据集train，并查看数据集结构及前10行信息 

In [None]:
# 导入相关第三方库
import pandas as pd
import time
from collections import defaultdict
# from sklearn.externals import joblib
import joblib
import datetime
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier  
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
# from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
from numpy import *
from sklearn.model_selection import train_test_split
starttime = datetime.datetime.now()

#导入数据
path = "D:/python/机器学习与社会科学应用/演示数据/05集成算法/name_and_gender/"
f = open(path+'train.txt',encoding='utf-8')
data = pd.read_csv(f,header=0,sep=',') #一个竞赛网站12万样本
data['name'] = data['name'].astype(str)
data['gender'] = data['gender'].astype(int)
print(data.shape)
data.head(10)

### 1.2 将data前1000行作为示例数据，并对其划分训练集和测试集 

In [None]:
#样本量太大的情况下，运行效率会比较低
data = data[0:1000]
# 将数据分出一部分，作为测试集，剩下的用于建模
data_train, data_test = train_test_split(data, test_size=0.3, random_state=666)  
print("随机挑选一部分进行建模：", data_train.shape)

In [None]:
data_train.head()

###  1.3 提取数据中关于name的相关信息

In [None]:
# 特征x是姓名用字，需要将x转换为一个数字化的向量
# 所有姓名合并在一起，去重，构造一个姓名用字池向量
name_vec_total = list(data_train['name'])  
name_vec_total = list(''.join(name_vec_total))
#print(name_vec_total[0:20])
print("语料库原始总字数：", len(name_vec_total))
print("不重复字样本量：", len(set(name_vec_total)))
freq = defaultdict(int)
for w in name_vec_total:
    freq[w] += 1
name_vec_total = [w  for w in name_vec_total if freq[w]>5]
name_vec_total = list(set(name_vec_total)) #去重后再转换成列表
print("剔除稀缺字后不重复字样本量：",len(set(name_vec_total)))
print("不重复姓名用字举例:",name_vec_total[0:20])

f = open(path+'name_vec_total_rf.txt','w',encoding='utf8')
f.write(';'.join(name_vec_total))
f.close()

###  1.4 把具体某个姓名用字用上述姓名用字池向量来表示

In [None]:
# 把具体某个姓名(如“建国”)的用字用上述姓名用字池向量来表示
def words2vec(inputSet): #inputSet是待定义姓名,这个函数基于上文得到的name_total
    returnVec = [0] * len(name_vec_total)    #获得所有单词等长的0列表
    for word in inputSet:
        if word in name_vec_total:
            returnVec[name_vec_total.index(word)] += 1   #对应单词位置加1
    return returnVec

# 这个方式是在dataframe中计算
# data_train['name_vec']=data_train['name'].apply(words2vec)
# print(data_train['name'][11],data_train['name_vec'][11])

# 也可以先转换成list后再计算
name = list(data_train['name'])
print("姓名举例:",name[0:20])
name_vec = [words2vec(n) for n in name]   #特征x是用向量表示的姓名，这是一个嵌套列表，会占用内存超级多
# print(name_vec[0:2])

# print(name_vec[0:5])
# 相应y为gender，
gender_vec = list(data_train['gender'])
# print(gender_vec[0:5])

### 1.5 对参数进行网格搜索和调参，并检验训练集预测准确率 

In [None]:
# 对n_estimators进行网格搜索
param_test1 = {'n_estimators':list(range(3,50,2))}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(oob_score=True, random_state=33), 
                       param_grid = param_test1, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(name_vec,gender_vec)
print(gsearch1.best_params_)

#接着我们对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索
param_test2 = {'max_depth':list(range(1,14,2)), 'min_samples_split':list(range(5,201,20))}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=27,oob_score=True, random_state=33),
   param_grid = param_test2, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch2.fit(name_vec,gender_vec)
print(gsearch2.best_params_)

#对于内部节点再划分所需最小样本数min_samples_split，我们暂时不能一起定下来，因为这个还和决策树其他的参数存在关联。
#下面我们再对内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参。
#最优min_samples_split为10，最优min_samples_split为140
param_test3 = {'min_samples_split':list(range(80,150,20)), 'min_samples_leaf':list(range(10,60,10))}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=13, max_depth=9,
                                  max_features='sqrt', oob_score=True, random_state=10),
                                  param_grid = param_test3, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch3.fit(name_vec,gender_vec)
print(gsearch3.best_params_)

#最后我们再对最大特征数max_features做调参: 基本上也是越大越好，但差别不大，取11
param_test4 = {'max_features':list(range(3,20,2))}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=20, max_depth=13, min_samples_split=140,
                                      min_samples_leaf=10 ,oob_score=True, random_state=10),param_grid = param_test4, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch4.fit(name_vec,gender_vec)
print(gsearch4.best_params_)

rf_clf = RandomForestClassifier(n_estimators=40, max_depth=13, min_samples_split=140,
                                  min_samples_leaf=10,max_features=11,oob_score=True, random_state=10)
rf_clf.fit(name_vec,gender_vec)
print("验证集预测准确率:",rf_clf.oob_score_)

joblib.dump(rf_clf,path+'random_forest'+'.model')  #模型的保存

### 1.6 在测试集中进行测试 

In [None]:
#测试集测试
data_test, data_test2 = train_test_split(data_test, test_size=0.95) #测试集可能会太大了
name_new = list(data_test['name']) 
x_test = [words2vec(n) for n in name_new]   
y_test = list(data_test['gender'])
y_pred_new = rf_clf.predict(x_test)
print("随机森林测试集正确率 {:05.2f}%" .format(100*(1-(sum(array(y_pred_new)!=array(y_test))/len(y_test)))))

endtime = datetime.datetime.now()
print("运行时间:",(endtime - starttime).seconds)

### 1.7 网格搜索调参

In [None]:
#导入相关第三方库
import pandas as pd
import time
from collections import defaultdict
#from sklearn.externals import joblib
import joblib
import datetime
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier  
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
#from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
from numpy import *
from sklearn.model_selection import train_test_split
starttime = datetime.datetime.now()

#导入数据
path = "D:/python/机器学习与社会科学应用/演示数据/05集成算法/name_and_gender/"
f = open(path+'train.txt',encoding='utf-8')
data = pd.read_csv(f,header=0,sep=',') #一个竞赛网站12万样本
data['name'] = data['name'].astype(str)
data['gender'] = data['gender'].astype(int)
print(data.shape)
data.head(10)

###通过这里调节样本数量，测试运行效率
data = data[0:1000]
#将数据分出一部分，作为测试集，剩下的用于建模
data_train,data_test = train_test_split(data,test_size=0.3,random_state=666)  
print("随机挑选一部分进行建模：",data_train.shape)

#特征x是姓名用字，需要将x转换为一个数字化的向量
#所有姓名合并在一起，去重，构造一个姓名用字池向量
name_vec_total = list(data_train['name'])  
name_vec_total = list(''.join(name_vec_total))
#print(name_vec_total[0:20])
print("语料库原始总字数：",len(name_vec_total))
print("不重复字样本量：",len(set(name_vec_total)))
freq = defaultdict(int)
for w in name_vec_total:
    freq[w] += 1
name_vec_total = [w  for w in name_vec_total if freq[w]>5]
name_vec_total = list(set(name_vec_total)) #去重后再转换成列表
print("剔除稀缺字后不重复字样本量：",len(set(name_vec_total)))
print("不重复姓名用字举例:",name_vec_total[0:20])

f = open(path+'name_vec_total_rf.txt','w',encoding='utf8')
f.write(';'.join(name_vec_total))
f.close()

#把具体某个姓名(如“建国”)的用字用上述姓名用字池向量来表示
def words2vec(inputSet): #inputSet是待定义姓名,这个函数基于上文得到的name_total
    returnVec = [0] * len(name_vec_total)    #获得所有单词等长的0列表
    for word in inputSet:
        if word in name_vec_total:
            returnVec[name_vec_total.index(word)] += 1   #对应单词位置加1
    return returnVec

#这个方式是在dataframe中计算
#data_train['name_vec']=data_train['name'].apply(words2vec)
#print(data_train['name'][11],data_train['name_vec'][11])

#也可以先转换成list后再计算
name = list(data_train['name'])
print("姓名举例:",name[0:20])
name_vec = [words2vec(n) for n in name]   #特征x是用向量表示的姓名，这是一个嵌套列表，会占用内存超级多
#print(name_vec[0:2])

#print(name_vec[0:5])
#相应y为gender，
gender_vec = list(data_train['gender'])
#print(gender_vec[0:5])

In [None]:
#网格搜索，时间消耗的太久了
param_test6 = {'n_estimators':list(range(3,50,2)),'max_depth':list(range(1,14,2)), 'min_samples_split':list(range(5,201,10)),
              'min_samples_leaf':list(range(10,60,10)),'max_features':list(range(3,20,2))}
gsearch6 = GridSearchCV(estimator = RandomForestClassifier(oob_score=True, random_state=33), 
                       param_grid = param_test6, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch6.fit(name_vec,gender_vec)
print(gsearch6.best_params_)

## 第二节 梯度提升树 

### 2.1 导入第三方模块和数据 

In [None]:
#数据说明:
#新能源汽车充电桩的故障检测问题，提供85500条训练数据（标签：0代表充电桩正常，1代表充电桩有故障）

In [None]:
#读入数据
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

path = "D:/python/机器学习与社会科学应用/演示数据/05集成算法/"
charging_pile = pd.read_csv(path+"charging_pile.csv",encoding='utf-8')
#charging_pile = pd.read_csv(f,header=0,sep=',')
print(charging_pile.shape)
charging_pile.head()
#s1-s6的含义参阅：https://blog.csdn.net/gb4215287/article/details/105184238/

### 2.2 定义特征变量和响应变量，划分测试集和训练集，并进行训练 

In [None]:
#区分x和y
x_columns = []
for x in charging_pile.columns:
    if x not in ['id', 'label']:
        x_columns.append(x)
X = charging_pile[x_columns]
y = charging_pile['label']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
print(x_train.shape)
print(x_test.shape)

# 模型训练，使用GBDT算法
gbr = GradientBoostingClassifier(n_estimators=3000, max_depth=2, min_samples_split=2, learning_rate=0.1)
gbr.fit(x_train, y_train.ravel())

### 2.3 查看训练集和测试集的准确率 

In [None]:
#训练和验证的准确率
y_gbr = gbr.predict(x_train)
y_gbr1 = gbr.predict(x_test)
acc_train = gbr.score(x_train, y_train)
acc_test = gbr.score(x_test, y_test)
print(acc_train)
print(acc_test)

### 2.4 GBDT算法参数

In [None]:
#GBDT分类算法参数
from sklearn.ensemble import GradientBoostingClassifier
GradientBoostingClassifier()

主要的几个参数：
1.criterion参数：指特征选择的标准，我们就选择默认即可。
2.init参数：指是否用该参数提供的弱分类器来进行预测，默认为None，即使用原始样本集来进行预测。
3.learning_rate：习率，指弱分类器的系数。
4.loss：指损失函数的类型，默认为deviance，即使用对数似然函数；也可以选择exponential，即指数损失函数。
5.subsample：指采样的比例，在0-1之间，默认为1，即不采样，使用全部样本；小于1，意味着只有一部分参与了模型的拟合。
6.n_estimators：指弱分类器的个数，默认为100.

In [None]:
#GBDT回归模型的参数
from sklearn.ensemble import GradientBoostingRegressor
GradientBoostingRegressor()

In [None]:
#回归模型的参数大部分与分类相同，只有损失函数采用的不同
#回归模型的损失函数主要有4种，默认为ls（标准差函数）。还有三种分别是lad（绝对损失函数）、huber和quantile（分位损失函数）。

## 第三节 XGBoost 

### 3.1 导入第三方模块，并且导入数据 

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os

import xgboost as xgb #导入成功则说明安装正确
xgb.__version__

from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.model_selection import train_test_split

path = "E:\machine learning\data\\05集成算法\house.csv"
data = pd.read_csv(path,index_col=0)
data.head()

Unnamed: 0,Id,住宅类型,住宅区域,街道接触面积(英尺),住宅面积,街道路面状况,巷子路面状况,住宅形状(大概),住宅现状,水电气,...,泳池面积,泳池质量,篱笆质量,其他配置,其他配置的价值,销售月份,销售年份,销售类型,销售状态,SalePrice
0,0.0,5.0,3.0,36.0,327.0,1.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,8.0,4.0,208500
1,1.0,0.0,3.0,51.0,498.0,1.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,1.0,8.0,4.0,181500
2,2.0,5.0,3.0,39.0,702.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2.0,8.0,4.0,223500
3,3.0,6.0,3.0,31.0,489.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,0.0,140000
4,4.0,5.0,3.0,55.0,925.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,2.0,8.0,4.0,250000


### 3.2 指定特征变量与响应变量，并设置训练集和测试集 

In [2]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=14)

### 3.3 使用XGBoost算法和XGBoost交叉验证算法 

In [22]:
# XGBoost算法：回归
xgb = XGBRegressor(random_state=12,tree_method="hist", device="cuda")
xgb.fit(x_train,y_train)
xgb.score(x_test,y_test) #默认指标R2

# XGBoost交叉验证算法
cv = KFold(n_splits=5,shuffle=True,random_state=14)

cv_xgb = cross_validate(xgb,X,y,cv=cv,scoring="neg_root_mean_squared_error",return_train_score=True,verbose=True,n_jobs=-1)

cv_xgb

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished


{'fit_time': array([0.24908304, 0.26532269, 0.25048804, 0.2342484 , 0.2342484 ]),
 'score_time': array([0.01623964, 0.        , 0.01627326, 0.01623964, 0.01623964]),
 'test_score': array([-27877.76476213, -25645.87889404, -26089.15161616, -34443.64621457,
        -27709.85433389]),
 'train_score': array([ -803.52839876, -1002.82833324,  -623.6970028 ,  -882.645679  ,
         -888.72025226])}

### 3.4  由于XGBoost算法不稳定、过拟合严重等问题，通过限制Max_depth进行缓解

In [23]:
def RMSE(result,name):
    return abs(result[name].mean())

# 训练集上RMSE
RMSE(cv_xgb,"train_score")

# 测试集上RMSE
RMSE(cv_xgb,"test_score")

xgb_depth = XGBRegressor(max_depth=5,random_state=14,tree_method="hist", device="cuda") #实例化
cv_xgb_depth = cross_validate(xgb_depth,X,y,cv=cv
                               ,scoring="neg_root_mean_squared_error" #负根均方误差
                               ,return_train_score=True
                               ,verbose=True
                               ,n_jobs=-1)

RMSE(cv_xgb_depth,"train_score")

RMSE(cv_xgb_depth,"test_score")

xgb_depth = XGBRegressor(max_depth=5,random_state=14,tree_method="hist", device="cuda").fit(X,y)
#查看特征重要性
xgb_depth.feature_importances_

#获取每一个参数的取值
xgb_depth.get_params()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': 'cuda',
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 5,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': 14,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': 'hist',
 'validate_parameters': None,
 'verbosity': None}

### 3.5 二分类问题：乳腺癌数据

In [None]:
from sklearn.metrics import accuracy_score as ACC
from sklearn.metrics import log_loss as logloss

In [None]:
import xgboost as xgb
from sklearn.datasets import load_breast_cancer, load_digits
# 二分类问题
X = load_breast_cancer().data
y = load_breast_cancer().target
data_binary = xgb.DMatrix(X,y)

In [None]:
# 二分类参数用logloss交叉熵损失
params1 = {"seed":14, "objective":"binary:logistic", "eval_metric":"logloss"}
clf_binary = xgb.train(params1, data_binary, num_boost_round=100)

y_pred_binary = clf_binary.predict(data_binary)

# 二分类返回概率，可以转换成0、1变量
y_pred_binary[:20] 

In [None]:
(y_pred_binary > 0.5).astype("int")

# 数据较简单，分类准确率100%
ACC(y,(y_pred_binary > 0.5).astype(int))

### 3.6 多分类问题：手写数字识别

In [None]:
# 多分类问题
X = load_digits().data
y = load_digits().target
data_multi = xgb.DMatrix(X, y)

# 多分类参数用mlogloss交叉熵损失，num_class是分类类别
params2 = {"seed":1412, "objective":"multi:softmax", "eval_metric":"mlogloss" ,"num_class":10}
clf_multi = xgb.train(params2, data_multi, num_boost_round=100)

y_pred_multi = clf_multi.predict(data_multi)

In [None]:
y_pred_multi

In [None]:
# 数据较简单，分类准确率100%
ACC(y, y_pred_multi)

In [None]:
# 本章结束