In [43]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler, normalize
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import KFold

In [44]:
# 这两行的作用是使每个cell的执行局结果可以显示多个
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# 下面这三行代码是为了画图可以显示中文
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False

In [144]:
train_data = pd.read_csv('data_train_B.csv', encoding = 'gbk')
test_data = pd.read_csv('new_data_test_nodate.csv', encoding = 'gbk')
Test_data = pd.read_csv('data_test_B.csv', encoding = 'gbk')
y_test = pd.read_csv('d_answer_a_20180128.csv', names = ['血糖'], encoding = 'gbk')

In [145]:
train_data.head()
test_data.head()

Unnamed: 0,性别,年龄,*天门冬氨酸氨基转换酶,*丙氨酸氨基转换酶,*碱性磷酸酶,*r-谷氨酰基转换酶,*总蛋白,白蛋白,*球蛋白,白球比例,...,血小板计数,血小板平均体积,血小板体积分布宽度,血小板比积,中性粒细胞%,淋巴细胞%,单核细胞%,嗜酸细胞%,嗜碱细胞%,血糖
0,1,41,24.96,23.1,99.59,20.23,76.88,49.6,27.28,1.82,...,166.0,9.9,17.4,0.164,54.1,34.2,6.5,4.7,0.6,6.06
1,1,41,24.57,36.25,67.21,79.0,79.43,47.76,31.67,1.51,...,277.0,9.2,10.3,0.26,52.0,36.7,5.8,4.7,0.8,5.39
2,1,46,20.82,15.23,63.69,38.17,86.23,48.0,38.23,1.26,...,241.0,8.3,16.6,0.199,48.1,40.3,7.7,3.2,0.8,5.59
3,0,22,14.99,10.59,74.08,20.22,70.98,44.02,26.96,1.63,...,252.0,10.3,10.8,0.26,41.7,46.5,6.7,4.6,0.5,4.3
4,0,48,20.07,14.78,75.79,22.72,78.05,41.83,36.22,1.15,...,316.0,11.1,14.0,0.35,56.6,33.1,9.1,0.6,0.6,5.42


Unnamed: 0,性别,年龄,*天门冬氨酸氨基转换酶,*丙氨酸氨基转换酶,*碱性磷酸酶,*r-谷氨酰基转换酶,*总蛋白,白蛋白,*球蛋白,白球比例,...,红细胞体积分布宽度,血小板计数,血小板平均体积,血小板体积分布宽度,血小板比积,中性粒细胞%,淋巴细胞%,单核细胞%,嗜酸细胞%,嗜碱细胞%
0,1,54,23.85,26.69,116.08,34.36,82.75,46.03,36.72,1.25,...,12.3,241,10.8,12.8,0.26,58.4,33.2,7.5,0.6,0.3
1,1,50,29.75,34.98,90.07,111.43,71.9,44.09,27.81,1.59,...,12.0,242,11.5,14.6,0.28,59.3,29.3,7.7,3.2,0.5
2,1,27,22.823494,25.860009,84.736496,28.027265,78.79106,45.911661,32.159592,1.325147,...,12.1,398,8.9,9.9,0.35,50.2,40.1,7.9,1.2,0.6
3,0,53,17.98,16.63,95.95,23.41,78.16,45.44,32.72,1.39,...,12.6,247,11.6,13.9,0.29,53.7,38.0,7.2,0.7,0.4
4,0,43,19.12,19.8,76.97,15.7,80.76,46.9,33.86,1.39,...,12.2,335,10.4,11.9,0.35,52.0,39.4,8.0,0.3,0.3


In [146]:
X_train = train_data.loc[:, train_data.columns != '血糖']
Y_train = train_data.loc[:, train_data.columns == '血糖']
X = np.array(pd.concat([X_train, test_data]))
y = np.array(pd.concat([Y_train, y_test]))

In [300]:
def model(X, y,X_test, folds = 10):
    
    X = np.array(X)
    y = np.array(y)
    X_test = np.array(X_test)
    
    test_pred = np.zeros((X_test.shape[0], folds))
    
    kfold = KFold(n_splits = folds, shuffle = True, random_state = np.random.seed(12306))   # 5折交叉验证

    xgb_model = xgb.XGBRegressor(n_estimators = 2500, 
                                 max_depth = 8,
                                 learning_rate = 0.05,
                                 gamma = 30,
                                 subsample = 0.9,
                                 min_child_weight = 1,
                                 colsample_bytree = 0.9)
    
    for k, (train_index, test_index) in enumerate(kfold.split(X, y)):

        print("**************************************************************")
        print('第 {} 次训练...............'.format(k + 1))
    
        xgb_model.fit(X[train_index], y[train_index])
        y_tr_pred = xgb_model.predict(X[train_index])
        y_de_pred = xgb_model.predict(X[test_index])
        y_test_pred = xgb_model.predict(X_test)
        loss1 = mean_squared_error(y_tr_pred, y[train_index]) * 0.5
        print("训练集误差：" + str(loss1))
        loss2 = mean_squared_error(y_de_pred, y[test_index]) * 0.5
        print("验证集误差：" + str(loss2))
        print("**************************************************************")
        test_pred[:, k] = y_test_pred
    return test_pred

In [301]:
folds = 5
test_pred = model(X_train, Y_train, test_data, folds = folds)

**************************************************************
第 1 次训练...............
训练集误差：0.346508986838
验证集误差：0.840582684562
**************************************************************
**************************************************************
第 2 次训练...............
训练集误差：0.334592136637
验证集误差：1.1290620475
**************************************************************
**************************************************************
第 3 次训练...............
训练集误差：0.350548837441
验证集误差：0.782583892862
**************************************************************
**************************************************************
第 4 次训练...............
训练集误差：0.347490891695
验证集误差：0.908484700832
**************************************************************
**************************************************************
第 5 次训练...............
训练集误差：0.352452872486
验证集误差：0.860511019496
**************************************************************


In [302]:
for i in range(folds):
    loss = mean_squared_error(test_pred[:, i], y_test) * 0.5
    print(loss)
test_pred = pd.DataFrame(test_pred)
loss2 = mean_squared_error(test_pred.median(axis = 1), y_test) * 0.5
loss2

0.890009099992
0.889205208574
0.853174094901
0.886083778288
0.877005967793


0.8488747647917444

In [289]:
folds = 5
Test_pred_last = model(X_train, Y_train, Test_data, folds = folds)
Test_pred_last = pd.DataFrame(Test_pred_last)
Y_last = np.array(Test_pred_last.median(axis = 1))
np.savetxt('BLast_result.csv', Y_last, fmt = '%10.3f')

**************************************************************
第 1 次训练...............
训练集误差：0.352575678019
验证集误差：0.839501443879
**************************************************************
**************************************************************
第 2 次训练...............
训练集误差：0.34409853346
验证集误差：1.12744785647
**************************************************************
**************************************************************
第 3 次训练...............
训练集误差：0.358101794236
验证集误差：0.790667053579
**************************************************************
**************************************************************
第 4 次训练...............
训练集误差：0.352579199467
验证集误差：0.893576502098
**************************************************************
**************************************************************
第 5 次训练...............
训练集误差：0.355022425026
验证集误差：0.84284275217
**************************************************************


In [204]:
# 将原来的训练集分为训练集和交叉验证集
from sklearn.preprocessing import PolynomialFeatures
def data_scale_PFS(X, y, X_test, scale = 'NoScale', poly_features = False):
    """
    参数：
    size是划分训练集和验证集的比例
    scale是归一化方式
    返回：
    
    
    """        
    if poly_features == True:
        poly = PolynomialFeatures(2, interaction_only = True)     # 先生成多项式特征，然后决定是否进行归一化       
        X = poly.fit_transform(X)
        X_test = poly.fit_transform(X_test)
    if scale == 'Standard':    # 对数据进行均值归一化
        scaler = StandardScaler().fit(X)
        X = scaler.transform(X)
        X_test = scaler.transform(X_test)
        
    if scale == 'Robust':     # 对数据进行robust_scale
        robust_scaler = RobustScaler().fit(X)
        X = robust_scaler.transform(X)
        X_test = robust_scaler.transform(X_test)
        
    if scale == 'NoScale':        # 不进行归一化
        X = X
        X_test = X_test
        
    # 将原来训练集中的20%的样本分为交叉验证集
    state = np.random.seed(123)
    #x_train, x_dev, label_train, label_dev = train_y_split(X, y, y_size = size, random_state = state) 
        
    #print('训练集的样本个数是：' + str(x_train.shape[0]))
    #print('训练集的形状是：' + str(x_train.shape))
    #print('交叉验证集的样本个数是：' + str(x_dev.shape[0]))
    #print('所有训练集的样本个数是：' + str(X.shape[0]))
    #print('测试集的样本个数是：' + str(y.shape[0]))
    #return x_train, x_dev, label_train, label_dev, X, y, X_test
    return X, y , X_test              #, X_test

对原始数据不进行归一化，然后用多项式特征生成新的特征

In [206]:
X_s, y_s, X_test = data_scale_PFS( X, y,test_data, scale = 'NoScale', poly_features = False)    
test_pred1 = model(X, y, X_test, folds = 5)

**************************************************************
第 1 次训练...............
训练集误差：0.344878695879
验证集误差：0.764711461854
**************************************************************
**************************************************************
第 2 次训练...............
训练集误差：0.364918220857
验证集误差：0.679847777879
**************************************************************
**************************************************************
第 3 次训练...............
训练集误差：0.337402066025
验证集误差：0.9843818196
**************************************************************
**************************************************************
第 4 次训练...............
训练集误差：0.337060138443
验证集误差：1.09460692261
**************************************************************
**************************************************************
第 5 次训练...............
训练集误差：0.335992204786
验证集误差：0.946966788577
**************************************************************


In [159]:
for i in range(4):
    loss = mean_squared_error(test_pred1[:, i], y_test) * 0.5
    print(loss)
loss2 = mean_squared_error(test_pred1.mean(axis = 1), y_test) * 0.5
loss2

0.856395653796
0.877352509519
0.85896700236
0.884904489029


0.84658194174912837

In [41]:
X_r, y_r, X_test = data_scale_PFS( X, y,test_data, scale = 'Robust', poly_features = False) 
test_pred = model(X_r, y_r, X_test)

**************************************************************
第 1 次训练...............
训练集误差：0.401663248228
验证集误差：0.762833382959
**************************************************************
**************************************************************
第 2 次训练...............
训练集误差：0.436564930708
验证集误差：0.666825981459
**************************************************************
**************************************************************
第 3 次训练...............
训练集误差：0.403503942023
验证集误差：0.985086950535
**************************************************************
**************************************************************
第 4 次训练...............
训练集误差：0.404019323392
验证集误差：1.08891766084
**************************************************************
**************************************************************
第 5 次训练...............
训练集误差：0.402103337341
验证集误差：0.950420713567
**************************************************************


In [42]:
for i in range(5):
    loss = mean_squared_error(test_pred[:, i], y_test) * 0.5
    print(loss)
loss2 = mean_squared_error(test_pred.mean(axis = 1), y_test) * 0.5
loss2

0.509595763525
0.467128945734
0.470741127875
0.558596177135
0.492564954007


0.47009830412817444

## XGBoost调参：
XGBoost中通常有两种方法控制过拟合：
- 直接控制模型的复杂度：包括max_depth, min_child_weight, gamma
- 增加随机性，使训练的模型更鲁棒：包括subsample, colsample, colsmaple_bytree，也可以减少步长eta，如果减少eta，请记得将参数num_round的值调大 
对于不均衡数据集：
有两种方法可以提升模型效果：
- 如果仅仅是用AUC评价模型预测结果：通过scale_pos_weight可以平衡正负样本权重，用AUC作为评价指标
- 如果关注预测的正确率：这种情况下，不能再均衡（re-balance）数据集，通过设置参数max_delta_step为一个较小的数可以帮助模型收敛

## 用XGBoost训练原始数据

In [None]:
raw_train = pd.read_csv('d_train_20180102.csv', encoding = 'gbk')
raw_test = pd.read_csv('d_test_A_20180102.csv', encoding = 'gbk')

In [None]:
raw_train.fillna(raw_train.median(), inplace = True)
raw_test.fillna(raw_test.median(), inplace = True)

In [None]:
raw_train.isnull().any()
raw_train.isnull().any()

In [None]:
raw_train.loc[572, '性别'] = '女'
raw_train.loc[raw_train['性别']== "男", '性别'] = 1
raw_train.loc[raw_train['性别']== '女', '性别'] = 0
raw_test.loc[raw_test['性别']=='男', '性别'] = 1
raw_test.loc[raw_test['性别']=='女', '性别'] = 0
raw_train = raw_train.drop(columns = ['id','体检日期'])
raw_test = raw_test.drop(columns = ['id','体检日期'])

In [None]:
Y = raw_train['血糖']
X = raw_train.loc[:, raw_train.columns != '血糖']
X_te = raw_test

poly = PolynomialFeatures(2, interaction_only=True)     # 先生成多项式特征
X = poly.fit_transform(X)
X_te = poly.fit_transform(X_te)

X_t, X_d, Y_t, Y_d = train_test_split(X, Y, test_size = 0.2, random_state = 123456) 

In [None]:
xgb_model = xgb.XGBRegressor(n_estimators = 1000, 
                             max_depth = 6,
                             learning_rate = 0.01,
                             subsample = 0.8,
                             colsample_bytree = 0.7)
xgb_model.fit(X_t, Y_t.values.ravel())
y_tr = xgb_model.predict(X_t)
y_de = xgb_model.predict(X_d)
loss1 = mean_squared_error(y_tr, Y_t)
loss1
loss2 = mean_squared_error(y_de, Y_d)
loss2

In [None]:
new_train = pd.read_csv('imputed_data_train.csv', encoding = 'gbk')
new_test = pd.read_csv('imputed_data_test.csv', encoding = 'gbk')

In [None]:
new_train.head()

In [None]:
Y_ = new_train['血糖']
X_ = new_train.loc[:, new_train.columns != '血糖']
X_nte = new_test
state = np.random.seed(123)
poly = PolynomialFeatures(2, interaction_only=True)     # 先生成多项式特征
X_ = poly.fit_transform(X_)
X_nte = poly.fit_transform(X_nte)
X_nt, X_nd, Y_nt, Y_nd = train_test_split(X_, Y_, test_size = 0.2, random_state = state) 

In [None]:
xgb_model = xgb.XGBRegressor(n_estimators = 1000, 
                             max_depth = 6,
                             learning_rate = 0.01,
                             subsample = 0.8,
                             colsample_bytree = 0.7)
xgb_model.fit(X_nt, Y_nt.values.ravel())
y_ntr = xgb_model.predict(X_nt)
y_nde = xgb_model.predict(X_nd)
loss1 = mean_squared_error(y_ntr, Y_nt)
loss1
loss2 = mean_squared_error(y_nde, Y_nd)
loss2