In [1]:
import numpy as np 
import pandas as pd 
import xgboost as xgb 
from sklearn.model_selection import train_test_split 
#记录程序运行时间 
import time 
start_time = time.time() 
#读入数据 
train = pd.read_csv("train.csv") 
tests = pd.read_csv("test.csv") 

In [2]:
#用sklearn.model_selection进行训练数据集划分，这里训练集和交叉验证集比例为7：3，可以自己根据需要设置 
train_xy,val = train_test_split(train, test_size = 0.3,random_state=1) 
y = train_xy.label # label 单独成为trian目标集
X = train_xy.drop(['label'],axis=1) #train集中删去label标签
val_y = val.label 
val_X = val.drop(['label'],axis=1) 
#xgb矩阵赋值 
xgb_val = xgb.DMatrix(val_X,label=val_y) # test集
xgb_train = xgb.DMatrix(X, label=y) # train集
xgb_test = xgb.DMatrix(tests)


In [3]:
params={ 
    'booster':'gbtree', 
    'objective': 'multi:softmax', #多分类的问题 
    'num_class':10, # 类别数，与 multisoftmax 并用 
    'gamma':0.1, # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。 
    'max_depth':12, # 构建树的深度，越大越容易过拟合 
    'lambda':2, # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。 
    'subsample':0.7, # 随机采样训练样本 
    'colsample_bytree':0.7, # 生成树时进行的列采样 
    'min_child_weight':3, # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言 
    #，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 
    #这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
    'silent':0 ,#设置成1则没有运行信息输出，最好是设置为0. 
    'eta': 0.007, # 如同学习率 
    'seed':1000, 
    #'nthread':7,# cpu 线程数 
    'gpu_id': 0,
    'max_bin': 128,
    'tree_method': 'gpu_hist',
    #'eval_metric': 'auc' 
} 

plst = list(params.items()) 
num_rounds = 200 # 迭代次数 
watchlist = [(xgb_train, 'train'),(xgb_val, 'val')] 
#训练模型并保存 
# early_stopping_rounds 当设置的迭代次数较大时，early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练 
model = xgb.train(plst, xgb_train, num_rounds, watchlist,early_stopping_rounds=100) 
model.save_model('xgbdigit.model') # 用于存储训练出的模型 
print("best best_ntree_limit", model.best_ntree_limit)


[0]	train-merror:0.088231	val-merror:0.127778
Multiple eval metrics have been passed: 'val-merror' will be used for early stopping.

Will train until val-merror hasn't improved in 100 rounds.
[1]	train-merror:0.066701	val-merror:0.096508
[2]	train-merror:0.057993	val-merror:0.088333
[3]	train-merror:0.052483	val-merror:0.081587
[4]	train-merror:0.05017	val-merror:0.079365
[5]	train-merror:0.048095	val-merror:0.076111
[6]	train-merror:0.046871	val-merror:0.075317
[7]	train-merror:0.046224	val-merror:0.073333
[8]	train-merror:0.04415	val-merror:0.072302
[9]	train-merror:0.043197	val-merror:0.070317
[10]	train-merror:0.042891	val-merror:0.07
[11]	train-merror:0.042041	val-merror:0.069683
[12]	train-merror:0.041429	val-merror:0.069206
[13]	train-merror:0.03966	val-merror:0.069048
[14]	train-merror:0.039626	val-merror:0.068492
[15]	train-merror:0.039456	val-merror:0.068492
[16]	train-merror:0.038503	val-merror:0.067222
[17]	train-merror:0.038401	val-merror:0.066825
[18]	train-merror:0.03792

[171]	train-merror:0.01898	val-merror:0.050556
[172]	train-merror:0.018946	val-merror:0.050397
[173]	train-merror:0.018912	val-merror:0.050159
[174]	train-merror:0.019014	val-merror:0.050159
[175]	train-merror:0.018707	val-merror:0.050079
[176]	train-merror:0.018741	val-merror:0.05
[177]	train-merror:0.018707	val-merror:0.049921
[178]	train-merror:0.018707	val-merror:0.049841
[179]	train-merror:0.018707	val-merror:0.049921
[180]	train-merror:0.018571	val-merror:0.049841
[181]	train-merror:0.018503	val-merror:0.049762
[182]	train-merror:0.018401	val-merror:0.049683
[183]	train-merror:0.018367	val-merror:0.049762
[184]	train-merror:0.018401	val-merror:0.049524
[185]	train-merror:0.018299	val-merror:0.049603
[186]	train-merror:0.018265	val-merror:0.049603
[187]	train-merror:0.018231	val-merror:0.049603
[188]	train-merror:0.018231	val-merror:0.049524
[189]	train-merror:0.018095	val-merror:0.049524
[190]	train-merror:0.018027	val-merror:0.049444
[191]	train-merror:0.017959	val-merror:0.0492

In [4]:
preds = model.predict(xgb_test,ntree_limit=model.best_ntree_limit) 
np.savetxt('xgb_submission.csv',
           np.c_[range(1,len(tests)+1),preds],delimiter=',',
           header='ImageId,Label',comments='',fmt='%d') 
#输出运行时长 
cost_time = time.time()-start_time 
print("xgboost success!",'\n',"cost time:",cost_time,"(s)......")


xgboost success! 
 cost time: 685.0299446582794 (s)......
