# 一、 Xgboost普通训练预测方法

# 1.1 数据可视化

In [37]:
import pandas as pd
import numpy as np

#Load train data
feature_file = pd.read_csv("./DataHousePricePrediction/train.csv")
#visual data
feature_file.head()


Unnamed: 0,saleTime,price,rooms,baths,size,parking,floor,score,buildingSize,basement,year,repairYear,latitude,longitude
0,20150302,545000,3,2.25,1670,6240,1.0,8,1240,430,1974,0,47.6413,-122.113
1,20150211,785000,4,2.5,3300,10514,2.0,10,3300,0,1984,0,47.6323,-122.036
2,20150107,765000,3,3.25,3190,5283,2.0,9,3190,0,2007,0,47.5534,-122.002
3,20141103,720000,5,2.5,2900,9525,2.0,9,2900,0,1989,0,47.5442,-122.138
4,20140603,449500,5,2.75,2040,7488,1.0,7,1200,840,1969,0,47.7289,-122.172


In [56]:
####################################################
####Data processing
####################################################
from sklearn.model_selection import train_test_split

x = []# feature data
y = []# label
for index in feature_file.index.values:
    #print('index', index)
    #print(feature_file.values[0])
    #print(feature_file.ix[index].values) 
    x.append(feature_file.values[index][2: -1]) # get feature data from original csv
    y.append(feature_file.values[index][1])     # get label from original csv
    
x, y = np.array(x), np.array(y)
print('='*60)
print('输入数据的shape为: ', x.shape)
print('输出Label的shape为:', y.shape)
print('样本数为：', len(feature_file.index.values))
print('='*60)
# 划分训练集和验证集
X_train,X_valid,y_train,y_valid = train_test_split(x,y,test_size=0.2,random_state=12345)
print('训练集和对应Label的shape为: ', X_train.shape, y_train.shape)
print('验证集和对应Label的shape为: ', X_valid.shape, y_valid.shape)

输入数据的shape为:  (1816, 11)
输出Label的shape为: (1816,)
样本数为： 1816
训练集和对应Label的shape为:  (1452, 11) (1452,)
验证集和对应Label的shape为:  (364, 11) (364,)


# 1.2 构建XGBoost模型

In [57]:
from xgboost import XGBRegressor

#定义xgboost
xgb = XGBRegressor(learning_rate =0.1,
                   n_estimators=150, 
                   max_depth=5,
                   min_child_weight=1,
                   gamma=0,
                   subsample=0.8,
                   colsample_bytree=0.8,
                   objective= 'reg:squarederror',
                   nthread=4,
                   scale_pos_weight=1,
                   seed=27)


## 1.3 XGBoost模型训练保存及模型评估

In [58]:
xgb.fit(X_train,y_train) # training model
xgb.save_model(f'./XGB_train.xgb')    # save model file

score = xgb.score(X_valid,y_valid)
print(score)

0.7093157893300268


## 1.4 使用XGBoost模型进行预测

In [None]:
xgb = XGBRegressor()    # define model
xgb.load_model(f'./XGB_train.xgb')      #load model file
df_result = xgb.predict(X_valid)
df_result = pd.DataFrame(df_result)        #numpy to pandas
df_result.to_csv("./precit_normal.csv", index=False)

# 二、使用F-fold方法训练和预测XGBoost模型

## 2.1 使用F-fold方法训练XGBoost模型

In [45]:
import pandas as pd
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import KFold
#set the number of KFold
n_splits = 5
kf = KFold(n_splits)
fold = 0
for train_index, test_index in kf.split(x):
    # Kf.split returns the index position corresponding to the divided training and valide data
    train_X = x[train_index]
    train_y = y[train_index]
    valid_X = x[test_index]
    valid_y = y[test_index]
    #print(train_X.shape)
    #print(train_y.shape)
    #define xgboost
    xgb = XGBRegressor(learning_rate =0.1,
                       n_estimators=150, 
                       max_depth=5,
                       min_child_weight=1,
                       gamma=0,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       objective= 'reg:squarederror',
                       nthread=4,
                       scale_pos_weight=1,
                       seed=27)
    
    xgb.fit(train_X, train_y)
    xgb.save_model(f'./XgbModels/XGB_fold{fold}.xgb')
    fold = fold + 1
    score = xgb.score(X_valid,y_valid)
    print("第",fold,"次模型的的准确率为：", score)


第 1 次模型的的准确率为： 0.8668489120138088
第 2 次模型的的准确率为： 0.9257405632715963
第 3 次模型的的准确率为： 0.89672406342582
第 4 次模型的的准确率为： 0.9509216841043323
第 5 次模型的的准确率为： 0.9036554008041089


## 2.2 使用K-Fold产生的XGBoost模型进行综合预测

In [63]:
#Comprehensive prediction using the model generated by K-FOld

for f in range(0,n_splits):
    xgb = XGBRegressor()
    xgb.load_model(f'XgbModels/XGB_fold{f}.xgb')
    if f == 0:
        df_result = xgb.predict(X_valid)
    else:
        df_result = df_result + xgb.predict(X_valid)

df_result /= n_splits
df_result = pd.DataFrame(df_result)        #numpy to pandas
df_result.to_csv("./precit_kfold.csv", index=False)