# 使用quandl库中的数据，利用Adaboost、GBDT、XGBoost、LightGBM四种算法进行预测

In [9]:
import quandl
import math
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
#获取数据集
df = quandl.get('WIKI/GOOGL')
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Ex-Dividend,Split Ratio,Adj. Open,Adj. High,Adj. Low,Adj. Close,Adj. Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-08-19,100.01,104.06,95.96,100.335,44659000.0,0.0,1.0,50.159839,52.191109,48.128568,50.322842,44659000.0
2004-08-20,101.01,109.08,100.5,108.31,22834300.0,0.0,1.0,50.661387,54.708881,50.405597,54.322689,22834300.0
2004-08-23,110.76,113.48,109.05,109.4,18256100.0,0.0,1.0,55.551482,56.915693,54.693835,54.869377,18256100.0
2004-08-24,111.24,111.6,103.57,104.87,15247300.0,0.0,1.0,55.792225,55.972783,51.94535,52.597363,15247300.0
2004-08-25,104.76,108.0,103.88,106.0,9188600.0,0.0,1.0,52.542193,54.167209,52.10083,53.164113,9188600.0


In [3]:
#数据处理
from sklearn import preprocessing
forecast_col = 'Adj. Close'  # 预测列变量
forecast_out = int(math.ceil(0.01 * len(df)))  # 定于预测天数，数据长度是0.01
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
# HL_PCT为股票最高价与最低价的变化百分比
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'] * 100.0
# PCT_change为股票收盘价与开盘价的变化百分比
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
# 为真正用到的特征字段
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
df.fillna(-99999, inplace=True)  # scikit-learn不处理空数据，故要把空数据设置为难出现的值
df['label'] = df[forecast_col].shift(-forecast_out)

# 模型最后使用的数据是X,y，以及预测的数据X_lately
X = np.array(df.drop(['label'], axis = 1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out]
df.dropna(inplace=True)
y = np.array(df['label'])

print(X)
print(y)

[[-1.39187095  2.33516806  0.23254018  4.46712802]
 [-1.37635178 -0.31135438  4.81828847  1.8207495 ]
 [-1.37423066  2.35004252 -0.79917922  1.26561472]
 ...
 [ 2.99737935 -0.50090092  0.33301614 -0.61156746]
 [ 2.7553098   0.01505419 -0.46778619 -0.24489949]
 [ 2.55960556  2.87565464 -1.90605746 -0.49425191]]
[  69.0782379    67.83941377   68.91272699 ... 1026.55       1054.09
 1006.94      ]


In [4]:
#分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

## Adaboost

In [11]:
from sklearn.ensemble import AdaBoostRegressor
# 搭建AdaBoost模型
start_time = time.time()
regressor=AdaBoostRegressor(n_estimators=500, learning_rate=0.05, loss='square')
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
# 用测试数据评估准确性
accuracy = regressor.score(X_test, y_test)
print("accuracy=",accuracy)
mse = mean_squared_error(y_test, y_pred)
print('均方误差: ', mse)
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time} s")

accuracy= 0.977580240007228
均方误差:  1521.6039007847505
代码执行时间: 1.432054042816162 s


## GBDT

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
start_time = time.time()
# 搭建GBDT模型
regressor=GradientBoostingRegressor(n_estimators=500,max_depth=3,min_samples_split=5,
                                    learning_rate=0.05,loss='squared_error')
#n_estimators:弱分类器的个数;max depth:弱分类器 (CART回归树)的最大深度;min samples split:分内部节点所需的最小样本数
# learning rate：学习率;loss:损失函数_均方误差损失函数
#
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
# 用测试数据评估准确性
accuracy = regressor.score(X_test, y_test)
print("accuracy=",accuracy)
mse = mean_squared_error(y_test, y_pred)
print('均方误差: ', mse)
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time} s")

accuracy= 0.9839798443571751
均方误差:  1087.2699495962472
代码执行时间: 1.3443052768707275 s


## XGBosot

In [13]:
import xgboost as xg
start_time = time.time()
#搭建XGBoost模型
regressor=xg.XGBRegressor(n_estimators=500,max_depth=3,min_samples_split=5,
                                    learning_rate=0.05,loss='reg:linear')
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
# 用测试数据评估准确性
accuracy = regressor.score(X_test, y_test)
print("accuracy=",accuracy)
mse = mean_squared_error(y_test, y_pred)
print('均方误差: ', mse)
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time} s")

accuracy= 0.9834564532061252
均方误差:  1122.7919184902296
代码执行时间: 0.13999342918395996 s


Parameters: { "loss", "min_samples_split" } are not used.



## LightGBM

In [None]:
import lightgbm as lgb
start_time = time.time()
#构建LightGBM模型
regressor=lgb.LGBMRegressor(boosting_type= 'gbdt',n_estimators=500,max_depth=3,min_samples_split=5,
                                    learning_rate=0.05,loss='regression')
regressor.fit(X_train,y_train)
y_pred = regressor.predict(X_test)
# 用测试数据评估准确性
accuracy = regressor.score(X_test, y_test)
print("accuracy=",accuracy)
print("accuracy=",accuracy)
mse = mean_squared_error(y_test, y_pred)
print('均方误差: ', mse)
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time} s")

In [14]:
import lightgbm as lgb
start_time = time.time()

#搭建XGBoost模型
train_data = lgb.Dataset(X_train, y_train)
validation_data = lgb.Dataset(X_test, y_test)
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'lambdal1': 0.1,
    'lambdal2':0.2,
    'max_depth':3,
    'learning_rate': 0.05,
}

gbm = lgb.train(params,train_data,valid_sets=[validation_data])
# 使用训练好的模型进行预测
y_test = gbm.predict(X_test, num_iteration=gbm.best_iteration)
accuracy = regressor.score(X_test, y_test)
print("accuracy=",accuracy)
mse = mean_squared_error(y_test, y_pred)
print('均方误差: ', mse)
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time} s")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 2711, number of used features: 4
[LightGBM] [Info] Start training from score 410.282820
accuracy= 0.998703499126434
均方误差:  85.97176309600258
代码执行时间: 0.02599954605102539 s
