# 使用红酒数据集，利用Adaboost、GBDT、XGBoost、LightGBM四种算法进行

In [63]:
import math
import numpy as np
import time
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [64]:
#获取数据
wine = load_wine()
X = wine.data
y = wine.target
X

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [13]:
df_wine = load_wine()
df = pd.DataFrame(df_wine.data, columns=df_wine.feature_names)
df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0


In [65]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

## AdaBoost

In [53]:
from sklearn.ensemble import AdaBoostClassifier

# 搭建AdaBoost模型
start_time = time.time()
classifier = AdaBoostClassifier(n_estimators=500, learning_rate=0.05)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
# 用测试数据评估准确性
accuracy = accuracy_score(y_test,y_pred)
print("accuracy=", accuracy)
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time} s")

accuracy= 0.8518518518518519
代码执行时间: 0.6258752346038818 s


## GBDT

In [54]:
from sklearn.ensemble import GradientBoostingClassifier

# 搭建GBDT模型
start_time = time.time()
classifier = GradientBoostingClassifier(n_estimators=500, learning_rate=0.1,max_depth=3,loss='deviance')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
# 用测试数据评估准确性
accuracy = accuracy_score(y_test,y_pred)
print("accuracy=", accuracy)
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time} s")

accuracy= 0.9259259259259259
代码执行时间: 0.6534223556518555 s


## XGBoost

In [55]:
import xgboost as xg
start_time = time.time()
#搭建XGBoost模型
classifier=xg.XGBClassifier(n_estimators=500,max_depth=3,min_samples_split=5,
                                    learning_rate=0.05,objective ='multi:softmax')
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
# 用测试数据评估准确性
accuracy = accuracy_score(y_test,y_pred)
print("accuracy=",accuracy)
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time} s")

Parameters: { "min_samples_split" } are not used.



accuracy= 0.9444444444444444
代码执行时间: 0.2376692295074463 s


## LightGBM

In [67]:
import lightgbm as lgb
start_time = time.time()
#搭建LightGBM模型
classifier=lgb.LGBMClassifier(n_estimators=500,learning_rate=0.05,objective= 'multiclass', num_class= 3, metric='multi_logloss')
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
# 用测试数据评估准确性
accuracy = accuracy_score(y_test,y_pred)
print("accuracy=",accuracy)
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time} s")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 124, number of used features: 13
[LightGBM] [Info] Start training from score -1.131402
[LightGBM] [Info] Start training from score -0.908259
[LightGBM] [Info] Start training from score -1.293921
accuracy= 0.9814814814814815
代码执行时间: 0.09840750694274902 s


In [58]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
start_time = time.time()

train_data = lgb.Dataset(X_train, y_train)
validation_data = lgb.Dataset(X_test, y_test)
params = {
    'objective': 'multiclass',
    'lambdal1': 0.1,
    'lambdal2':0.2,
    'max_depth':3,
    'learning_rate': 0.05,
    'num_class': 3,
    'metric': 'multi_logloss'
}

gbm = lgb.train(params,train_data,valid_sets=[validation_data])

# 使用训练好的模型进行预测
y_test = gbm.predict(X_test, num_iteration=gbm.best_iteration)
accuracy = accuracy_score(y_test,y_pred)
print("accuracy=",accuracy)
end_time = time.time()
execution_time = end_time - start_time
print(f"代码执行时间: {execution_time} s")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 453
[LightGBM] [Info] Number of data points in the train set: 124, number of used features: 13


TypeError: Wrong type(ndarray) for label.
It should be list, numpy 1-D array or pandas Series