In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report as report
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier,RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
import matplotlib.pyplot as plt

In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('data/final_train_data.csv', header = None)
df = pd.read_csv('cross/train0.csv', header = None)
data=df.values[1:]
data=data.astype(float)
X_train=data[:,0:22]
y_train=data[:,22]
y_train=y_train.astype(int)

df = pd.read_csv('data/final_test_data.csv', header = None)
data=df.values[1:]
data=data.astype(float)
X_test=data[:,0:22]
y_test=data[:,22]
y_test=y_test.astype(int)


# y_train = tf.one_hot(y_train,10)
# y_test = tf.one_hot(y_test,10)
# sess1=tf.Session()
# y_train, y_test = sess1.run([y_train,y_test])

In [3]:
lr_clf=LogisticRegression(max_iter=3000)
svm_clf=SVC()
rf_clf=RandomForestClassifier(
    n_estimators=21,
    criterion="gini",
    max_depth=4,
    random_state=0)

gbr = GradientBoostingClassifier(n_estimators=30000, max_depth=2, min_samples_split=2, learning_rate=0.01)
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=None, min_samples_split=20, min_samples_leaf=21),
                         algorithm="SAMME",
                         n_estimators=200, learning_rate=0.8)
xgb = XGBClassifier(learning_rate=0.01,max_depth=10)

In [4]:
lr_clf.fit(X_train,y_train)
svm_clf.fit(X_train,y_train)
rf_clf.fit(X_train,y_train)
gbr.fit(X_train, y_train.ravel())
bdt.fit(X_train,y_train)
eval_set = [(X_test, y_test)]
xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="mlogloss", eval_set=eval_set, verbose=False)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [5]:
lr_pred=lr_clf.predict(X_test)
svm_pred=svm_clf.predict(X_test)
rf_pred=rf_clf.predict(X_test)
gbr_pred = gbr.predict(X_test)
bdt_pred = bdt.predict(X_test)
xgb_pred = xgb.predict(X_test)

In [6]:
print('lr:\n',report(y_test,lr_pred))
print('svm:\n',report(y_test,svm_pred))
print('rf:\n',report(y_test, rf_pred))
print('GBDT:\n',report(y_test, gbr_pred))
print('AdaBoost:\n',report(y_test, bdt_pred))
print('XgBoost:\n',report(y_test, xgb_pred))

lr:
               precision    recall  f1-score   support

           0       0.72      0.56      0.63        32
           1       0.73      0.59      0.66        32
           2       0.94      0.97      0.95        32
           3       0.86      0.97      0.91        32
           4       0.57      0.53      0.55        32
           5       0.50      0.44      0.47        32
           6       0.70      1.00      0.82        32
           7       0.79      0.94      0.86        32
           8       0.80      0.25      0.38        32
           9       0.60      0.91      0.72        32

    accuracy                           0.72       320
   macro avg       0.72      0.72      0.70       320
weighted avg       0.72      0.72      0.70       320

svm:
               precision    recall  f1-score   support

           0       0.82      0.28      0.42        32
           1       0.86      1.00      0.93        32
           2       0.97      0.97      0.97        32
           3 

In [11]:
esb_clf=VotingClassifier(estimators=[
    ('rf_clf',rf_clf),
    ('svm_clf',SVC(probability=True)),
    ('lr_clf',lr_clf),
    ('GBDT',gbr),
    ('AdaBoost',bdt),
    ('Xgboost',xgb)
    ],
    weights=[0.6,0.6,0.6,1,2,0.9],
    voting='soft')
esb_clf2=VotingClassifier(estimators=[
    ('rf_clf',rf_clf),
    ('svm_clf',SVC(probability=True)),
    ('lr_clf',lr_clf),
    ('GBDT',gbr),
    ('AdaBoost',bdt),
    ('Xgboost',xgb)],
    weights=[0.6,0.6,0.6,1,2,0.9],
    voting='hard')

In [12]:
esb_clf.fit(X_train,y_train)
esb_pred=esb_clf.predict(X_test)
esb_clf2.fit(X_train,y_train)
esb_pred2=esb_clf2.predict(X_test)



In [13]:
print('ensemble-soft:\n',report(y_test,esb_pred))
print('ensemble-hard:\n',report(y_test,esb_pred2))

ensemble-soft:
               precision    recall  f1-score   support

           0       1.00      0.91      0.95        32
           1       0.97      0.97      0.97        32
           2       1.00      0.97      0.98        32
           3       0.94      0.97      0.95        32
           4       0.97      0.91      0.94        32
           5       0.88      0.91      0.89        32
           6       0.89      1.00      0.94        32
           7       0.93      0.88      0.90        32
           8       0.89      1.00      0.94        32
           9       1.00      0.94      0.97        32

    accuracy                           0.94       320
   macro avg       0.95      0.94      0.94       320
weighted avg       0.95      0.94      0.94       320

ensemble-hard:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98        32
           1       0.97      0.97      0.97        32
           2       1.00      1.00      1.00   

In [14]:
from vecstack import stacking
from sklearn import metrics
# 模型融合
model_list = [rf_clf,bdt,xgb]
rel_n_folds = 0
rel_pre = -1
x = []
y = []
rel_pred = []

for n_fold in range(2,101, 1):
    S_train, S_test = stacking(model_list, X_train, y_train,X_test, regression=False, n_folds=n_fold)
    # # Initialize 2-nd level model
    model = GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 100, max_depth = 1)
    # # Fit 2-nd level model
    model_s = model.fit(S_train, y_train)
    # # Predict
    y_pred = model_s.predict(S_test)
    # Final prediction score
    f1_score = metrics.f1_score(y_pred, y_test,average='micro')
    x.append(n_fold)
    y.append(f1_score)
    print('n_folds =',n_fold,'f1_score=', f1_score)
    if f1_score >= rel_pre:
        rel_pre = f1_score
        rel_n_folds = n_fold
        rel_pred = y_pred

n_folds = 2 f1_score= 0.371875
n_folds = 3 f1_score= 0.203125
n_folds = 4 f1_score= 0.175
n_folds = 5 f1_score= 0.009375
n_folds = 6 f1_score= 0.4625
n_folds = 7 f1_score= 0.490625
n_folds = 8 f1_score= 0.603125
n_folds = 9 f1_score= 0.69375
n_folds = 10 f1_score= 0.015625
n_folds = 11 f1_score= 0.984375
n_folds = 12 f1_score= 0.975
n_folds = 13 f1_score= 0.984375


KeyboardInterrupt: 

In [15]:
print('rel_pre：',rel_pre)

rel_pre： 0.984375


In [17]:
# 模型融合
model_list = [rf_clf,bdt,xgb]
S_train, S_test = stacking(model_list, X_train, y_train,X_test, regression=False, n_folds=11)

# # Initialize 2-nd level model
model = GradientBoostingClassifier(learning_rate = 0.1, n_estimators = 100, max_depth = 3)

# # Fit 2-nd level model
model_s = model.fit(S_train, y_train)

# # Predict
y_pred = model_s.predict(S_test)


NameError: name 'test_labels' is not defined

In [21]:
# Final prediction score
# print('Final prediction score: [%.8f]' % metrics.accuracy_score(y_test, y_pred))
acc_score_test = metrics.accuracy_score(y_pred, y_test)
precision_score_test = metrics.precision_score(y_pred, y_test,average='micro')
recall_score_test = metrics.recall_score(y_pred, y_test,average='micro')
f1_score_test = metrics.f1_score(y_pred, y_test,average='micro')


print('Final 测试集准确率：{}\n'.format(acc_score_test))
print('Final 测试集精确率：{}\n'.format(precision_score_test))
print('Final 测试集召回率：{}\n'.format(recall_score_test))
print('Final 测试集f1评分：{}\n'.format(f1_score_test))

Final 测试集准确率：0.96875

Final 测试集精确率：0.96875

Final 测试集召回率：0.96875

Final 测试集f1评分：0.96875

