In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report as report
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier,RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
import matplotlib.pyplot as plt

In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('data/hiber_train.csv', header = None)
data=df.values[1:]
data=data.astype(float)
X_train=data[:,0:3]
y_train=data[:,3]
y_train=y_train.astype(int)

df = pd.read_csv('data/hiber_test.csv', header = None)
data=df.values[1:]
data=data.astype(float)
X_test=data[:,0:3]
y_test=data[:,3]
y_test=y_test.astype(int)


# y_train = tf.one_hot(y_train,10)
# y_test = tf.one_hot(y_test,10)
# sess1=tf.Session()
# y_train, y_test = sess1.run([y_train,y_test])

In [3]:
lr_clf=LogisticRegression(max_iter=3000)
svm_clf=SVC()
rf_clf=RandomForestClassifier(
    n_estimators=21,
    criterion="gini",
    max_depth=4,
    random_state=0)

gbr = GradientBoostingClassifier(n_estimators=30000, max_depth=2, min_samples_split=2, learning_rate=0.01)
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=None, min_samples_split=20, min_samples_leaf=21),
                         algorithm="SAMME",
                         n_estimators=200, learning_rate=0.8)
xgb = XGBClassifier(learning_rate=0.01,max_depth=10)

In [5]:
lr_clf.fit(X_train,y_train)
svm_clf.fit(X_train,y_train)
rf_clf.fit(X_train,y_train)
gbr.fit(X_train, y_train.ravel())
bdt.fit(X_train,y_train)
eval_set = [(X_test, y_test)]
xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="logloss", eval_set=eval_set, verbose=False)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [6]:
lr_pred=lr_clf.predict(X_test)
svm_pred=svm_clf.predict(X_test)
rf_pred=rf_clf.predict(X_test)
gbr_pred = gbr.predict(X_test)
bdt_pred = bdt.predict(X_test)
xgb_pred = xgb.predict(X_test)

In [7]:
print('lr:\n',report(y_test,lr_pred))
print('svm:\n',report(y_test,svm_pred))
print('rf:\n',report(y_test, rf_pred))
print('GBDT:\n',report(y_test, gbr_pred))
print('AdaBoost:\n',report(y_test, bdt_pred))
print('XgBoost:\n',report(y_test, xgb_pred))

lr:
               precision    recall  f1-score   support

           1       0.77      0.98      0.86        45
           2       0.75      0.19      0.30        16

    accuracy                           0.77        61
   macro avg       0.76      0.58      0.58        61
weighted avg       0.77      0.77      0.72        61

svm:
               precision    recall  f1-score   support

           1       0.74      1.00      0.85        45
           2       0.00      0.00      0.00        16

    accuracy                           0.74        61
   macro avg       0.37      0.50      0.42        61
weighted avg       0.54      0.74      0.63        61

rf:
               precision    recall  f1-score   support

           1       0.77      0.98      0.86        45
           2       0.75      0.19      0.30        16

    accuracy                           0.77        61
   macro avg       0.76      0.58      0.58        61
weighted avg       0.77      0.77      0.72        61

GBD

  'precision', 'predicted', average, warn_for)


In [11]:
esb_clf=VotingClassifier(estimators=[
    ('rf_clf',rf_clf),
    ('svm_clf',SVC(probability=True)),
    ('lr_clf',lr_clf),
    ('GBDT',gbr),
    ('AdaBoost',bdt),
    ('Xgboost',xgb)
    ],
    weights=[1,0.7,1,0.7,0.7,0.7],
    voting='soft')
esb_clf2=VotingClassifier(estimators=[
    ('rf_clf',rf_clf),
    ('svm_clf',SVC(probability=True)),
    ('lr_clf',lr_clf),
    ('GBDT',gbr),
    ('AdaBoost',bdt),
    ('Xgboost',xgb)],
    weights=[1,0.6,0.7,0.7,0.9,0.9],
    voting='hard')

In [12]:
esb_clf.fit(X_train,y_train)
esb_pred=esb_clf.predict(X_test)
esb_clf2.fit(X_train,y_train)
esb_pred2=esb_clf2.predict(X_test)



In [13]:
print('ensemble-soft:\n',report(y_test,esb_pred))
print('ensemble-hard:\n',report(y_test,esb_pred2))

ensemble-soft:
               precision    recall  f1-score   support

           1       0.76      0.98      0.85        45
           2       0.67      0.12      0.21        16

    accuracy                           0.75        61
   macro avg       0.71      0.55      0.53        61
weighted avg       0.73      0.75      0.69        61

ensemble-hard:
               precision    recall  f1-score   support

           1       0.77      0.96      0.85        45
           2       0.60      0.19      0.29        16

    accuracy                           0.75        61
   macro avg       0.68      0.57      0.57        61
weighted avg       0.72      0.75      0.70        61

