# Model : classification using XGboost

In [82]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import os
import seaborn as sns

In [83]:
pd.set_option('display.max_rows',100)

In [84]:
df = pd.read_csv('imputed_final_deleted2.csv') # deleted2 파일이 corr 보고 추가적으로 지워준 파일
print(df.shape)

(6835, 31)


## Measures for Highly Imbalanced Dataset
1. Oversample the minority class (SMOTE etc.)
2. Undersample the majority class
3. Use class_weight kind of parameters provided within several ML libraries

In [85]:
y= df['class']
X= df[df.columns[df.columns!='class']]

In [86]:
from sklearn.preprocessing import MinMaxScaler

In [87]:
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit_transform(X)
X= scaler.fit_transform(X)

col = list(df)
col.remove('class')

X = pd.DataFrame(X)
X.columns = col

In [88]:
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

In [89]:
X_resampled, y_resampled = SMOTE(random_state=5).fit_sample(X, y)

print('After OverSampling, the shape of train_X: {}'.format(X_resampled.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(X_resampled.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_resampled==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_resampled==0)))

After OverSampling, the shape of train_X: (12948, 30)
After OverSampling, the shape of train_y: (12948, 30) 

After OverSampling, counts of label '1': 6474
After OverSampling, counts of label '0': 6474


In [90]:
X_up = X_resampled
y_up = pd.DataFrame(y_resampled)

In [91]:
from sklearn.model_selection import train_test_split

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X_up, y_up, test_size= 0.2, random_state= 5)

## Classification Models
- CART, Bagging NN, AdaBoost, Gradient Boost, Random Forest


## Classification Metrics
- confusion_matrix, accuracy_score , recall_score , precision_score, f1_score, roc_curve,roc_auc_score

In [93]:
seed = 5
np.random.seed(seed)

## Extra Gradient Boosting (XG Boost)

In [94]:
from xgboost import XGBClassifier

In [95]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [96]:
kfold = StratifiedKFold(n_splits=7, random_state=5)



## learning rate

In [97]:
xgb_model1 = XGBClassifier(learning_rate=0.3)

results = cross_val_score(xgb_model1, X_train, y_train, cv=kfold)

print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 97.15% (0.44%)


In [98]:
xgb_model2 = XGBClassifier(learning_rate=0.05)

results = cross_val_score(xgb_model2, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 93.18% (0.72%)


In [99]:
xgb_model3 = XGBClassifier(learning_rate=0.2)

results = cross_val_score(xgb_model3, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 97.04% (0.45%)


## max depth

In [100]:
xgb_model4 = XGBClassifier(max_depth=3)

results = cross_val_score(xgb_model4, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 93.09% (0.65%)


In [101]:
xgb_model5 = XGBClassifier(max_depth=6)

results = cross_val_score(xgb_model5, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 97.15% (0.44%)


In [102]:
xgb_model6 = XGBClassifier(max_depth=10)

results = cross_val_score(xgb_model6, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 97.44% (0.39%)


## min child weight

In [103]:
xgb_model7 = XGBClassifier(min_child_weight=0.1)

results = cross_val_score(xgb_model7, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 97.42% (0.27%)


In [104]:
xgb_model8= XGBClassifier(min_child_weight=1)

results = cross_val_score(xgb_model8, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 97.15% (0.44%)


In [105]:
xgb_model9 = XGBClassifier(min_child_weight=3)

results = cross_val_score(xgb_model9, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 96.95% (0.59%)


## gamma

In [106]:
xgb_model10 = XGBClassifier(gamma=0)

results = cross_val_score(xgb_model10, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 97.15% (0.44%)


In [107]:
xgb_model11 = XGBClassifier(gamma=0.2)

results = cross_val_score(xgb_model11, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 97.17% (0.42%)


In [108]:
xgb_model12 = XGBClassifier(gamma=0.4)

results = cross_val_score(xgb_model12, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 96.91% (0.37%)


## colsample_bytree

In [109]:
xgb_model13 = XGBClassifier(colsample_bytree=0.3)

results = cross_val_score(xgb_model13, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 96.87% (0.47%)


In [110]:
xgb_model14 = XGBClassifier(colsample_bytree=0.5)

results = cross_val_score(xgb_model14, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 96.92% (0.33%)


In [111]:
xgb_model15 = XGBClassifier(colsample_bytree=0.7)

results = cross_val_score(xgb_model15, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 97.19% (0.29%)


## hyper parameter tuning
### 각 parameter 마다 가장 성능 좋은 것으로 튜닝

In [112]:
xgb_model_final = XGBClassifier(learning_rate=0.3, max_depth=10, min_child_weight=0.1, gamma=0.2, colsample_bytree=0.7)

results = cross_val_score(xgb_model_final, X_train, y_train, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 97.17% (0.39%)


In [113]:
xg_pred = xgb_model_final.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, xg_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.96      0.97      1325
         1.0       0.96      0.98      0.97      1265

    accuracy                           0.97      2590
   macro avg       0.97      0.97      0.97      2590
weighted avg       0.97      0.97      0.97      2590



In [114]:
from sklearn.externals import joblib 



In [116]:
file_name = 'xgb_01.pkl' 
joblib.dump(xgb_model_final, file_name) 

['xgb_01.pkl']