# by 18bcd7017

## Importing libary

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict,train_test_split
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix, roc_auc_score
from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

## reading data

In [2]:
df = pd.read_csv("emp.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


## data preprocessing

In [3]:
df.shape

(1470, 35)

In [4]:
df.drop(['EmployeeCount','EmployeeNumber','Over18','StandardHours'],axis=1,inplace=True)
df.replace({'Attrition':{'Yes':1,'No':0}}, inplace=True)
df_dum = pd.get_dummies(df)

## print_score functionn

In [5]:
def print_score(clf, X_train, X_test, y_train, y_test, train=True):
    '''
    v0.1 Follow the scikit learn library format in terms of input
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        '''
        training performance
        '''
        res = clf.predict(X_train)
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train,res)))
        print("Classification Report: \n {}\n".format(classification_report(y_train, res)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, res)))
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_train), lb.transform(res))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        res_test = clf.predict(X_test)
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test,  res_test)))
        print("Classification Report: \n {}\n".format(classification_report(y_test,  res_test)))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test,  res_test)))   
        print("ROC AUC: {0:.4f}\n".format(roc_auc_score(lb.transform(y_test),  lb.transform(res_test))))

## spliting data

In [6]:
y=df_dum['Attrition']
x=df_dum.drop(['Attrition'], axis=1)
x_col = x.columns

In [7]:
x = StandardScaler().fit_transform(x.values)
df1 = pd.DataFrame(x, columns=x_col)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1176, 51)
(294, 51)
(1176,)
(294,)


## DecisionTreeClassifier

In [9]:
DT_clf = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=3, min_samples_leaf=5) 
DT_clf.fit(X_train, y_train)
print_score(DT_clf, X_train, X_test, y_train, y_test, train=True)
print_score(DT_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.8631

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.97      0.92       982
           1       0.68      0.32      0.44       194

    accuracy                           0.86      1176
   macro avg       0.78      0.65      0.68      1176
weighted avg       0.85      0.86      0.84      1176


Confusion Matrix: 
 [[953  29]
 [132  62]]

ROC AUC: 0.6450

Average Accuracy: 	 0.8393
Accuracy SD: 		 0.0137
Test Result:

accuracy score: 0.8639

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.98      0.92       251
           1       0.60      0.21      0.31        43

    accuracy                           0.86       294
   macro avg       0.74      0.59      0.62       294
weighted avg       0.84      0.86      0.83       294


Confusion Matrix: 
 [[245   6]
 [ 34   9]]

ROC AUC: 0.5927



## **Q 1** BaggingClassifier

In [10]:
bag_clf = BaggingClassifier(base_estimator=DT_clf, n_estimators=1000, bootstrap=True, oob_score=True,n_jobs=-1, random_state=100)
bag_clf.fit(X_train, y_train)
print_score(bag_clf, X_train, X_test, y_train, y_test, train=True)
print_score(bag_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.8733

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       982
           1       0.94      0.25      0.39       194

    accuracy                           0.87      1176
   macro avg       0.91      0.62      0.66      1176
weighted avg       0.88      0.87      0.84      1176


Confusion Matrix: 
 [[979   3]
 [146  48]]

ROC AUC: 0.6222

Average Accuracy: 	 0.8537
Accuracy SD: 		 0.0167
Test Result:

accuracy score: 0.8639

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.99      0.93       251
           1       0.67      0.14      0.23        43

    accuracy                           0.86       294
   macro avg       0.77      0.56      0.58       294
weighted avg       0.84      0.86      0.82       294


Confusion Matrix: 
 [[248   3]
 [ 37   6]]

ROC AUC: 0.5638



## **Q 2** AdaBoostClassifier

In [11]:
adaboost_clf = AdaBoostClassifier(n_estimators=200,random_state = 100)
adaboost_clf.fit(X_train, y_train)
print_score(adaboost_clf, X_train, X_test, y_train, y_test, train=True)
print_score(adaboost_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.9303

Classification Report: 
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       982
           1       0.88      0.66      0.76       194

    accuracy                           0.93      1176
   macro avg       0.91      0.82      0.86      1176
weighted avg       0.93      0.93      0.93      1176


Confusion Matrix: 
 [[965  17]
 [ 65 129]]

ROC AUC: 0.8238

Average Accuracy: 	 0.8504
Accuracy SD: 		 0.0400
Test Result:

accuracy score: 0.8912

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.98      0.94       251
           1       0.74      0.40      0.52        43

    accuracy                           0.89       294
   macro avg       0.82      0.69      0.73       294
weighted avg       0.88      0.89      0.88       294


Confusion Matrix: 
 [[245   6]
 [ 26  17]]

ROC AUC: 0.6857



## **Q 3**  Ada Boost + Random Forest

In [12]:
RF_clf = RandomForestClassifier(n_estimators=200,random_state=100,max_depth=3,max_features='auto')
RF_clf.fit(X_train, y_train)
print_score(RF_clf, X_train, X_test, y_train, y_test, train=True)
print_score(RF_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.8435

Classification Report: 
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       982
           1       1.00      0.05      0.10       194

    accuracy                           0.84      1176
   macro avg       0.92      0.53      0.51      1176
weighted avg       0.87      0.84      0.78      1176


Confusion Matrix: 
 [[982   0]
 [184  10]]

ROC AUC: 0.5258

Average Accuracy: 	 0.8410
Accuracy SD: 		 0.0053
Test Result:

accuracy score: 0.8639

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      1.00      0.93       251
           1       1.00      0.07      0.13        43

    accuracy                           0.86       294
   macro avg       0.93      0.53      0.53       294
weighted avg       0.88      0.86      0.81       294


Confusion Matrix: 
 [[251   0]
 [ 40   3]]

ROC AUC: 0.5349



In [14]:
adaboost_clf2 = AdaBoostClassifier(base_estimator=RF_clf,n_estimators=200,random_state = 100,learning_rate=0.1)
adaboost_clf2.fit(X_train, y_train)
print_score(adaboost_clf2, X_train, X_test, y_train, y_test, train=True)
print_score(adaboost_clf2, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.9583

Classification Report: 
               precision    recall  f1-score   support

           0       0.95      1.00      0.98       982
           1       0.99      0.75      0.86       194

    accuracy                           0.96      1176
   macro avg       0.97      0.88      0.92      1176
weighted avg       0.96      0.96      0.96      1176


Confusion Matrix: 
 [[981   1]
 [ 48 146]]

ROC AUC: 0.8758

Average Accuracy: 	 0.8742
Accuracy SD: 		 0.0208
Test Result:

accuracy score: 0.8946

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.99      0.94       251
           1       0.83      0.35      0.49        43

    accuracy                           0.89       294
   macro avg       0.87      0.67      0.72       294
weighted avg       0.89      0.89      0.88       294


Confusion Matrix: 
 [[248   3]
 [ 28  15]]

ROC AUC: 0.6684



##**Q 4** GradientBoostingClassifier

In [15]:
GB_clf =  GradientBoostingClassifier(n_estimators=200,random_state = 100,max_features="auto")
GB_clf.fit(X_train, y_train)
print_score(GB_clf, X_train, X_test, y_train, y_test, train=True)
print_score(GB_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.9855

Classification Report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       982
           1       1.00      0.91      0.95       194

    accuracy                           0.99      1176
   macro avg       0.99      0.96      0.97      1176
weighted avg       0.99      0.99      0.99      1176


Confusion Matrix: 
 [[982   0]
 [ 17 177]]

ROC AUC: 0.9562

Average Accuracy: 	 0.8589
Accuracy SD: 		 0.0263
Test Result:

accuracy score: 0.8776

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.96      0.93       251
           1       0.63      0.40      0.49        43

    accuracy                           0.88       294
   macro avg       0.77      0.68      0.71       294
weighted avg       0.86      0.88      0.87       294


Confusion Matrix: 
 [[241  10]
 [ 26  17]]

ROC AUC: 0.6778



##**Q 5** XGBClassifier

In [16]:
xgb_model = xgb.XGBClassifier(learning_rate=0.1,n_estimators=200,objective='binary:logistic',nthread= 4,n_jobs= 4,base_score= 0.6)
xgb_model.fit(X_train, y_train)
print_score(xgb_model, X_train, X_test, y_train, y_test, train=True)
print_score(xgb_model, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.9736

Classification Report: 
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       982
           1       0.99      0.85      0.91       194

    accuracy                           0.97      1176
   macro avg       0.98      0.92      0.95      1176
weighted avg       0.97      0.97      0.97      1176


Confusion Matrix: 
 [[981   1]
 [ 30 164]]

ROC AUC: 0.9222

Average Accuracy: 	 0.8648
Accuracy SD: 		 0.0203
Test Result:

accuracy score: 0.8741

Classification Report: 
               precision    recall  f1-score   support

           0       0.90      0.96      0.93       251
           1       0.62      0.35      0.45        43

    accuracy                           0.87       294
   macro avg       0.76      0.66      0.69       294
weighted avg       0.86      0.87      0.86       294


Confusion Matrix: 
 [[242   9]
 [ 28  15]]

ROC AUC: 0.6565



## Stacking 

### A. Meta Classifier-you take as Logistic Regression and ensembles as "Decision Tree" and "Random Forest"

In [17]:
xtrain_base, xpred_base, ytrain_base, ypred_base = train_test_split(X_train, y_train, test_size=0.5, random_state=100)

In [19]:
def train_base_learners(base_learners, inp, out, verbose=True):
    """
    Train all base learners in the library.
    """
    if verbose: print("Fitting models.")
    for i, (name, m) in enumerate(base_learners.items()):
        if verbose: print("%s..." % name, end=" ", flush=False)
        m.fit(inp, out)
        if verbose: print("done")
        

In [20]:
train_base_learners({'DT': DT_clf,'RF': RF_clf}, xtrain_base, ytrain_base)

Fitting models.
DT... done
RF... done


In [21]:
def predict_base_learners(pred_base_learners, inp, verbose=True):
    """
    Generate a prediction matrix.
    """
    P = np.zeros((inp.shape[0], len(pred_base_learners)))

    if verbose: print("Generating base learner predictions.")
    for i, (name, m) in enumerate(pred_base_learners.items()):
        if verbose: print("%s..." % name, end=" ", flush=False)
        p = m.predict_proba(inp)
        # With two classes, need only predictions for one class
        P[:, i] = p[:, 1]
        if verbose: print("done")

    return P

In [22]:
P_base = predict_base_learners({'DT': DT_clf,'RF': RF_clf}, xpred_base)

Generating base learner predictions.
DT... done
RF... done


In [23]:
logisticRegr = LogisticRegression()
logisticRegr.fit(P_base, ypred_base)
#print_score(logisticRegr, p, X_test, y_train, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
def ensemble_predict(base_learners, meta_learner, inp, verbose=True):
    """
    Generate predictions from the ensemble.
    """
    P_pred = predict_base_learners(base_learners, inp, verbose=verbose)
    return P_pred, meta_learner.predict_proba(P_pred)[:, 1]

In [25]:
P_pred, p = ensemble_predict({'DT': DT_clf,'RF': RF_clf}, logisticRegr, X_test)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(y_test, p))

Generating base learner predictions.
DT... done
RF... done

Ensemble ROC-AUC score: 0.802


###  B. Single Classifier- Random Forest --> Adaboost --> Bagging

In [None]:
bag_clf = BaggingClassifier(base_estimator=adaboost_clf2, n_estimators=1000, bootstrap=True, oob_score=True,n_jobs=-1, random_state=100)
bag_clf.fit(X_train, y_train)
print_score(bag_clf, X_train, X_test, y_train, y_test, train=True)
print_score(bag_clf, X_train, X_test, y_train, y_test, train=False)

## practice stackingclassifier 

In [30]:
from sklearn.ensemble import StackingClassifier
sclf = StackingClassifier(estimators=[('DT',DT_clf ),('RFC', RF_clf)])
sclf.fit(X_train, y_train)
print_score(bag_clf, X_train, X_test, y_train, y_test, train=True)
print_score(bag_clf, X_train, X_test, y_train, y_test, train=False)

Train Result:

accuracy score: 0.8350

Classification Report: 
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       982
           1       0.00      0.00      0.00       194

    accuracy                           0.84      1176
   macro avg       0.42      0.50      0.46      1176
weighted avg       0.70      0.84      0.76      1176


Confusion Matrix: 
 [[982   0]
 [194   0]]

ROC AUC: 0.5000



  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: ignored