# Model Stacking

- Ensemble with different types of classifiers
    - Ex. Decision trees, logistic regression, are fitted on the same training data
    - Results are combined based on:
        - Majority voting (classification)
        - Average (regression)
- Ensemble with one type of classifier
    - Boostrap samples are drawn from the training data
    - Model fitted on each boostrap sample
    - All the results are combined to create an ensemble
    - Suitable for highly flexible models that are prone to overfitting / high variance

## Combining Methods

- Majority voting / average
- Method of application of meta-classifiers on outcomes
    - Binary outcome form individual classifiers
    - Meta-classifier is applied on top of these classifier outcomes
- Method of application of meta-classifiers on probabilities
    - Probabilities are obtained from individual classifiers
    - Applying meta-classifier

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

In [5]:
df = pd.read_csv('employee_attrition.csv')
num_col = list(df.describe().columns)
col_categorical = list(set(df.columns).difference(num_col))
remove_list = ['EmployeeCount', 'EmployeeNumber', 'StandardHours']
col_numerical = [
    e for e in num_col if e not in remove_list
]
attrition_to_num = {
    'Yes' : 0,
    'No' : 1
}
df['Attrition_num'] = df['Attrition'].map(attrition_to_num)
col_categorical.remove('Attrition')
df_cat = pd.get_dummies(df[col_categorical])
X = pd.concat([df[col_numerical], df_cat], axis=1)
y = df['Attrition_num']

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn import preprocessing

def printScore(clf, X_train, X_test, y_train, y_test, train=True):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        res = clf.predict(X_train)
        print('Train Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_train, res))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, res)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, res)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_train), lb.transform(res))))
    else:
        res_test = clf.predict(X_test)
        print('Test Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_test, res_test))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, res_test)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, res_test)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_test), lb.transform(res_test))))

In [15]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train);

In [16]:
printScore(tree_clf, X_train, X_test, y_train, y_test)
printScore(tree_clf, X_train, X_test, y_train, y_test, train=False)

Train Results:

Accuracy: 1.00

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       168
           1       1.00      1.00      1.00       934

    accuracy                           1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102
 

Confusion Matrix: 
 [[168   0]
 [  0 934]] 

ROC AUC: 1.0000

Test Results:

Accuracy: 0.79

Classification Report: 
               precision    recall  f1-score   support

           0       0.42      0.39      0.41        69
           1       0.86      0.88      0.87       299

    accuracy                           0.79       368
   macro avg       0.64      0.63      0.64       368
weighted avg       0.78      0.79      0.78       368
 

Confusion Matrix: 
 [[ 27  42]
 [ 37 262]] 

ROC AUC: 0.6338



In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train);

RandomForestClassifier()

In [19]:
printScore(rf_clf, X_train, X_test, y_train, y_test)
printScore(rf_clf, X_train, X_test, y_train, y_test, train=False)

Train Results:

Accuracy: 1.00

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       168
           1       1.00      1.00      1.00       934

    accuracy                           1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102
 

Confusion Matrix: 
 [[168   0]
 [  0 934]] 

ROC AUC: 1.0000

Test Results:

Accuracy: 0.83

Classification Report: 
               precision    recall  f1-score   support

           0       0.69      0.13      0.22        69
           1       0.83      0.99      0.90       299

    accuracy                           0.83       368
   macro avg       0.76      0.56      0.56       368
weighted avg       0.80      0.83      0.77       368
 

Confusion Matrix: 
 [[  9  60]
 [  4 295]] 

ROC AUC: 0.5585



In [24]:
# probability comparison between both estimators
en_en = pd.DataFrame()
en_en['tree'] = pd.DataFrame(tree_clf.predict_proba(X_train))[1]
en_en['rf'] = pd.DataFrame(rf_clf.predict_proba(X_train))[1]
en_en = pd.concat([en_en, pd.DataFrame(y_train).reset_index(drop=True)], axis=1)
en_en.head()

Unnamed: 0,tree,rf,Attrition_num
0,0.0,0.31,0
1,1.0,0.98,1
2,1.0,0.96,1
3,1.0,0.91,1
4,1.0,0.91,1


# Meta Classifier

In [25]:
from sklearn.linear_model import LogisticRegression

In [28]:
m_clf = LogisticRegression(fit_intercept=False, solver='lbfgs')
m_clf.fit(en_en[['tree', 'rf']], en_en['Attrition_num'])

LogisticRegression(fit_intercept=False)

In [35]:
en_test = pd.DataFrame()
en_test['tree'] = pd.DataFrame(tree_clf.predict_proba(X_test))[1]
en_test['rf'] = pd.DataFrame(rf_clf.predict_proba(X_test))[1]
en_test['combined'] = m_clf.predict(en_test[['tree', 'rf']])
en_test = pd.concat([en_test, pd.DataFrame(y_test).reset_index(drop=True)], axis=1)
en_test

Unnamed: 0,tree,rf,combined,Attrition_num
0,0.0,0.81,0,1
1,1.0,0.92,1,1
2,1.0,0.89,1,1
3,1.0,0.96,1,1
4,1.0,0.98,1,1
...,...,...,...,...
363,0.0,0.73,0,1
364,1.0,0.94,1,1
365,1.0,0.94,1,0
366,1.0,0.94,1,1


In [38]:
print(pd.crosstab(en_test['Attrition_num'], en_test['combined']))

combined        0    1
Attrition_num         
0              27   42
1              37  262
