# Bagging

In [1]:
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline
import pandas as pd 

In [2]:
df = sns.load_dataset('titanic')

In [4]:
df.shape

(891, 15)

In [6]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [8]:
df.dropna(inplace=True)
df['pclass'].unique()

array([1, 3, 2], dtype=int64)

In [10]:
df['pclass'].value_counts()

1    157
2     15
3     10
Name: pclass, dtype: int64

In [9]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [11]:
X = df[['pclass', 'sex', 'age']].copy()

In [13]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [14]:
X['sex'] = le.fit_transform(df['sex'])

In [15]:
X.head()

Unnamed: 0,pclass,sex,age
1,1,0,38.0
3,1,0,35.0
6,1,1,54.0
10,3,0,4.0
11,1,0,58.0


In [17]:
X.shape

(182, 3)

In [18]:
X.describe()

Unnamed: 0,pclass,sex,age
count,182.0,182.0,182.0
mean,1.192308,0.516484,35.623187
std,0.516411,0.501107,15.671615
min,1.0,0.0,0.92
25%,1.0,0.0,24.0
50%,1.0,1.0,36.0
75%,1.0,1.0,47.75
max,3.0,1.0,80.0


In [20]:
y = df['survived'].copy()

In [23]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [30]:
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix, roc_auc_score

def printScore(clf, X_train, X_test, y_train, y_test, train=True):
    lb = preprocessing.LabelBinarizer()
    lb.fit(y_train)
    if train:
        res = clf.predict(X_train)
        print('Train Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_train, res))
        print('Classification Report: \n {} \n'.format(classification_report(y_train, res)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, res)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_train), lb.transform(res))))
    else:
        res_test = clf.predict(X_test)
        print('Test Results:\n')
        print('Accuracy: %.2f\n' % accuracy_score(y_test, res_test))
        print('Classification Report: \n {} \n'.format(classification_report(y_test, res_test)))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, res_test)))
        print('ROC AUC: {0:.4f}\n'.format(roc_auc_score(lb.transform(y_test), lb.transform(res_test))))

In [31]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [34]:
printScore(clf, X_train, X_test, y_train, y_test, train=True)
printScore(clf, X_train, X_test, y_train, y_test, train=False)

Train Results:

Accuracy: 0.93

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.96      0.90        46
           1       0.98      0.92      0.95        99

    accuracy                           0.93       145
   macro avg       0.91      0.94      0.92       145
weighted avg       0.94      0.93      0.93       145
 

Confusion Matrix: 
 [[44  2]
 [ 8 91]] 

ROC AUC: 0.9379

Test Results:

Accuracy: 0.73

Classification Report: 
               precision    recall  f1-score   support

           0       0.60      0.69      0.64        13
           1       0.82      0.75      0.78        24

    accuracy                           0.73        37
   macro avg       0.71      0.72      0.71        37
weighted avg       0.74      0.73      0.73        37
 

Confusion Matrix: 
 [[ 9  4]
 [ 6 18]] 

ROC AUC: 0.7212



In [37]:
bag_clf = BaggingClassifier(base_estimator=clf, n_estimators=1000, 
                            bootstrap=True, n_jobs=-1, random_state=42
                            )
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                  n_estimators=1000, n_jobs=-1, random_state=42)

In [38]:
printScore(clf, X_train, X_test, y_train, y_test, train=True)
printScore(clf, X_train, X_test, y_train, y_test, train=False)

Train Results:

Accuracy: 0.93

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.96      0.90        46
           1       0.98      0.92      0.95        99

    accuracy                           0.93       145
   macro avg       0.91      0.94      0.92       145
weighted avg       0.94      0.93      0.93       145
 

Confusion Matrix: 
 [[44  2]
 [ 8 91]] 

ROC AUC: 0.9379

Test Results:

Accuracy: 0.73

Classification Report: 
               precision    recall  f1-score   support

           0       0.60      0.69      0.64        13
           1       0.82      0.75      0.78        24

    accuracy                           0.73        37
   macro avg       0.71      0.72      0.71        37
weighted avg       0.74      0.73      0.73        37
 

Confusion Matrix: 
 [[ 9  4]
 [ 6 18]] 

ROC AUC: 0.7212

