# Bagging for Classification

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import os

In [8]:
iris = pd.read_csv(os.getcwd()+"\\Data\\iris.csv")

FileNotFoundError: [Errno 2] File C:\Users\aksha\OneDrive\Desktop\Data Science Material\Course Assignments\Algo From Scratch\Data\iris.csv does not exist: 'C:\\Users\\aksha\\OneDrive\\Desktop\\Data Science Material\\Course Assignments\\Algo From Scratch\\Data\\iris.csv'

In [None]:
iris = iris.drop('Unnamed: 0', axis=1)

In [9]:
X = iris[iris.columns[0:4].tolist()]
y = iris['Species']

NameError: name 'iris' is not defined

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100, stratify=y)

## Function to create samples

In [26]:
def create_boot_samples(X_train, no_of_samples):
    dict_for_not_selected = dict()
    samples = list()
    
    # Loop to get the number of samples defined by the user.
    for i in range(no_of_samples):
        boot_sample = X_train.sample(frac=1, replace=True)
        samples.append(boot_sample.index.tolist())
        index_selected = np.unique(boot_sample.index.tolist())
        not_selected_list = list()
        
        # Condition to mark the index True if that index is selected for the bootstrap sample.
        for j in range(X_train.shape[0]):
            if j not in index_selected:
                not_selected_list.append(True)
            else:
                not_selected_list.append(False)
                
        # Naming and adding the sample to a dictionary.
        key = 'Boot' + str(i+1)
        dict_for_not_selected[key] = not_selected_list
    return dict_for_not_selected, samples

## Function to build models

In [27]:
def build_models(X_train, y_train, samples):
    model_list = list()
    
    # Looping till every sample is used.
    for i in range(len(samples)):
        train_predictor = X_train.loc[samples[i],]
        train_response = y_train.loc[samples[i],]
        
        # Fitting decision tree using bootstrapped samples.
        model = DecisionTreeClassifier()
        model.fit(train_predictor, train_response)
        model_list.append(model)
    return model_list

## Function to cross validate using OOB

In [28]:
def oob_stats(X_train, y_train, boot_unselected_list, samples, models):
    key_list = list(boot_unselected_list.keys())
    oob_pred_list = list()
    
    # Predicting for unselected values of sample using that particular model.
    for i in range(len(key_list)):
        test = X_train.loc[boot_unselected_list[key_list[i]],]
        prediction = models[i].predict(test)
        oob_pred_list.append(prediction)
        
    # Looping till every predicted value is assigned to their respective indexes.
    each_model_predictions = dict()
    for i in range(len(oob_pred_list)):
        oob_pred = oob_pred_list[i]
        current_sample = samples[i]
        mod_prediction = list()
        counter = 0
        
        # Adding value of prediction if available and marking it 'Unavailable' if prediction is missing.
        for j in range(X_train.shape[0]):
            if j not in current_sample:
                mod_prediction.append(oob_pred[counter])
                counter = counter + 1
            else:
                mod_prediction.append('Unidentified')
        key = 'model'+str(i+1)
        each_model_predictions[key] = mod_prediction
        prediction_df = pd.DataFrame(each_model_predictions)
        
    # Counting votes for every predicted category.
    final_prediction = list()
    for i in range(prediction_df.shape[0]):
        count_dictionary = dict()
        for j in range(prediction_df.shape[1]):
            pred = prediction_df.iloc[i,j]
            count_dictionary[pred] = count_dictionary.get(pred, 0) + 1
        pred_check = sorted(count_dictionary, key=count_dictionary.get, reverse=True)
        
        # If 'Unidentified' gets the most number of votes, we select the category with second highest votes.
        if pred_check[0] == 'Unidentified':
            final_prediction.append(pred_check[1])
        else:
            final_prediction.append(pred_check[0])
            
    # Printing the results of Out of Box validation.
    print(confusion_matrix(y_train, final_prediction, labels = ['setosa', 'versicolor', 'virginica']))
    print(classification_report(y_train, final_prediction))

## Function to build bagging model

In [29]:
def bagging_build(X_train, y_train, no_of_samples, OOB_test = False):
    X_train = X_train.reset_index()
    y_train = y_train.reset_index()
    X_train = X_train.drop('index', axis = 1)
    y_train = y_train.drop('index', axis = 1)
    boot_selection_list, samples = create_boot_samples(X_train, no_of_samples)
    models = build_models(X_train, y_train, samples)
    if OOB_test:
        oob_stats(X_train, y_train, boot_selection_list, samples, models)
    return models

In [10]:
models = bagging_build(X_train, y_train, 100, True)

[[35  0  0]
 [ 0 33  2]
 [ 0  2 33]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       0.94      0.94      0.94        35
   virginica       0.94      0.94      0.94        35

    accuracy                           0.96       105
   macro avg       0.96      0.96      0.96       105
weighted avg       0.96      0.96      0.96       105



## Function to predict using bagging model

In [30]:
def bagging_predict(models, X_test):
    prediction_list = dict()
    for i in range(len(models)):
        predictions = models[i].predict(X_test)
        model_name = 'model_'+str(i+1)
        prediction_list[model_name] = predictions
        prediction_df = pd.DataFrame(prediction_list)

    final_prediction = list()
    for i in range(prediction_df.shape[0]):
        count_dictionary = dict()
        for j in range(prediction_df.shape[1]):
            pred = prediction_df.iloc[i,j]
            count_dictionary[pred] = count_dictionary.get(pred, 0) + 1
        final_prediction.append(sorted(count_dictionary, key=count_dictionary.get, reverse=True)[0])
    return final_prediction

In [12]:
pred = bagging_predict(models, X_test)

In [13]:
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[15  0  0]
 [ 0 15  0]
 [ 0  3 12]]
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       0.83      1.00      0.91        15
   virginica       1.00      0.80      0.89        15

    accuracy                           0.93        45
   macro avg       0.94      0.93      0.93        45
weighted avg       0.94      0.93      0.93        45



In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [31]:
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
df = pd.read_csv('C:/Users/aksha/OneDrive/Desktop/Data Science Material/Course Assignments/ML Article - Bagging on Linear Regression/pima_indians_diabetes.data.csv', names=names)

In [32]:
df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [33]:
df.describe()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [34]:
X = df.iloc[:,0:8]
Y = df.iloc[:,8]

In [35]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X,Y,test_size=0.3,random_state=100)

In [36]:
model = bagging_build(X1_train, y1_train, 50)

In [37]:
train_predict = bagging_predict(model, X1_train)

In [38]:
print(classification_report(y1_train, train_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       350
           1       0.99      1.00      1.00       187

    accuracy                           1.00       537
   macro avg       1.00      1.00      1.00       537
weighted avg       1.00      1.00      1.00       537



In [39]:
test_predict = bagging_predict(model, X1_test)

In [40]:
print(classification_report(y1_test, test_predict))

              precision    recall  f1-score   support

           0       0.78      0.81      0.79       150
           1       0.62      0.58      0.60        81

    accuracy                           0.73       231
   macro avg       0.70      0.69      0.70       231
weighted avg       0.72      0.73      0.73       231



In [42]:
decision = DecisionTreeClassifier()
decision.fit(X1_train, y1_train)

DecisionTreeClassifier()

In [43]:
dc_training_pred = decision.predict(X1_train)
print(classification_report(y1_train, dc_training_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       350
           1       1.00      1.00      1.00       187

    accuracy                           1.00       537
   macro avg       1.00      1.00      1.00       537
weighted avg       1.00      1.00      1.00       537



In [45]:
dc_test_pred = decision.predict(X1_test)
print(classification_report(y1_test, dc_test_pred))

              precision    recall  f1-score   support

           0       0.74      0.74      0.74       150
           1       0.52      0.53      0.53        81

    accuracy                           0.67       231
   macro avg       0.63      0.64      0.64       231
weighted avg       0.67      0.67      0.67       231

