In [39]:
from sklearn.datasets import fetch_openml
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

In [2]:
_id=1510
X_breast,y_breast= fetch_openml(data_id=_id,return_X_y=True,parser='auto')


In [3]:
_id=182
X_sat,y_sat= fetch_openml(data_id=_id,return_X_y=True,parser='auto')


# Bagging


In [4]:
# set the number of base classifiers
n_classifiers = 20
k=1

# create a list to store the base classifiers
def Bagging_DT(X_train, X_test, y_train, y_test,n_classifiers):
    classifiers = []
    # train n_classifiers decision trees with depth 1 on different subsets of the training data
    np.random.seed(0)
    for i in range(n_classifiers):
        
        # randomly sample the training data with replacement to create a new training set
        sample_indices = np.random.choice(len(X_train), len(X_train), replace=True)
        X_train_sample = X_train.iloc[sample_indices]
        y_train_sample = y_train.iloc[sample_indices]

        # fit a decision tree with depth 1 on the sample training data
        clf = DecisionTreeClassifier(max_depth=1)
        clf.fit(X_train_sample, y_train_sample)

        # store the fitted classifier in the list
        classifiers.append(clf)

    # use the base classifiers to make predictions on the test data
    predictions = np.ones((len(X_test), n_classifiers))
    for i, clf in enumerate(classifiers):
        predictions[:, i] = clf.predict(X_test)

    # take the majority vote to make the final prediction for each test data point
    final_predictions = np.round(np.mean(predictions, axis=1))
    # calculate the accuracy of the bagging algorithm
    y_test = y_test.astype(int)
    accuracy = np.mean(final_predictions == y_test)
    #print('Accuracy:', accuracy)
    return final_predictions,accuracy


In [5]:
def Bagging_KNN(X_train, X_test, y_train, y_test, n_classifiers, k):
    classifiers = []
    # train n_classifiers KNN classifiers on different subsets of the training data
    np.random.seed(0)
    for i in range(n_classifiers):
        
        # randomly sample the training data with replacement to create a new training set
        sample_indices = np.random.choice(len(X_train), len(X_train), replace=True)
        X_train_sample = X_train.iloc[sample_indices]
        y_train_sample = y_train.iloc[sample_indices]

        # fit a KNN classifier on the sample training data
        clf = KNeighborsClassifier(n_neighbors=k)
        clf.fit(X_train_sample, y_train_sample)

        # store the fitted classifier in the list
        classifiers.append(clf)

    # use the base classifiers to make predictions on the test data
    predictions = np.ones((len(X_test), n_classifiers))
    for i, clf in enumerate(classifiers):
        predictions[:, i] = clf.predict(X_test)

    # take the majority vote to make the final prediction for each test data point
    final_predictions = np.round(np.mean(predictions, axis=1))
    # calculate the accuracy of the bagging algorithm
    y_test = y_test.astype(int)
    accuracy = np.mean(final_predictions == y_test)
    #print('Accuracy:', accuracy)
    return final_predictions, accuracy

In [6]:
def DT_Cross_Val(X,y):
    n_classifiers_range =(20,30,50,100,200,250)
    accuracies = []

    # initialize the stratified k-fold cross-validation object
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

    # loop over the range of classifiers
    for n_classifiers in n_classifiers_range:
        accuracy = []

        # loop over the folds
        for train_index, test_index in kfold.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]
            pred,acc=Bagging_DT(X_train,X_test,y_train,y_test,n_classifiers)
            accuracy.append(acc)
        #print(accuracy,'\n')
        accuracies.append(np.mean(accuracy))   
    optimal_n_classifiers = n_classifiers_range[np.argmax(accuracies)]
    print('mean of accuracies: ',accuracies)
    print('Optimal number of classifiers:', optimal_n_classifiers)
    print('Optimal accuracy:', accuracies[np.argmax(accuracies)])

In [7]:
def KNN_Cross_Val(X,y):
    n_classifiers_range =(20,30,50,100,200,250)
    accuracies = []

    # initialize the stratified k-fold cross-validation object
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

    # loop over the range of classifiers
    for n_classifiers in n_classifiers_range:
        accuracy = []

        # loop over the folds
        for train_index, test_index in kfold.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]
            pred,acc=Bagging_KNN(X_train,X_test,y_train,y_test,n_classifiers,k=7)
            accuracy.append(acc)
        #print(accuracy,'\n')
        accuracies.append(np.mean(accuracy))   
    optimal_n_classifiers = n_classifiers_range[np.argmax(accuracies)]
    print('mean of accuracies: ',accuracies)
    print('Optimal number of classifiers:', optimal_n_classifiers)
    print('Optimal accuracy:', accuracies[np.argmax(accuracies)])

# breast dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.2, random_state=0)
breast_pred,acc=Bagging_DT(X_train, X_test, y_train, y_test,n_classifiers)
print('Accuracy:', acc)

Accuracy: 0.9385964912280702


In [9]:
breast_pred,acc=Bagging_KNN(X_train, X_test, y_train, y_test,n_classifiers,k)
print('Accuracy:', acc)

Accuracy: 0.9035087719298246


In [10]:
DT_Cross_Val(X_breast,y_breast)

mean of accuracies:  [0.9121409718987735, 0.915649743828598, 0.9156342182890856, 0.9191274646793977, 0.9261605340785592, 0.9208818506443098]
Optimal number of classifiers: 200
Optimal accuracy: 0.9261605340785592


In [11]:
KNN_Cross_Val(X_breast,y_breast)

mean of accuracies:  [0.9332401800962582, 0.9349945660611706, 0.9332401800962582, 0.9332401800962582, 0.9349945660611706, 0.9385033379909953]
Optimal number of classifiers: 250
Optimal accuracy: 0.9385033379909953


# satimage dataset

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_sat, y_sat, test_size=0.2, random_state=0)
y_test=y_test.astype(float)
y_test=y_test.astype(int)
y_train=y_train.astype(float)
y_train=y_train.astype(int)
sat_pred,acc=Bagging_DT(X_train, X_test, y_train, y_test,n_classifiers)
print('Accuracy:', acc)

Accuracy: 0.307153965785381


In [13]:
breast_pred,acc=Bagging_KNN(X_train, X_test, y_train, y_test,n_classifiers,k)
print('Accuracy:', acc)

Accuracy: 0.8600311041990669


In [14]:
y_sat=y_sat.astype(float)
DT_Cross_Val(X_sat,y_sat)

mean of accuracies:  [0.35303265940902023, 0.35303265940902023, 0.3052877138413686, 0.302954898911353, 0.30451010886469676, 0.30451010886469676]
Optimal number of classifiers: 20
Optimal accuracy: 0.35303265940902023


In [15]:
KNN_Cross_Val(X_sat,y_sat)

mean of accuracies:  [0.8598755832037325, 0.8618973561430792, 0.864230171073095, 0.862208398133748, 0.8626749611197513, 0.8626749611197513]
Optimal number of classifiers: 50
Optimal accuracy: 0.864230171073095


# OOB Breast

In [18]:
def Bagging_DT_oob(X_train, X_test, y_train, y_test, n_classifiers):
    classifiers = []
    # train n_classifiers decision trees with depth 1 on different subsets of the training data
    np.random.seed(0)
    n_samples = len(X_train)
    oob_accuracies=[]
    y_train=y_train.astype(float)
    for i in range(n_classifiers):
        
        # randomly sample the training data with replacement to create a new training set
        sample_indices = np.random.choice(n_samples, n_samples, replace=True)
        X_train_sample = X_train.iloc[sample_indices]
        y_train_sample = y_train.iloc[sample_indices]

        # fit a decision tree with depth 1 on the sample training data
        clf = DecisionTreeClassifier(max_depth=1)
        clf.fit(X_train_sample, y_train_sample)

        # store the fitted classifier in the list
        classifiers.append(clf)

        # use the current classifier to make predictions on the samples not included in the current training set
        not_included_indices = np.array([i for i in range(n_samples) if i not in sample_indices])
        not_inc_pred= clf.predict(X_train.iloc[not_included_indices]).astype(int)
        oob_accuracies.append(np.mean(not_inc_pred==y_train_sample.iloc[not_included_indices]))


    # average the OOB predictions over all classifiers to get the final OOB prediction

    oob_error = 1- np.mean(oob_accuracies)
    #oob_accuracy = np.mean(oob_predictions == y_train)
    #print('OOB accuracy:', oob_accuracy)
    
    # use the base classifiers to make predictions on the test data
    predictions = np.ones((len(X_test), n_classifiers))
    for i, clf in enumerate(classifiers):
        predictions[:, i] = clf.predict(X_test)

    # take the majority vote to make the final prediction for each test data point
    final_predictions = np.round(np.mean(predictions, axis=1))
    # calculate the accuracy of the bagging algorithm
    y_test = y_test.astype(int)
    accuracy = np.mean(final_predictions == y_test)
    #print('Accuracy:', accuracy)
    #print(oob_accuracies)
    return final_predictions, accuracy, oob_error


In [19]:
def Bagging_KNN_OOB(X_train, X_test, y_train, y_test, n_classifiers, k):
    classifiers = []
    n_samples = len(X_train)
    oob_accuracies=[]
    y_train=y_train.astype(float)
    np.random.seed(0)
    for i in range(n_classifiers):
        
        # randomly sample the training data with replacement to create a new training set
        sample_indices = np.random.choice(len(X_train), len(X_train), replace=True)
        X_train_sample = X_train.iloc[sample_indices]
        y_train_sample = y_train.iloc[sample_indices]

        # fit a KNN classifier on the sample training data
        clf = KNeighborsClassifier(n_neighbors=k)
        clf.fit(X_train_sample, y_train_sample)

        # store the fitted classifier in the list
        classifiers.append(clf)
        not_included_indices = np.array([i for i in range(n_samples) if i not in sample_indices])
        not_inc_pred= clf.predict(X_train.iloc[not_included_indices]).astype(int)
        oob_accuracies.append(np.mean(not_inc_pred==y_train_sample.iloc[not_included_indices]))

    # use the base classifiers to make predictions on the test data
    predictions = np.ones((len(X_test), n_classifiers))
    for i, clf in enumerate(classifiers):
        predictions[:, i] = clf.predict(X_test)

    # take the majority vote to make the final prediction for each test data point
    final_predictions = np.round(np.mean(predictions, axis=1))
    # calculate the accuracy of the bagging algorithm
    y_test = y_test.astype(int)
    accuracy = np.mean(final_predictions == y_test)

    # use the samples that are not included in the training of each base classifier to make OOB predictions
    oob_error = 1- np.mean(oob_accuracies)
    #print('OOB accuracy:', oob_accuracy)
   
    #print('Accuracy:', accuracy)
    #print('OOB Accuracy:', oob_accuracy)
    return final_predictions, accuracy, oob_error

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast, test_size=0.2, random_state=0)
breast_pred,acc,oob=Bagging_DT_oob(X_train, X_test, y_train, y_test,200)
print('Accuracy:', acc)
print('oob error:', oob)

Accuracy: 0.956140350877193
oob error: 0.46153887606313115


In [21]:
breast_pred,acc,oob=Bagging_KNN_OOB(X_train, X_test, y_train, y_test,250,k)
print('Accuracy:', acc)
print('oob error:', oob)

Accuracy: 0.9122807017543859
oob error: 0.4584218169909724


# OOB satimage

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_sat, y_sat, test_size=0.2, random_state=0)
breast_pred,acc,oob=Bagging_DT_oob(X_train, X_test, y_train, y_test,20)
print('Accuracy:', acc)
print('oob error :', oob)

Accuracy: 0.307153965785381
oob error : 0.7630252061693837


In [23]:
breast_pred,acc,oob=Bagging_KNN_OOB(X_train, X_test, y_train, y_test,50,k)
print('Accuracy:', acc)
print('oob error:', oob)

Accuracy: 0.8600311041990669
oob error: 0.8123842358062976


# Boosting

In [50]:
def AdaBoost(X_train, X_test, y_train, y_test, T,base):
    n_train = X_train.shape[0]
    n_test = X_test.shape[0]
    w = np.ones(n_train) / n_train
    classifiers = []
    alphas = []
    
    for t in range(T):
        # Fit a decision stump on the training data weighted by w
        if (base=='DT'):
            clf = DecisionTreeClassifier(max_depth=1)
            clf.fit(X_train, y_train, sample_weight=w)
            classifiers.append(clf)
        else:
            clf = BaggingClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=5)
            clf.fit(X_train, y_train, sample_weight=w)
            classifiers.append(clf)

        # Predict on the training data to calculate weighted error
        y_pred_train = clf.predict(X_train)
        weighted_error = np.sum(w[y_pred_train != y_train])

        # Calculate alpha
        epsilon = 1e-10  # small constant
        alpha = 0.5 * np.log((1 - weighted_error + epsilon) / (weighted_error + epsilon))

        alphas.append(alpha)

        # Update the weights for the next iteration
        w *= np.exp(-alpha * y_train * y_pred_train)
        w /= np.sum(w)

    # Make predictions on the test data
    predictions = np.zeros(n_test)
    for t, clf in enumerate(classifiers):
        y_pred_test = clf.predict(X_test)
        predictions += alphas[t] * y_pred_test

    final_predictions = np.sign(predictions)
    accuracy = np.mean(final_predictions == y_test)
    return final_predictions, accuracy

In [51]:
y_breast=y_breast.astype(int)
y_breast1=y_breast-1
X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast1, test_size=0.2, random_state=0)

iterations=[1,10,20,30,50,70,100]
for i in iterations:
    final,accuracy=AdaBoost(X_train, X_test, y_train, y_test, i,'DT')
    print("Accuracy on WDBC dataset for adaboost for",i,"iterations with decision Tree:", accuracy)

Accuracy on WDBC dataset for adaboost for 1 iterations with decision Tree: 0.9035087719298246
Accuracy on WDBC dataset for adaboost for 10 iterations with decision Tree: 0.9649122807017544
Accuracy on WDBC dataset for adaboost for 20 iterations with decision Tree: 0.9649122807017544
Accuracy on WDBC dataset for adaboost for 30 iterations with decision Tree: 0.9649122807017544
Accuracy on WDBC dataset for adaboost for 50 iterations with decision Tree: 0.9649122807017544
Accuracy on WDBC dataset for adaboost for 70 iterations with decision Tree: 0.9649122807017544
Accuracy on WDBC dataset for adaboost for 100 iterations with decision Tree: 0.9649122807017544


In [52]:
iterations=[1,10,20,30,50,70,100]
for i in iterations:
    final,accuracy=AdaBoost(X_train, X_test, y_train, y_test, i,'RF')
    print("Accuracy on WDBC dataset for adaboost for",i,"iterations with bag of 5 decision trees:", accuracy)

Accuracy on WDBC dataset for adaboost for 1 iterations with bag of 5 decision trees: 0.956140350877193
Accuracy on WDBC dataset for adaboost for 10 iterations with bag of 5 decision trees: 0.9385964912280702
Accuracy on WDBC dataset for adaboost for 20 iterations with bag of 5 decision trees: 0.9385964912280702
Accuracy on WDBC dataset for adaboost for 30 iterations with bag of 5 decision trees: 0.956140350877193
Accuracy on WDBC dataset for adaboost for 50 iterations with bag of 5 decision trees: 0.9649122807017544
Accuracy on WDBC dataset for adaboost for 70 iterations with bag of 5 decision trees: 0.9122807017543859
Accuracy on WDBC dataset for adaboost for 100 iterations with bag of 5 decision trees: 0.9824561403508771


Based on the results obtained, it appears that the AdaBoost algorithm performs well on the WDBC dataset, regardless of whether a single decision tree or a bag of decision trees is used as the base classifier.

For the case of a single decision tree, the accuracy of the AdaBoost algorithm appears to stabilize at around 0.965 after approximately 10 iterations, and remains relatively consistent with further iterations. This suggests that adding more iterations beyond 10 does not necessarily improve the performance of the algorithm significantly.

On the other hand, for the case of a bag of 5 decision trees, the accuracy of the AdaBoost algorithm appears to vary more with the number of iterations. The highest accuracy of 0.982 was achieved with 100 iterations, while the lowest accuracy of 0.912 was achieved with 70 iterations. This suggests that with a bag of decision trees, increasing the number of iterations can have a larger impact on the performance of the algorithm.

Overall, it can be concluded that the AdaBoost algorithm is a powerful technique for classification, and that it can achieve high accuracy on the WDBC dataset with both single decision trees and bags of decision trees as base classifiers. However, the number of iterations used in the algorithm can have a significant impact on its performance, especially when using a bag of decision trees.

# RSM

In [56]:
def RSM(X_train, X_test, y_train, y_test, n_classifiers, subspace_size):
    classifiers = []
    feature_indices = []
    np.random.seed(0)
    
    # Select subspace_size random features for each classifier
    for i in range(n_classifiers):
        indices = np.random.choice(X_train.shape[1], subspace_size, replace=False)
        feature_indices.append(indices)
        X_train_subspace = X_train.iloc[:, indices]
        y_train_subspace = y_train

        # fit a decision tree classifier on the subspace of training data
        clf = DecisionTreeClassifier(max_depth=1)
        clf.fit(X_train_subspace, y_train_subspace)

        # store the fitted classifier in the list
        classifiers.append(clf)

    # use the base classifiers to make predictions on the test data
    predictions = np.ones((len(X_test), n_classifiers))
    for i, clf in enumerate(classifiers):
        indices = feature_indices[i]
        X_test_subspace = X_test.iloc[:, indices]
        predictions[:, i] = clf.predict(X_test_subspace)

    # take the majority vote to make the final prediction for each test data point
    final_predictions = np.round(np.mean(predictions, axis=1))
    
    # calculate the accuracy of the RSM algorithm
    y_test = y_test.astype(int)
    accuracy = np.mean(final_predictions == y_test)
    
    return final_predictions, accuracy

# wdbc dataset

In [57]:
y_breast1=y_breast-1
X_train, X_test, y_train, y_test = train_test_split(X_breast, y_breast1, test_size=0.2, random_state=0)
n_classifiers=50
subspace_size=5
final,accuracy=RSM(X_train,X_test, y_train, y_test, n_classifiers, subspace_size)
print('accuracy of rsm on  the wdbc dataset : ', accuracy)

accuracy of rsm on  the wdbc dataset :  0.9385964912280702


# satimage dataset

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_sat, y_sat, test_size=0.2, random_state=0)
n_classifiers=50
subspace_size=5
final,accuracy=RSM(X_train,X_test, y_train, y_test, n_classifiers, subspace_size)
print('accuracy of rsm on  the wdbc dataset : ', accuracy)

accuracy of rsm on  the wdbc dataset :  0.4059097978227061


# 5CV for both datasets

In [86]:
def RSM_CV(X, y, n_splits, n_classifiers, subspace_size):
    scores = []
    skf = StratifiedKFold(n_splits=n_splits)
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        _, score = RSM(X_train, X_test, y_train, y_test, n_classifiers, subspace_size)
        scores.append(score)
    return np.mean(scores)
n_splits = 5
n_classifiers = 20
subspace_sizes = [5, 10, 15, 20, 25,30]
results = []
for subspace_size in subspace_sizes:
    score = RSM_CV(X_breast, y_breast1, n_splits, n_classifiers, subspace_size)
    results.append(score)
    
# print the results
for i, subspace_size in enumerate(subspace_sizes):
    print(f"Accuracy on wdbc for subspace size {subspace_size}: {results[i]}")

Accuracy on wdbc for subspace size 5: 0.915680794907623
Accuracy on wdbc for subspace size 10: 0.9209750038813848
Accuracy on wdbc for subspace size 15: 0.9086166744294365
Accuracy on wdbc for subspace size 20: 0.9015991305697874
Accuracy on wdbc for subspace size 25: 0.8998447446048751
Accuracy on wdbc for subspace size 30: 0.9015991305697874


In [87]:
n_splits = 5
n_classifiers = 20
subspace_sizes = [5, 10, 15, 20, 25,30]
results = []
for subspace_size in subspace_sizes:
    score = RSM_CV(X_sat, y_sat, n_splits, n_classifiers, subspace_size)
    results.append(score)
    
# print the results
for i, subspace_size in enumerate(subspace_sizes):
    print(f"Accuracy on satimage for subspace size {subspace_size}: {results[i]}")

Accuracy on satimage for subspace size 5: 0.37558320373250387
Accuracy on satimage for subspace size 10: 0.4040435458786936
Accuracy on satimage for subspace size 15: 0.41772939346811827
Accuracy on satimage for subspace size 20: 0.4209953343701399
Accuracy on satimage for subspace size 25: 0.41555209953343697
Accuracy on satimage for subspace size 30: 0.4270606531881803


# ECOC FOR satimage

In [193]:
import itertools

def ecoc(num_classes, X_train, y_train, X_test):
    # Create binary classifiers for all possible pairs of classes
    binary_classifiers = []
    for pair in itertools.combinations(range(num_classes), 2):
        # Filter examples that belong to the current pair of classes
        X_pair, y_pair = filtering(X_train, y_train, pair)
        # Check if there are any examples in this pair
        if X_pair.shape[0] > 0:
            # Train a binary classifier on the current pair
            clf = DecisionTreeClassifier(max_depth=1)
            clf.fit(X_pair, y_pair)
            binary_classifiers.append((pair, clf))

    # Make predictions using the binary classifiers
    binary_predictions = []
    for pair, clf in binary_classifiers:
        y_pred = clf.predict(X_test)
        binary_predictions.append((pair, y_pred))

    # Combine the binary predictions to get multi-class predictions
    multi_class_predictions = []
    for i in range(len(X_test)):
        # Count the number of votes for each class
        votes = [0] * num_classes
        for pair, y_pred in binary_predictions:
            if y_pred[i] == 0:
                # Add a vote for the first class in the current pair
                votes[pair[0]] += 1
            else:
                # Add a vote for the second class in the current pair
                votes[pair[1]] += 1
        # Choose the class with the most votes
        multi_class_predictions.append(votes.index(max(votes)))

    return multi_class_predictions

def filtering(X, y, pair):
    # Filter examples that belong to the current pair of classes
    mask = (y == pair[0]) | (y == pair[1])
    X_filtered = X[mask]
    y_filtered = y[mask]
    # Convert class labels to binary labels
    y_filtered[y_filtered == pair[0]] = 0
    y_filtered[y_filtered == pair[1]] = 1
    return X_filtered, y_filtered


In [194]:

X_train, X_test, y_train, y_test = train_test_split(X_sat, y_sat, test_size=0.2, random_state=42)
y_pred = ecoc(num_classes=10, X_train=X_train, y_train=y_train, X_test=X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8133748055987559
