## Random Patches definition

Complete the code as required in the assignment

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from sklearn.tree import DecisionTreeClassifier
import pandas as pd 
import random


class RandomPatches(BaseEstimator, ClassifierMixin):
    def __init__(self, n_estimators=10, max_features=4, custom_voting="majority", random_state=42):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.custom_voting = custom_voting  # "weighted_majority" / "probabilistic" / default: "majority" 
        self.random_state = random_state
        self.learners = []
        self.subspaces = []
        self.oob_scores = []
        self.oob_sets_size = []
    
    def fit(self, X, y):
        
        np.random.seed(self.random_state)
        random.seed(self.random_state)
        
        X = X.values if isinstance(X, pd.DataFrame) else X
        y = y.values if isinstance(y, pd.Series) else y
        
        # Ensure numpy arrays for consistency
        X = np.asarray(X)
        y = np.asarray(y)
 
        # total number of instances and total number of features
        n_samples, n_features = X.shape

        # for every member in the ensemble....
        # Should select the patches (subsets of instances and features) and train a DecisionTreeClassifier()
        # Also calculate the accuracy score for the oob and append it to oob_scores
        for _ in range(self.n_estimators):

            # Sample indices of instances and features
            instance_indices = np.random.choice(n_samples, size=n_samples, replace=True)
            feature_indices = np.random.choice(n_features, size=self.max_features, replace=False) 
            
            # Generate patch for member in the ensemble as subset of X
            X_patch = X[np.ix_(instance_indices, feature_indices)]
            y_patch = y[instance_indices]
            
            # Train learner on the patch
            learner = DecisionTreeClassifier(random_state=self.random_state)  
            learner.fit(X_patch, y_patch)
            
            # Calculate OOB score for learner 
            oob_indices = np.delete(np.arange(n_samples), np.unique(instance_indices))  # Creates a new np array with all possible indices, then delete used indices from it 
            oob_score = 0.0 
            oob_size = len(oob_indices)
            if oob_size > 0:
                X_oob = X[np.ix_(oob_indices, feature_indices)]
                y_oob = y[oob_indices]
                preds_oob = learner.predict(X_oob)
                oob_score = np.mean(preds_oob == y_oob) * 100  # *100 to convert from decimal to percentage
                    
            self.learners.append(learner)
            self.subspaces.append(feature_indices) # the features in that subspace
            self.oob_scores.append(oob_score) # the accuracy on the oob set
            self.oob_sets_size.append(oob_size) # number of elements in the oob

        return self

    def predict(self, X):
        
        np.random.seed(self.random_state)
        random.seed(self.random_state)
        
        X = X.values if isinstance(X, pd.DataFrame) else X
        
        predictions = np.array([learner.predict(X[:, subspace]) for learner, subspace in zip(self.learners, self.subspaces)])
        pred_T = predictions.T  # Making each row a sample, for easier iteration through samples

        # Weighted majority voting scheme
        if self.custom_voting == "weighted_majority":
            vote = [                                                                             #  A list of 
                max(                                                                             #  Max vote total weights
                    np.unique(sample_preds),                                                     #  (from comparing the total weight for each unique prediction value in sample predictions)
                    key=lambda label : sum(
                                           oob                                                   #  with total weight being the sum of weights of all estimators that predicted said prediction value
                                           for p, oob in zip(sample_preds, self.oob_scores) 
                                           if p == label)  
                )
                for sample_preds in pred_T]                                                      # For each sample in predictions (sample_preds is a list of all predictions from estimators for one sample)
            
        # Probabilistic voting scheme
        elif self.custom_voting == "probabilistic":
            vote = [                                                                             #  A list of 
                np.random.choice(sample_preds,                                                   #  Random choices from the list of sample predictions  
                                 p=[oob / sum(self.oob_scores) for oob in self.oob_scores])   #  with estimators' normalized OOB scores (that sum up to 1) being probabilities 
                for sample_preds in pred_T]                                                      #  For each sample in predictions (sample_preds is a list of all predictions from estimators for one sample)
            
        # Majority voting scheme (default)
        else: 
            vote = [                                                # A list of 
                max(                                                # Max vote counts (from comparing the counts of unique prediction value in sample predictions) 
                    np.unique(sample_preds),                        
                    key=lambda label : sum(sample_preds == label))
                for sample_preds in pred_T]                         # For each sample in predictions (sample_preds is a list of all predictions from estimators for one sample)
        
        return vote

## Data reading and evaluation

There is no need to update the following cell, it just declare functions to read the dataset, evaluate and run the experiments. 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def load_dataset(dataset_path='electricity2.csv'):
    _data = pd.read_csv(dataset_path)
    # class label must be the last column
    X = _data.iloc[:, :-1]
    y = _data.iloc[:, -1]
    return (X, y)

# train-test split and return accuracy
def evaluate_classifier(classifier, X, y, test_size=0.3):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, X_train, X_test, y_train, y_test


# perform experiments in one dataset for several classifiers
def run_experiments(classifiers, show_oob=True):
    (X_electricity, y_electricity) = load_dataset(dataset_path='electricity2.csv')
    
    results = []
    datasets = {
        'Electricity': (X_electricity, y_electricity)
    }
    
    for dataset_name, (X, y) in datasets.items():
        for clf_name, clf in classifiers.items():
            print(f"running {clf_name}")
            accuracy, _, _, _, _ = evaluate_classifier(clf, X, y)
            results.append({
                'Dataset': dataset_name,
                'Classifier': clf_name,
                'Accuracy': accuracy
            })
            if isinstance(clf, RandomPatches) and show_oob:
                for i, (subspace, oob_set_size, oob_accuracy) in enumerate(zip(clf.subspaces, clf.oob_sets_size, clf.oob_scores)):
                    print(f"Base Learner {i+1} Subspace (features): {subspace} OOB Instances: {oob_set_size} OOB Accuracy: {oob_accuracy:.4f}")

    return pd.DataFrame(results)

## Experiments

Modify this part of the code to add more experiments as needed

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Run the experiments and display results
classifiers = {
        'DecisionTree': DecisionTreeClassifier(),
        'Bagging': BaggingClassifier(n_estimators=20),
        'RandomForest': RandomForestClassifier(),
        'AdaBoost': AdaBoostClassifier(algorithm="SAMME"),
        'XGBoost': XGBClassifier(eval_metric='logloss'),
        'RandomPatches(4,10)': RandomPatches(max_features=4, n_estimators=10)
    }

results_df = run_experiments(classifiers, show_oob=True)
display(results_df)

### Impact of Ensemble Size and Feature Subset Size

In [None]:
# Impact of ensemble size (keeping max_features=4 constant)
ensemble_sizes_list = [5, 10, 50]  
ensemble_sizes = {
    f'RandomPatches(4,{size})' : RandomPatches(n_estimators=size)
    for size in ensemble_sizes_list
}

# Impact of feature subset size (keeping n_estimators=10 constant)
feature_subset_sizes_list = [2, 4, 6, 8, 11] 
feature_subset_sizes = {
    f'RandomPatches({size},10)' : RandomPatches(max_features=size)
    for size in feature_subset_sizes_list
}

results_ensemble = run_experiments(ensemble_sizes, show_oob=True)
results_features = run_experiments(feature_subset_sizes, show_oob=True)

# Plot line graphs for each experiment
fig, ax = plt.subplots(1, 2, figsize=(15, 6))
ensemble_accuracies = results_ensemble['Accuracy'].values
feature_accuracies = results_features['Accuracy'].values
ax[0].plot(ensemble_sizes_list, ensemble_accuracies, marker='o', linewidth=2, markersize=8)
ax[0].set_xlabel('Number of Estimators')
ax[0].set_ylabel('Accuracy')
ax[0].set_title('Impact of Ensemble Size on RandomPatches Performance\n(max_features=4)')
ax[0].set_xticks(ensemble_sizes_list)
ax[0].grid(True, alpha=0.3)
ax[1].plot(feature_subset_sizes_list, feature_accuracies, marker='s', linewidth=2, markersize=8, color='orange')
ax[1].set_xlabel('Number of Features')
ax[1].set_ylabel('Accuracy')
ax[1].set_title('Impact of Feature Subset Size on RandomPatches Performance\n(n_estimators=10)')
ax[1].set_xticks(feature_subset_sizes_list)
ax[1].grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Display the results tables
print("\nEnsemble Size Experiment Results")
display(results_ensemble)
print("\nFeature Subset Size Experiment Results")
display(results_features)


### Custom Voting Scheme (Weighted Majority) vs. Majority Voting Scheme
(+ Comparison of OOB Accuracies vs. Ensemble Accuracy)

In [None]:
voting_schemes_list = ['majority', 'weighted_majority']
voting_schemes = {
    f'RandomPatches(4,10,{voting})' : RandomPatches(custom_voting=voting)
    for voting in voting_schemes_list
}

results_voting = run_experiments(voting_schemes, show_oob=True)
display(results_voting)

### Other Improvements: Probabilistic Voting Scheme

In [84]:
voting_schemes_list = ['majority', 'weighted_majority', 'probabilistic']
voting_schemes = {
    f'RandomPatches(4,10,{voting})' : RandomPatches(custom_voting=voting)
    for voting in voting_schemes_list
}

results_voting = run_experiments(voting_schemes, show_oob=True)
display(results_voting)

running RandomPatches(4,10,majority)
Base Learner 1 Subspace (features): [10  8  4  2] OOB Instances: 11684 OOB Accuracy: 73.6392
Base Learner 2 Subspace (features): [6 9 2 5] OOB Instances: 11685 OOB Accuracy: 70.0471
Base Learner 3 Subspace (features): [ 9  7 10  5] OOB Instances: 11645 OOB Accuracy: 52.7351
Base Learner 4 Subspace (features): [8 1 9 3] OOB Instances: 11626 OOB Accuracy: 53.9136
Base Learner 5 Subspace (features): [ 2 10  9  6] OOB Instances: 11698 OOB Accuracy: 70.7813
Base Learner 6 Subspace (features): [9 8 7 0] OOB Instances: 11643 OOB Accuracy: 58.6361
Base Learner 7 Subspace (features): [9 5 3 7] OOB Instances: 11664 OOB Accuracy: 51.4832
Base Learner 8 Subspace (features): [5 7 1 6] OOB Instances: 11652 OOB Accuracy: 58.9513
Base Learner 9 Subspace (features): [6 2 8 7] OOB Instances: 11692 OOB Accuracy: 71.2196
Base Learner 10 Subspace (features): [ 0  2 10  5] OOB Instances: 11586 OOB Accuracy: 72.6049
running RandomPatches(4,10,weighted_majority)
Base Learn

Unnamed: 0,Dataset,Classifier,Accuracy
0,Electricity,"RandomPatches(4,10,majority)",0.745476
1,Electricity,"RandomPatches(4,10,weighted_majority)",0.764014
2,Electricity,"RandomPatches(4,10,probabilistic)",0.643225
