In [None]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
	estimators = [('lr', log_clf), ('rf', rnd_clf), ('svc',svm_clf)],
	voting = 'hard')

# Bagging
1. boostrapped aggregation

## Bootstrapping

1. First, we create random samples of the training data set with replacment (we *aren't creating new samples*, we are rather bucketing samples from the main training dataset into smaller subsets).
    
    1. the likelihood of the models in the ensemble being exposed to truly distinct sets depends on sampling with replacement or without replacement
    2. with replacement([`bootstrap=True`](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)) shall be typically used when the dataset size is small enough s.t. not all base estimators will get at least K-sized subset.
    3. without replacement shall be typically used when the dataset size is large enough s.t. all base estimators will get at least a K-sized subset.
    4. when **subsets** are **created without replacement**, the process is called **subsetting instead of bootstrapping**.

2. Then, we build a model (classifier or Decision tree) for each sample. 

## Aggregation

1. **Aggregation**: Finally, results of these multiple models are combined using average or majority voting.

## How Bagging = Bootstrapping + Aggregation?

1. As each model is exposed to a **different subset of data**
    1. this results in diverse models being created.
    2. diversity results from them being trained on slightly different subsets of the original dataset. 
    3. If you were to train all the base models on the exact same dataset, they’ll make the same mistakes, and their errors will be perfectly correlated. 
    4. Bagging works because each model is forced to overfit to different pieces of the data, and therefore *makes different mistakes*.
2. Diversity is what sets bagging apart from an individual classical ML model
    1. Bagging prefers usage of low bias high variance models as its base estimators, like decision trees.
    2. if say a single DT was used for modelling the problem, it will fundamentally overfit.
    3. In bagging rather, the aggregation step is what leads to an overall lower variance
        1. since each base DT is trained with a different subset of samples, each will be a low bias high variance model.
        2. however, for a given test sample, the noise picked by each of the base DTs will be uncorrelated as each of these DTs has *theoretically learnt different patterns*.
    1. Thus, Bagging helps us to reduce the variance error.
3. Combinations of multiple models decreases variance, especially in the case of unstable models, and may produce a more reliable prediction than a single model.


# Bagging as seen in sklearn

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42),
    n_estimators=500,
    max_samples=100,
    bootstrap=True, # usage of Bagging, for pasting change this to False
    n_jobs=2,
    random_state=42
)

`BaggingClassifier` automatically performs **soft voting** if the classifier can calculate the probabilities for its predictions(`predict_proba()` method)

Bagging is much better than Pasting

* When performing Bagging on a training set, only 63% of the instances are included in the model, that means there are 37% of the instances that the classifier has not seen before. 
    * These can be used for evaluation just like Cross-Validation.
    
    * To use this functionality, simply add a `oob_score = True` parameter in the `BaggingClassifier` 

# Check if Bagging actually works in practice
- check if the claim of bootstrapping, that the underlying base estimators produce uncorrelated errors is true practically:
- The correlation matrix will give you the correlation between the residuals of each pair of base models. 
- If bagging is working as expected, these values should be low, ideally close to 0.
- MSE of each base model will show if they’re performing similarly or if their individual biases vary. 
    - If the errors are correlated, their MSEs should be roughly the same, and vice versa.

In [None]:
# Get predictions from each base model
base_model_preds = np.array([model.predict(X_test) for model in bagging_model.estimators_])


###################### Regression task ######################
# Compute residuals (errors)
residuals = base_model_preds - y_test.reshape(-1, 1)

# Calculate the correlation matrix of the residuals
correlation_matrix = np.corrcoef(residuals)

# Print the correlation matrix (it's the key here)
print("Correlation matrix of residuals between base models:\n", correlation_matrix)

# Optionally, you can also calculate the mean squared error for each base model to see performance differences
errors = np.array([mean_squared_error(y_test, preds) for preds in base_model_preds])
print("Mean Squared Errors for each base model: ", errors)

###################### Classification task ######################
# Compute misclassification residuals (1 if wrong, 0 if correct)
misclassifications = (base_model_preds != y_test.reshape(-1, 1)).astype(int)

# Calculate the correlation matrix of the misclassifications
correlation_matrix = np.corrcoef(misclassifications)

#################  AUTOMATED EVALUATION OF CORRELATION MATRIX ############################
#################  WHEN NO. OF ESTIMATORS IS TOO HIGH ####################################
# Find the pairwise correlations that are too high (above some threshold, e.g., 0.9)
high_correlation_pairs = np.where(np.abs(correlation_matrix) > 0.9)
high_correlation_pairs = [(i, j) for i, j in zip(*high_correlation_pairs) if i < j]

# Output the problematic pairs of base models
if high_correlation_pairs:
    print("Found high correlation between the following base model pairs (indices):")
    for pair in high_correlation_pairs:
        print(pair)
else:
    print("No significant correlation detected between the errors of the base models.")

# Random patches and random subspaces

1. All ensemble techniques up until now sampled only the training instances, but kept all the features(`bootstrap_features = False`).

2. patches samples both training instances and features(out of d features, k are chosen at random, just as how training instances were chosen at random)

3. Random Subspaces keeps all the instances but samples features.

In [None]:
patchedBagClf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42),
    n_estimators=500,
    max_samples=0.6,
    bootstrap=True, # usage of Bagging, for pasting change this to False
    n_jobs=2,
    random_state=42,
    bootstrap_features=True
)

subspaceBagClf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42),
    n_estimators=500,
    max_samples=0.6,
    bootstrap=True, # usage of Bagging, for pasting change this to False
    n_jobs=2,
    random_state=42,
    bootstrap_features=True,
    max_features=0.6
)