## Fetch Data


In [51]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# fetch dataset 
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_original.data.features 
y = breast_cancer_wisconsin_original.data.targets 

## Drop Missing Value

In [52]:
# drop missing values
X = X.replace("?", pd.NA).dropna()

# Now, drop corresponding rows in y
y = y.loc[X.index]

# Reorder the index ( this might affect the data accuracy if the original index order is meant to be fixed)
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

In [56]:
# Recursive function to find the best three features combination
def find_best_feature_combination(X, y, X_val, y_val, best_accuracy=0, best_feature_combination=None, final_model=None):
    # Initialize variables if not provided
    if best_feature_combination is None:
        best_feature_combination = []
    
    # Base case: If we have selected three features, return the best combination and accuracy
    if len(best_feature_combination) >= 3:
        return best_feature_combination, best_accuracy, final_model
    best_accuracy = 0
    
    # Try adding one random feature at each recursion
    remaining_features = list(set(X.columns) - set(best_feature_combination))  
  
    best_feature = [];
    # Iteratively find the best feature and add it to the best_feature_combination
    for next_feature in remaining_features:
        
        # Select the current feature along with previously selected features
        current_features = best_feature_combination + [next_feature]
        
        # Select features from the dataset
        X_selected = X[current_features]
        X_selected_val = X_val[current_features]
      
        # Train the Naïve Bayes classifier
        nb_classifier = GaussianNB()
        nb_classifier.fit(X_selected, y)
        
        # Predict the labels for the testing set
        y_pred = nb_classifier.predict(X_selected_val)
        
        # Calculate accuracy for the current feature combination
        accuracy = accuracy_score(y_val, y_pred)
        
        # Update the best accuracy and feature combination if the current accuracy is higher
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_feature = next_feature
            final_model = nb_classifier

    best_feature_combination += [best_feature]
    
    # print("best feature collection:", best_feature_combination)
        
    # Recur with the next feature added
    best_feature_combination, best_accuracy, final_model = find_best_feature_combination(X, y, X_val, y_val, best_accuracy, best_feature_combination, final_model)
    
    return best_feature_combination, best_accuracy, final_model

## Split the dataset to train, validation, and test

In [57]:
X_train, X_, y_train, y_ = train_test_split(X, np.ravel(y), test_size=0.4, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_, y_, test_size=0.5, stratify=y_)
del X_, y_

## Perform the Wrapper Method
Run the `find_best_feature_combination` ten times, and find the average validation accuracy

In [60]:
accuracies = []

for _ in range(10):
    # Call the recursive function to find the best feature combination
    best_feature_combination, best_accuracy, final_model = find_best_feature_combination(X_train, y_train, X_val, y_val)
    
    y_pred = final_model.predict(X_test[best_feature_combination])
    test_accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(test_accuracy)
    
    print("Best feature combination:", best_feature_combination)
    print("Val accuracy:", best_accuracy)
    print("Test accuracy:", test_accuracy)
    print("---------------------------")

average_accuracy = np.mean(accuracies)
print("Average accuracy:", average_accuracy)

Best feature combination: ['Uniformity_of_cell_size', 'Bland_chromatin', 'Clump_thickness']
Val accuracy: 0.9781021897810219
Test accuracy: 0.9416058394160584
---------------------------
Best feature combination: ['Uniformity_of_cell_size', 'Bland_chromatin', 'Clump_thickness']
Val accuracy: 0.9781021897810219
Test accuracy: 0.9416058394160584
---------------------------
Best feature combination: ['Uniformity_of_cell_size', 'Bland_chromatin', 'Clump_thickness']
Val accuracy: 0.9781021897810219
Test accuracy: 0.9416058394160584
---------------------------
Best feature combination: ['Uniformity_of_cell_size', 'Bland_chromatin', 'Clump_thickness']
Val accuracy: 0.9781021897810219
Test accuracy: 0.9416058394160584
---------------------------
Best feature combination: ['Uniformity_of_cell_size', 'Bland_chromatin', 'Clump_thickness']
Val accuracy: 0.9781021897810219
Test accuracy: 0.9416058394160584
---------------------------
Best feature combination: ['Uniformity_of_cell_size', 'Bland_chro

### Compare the model of Wrapper Method to the model of Full-Features

In [59]:
# Train Naïve Bayes classifier with full set of features

nb_classifier_full = GaussianNB()
nb_classifier_full.fit(X_train, y_train)

y_pred = nb_classifier_full.predict(X_test)
full_feature_accuracy = accuracy_score(y_test, y_pred)

print("Accuracy with three features:", average_accuracy)
print("Accuracy with full set of features:", full_feature_accuracy)

Accuracy with three features: 0.9416058394160582
Accuracy with full set of features: 0.9708029197080292
