## Test Dataset 15
#### Breast Cancer

In [1]:
#imports
import openml
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.ensemble import AdaBoostClassifier
from AdaBoostWorkyWorky import AdaBoost

import scipy.stats as ss
import matplotlib.pyplot as plt

In [2]:
# load dataset
ds = openml.tasks.get_task(15)
dataset = openml.datasets.get_dataset(ds.dataset_id)
X, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="array", target=dataset.default_target_attribute
)

In [3]:
# visualize dataset
df = pd.DataFrame(X, columns=attribute_names)
df['target'] = 2*y - 1

df = df.dropna(how='any', axis=0)
X = df.drop(columns=['target'], axis=1)
y = df.target

df

Unnamed: 0,Clump_Thickness,Cell_Size_Uniformity,Cell_Shape_Uniformity,Marginal_Adhesion,Single_Epi_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,target
0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,-1
1,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,-1
2,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,-1
3,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,-1
4,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,-1
...,...,...,...,...,...,...,...,...,...,...
694,3.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,-1
695,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,-1
696,5.0,10.0,10.0,3.0,7.0,3.0,8.0,10.0,2.0,1
697,4.0,8.0,6.0,4.0,3.0,4.0,10.0,6.0,1.0,1


In [4]:
# holdout estimation function

def holdout_estimation(model, alpha, n_classifiers, x, y, test_size_value=0.3, seed=1111):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size_value, random_state=seed)
    model.fit(x_train, y_train, alpha, n_classifiers)
    y_pred = model.predict(x_test)
    return accuracy_score(y_test, y_pred)

In [5]:
# get the best number of classifiers in the AdaBoost for each alpha type of calculation

best_n_interators = {}
for alpha in range(3):
    print("Calculating the best n for alpha =", alpha)
    best_n_interators[alpha] = (0, 0)
    for n in range(50, 151, 10):
        ab = AdaBoost()
        accuracy = holdout_estimation(ab, alpha, n, X, y)
        if (accuracy > best_n_interators[alpha][1]):
                best_n_interators[alpha] = (n, accuracy)

print(best_n_interators)

# {0: (50, 0.9512195121951219), 1: (70, 0.9609756097560975), 2: (50, 0.9560975609756097)}

Calculating the best n for alpha = 0
Calculating the best n for alpha = 1
Calculating the best n for alpha = 2
{0: (50, 0.9512195121951219), 1: (70, 0.9609756097560975), 2: (50, 0.9560975609756097)}
