In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load datasets
transformedAbalone = pd.read_csv('transformed_ABALONE.csv')
untransformedAbalone = pd.read_csv('untransformed_ABALONE.csv')
transformedAdult = pd.read_csv('transformed_ADULT.csv')
untransformedAdult = pd.read_csv('untransformed_ADULT.csv')
transformedBankMarketing = pd.read_csv('transformed_BANKMARKETING.csv')
untransformedBankMarketing = pd.read_csv('untransformed_BANKMARKETING.csv')

In [3]:
transformed_datasets = {
    'transformedAbalone': transformedAbalone,
    'transformedAdult': transformedAdult,
    'transformedBankMarketing': transformedBankMarketing
}

untransformed_datasets = {
    'untransformedAbalone': untransformedAbalone,
    'untransformedAdult': untransformedAdult,
    'untransformedBankMarketing': untransformedBankMarketing
}

In [None]:
# SVM

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [9]:
random_seeds = [42, 100, 2024]

In [10]:
SVM_results = {}

In [11]:
# hyperparameters
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

In [12]:
for dataset_name, dataset in transformed_datasets.items():
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    dataset_results = {}

    for training_size in (0.2, 0.5, 0.8):
        partition_results = {'test_accuracies': [], 'best_params': []}

        for seed in random_seeds:
            Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=training_size, stratify=y, random_state=seed)

            cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
            grid_search = GridSearchCV(SVC(), param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

            grid_search.fit(Xtrain, ytrain)
            best_params = grid_search.best_params_

            best_SVM = SVC(**best_params)
            best_SVM.fit(Xtrain, ytrain)
            y_pred = best_SVM.predict(Xtest)
            test_accuracy = accuracy_score(ytest, y_pred)

            partition_results['test_accuracies'].append(test_accuracy)
            partition_results['best_params'].append(best_params)

            print(f"{dataset_name}, training size {training_size}, test accuracy {test_accuracy}")
        
        avg_accuracy = np.mean(partition_results['test_accuracies'])
        partition_results['avg_accuracy'] = avg_accuracy 

        print(partition_results)
        dataset_results[training_size] = partition_results

    SVM_results[dataset_name] = dataset_results

transformedAbalone, training size 0.2, test accuracy 0.7860562537402753
transformedAbalone, training size 0.2, test accuracy 0.7905445840813884
transformedAbalone, training size 0.2, test accuracy 0.7845601436265709
{'test_accuracies': [0.7860562537402753, 0.7905445840813884, 0.7845601436265709], 'best_params': [{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}, {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}, {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}], 'avg_accuracy': np.float64(0.7870536604827448)}
transformedAbalone, training size 0.5, test accuracy 0.7951172809956917
transformedAbalone, training size 0.5, test accuracy 0.7932024892292963
transformedAbalone, training size 0.5, test accuracy 0.803733843944471
{'test_accuracies': [0.7951172809956917, 0.7932024892292963, 0.803733843944471], 'best_params': [{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}, {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}, {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}], 'avg_accuracy': np.float64(0.797351204723153

In [16]:
SVM_results

{'transformedAbalone': {0.2: {'test_accuracies': [0.7860562537402753,
    0.7905445840813884,
    0.7845601436265709],
   'best_params': [{'C': 10, 'gamma': 'scale', 'kernel': 'linear'},
    {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'},
    {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}],
   'avg_accuracy': np.float64(0.7870536604827448)},
  0.5: {'test_accuracies': [0.7951172809956917,
    0.7932024892292963,
    0.803733843944471],
   'best_params': [{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'},
    {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'},
    {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}],
   'avg_accuracy': np.float64(0.797351204723153)},
  0.8: {'test_accuracies': [0.7978468899521531,
    0.8086124401913876,
    0.7942583732057417],
   'best_params': [{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'},
    {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'},
    {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}],
   'avg_accuracy': np.float64(0.8002392344497608)}},
 'transformedAdult': {0.2: {'test

In [None]:
for dataset_name, results in SVM_results.items():
    for training_size in (0.2, 0.5, 0.8):
        avg_accuracy = results[training_size]['avg_accuracy']
        print(f"{dataset_name} with training size {training_size}; average accuracy: {avg_accuracy}")

transformedAbalone with training size 0.2; average accuracy: 0.7870536604827448
transformedAbalone with training size 0.5; average accuracy: 0.797351204723153
transformedAbalone with training size 0.8; average accuracy: 0.8002392344497608
transformedAdult with training size 0.2; average accuracy: 0.6688932111719644
transformedAdult with training size 0.5; average accuracy: 0.6717715627260691
transformedAdult with training size 0.8; average accuracy: 0.6700446992186168
transformedBankMarketing with training size 0.2; average accuracy: 0.8953984535559917
transformedBankMarketing with training size 0.5; average accuracy: 0.8989206405379103
transformedBankMarketing with training size 0.8; average accuracy: 0.9029083268826716


In [None]:
# ANN

In [83]:
from sklearn.neural_network import MLPClassifier

In [84]:
ANN_results = {}

In [None]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50,50), (100,50)],
}

In [86]:
for dataset_name, dataset in transformed_datasets.items():
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    dataset_results = {}

    for training_size in (0.2, 0.5, 0.8):
        partition_results = {'test_accuracies': [], 'best_params': []}

        for seed in random_seeds:
            Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=training_size, stratify=y, random_state=seed)

            cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
            grid_search = GridSearchCV(MLPClassifier(random_state=seed, max_iter=1000), param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
            
            grid_search.fit(Xtrain, ytrain)
            best_params = grid_search.best_params_

            best_ANN = MLPClassifier(**best_params, random_state=seed, max_iter=1000)
            best_ANN.fit(Xtrain, ytrain)

            y_pred = best_ANN.predict(Xtest)
            test_accuracy = accuracy_score(ytest, y_pred)

            partition_results['test_accuracies'].append(test_accuracy)
            partition_results['best_params'].append(best_params)

            print(f"{dataset_name}, training size {training_size}, test accuracy {test_accuracy}")
        
        avg_accuracy = np.mean(partition_results['test_accuracies'])
        partition_results['avg_accuracy'] = avg_accuracy 

        print(partition_results)
        dataset_results[training_size] = partition_results

    ANN_results[dataset_name] = dataset_results

transformedAbalone, training size 0.2, test accuracy 0.7776780371035308
transformedAbalone, training size 0.2, test accuracy 0.7860562537402753
transformedAbalone, training size 0.2, test accuracy 0.7758827049670856
{'test_accuracies': [0.7776780371035308, 0.7860562537402753, 0.7758827049670856], 'best_params': [{'hidden_layer_sizes': (100, 50)}, {'hidden_layer_sizes': (100, 50)}, {'hidden_layer_sizes': (100, 50)}], 'avg_accuracy': np.float64(0.7798723319369639)}
transformedAbalone, training size 0.5, test accuracy 0.7917663954044998
transformedAbalone, training size 0.5, test accuracy 0.7802776448061274
transformedAbalone, training size 0.5, test accuracy 0.7989468645284825
{'test_accuracies': [0.7917663954044998, 0.7802776448061274, 0.7989468645284825], 'best_params': [{'hidden_layer_sizes': (100, 50)}, {'hidden_layer_sizes': (100, 50)}, {'hidden_layer_sizes': (100, 50)}], 'avg_accuracy': np.float64(0.7903303015797033)}
transformedAbalone, training size 0.8, test accuracy 0.785885167

In [87]:
for dataset_name, results in ANN_results.items():
    for training_size in (0.2, 0.5, 0.8):
        avg_accuracy = results[training_size]['avg_accuracy']
        print(f"{dataset_name} with training size {training_size}; average accuracy: {avg_accuracy}")

transformedAbalone with training size 0.2; average accuracy: 0.7798723319369639
transformedAbalone with training size 0.5; average accuracy: 0.7903303015797033
transformedAbalone with training size 0.8; average accuracy: 0.7922647527910686
transformedAdult with training size 0.2; average accuracy: 0.5968930746788145
transformedAdult with training size 0.5; average accuracy: 0.6136521845952254
transformedAdult with training size 0.8; average accuracy: 0.605998566895281
transformedBankMarketing with training size 0.2; average accuracy: 0.8790953579031768
transformedBankMarketing with training size 0.5; average accuracy: 0.879088147689404
transformedBankMarketing with training size 0.8; average accuracy: 0.8826348188285599


In [None]:
# LogReg

In [93]:
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [94]:
LR_results = {}

In [95]:
param_grid = {
    'C': [0.1, 10],
    'penalty': ['l1', 'l2']
}

In [96]:
for dataset_name, dataset in transformed_datasets.items():
    X = dataset.iloc[:, :-1].values
    y = dataset.iloc[:, -1].values

    dataset_results = {}

    for training_size in (0.2, 0.5, 0.8):
        partition_results = {'test_accuracies': [], 'best_params': []}

        for seed in random_seeds:
            Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=training_size, stratify=y, random_state=seed)

            cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
            grid_search = GridSearchCV(LogisticRegression(random_state=seed, max_iter=1000), param_grid=param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
            
            grid_search.fit(Xtrain, ytrain)
            best_params = grid_search.best_params_

            best_LR = LogisticRegression(**best_params, random_state=seed, max_iter=1000)
            best_LR.fit(Xtrain, ytrain)

            y_pred = best_LR.predict(Xtest)
            test_accuracy = accuracy_score(ytest, y_pred)

            partition_results['test_accuracies'].append(test_accuracy)
            partition_results['best_params'].append(best_params)

            print(f"{dataset_name}, training size {training_size}, test accuracy {test_accuracy}")
        
        avg_accuracy = np.mean(partition_results['test_accuracies'])
        partition_results['avg_accuracy'] = avg_accuracy 

        print(partition_results)
        dataset_results[training_size] = partition_results

    LR_results[dataset_name] = dataset_results

transformedAbalone, training size 0.2, test accuracy 0.781867145421903
transformedAbalone, training size 0.2, test accuracy 0.7935368043087971
transformedAbalone, training size 0.2, test accuracy 0.7803710353081987
{'test_accuracies': [0.781867145421903, 0.7935368043087971, 0.7803710353081987], 'best_params': [{'C': 10, 'penalty': 'l2'}, {'C': 10, 'penalty': 'l2'}, {'C': 10, 'penalty': 'l2'}], 'avg_accuracy': np.float64(0.7852583283462996)}
transformedAbalone, training size 0.5, test accuracy 0.7879368118717089
transformedAbalone, training size 0.5, test accuracy 0.7898516036381044
transformedAbalone, training size 0.5, test accuracy 0.781235040689325
{'test_accuracies': [0.7879368118717089, 0.7898516036381044, 0.781235040689325], 'best_params': [{'C': 10, 'penalty': 'l2'}, {'C': 10, 'penalty': 'l2'}, {'C': 10, 'penalty': 'l2'}], 'avg_accuracy': np.float64(0.7863411520663793)}
transformedAbalone, training size 0.8, test accuracy 0.7811004784688995
transformedAbalone, training size 0.8,

In [97]:
for dataset_name, results in LR_results.items():
    for training_size in (0.2, 0.5, 0.8):
        avg_accuracy = results[training_size]['avg_accuracy']
        print(f"{dataset_name} with training size {training_size}; average accuracy: {avg_accuracy}")

transformedAbalone with training size 0.2; average accuracy: 0.7852583283462996
transformedAbalone with training size 0.5; average accuracy: 0.7863411520663793
transformedAbalone with training size 0.8; average accuracy: 0.779505582137161
transformedAdult with training size 0.2; average accuracy: 0.6540154578492091
transformedAdult with training size 0.5; average accuracy: 0.6563886272743403
transformedAdult with training size 0.8; average accuracy: 0.6534616303272256
transformedBankMarketing with training size 0.2; average accuracy: 0.8924861990470659
transformedBankMarketing with training size 0.5; average accuracy: 0.8932141909227639
transformedBankMarketing with training size 0.8; average accuracy: 0.8926241291606768


In [None]:
# Figure 1: Average Classification Accuracy Per Model

In [98]:
results_dict = {
    'SVM': SVM_results,
    'ANN': ANN_results,
    'LogReg': LR_results
}

In [110]:
def extract_accuracies(results):
    accuracies = []
    for dataset, dataset_results in results.items():
        for training_size, partition_results in dataset_results.items():
            print(partition_results)
            accuracies.append(partition_results['avg_accuracy'])
    return accuracies

In [None]:
overall_accuracies = {model: np.mean(extract_accuracies(results)) for model, results in results_dict.items()}

In [113]:
overall_accuracy_table = pd.DataFrame.from_dict(overall_accuracies, orient='index', columns=['Average Accuracy'])

In [114]:
print(overall_accuracy_table)

        Average Accuracy
SVM             0.788065
ANN             0.757759
LogReg          0.777033


In [115]:
# Figure 2: Average Classification Error Per Model Per Dataset

In [116]:
dataset_model_accuracies = {}

In [117]:
for model, results in results_dict.items():
    dataset_model_accuracies[model] = {}
    for dataset, dataset_results in results.items():
        all_dataset_accuracies = []
        for training_size, partition_results in dataset_results.items():
            all_dataset_accuracies.append(partition_results['avg_accuracy'])
        dataset_model_accuracies[model][dataset] = np.mean(all_dataset_accuracies)

In [118]:
dataset_model_accuracy_table = pd.DataFrame(dataset_model_accuracies).T

In [119]:
print(dataset_model_accuracy_table)

        transformedAbalone  transformedAdult  transformedBankMarketing
SVM               0.794881          0.670236                  0.899076
ANN               0.787489          0.605515                  0.880273
LogReg            0.783702          0.654622                  0.892775


In [120]:
# Figure 3: Average Classification Error Per Model Per Dataset Per Partition

In [121]:
dataset_model_partition_accuracies = {}

In [123]:
for model, results in results_dict.items():
    dataset_model_partition_accuracies[model] = {}
    for dataset, dataset_results in results.items():
        dataset_model_partition_accuracies[model][dataset] = {}
        for training_size, partition_results in dataset_results.items():
            dataset_model_partition_accuracies[model][dataset][training_size] = partition_results['avg_accuracy']

In [124]:
dataset_model_partition_accuracy_table = pd.DataFrame(dataset_model_partition_accuracies).T

In [125]:
print(dataset_model_partition_accuracy_table)

                                       transformedAbalone  \
SVM     {0.2: 0.7870536604827448, 0.5: 0.7973512047231...   
ANN     {0.2: 0.7798723319369639, 0.5: 0.7903303015797...   
LogReg  {0.2: 0.7852583283462996, 0.5: 0.7863411520663...   

                                         transformedAdult  \
SVM     {0.2: 0.6688932111719644, 0.5: 0.6717715627260...   
ANN     {0.2: 0.5968930746788145, 0.5: 0.6136521845952...   
LogReg  {0.2: 0.6540154578492091, 0.5: 0.6563886272743...   

                                 transformedBankMarketing  
SVM     {0.2: 0.8953984535559917, 0.5: 0.8989206405379...  
ANN     {0.2: 0.8790953579031768, 0.5: 0.8790881476894...  
LogReg  {0.2: 0.8924861990470659, 0.5: 0.8932141909227...  
