### Library used and deeclare data needed

In [59]:
import numpy as np
import csv
from numpy.typing import NDArray
from scipy.stats import zscore
from pytictoc import TicToc
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.metrics import accuracy_score

# from matilda.data.model import AlgorithmSummary
# from matilda.data.option import Opts

In [61]:
kernel_fcn = 'rbf'
opts_csv_fold = 5
with open('./data/algolabels.csv', newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            algo_labels = row

# prepare for y, transpose it
# y = pd.read_csv('./data/ybin.csv')
# y = y.values.tolist()

y = np.loadtxt('./data/ybin.csv', delimiter=',', skiprows=1)
z = pd.read_csv('./data/z.csv')

ninst, nalgos = y.shape
w = np.ones((ninst, nalgos))
# prepare for z, normalise it

z_norm = zscore(z, axis = 0, ddof = 1)

cvcmat = np.zeros((nalgos, 4))


### Training

In [91]:
def fit_libsvm(z, y, kkv, kernel_given):
    accuracy= dict()
    for k, v in kkv.items():
        train_index, test_index = v[0], v[1]
        # prepare training data
        x_train = [z[i] for i in train_index]
        y_train = [y[i] for i in train_index]
        # prepare test data
        x_test = [z[i] for i in test_index]
        y_test = [y[i] for i in test_index]
        svm = SVC(kernel=kernel_given, C=1.0, random_state = 0)
        svm.fit(x_train, y_train)
        y_pred = svm.predict(x_test)
        # calculate accuracy
        accuracy[k] = accuracy_score(y_test, y_pred)
        
    return accuracy

In [73]:
def fit_matsvm(z, y, w, skf, kernel_given, params):
    # TODO Set up parallel workers in pool
    

    # Scikit-learn lib need to ensuring data contiguity
    z = np.ascontiguousarray(z)
    y = np.ascontiguousarray(y)
    w = np.ascontiguousarray(w)
    
    # Check if hyperparameter is given by user
    if(np.isnan(params)):
        # Initialize a random number generator
        np.random.seed(0)

        # Retrieve default hyperparameters for fitcsvm and sets the range for the box constraint (C) and kernel scale
        # Define the range for C and gamma in a logarithmic scale
        param_grid = {
        # Generates 15 numbers between 2^-10 and 2^4
        'C': np.logspace(-10, 4, base=2, num=15),
        'gamma': np.logspace(-10, 4, base=2, num=15)
        }

        # z is normalised without modifying the scale, since in the original settings, 
        # the 'Standardize'is false
        # MinMaxScaler  --- slight improve!
        # scaler = MinMaxScaler()
        # scaler.fit(z)
        # z = scaler.transform(z)

        # By default, the class_weight=None represent equal weight OR
        # Let SVC balance the weight with class_weight='balance'    ---------?
        svm_model = SVC(kernel=kernel_given, class_weight=None, random_state=0, probability=True)

        # scores = cross_val_score(model, z, y, scoring='accuracy', cv=skf)
        # # for score in scores:
        # #     print("Accuracy for this al is: ", accuracy)
        # print("Mean Accuracy for this al is: ", np.mean(scores))


        # Used for exhaustive search over specified parameter values for the SVM. The param_grid defines 
        # the range over which C and gamma will be tuned.
        # GridSearchCV for optimizing the hyperparameters
        grid_search = GridSearchCV(
            estimator=svm_model, 
            param_grid=param_grid, 
            # 'roc_auc' measures the area under the receiver operating characteristic curve, which is a 
            # good choice for binary classification problems, especially with imbalanced classes.
            scoring='accuracy', # 'roc_auc'
            cv=skf, 
            verbose=0
            #, n_jobs=nworkers if nworkers != 0 else None,
            )

        # OPT1: 
        # Fit a probability calibration model with trained SVM
   
        grid_search.fit(z, y, sample_weight=w)   # Fit GridSearchCV
        best_svm = grid_search.best_estimator_
        # With cv='prefit' and default method is method='sigmoid'
        # calibrator = CalibratedClassifierCV(best_svm, cv='prefit', method='sigmoid')
        # calibrator.fit(z, y, sample_weight=w)

        # OPT2: Use it to train
        # calibrator = CalibratedClassifierCV(best_svm, cv=skf, method='sigmoid')
        # calibrator.fit(z, y, sample_weight=w)

        # Retrieve the best model and hyperparameters
        
        best_C = grid_search.best_params_['C']
        best_g = grid_search.best_params_['gamma']

        # y_sub = best_svm.predict(z)
        y_sub = best_svm.predict(z)
        p_sub = best_svm.predict_proba(z)

        # Making predictions on the same data to simulate resubstitution prediction
        y_hat = y_sub
        p_hat = p_sub
        
        print("Best C:", best_C)
        print("Best gamma:", best_g)


    return best_svm, y_sub, p_sub, y_hat, p_hat, best_C, best_g

### MATSVM


In [74]:
t = TicToc()
t.tic()

for i in range(nalgos):
    t_inner = TicToc()
    t_inner.tic()

    state = np.random.get_state()
    np.random.seed(0)  # equivalent to MATLAB's rng('default') ?

    # y_b = [row[i] for row in y]
    y_b = y[:, i]

    # Split data into train and test to verify accuracy after fitting the model
    # Note: z is used here, not z_norm. If normalise and scale ahead, some test data might leak into
    # the training process;
    # x_train, x_test, y_train, y_test = train_test_split(z, y_b, 
    #                                                     test_size=0.25,
    #                                                     random_state=42) # =0?
    
    # REQUIRE: Test case for validation the result --better!
    skf = StratifiedKFold(n_splits = opts_csv_fold, shuffle = True, random_state = 0)
    # skf = KFold(n_splits = opts_csv_fold, shuffle = True, random_state = 0)
    
    # Test k-fold cross validation
    # data_splits = skf.split(z_norm, y_b)
    # test_split = next(data_splits)
    # print("Train indices: ", test_split[0][0:100])
    # print("Test indices: ", test_split[1][0:100])

    fit_matsvm(z_norm, y_b, w[:, i], skf, kernel_fcn, np.nan)

    best_svm, y_sub, p_sub, y_hat, p_hat, best_C, best_g = fit_matsvm(z_norm, y_b, w[:, i], skf, kernel_fcn, np.nan)
    aux = confusion_matrix(y_b, y_sub)
    print("------------aux-----------")
    print(aux)
    # 66    17   16   113

    # np.prod(aux.shape) != 4 is False
    cvcmat[i, :] = aux.flatten()
    models_left = nalgos - (i + 1)
    print(f"    -> PYTHIA has trained a model for {algo_labels[i]}, there are {models_left} models left to train.")

    print(f"      -> Elapsed time: {t_inner.tocvalue():.2f}s")

    

Best C: 0.5
Best gamma: 0.125
Best C: 0.5
Best gamma: 0.125
------------aux-----------
[[ 65  17]
 [ 16 113]]
    -> PYTHIA has trained a model for NB, there are 9 models left to train.
      -> Elapsed time: 23.34s
Best C: 4.0
Best gamma: 2.0
Best C: 4.0
Best gamma: 2.0
------------aux-----------
[[ 63  20]
 [  7 121]]
    -> PYTHIA has trained a model for LDA, there are 8 models left to train.
      -> Elapsed time: 23.88s
Best C: 8.0
Best gamma: 0.125
Best C: 8.0
Best gamma: 0.125
------------aux-----------
[[114  22]
 [ 20  55]]
    -> PYTHIA has trained a model for QDA, there are 7 models left to train.
      -> Elapsed time: 23.70s
Best C: 1.0
Best gamma: 0.5
Best C: 1.0
Best gamma: 0.5
------------aux-----------
[[ 54   6]
 [ 10 141]]
    -> PYTHIA has trained a model for CART, there are 6 models left to train.
      -> Elapsed time: 18.86s
Best C: 16.0
Best gamma: 0.0625
Best C: 16.0
Best gamma: 0.0625
------------aux-----------
[[ 49   8]
 [ 10 144]]
    -> PYTHIA has trained 

### Estimation

In [75]:
  
tn, fp, fn, tp = cvcmat[:, 0], cvcmat[:, 1], cvcmat[:, 2], cvcmat[:, 3]
print(tn.dtype)  
print(tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / ninst

print(precision)
# print(recall)
# print(accuracy)

float64
[ 65.  63. 114.  54.  49.  49.  54.  85.  59.  57.]
[0.86923077 0.85815603 0.71428571 0.95918367 0.94736842 0.94736842
 0.95973154 0.88135593 0.98648649 0.93877551]


### LIBSVM

In [92]:
y = pd.read_csv('./data/ybin.csv')
y = y.values.tolist()

z = pd.read_csv('./data/z.csv').values.tolist()
z_norm = zscore(z, axis = 0, ddof = 1)

for i in range(nalgos):
    t_inner = TicToc()
    t_inner.tic()

    state = np.random.get_state()
    np.random.seed(0)  # equivalent to MATLAB's rng('default') ?

    # REQUIRE: Test case for validation the result
    y_b = [row[i] for row in y]
    # y_b = y[:, i]
    skf = StratifiedKFold(n_splits = opts_csv_fold, shuffle = True, random_state = 0)
    
    kkv= dict()
    for i, (train_index, test_index) in enumerate(skf.split(np.zeros(len(y_b)), y_b)):
        kkv[i] = [train_index.tolist(), test_index.tolist()]
    
    # start training using svm
    svm_res = fit_libsvm(z_norm, y_b, kkv, kernel_fcn)

    # visualise accuracy score
for k, v in svm_res.items():
    print(f'{k} fold: accuracy score = {v}')

0 fold: accuracy score = 0.8837209302325582
1 fold: accuracy score = 0.9047619047619048
2 fold: accuracy score = 0.8333333333333334
3 fold: accuracy score = 0.8333333333333334
4 fold: accuracy score = 0.8571428571428571
