### Library used and deeclare data needed

In [19]:
import numpy as np
import csv
import pandas as pd
from numpy.typing import NDArray
from scipy.stats import zscore
from pytictoc import TicToc
from sklearn.model_selection import GridSearchCV, StratifiedKFold, KFold, cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
from sklearn.metrics import accuracy_score

In [60]:
kernel_fcn = 'rbf'
opts_csv_fold = 5
with open('./data/algolabels.csv', newline='') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            algo_labels = row

y = np.loadtxt('./data/ybin.csv', delimiter=',', skiprows=1)
z = pd.read_csv('./data/z.csv', header=None, dtype=np.float64)

ninst, nalgos = y.shape
w = np.ones((ninst, nalgos))

""" Problem 1 - Normalization generate different result with MATLAB """
z_norm = (z-np.mean(z, axis=0))/np.std(z, ddof=1, axis=0)
pd.DataFrame(z_norm).to_csv('z_f.csv', header=None, index=None)
# scaler = StandardScaler().fit(z)

cvcmat = np.zeros((nalgos, 4))


### Check Consistency

In [65]:
z_norm_M = pd.read_csv('./data/z_norm.csv', header=None, dtype=np.float64)
z_norm_P = pd.read_csv('z_f.csv', header=None, dtype=np.float64)

tolerance = 1e-10
are_close = np.isclose(z_norm_M.values, z_norm_P.values, atol=tolerance)
results = are_close.all()

print("Are all elements close within the tolerance level: ", results)
if not results:
    mismatches = np.where(~are_close)
    print("Mismatch found at positions:", mismatches)

Are all elements close within the tolerance level:  True


### Training

In [91]:
def fit_libsvm(z, y, kkv, kernel_given):
    accuracy= dict()
    for k, v in kkv.items():
        train_index, test_index = v[0], v[1]
        # prepare training data
        x_train = [z[i] for i in train_index]
        y_train = [y[i] for i in train_index]
        # prepare test data
        x_test = [z[i] for i in test_index]
        y_test = [y[i] for i in test_index]
        svm = SVC(kernel=kernel_given, C=1.0, random_state = 0)
        svm.fit(x_train, y_train)
        y_pred = svm.predict(x_test)
        # calculate accuracy
        accuracy[k] = accuracy_score(y_test, y_pred)
        
    return accuracy

In [25]:
def fit_matsvm(z, y, w, skf, kernel_given, params):
    # TODO Set up parallel workers in pool
    

    # Scikit-learn lib need to ensuring data contiguity
    z = np.ascontiguousarray(z)
    y = np.ascontiguousarray(y)
    w = np.ascontiguousarray(w)
    
    # Check if hyperparameter is given by user
    if(np.isnan(params)):

        # Initialize a random number generator
        np.random.seed(0)

        # Retrieve default hyperparameters for fitcsvm and sets the range for the box constraint (C) and kernel scale
        # Define the range for C and gamma in a logarithmic scale
        param_grid = {
        'C': np.logspace(-10, 4, base=2, num=15),
        'gamma': np.logspace(-10, 4, base=2, num=15)
        }

        # By default, the class_weight=None represent equal weight
        svm_model = SVC(kernel=kernel_given, class_weight=None, random_state=0)

        # steps = list()
        # steps.append(('scaler', StandardScaler()))
        # steps.append(('model', svm_model))
        # pipeline = Pipeline(steps=steps)

        # Used for exhaustive search over specified parameter values for the SVM. The param_grid defines 
        # the range over which C and gamma will be tuned.
        # GridSearchCV for optimizing the hyperparameters
        grid_search = GridSearchCV(
            svm_model, param_grid, 
            scoring='accuracy', # 'roc_auc'
            cv=skf, 
            verbose=0
            #, n_jobs=nworkers if nworkers != 0 else None,
            )

        # OPT1: 
        # Fit a probability calibration model with trained SVM
        # print(z.shape, y.shape, w.shape)
        grid_search.fit(z, y, sample_weight=w)   # Fit GridSearchCV
        best_svm = grid_search.best_estimator_
       
        calibrator = CalibratedClassifierCV(best_svm, cv='prefit', method='sigmoid')
        calibrator.fit(z, y, sample_weight=w)

        # OPT2: retrain
        # calibrator = CalibratedClassifierCV(best_svm, cv=skf, method='sigmoid')
        # calibrator.fit(z_norm, y, sample_weight=w)

        best_C = grid_search.best_params_['C']
        best_g = grid_search.best_params_['gamma']

        y_sub = calibrator.predict(z)
        p_sub = calibrator.predict_proba(z)

        # Making predictions on the same data to simulate resubstitution prediction
        y_hat = y_sub
        p_hat = p_sub
        
        print("Best C:", best_C)
        print("Best gamma:", best_g)


    return best_svm, y_sub, p_sub, y_hat, p_hat, best_C, best_g

### MATSVM


In [26]:
t = TicToc()
t.tic()

for i in range(nalgos):
    t_inner = TicToc()
    t_inner.tic()

    state = np.random.get_state()
    np.random.seed(0)  # equivalent to MATLAB's rng('default') ?

    # y_b = [row[i] for row in y]
    y_b = y[:, i]

    
    # REQUIRE: Test case for validation the result
    # shuffle = True means that it partitions the data as it is presented. If the data is already in a random 
    # order, this isn't typically an issue, but if the data has some sort of order (e.g., all instances of one 
    # class followed by all instances of another), stratification alone won't randomize the order of the instances within each fold.

    # skf = StratifiedKFold(n_splits = opts_csv_fold, shuffle = True, random_state = 0)
    skf = KFold(n_splits = opts_csv_fold, shuffle = True, random_state = 0)
    
    # Test k-fold cross validation
    # data_splits = skf.split(z_norm, y_b)
    # test_split = next(data_splits)
    # print("Train indices: ", test_split[0][0:100])
    # print("Test indices: ", test_split[1][0:100])

    fit_matsvm(z, y_b, w[:, i], skf, kernel_fcn, np.nan)

    best_svm, y_sub, p_sub, y_hat, p_hat, best_C, best_g = fit_matsvm(z, y_b, w[:, i], skf, kernel_fcn, np.nan)
    aux = confusion_matrix(y_b, y_sub)

    # np.prod(aux.shape) != 4 is False
    cvcmat[i, :] = aux.flatten()
    models_left = nalgos - (i + 1)
    print(f"    -> PYTHIA has trained a model for {algo_labels[i]}, there are {models_left} models left to train.")

    print(f"      -> Elapsed time: {t_inner.tocvalue():.2f}s")

    

Best C: 0.125
Best gamma: 0.25
Best C: 0.125
Best gamma: 0.25
------------aux-----------
[[ 64  18]
 [ 16 113]]
    -> PYTHIA has trained a model for NB, there are 9 models left to train.
      -> Elapsed time: 7.75s
Best C: 16.0
Best gamma: 0.125
Best C: 16.0
Best gamma: 0.125
------------aux-----------
[[ 61  22]
 [  9 119]]
    -> PYTHIA has trained a model for LDA, there are 8 models left to train.
      -> Elapsed time: 7.88s
Best C: 16.0
Best gamma: 0.015625
Best C: 16.0
Best gamma: 0.015625
------------aux-----------
[[118  18]
 [ 23  52]]
    -> PYTHIA has trained a model for QDA, there are 7 models left to train.
      -> Elapsed time: 7.69s
Best C: 16.0
Best gamma: 0.03125
Best C: 16.0
Best gamma: 0.03125
------------aux-----------
[[ 53   7]
 [  8 143]]
    -> PYTHIA has trained a model for CART, there are 6 models left to train.
      -> Elapsed time: 6.83s
Best C: 4.0
Best gamma: 0.125
Best C: 4.0
Best gamma: 0.125
------------aux-----------
[[ 47  10]
 [  9 145]]
    -> P

### Estimation

In [115]:
  
tn, fp, fn, tp = cvcmat[:, 0], cvcmat[:, 1], cvcmat[:, 2], cvcmat[:, 3]
print(tn.dtype)  
print(tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
accuracy = (tp + tn) / ninst

print(precision)
# print(recall)
# print(accuracy)

float64
[ 61.  70. 122.  21.  54.  57.  58.  86.  12.  53.]
[0.8359375  0.875      0.75       0.77456647 0.97727273 1.
 0.98425197 0.88073394 0.72928177 0.8976378 ]


### LIBSVM

In [92]:
y = pd.read_csv('./data/ybin.csv')
y = y.values.tolist()

z = pd.read_csv('./data/z.csv').values.tolist()
z_norm = zscore(z, axis = 0, ddof = 1)

for i in range(nalgos):
    t_inner = TicToc()
    t_inner.tic()

    state = np.random.get_state()
    np.random.seed(0)  # equivalent to MATLAB's rng('default') ?

    # REQUIRE: Test case for validation the result
    y_b = [row[i] for row in y]
    # y_b = y[:, i]
    skf = StratifiedKFold(n_splits = opts_csv_fold, shuffle = True, random_state = 0)
    
    kkv= dict()
    for i, (train_index, test_index) in enumerate(skf.split(np.zeros(len(y_b)), y_b)):
        kkv[i] = [train_index.tolist(), test_index.tolist()]
    
    # start training using svm
    svm_res = fit_libsvm(z_norm, y_b, kkv, kernel_fcn)

    # visualise accuracy score
for k, v in svm_res.items():
    print(f'{k} fold: accuracy score = {v}')

0 fold: accuracy score = 0.8837209302325582
1 fold: accuracy score = 0.9047619047619048
2 fold: accuracy score = 0.8333333333333334
3 fold: accuracy score = 0.8333333333333334
4 fold: accuracy score = 0.8571428571428571
