## Library used and deeclare data needed

In [2]:
import numpy as np
from numpy.typing import NDArray
from scipy.stats import zscore
from pytictoc import TicToc
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
import pandas as pd
from sklearn.metrics import accuracy_score

# from matilda.data.model import AlgorithmSummary
# from matilda.data.option import Opts

In [26]:
kernel_fcn = 'rbf'
opts_csv_fold = 5
nalgos = 10

# prepare for y, transpose it
# y = pd.read_csv('./data/ybin.csv')
# y = y.values.tolist()

y = np.loadtxt('./data/ybin.csv', delimiter=',', skiprows=1)

# prepare for z, normalise it
z = pd.read_csv('./data/z.csv')
z_norm = zscore(z, axis = 0, ddof = 1)

ninst, nalgos = y.shape
w = np.ones((ninst, nalgos))

### Training

In [7]:
def fit_libsvm(z, y, kkv, kernel_given):
    accuracy= dict()
    for k, v in kkv.items():
        train_index, test_index = v[0], v[1]
        # prepare training data
        x_train = [z[i] for i in train_index]
        y_train = [y[i] for i in train_index]
        # prepare test data
        x_test = [z[i] for i in test_index]
        y_test = [y[i] for i in test_index]
        svm = SVC(kernel=kernel_given, C=1.0, random_state = 0)
        svm.fit(x_train, y_train)
        y_pred = svm.predict(x_test)
        # calculate accuracy
        accuracy[k] = accuracy_score(y_test, y_pred)
        
    return accuracy

In [30]:
def fit_matsvm(z, y, w, skf, kernel_given, params):
    # Set up parallel workers in pool
    
    # Check if hyperparameter is given by user
    if(np.isnan(params)):
        # Initialize a random number generator
        np.random.seed(0)

        # Scikit-learn lib need to ensuring data contiguity
        z = np.ascontiguousarray(z)
        y = np.ascontiguousarray(y)
        w = np.ascontiguousarray(w)

        # Debug shape and data type
        # print(w.flags)

        # Retrieve default hyperparameters for fitcsvm and sets the range for the box constraint (C) and kernel scale
        # Define the range for C and gamma in a logarithmic scale
        param_grid = {
        'C': np.logspace(-10, 4, base=2, num=15),
        'gamma': np.logspace(-10, 4, base=2, num=15)  
        }

        # cache_size: maximal, ?????class_weight='balanced'
        svm_model = SVC(kernel=kernel_given, cache_size=2000, class_weight='balanced', probability=True)

        # Used for exhaustive search over specified parameter values for the SVM. The param_grid defines 
        # the range over which C and gamma will be tuned.
        # GridSearchCV for optimizing the hyperparameters
        grid_search = GridSearchCV(
            estimator=svm_model, 
            param_grid=param_grid, 
            scoring='roc_auc', 
            cv=skf, 
            verbose=0
            #, n_jobs=nworkers if nworkers != 0 else None,
            )

        # Fit GridSearchCV
        grid_search.fit(z, y, sample_weight=w)

        # Retrieve the best model and hyperparameters
        best_svm = grid_search.best_estimator_
        best_C = grid_search.best_params_['C']
        best_g = grid_search.best_params_['gamma']

        # Calibrate the probability model
        calibrated_svm = CalibratedClassifierCV(best_svm, cv='prefit')
        calibrated_svm.fit(z, y, sample_weight=w)
        
        # Making predictions on the training data
        y_sub = calibrated_svm.predict(z)
        p_sub = calibrated_svm.predict_proba(z)[:, 1]

        t_y_sub = best_svm.predict(z)
        t_p_sub = best_svm.predict_proba(z)[:, 1]

        # Making predictions on the same data to simulate resubstitution prediction
        y_hat = y_sub
        p_hat = p_sub

        print("Resubstitution Predictions:", y_sub)
        print("Resubstitution Probabilities:", p_sub)
        print("T sub:", t_y_sub)
        print("T on p sub:", t_p_sub)
        print("Best C:", best_C)
        print("Best gamma:", best_g)


    return calibrated_svm #, y_sub, p_sub, y_hat, p_hat, best_C, best_g

In [31]:
t = TicToc()
t.tic()

for i in range(nalgos):
    t_inner = TicToc()
    t_inner.tic()

    state = np.random.get_state()
    np.random.seed(0)  # equivalent to MATLAB's rng('default') ?

    # REQUIRE: Test case for validation the result
    # y_b = [row[i] for row in y]
    y_b = y[:, i]
    skf = StratifiedKFold(n_splits = opts_csv_fold, shuffle = True, random_state = 0)
    
    kkv= dict()
    for i, (train_index, test_index) in enumerate(skf.split(np.zeros(len(y_b)), y_b)):
        kkv[i] = [train_index.tolist(), test_index.tolist()]
    # start training using svm
    # svm_res = fit_libsvm(z_norm, y_b, kkv, kernel_fcn)
    svm_res = fit_matsvm(z_norm, y_b, w[:, i], skf, kernel_fcn, np.nan)
    
# visualise accuracy score
# for k, v in svm_res.items():
#     print(f'{k} fold: accuracy score = {v}')

    

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False



ValueError: ndarray is not C-contiguous