# TP3 - Ejercicio 2
### Enunciado
a)  Dividir aleatoriamente el conjunto de datos en dos conjuntos, uno de entrenamiento y uno de prueba.

b)  Clasificar la variable categórica _sigdz_ que indica si el paciente posee o no una enfermedad coronaria, utilizando el método SVM. Calcular la matriz de confusión.

c)  Proponer  (y  tambien  realizar)  una  secuencia  de  pasos que  permita  encontrar  una clasificación  mejor  que  la  anterior,  evaluando  diferentes  valores  de C y  diferentes núcleos.

In [33]:
from IPython.display import display
import datetime
import pandas as pd
import numpy as np
import os
from sklearn import svm
from ml_tps.utils.evaluation_utils import getConfusionMatrix, computeAccuracy
from ml_tps.utils.dataframe_utils import divide_in_training_test_datasets, scale_dataset, seperateDatasetObjectiveData

print("Module and package import success")

Module and package import success


In [34]:
dir_path = os.path.dirname(os.path.realpath("e2.ipynb"))
DEFAULT_FILEPATH = f"{dir_path}/../tp3/data/acath.xls"
DEFAULT_OBJECTIVE = "sigdz"
DEFAULT_TRAIN_PCTG = 0.6
DEFAULT_CV_PCTG = 0.2

print("Variable setting and data import success")

Variable setting and data import success


## Data set info

In [35]:
# a)  Divide dataset randomly into training and evaluation set
dataset = pd.read_excel(DEFAULT_FILEPATH)
dataset = dataset.dropna()      # TODO maybe deal with NaN otherwise?
dataset = dataset.drop("tvdlm", axis=1)   # Drop tvdlm columns which does not add information
dataset_scaled = scale_dataset(dataset, objective=DEFAULT_OBJECTIVE, scaling_type="minmax")

train, testing_sets = divide_in_training_test_datasets(dataset_scaled, train_pctg=DEFAULT_TRAIN_PCTG)
cv_set, test = divide_in_training_test_datasets(testing_sets, train_pctg=DEFAULT_CV_PCTG/(1-DEFAULT_TRAIN_PCTG))

X_train, y_train = seperateDatasetObjectiveData(train, DEFAULT_OBJECTIVE)
X_cv_set, y_cv_set = seperateDatasetObjectiveData(cv_set, DEFAULT_OBJECTIVE)
X_test, y_test = seperateDatasetObjectiveData(test, DEFAULT_OBJECTIVE)

data_info = pd.Series({"Data set dimensions (ignoring NaN)": dataset.shape,
                      "Training set dimensions": train.shape,
                      "CV set dimensions": cv_set.shape,
                      "Test set dimensions": test.shape,
                      "% of data set used for training": str(int(DEFAULT_TRAIN_PCTG*100)) + "%",
                      "% of data set used for cross-validation": str(int(DEFAULT_CV_PCTG*100)) + "%",
                      "% of data set used for testing": str(int((1-DEFAULT_TRAIN_PCTG-DEFAULT_CV_PCTG)*100)) + "%",
                      "Classification objective": DEFAULT_OBJECTIVE})

pd.DataFrame(data_info, columns=["acath.xls"])

*** No CODEPAGE record, no encoding_override: will use 'ascii'


Unnamed: 0,acath.xls
Data set dimensions (ignoring NaN),"(2258, 5)"
Training set dimensions,"(1354, 5)"
CV set dimensions,"(452, 5)"
Test set dimensions,"(452, 5)"
% of data set used for training,60%
% of data set used for cross-validation,20%
% of data set used for testing,20%
Classification objective,sigdz


## Default SVM

In [36]:
# b)  Classify categorical variable "sigdz" using default SVC SVM
words_then = datetime.datetime.now()
c_value1 = 1
kernel1 = "rbf"
clf1 = svm.SVC(kernel=kernel1, gamma='scale', C=c_value1)      # using default parameters, written down for illustrative purposes
clf1.fit(X_train, y_train)
predictions_train1 = pd.Series(clf1.predict(X_train).T)
predictions_cv1 = pd.Series(clf1.predict(X_cv_set).T)
confusion_matrix = getConfusionMatrix(predictions_cv1, y_cv_set)
accuracy_train1 = computeAccuracy(predictions_train1, y_train)
accuracy_cv1 = computeAccuracy(predictions_cv1, y_cv_set)

data_default_svm = pd.DataFrame(columns=["Kernel", "C value", "Training set accuracy", "CV set accuracy"])
data_default_svm.loc[0] = [kernel1, c_value1, accuracy_train1, accuracy_cv1]

words_now = datetime.datetime.now()
print("Runtime Default SVM fitting and testing: ", divmod((words_now - words_then).total_seconds(), 60), "\n")
print(clf1, "\n\n")

data_default_svm

Runtime Default SVM fitting and testing:  (0.0, 0.16478) 

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False) 




Unnamed: 0,Kernel,C value,Training set accuracy,CV set accuracy
0,rbf,1,0.759232,0.723451


__Confusion Matrix:__

In [37]:
confusion_matrix


Actual,0.0,1.0
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,75,50
1.0,75,252


## Parameter and kernel tuning

In [38]:
# c)  Evaluate different values for C and different nuclei to find better performing classifiers
svm_values = pd.DataFrame(columns=["Kernel", "C value", "Training set accuracy", "CV set accuracy"])

i = 0
for kernel in ["rbf", "poly", "linear", "sigmoid"]:
    for c_value in np.logspace(-3, 2, 6):
        clf = svm.SVC(kernel=kernel, C=c_value, gamma="scale", cache_size=500)
        clf.fit(X_train, y_train)
        predictions_train = pd.Series(clf.predict(X_train).T)
        predictions_cv = pd.Series(clf.predict(X_cv_set).T)
        accuracy_train = computeAccuracy(predictions_train, y_train)
        accuracy_cv = computeAccuracy(predictions_cv, y_cv_set)

        svm_values.loc[i] = [kernel, c_value, accuracy_train, accuracy_cv]
        i += 1

time_now = datetime.datetime.now()
print("\n\nRuntime parameter and kernel testing: ", divmod((time_now - words_now).total_seconds(), 60), "\n")

svm_values



Runtime parameter and kernel testing:  (0.0, 12.233084) 



Unnamed: 0,Kernel,C value,Training set accuracy,CV set accuracy
0,rbf,0.001,0.65805,0.668142
1,rbf,0.01,0.735598,0.719027
2,rbf,0.1,0.737075,0.719027
3,rbf,1.0,0.759232,0.723451
4,rbf,10.0,0.774003,0.734513
5,rbf,100.0,0.771049,0.743363
6,poly,0.001,0.735598,0.719027
7,poly,0.01,0.735598,0.719027
8,poly,0.1,0.752585,0.719027
9,poly,1.0,0.757755,0.727876


#### Best performing SVM configuration:
(on cross-validation set)

In [39]:
# Choose SVM with highest accuracy after hyperparameter tuning
winner = svm_values.sort_values(by="CV set accuracy", ascending=False).head(1)
winner

Unnamed: 0,Kernel,C value,Training set accuracy,CV set accuracy
5,rbf,100.0,0.771049,0.743363


__Performance on test set:__

In [40]:
winner_svm = svm.SVC(kernel=winner.iat[0, 0], C=winner.iat[0, 1], gamma="scale", cache_size=500)
winner_svm.fit(X_train, y_train)
winner_test_predictions = pd.Series(winner_svm.predict(X_test).T)
winner_test_accuracy = computeAccuracy(winner_test_predictions, y_test)

winner["Test set accuracy"] = winner_test_accuracy

winner

Unnamed: 0,Kernel,C value,Training set accuracy,CV set accuracy,Test set accuracy
5,rbf,100.0,0.771049,0.743363,0.730088
