# TP3 - Ejercicio 2
### Enunciado
a)  Dividir aleatoriamente el conjunto de datos en dos conjuntos, uno de entrenamiento y uno de prueba.

b)  Clasificar la variable categórica _sigdz_ que indica si el paciente posee o no una enfermedad coronaria, utilizando el método SVM. Calcular la matriz de confusión.

c)  Proponer  (y  tambien  realizar)  una  secuencia  de  pasos que  permita  encontrar  una clasificación  mejor  que  la  anterior,  evaluando  diferentes  valores  de C y  diferentes núcleos.

In [29]:
from IPython.display import display
import datetime
import pandas as pd
import numpy as np
import os
from sklearn import svm
from ml_tps.utils.evaluation_utils import getConfusionMatrix, computeAccuracy
from ml_tps.utils.dataframe_utils import divide_in_training_test_datasets, scale_dataset, seperateDatasetObjectiveData

print("Module and package import success")

Module and package import success


In [30]:
dir_path = os.path.dirname(os.path.realpath("e2.ipynb"))
DEFAULT_FILEPATH = f"{dir_path}/../tp3/data/acath.xls"
DEFAULT_OBJECTIVE = "sigdz"
DEFAULT_TRAIN_PCTG = 0.6

print("Variable setting and data import success")

Variable setting and data import success


## Data set info

In [34]:
# a)  Divide dataset randomly into training and evaluation set
dataset = pd.read_excel(DEFAULT_FILEPATH)
dataset = dataset.dropna()      # TODO maybe deal with NaN otherwise?
dataset_scaled = scale_dataset(dataset, objective=DEFAULT_OBJECTIVE, ignore_objective=True, scaling_type="minmax")

train, test = divide_in_training_test_datasets(dataset_scaled, train_pctg=DEFAULT_TRAIN_PCTG)

X_train, y_train = seperateDatasetObjectiveData(train, DEFAULT_OBJECTIVE)
X_test, y_test = seperateDatasetObjectiveData(test, DEFAULT_OBJECTIVE)

data_info = pd.Series({"Data set dimensions (ignoring NaN): ": dataset.shape,
                          "Training set dimensions: ": train.shape,
                          "Evaluation set dimensions: ": test.shape,
                          "Percentage of data set used for training: ": int(DEFAULT_TRAIN_PCTG*100),
                          "Classification objective: ": DEFAULT_OBJECTIVE})

display(data_info)

*** No CODEPAGE record, no encoding_override: will use 'ascii'


Data set dimensions (ignoring NaN):           (2258, 6)
Training set dimensions:                      (1354, 6)
Evaluation set dimensions:                     (904, 6)
Percentage of data set used for training:            60
Classification objective:                         sigdz
dtype: object

## Default SVM
Using RBF (radial basis function) kernel

In [32]:
# b)  Classify categorical variable "sigdz" using default SVC SVM
words_then = datetime.datetime.now()
svm_values = pd.DataFrame(columns=["Kernel", "C value", "Accuracy"])
c_value1 = 1
kernel1 = "rbf"
clf1 = svm.SVC(kernel=kernel1, gamma='scale', C=c_value1)      # using default parameters, written down for illustrative purposes
clf1.fit(X_train, y_train)
predictions_test = pd.Series(clf1.predict(X_test).T)
confusion_matrix = getConfusionMatrix(predictions_test, y_test)
accuracy1 = computeAccuracy(predictions_test, y_test)

svm_values.loc[0] = [kernel1, c_value1, accuracy1]

words_now = datetime.datetime.now()
print("Training runtime: ", divmod((words_now - words_then).total_seconds(), 60))
print(clf1)
print("\nAccuracy:", accuracy1, "\n")
print("Confusion Matrix:")
confusion_matrix


Training runtime:  (0.0, 0.103722)
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Accuracy: 0.7876106194690266 

Confusion Matrix:


Actual,0.0,1.0
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,203,68
1.0,124,509


## Parameter and kernel tuning

In [33]:
# c)  Evaluate different values for C and different nuclei to find better performing classifiers
for kernel in ["rbf", "poly", "linear", "sigmoid"]:
    for c_value in np.logspace(-3, 2, 6):   # TODO SVM with C=0.001 seem to predict only 1's -> error in computeAccuracy
        clf = svm.SVC(kernel=kernel, C=c_value, gamma="scale", cache_size=500)
        clf.fit(X_train, y_train)
        predictions = pd.Series(clf.predict(X_test).T)
        accuracy = computeAccuracy(predictions, y_test)

        svm_values.loc[svm_values.index.max() + 1] = [kernel, c_value, accuracy]

time_now = datetime.datetime.now()
print("\n\nRuntime parameter and kernel testing: ", divmod((time_now - words_now).total_seconds(), 60), "\n")

svm_values



Runtime C Parameter testing:  (0.0, 2.105568) 



Unnamed: 0,Kernel,C value,Accuracy
0,rbf,1.0,0.787611
1,rbf,0.001,0.638274
2,rbf,0.01,0.77323
3,rbf,0.1,0.775442
4,rbf,1.0,0.787611
5,rbf,10.0,0.792035
6,rbf,100.0,0.790929
7,poly,0.001,0.769912
8,poly,0.01,0.775442
9,poly,0.1,0.775442
