In [18]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_predict, cross_validate, cross_val_score
from sklearn.model_selection import KFold
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from pprint import pprint
from sklearn.svm import SVC

In [46]:
# Carga de datos.
datos = datasets.load_iris()
print(np.shape(datos.data))

(150, 4)


In [47]:
# Extraer la matriz de datos "X" y la variable target "y"
X = datos.data
y = datos.target

In [48]:
# Partición EXTERNA. Test: hold-out split 80-20%.
X_train, X_test, y_train, y_test = train_test_split(datos.data, datos.target, test_size=0.2, random_state=42)
print('Train dimensions: ', np.shape(X_train))
print('Test dimensions:  ', np.shape(X_test))

Train dimensions:  (120, 4)
Test dimensions:   (30, 4)


In [53]:
# Estandarización de los datos de entrenamiento y test
standardizer = preprocessing.StandardScaler()
X_train = standardizer.fit_transform(X_train)
X_test = standardizer.transform(X_test)

In [61]:
# Definimos el algoritmo. 
# C=0.1, gamma='auto' y kernel gausiano
alg = SVC(C=10, gamma='auto', kernel='rbf')

In [62]:
# Cross-validation interno
# Utilizar comando "cross_val_predict" para extraer directamente las predicciones de las 5 bolsas de validación
y_pred = cross_val_predict(alg, X_train, y_train, cv=KFold(n_splits=5, shuffle=True, random_state=42))
print(y_pred)
print(y_train)

[0 0 1 0 0 2 1 0 0 0 2 1 1 0 0 1 1 2 1 2 1 2 1 0 2 1 0 0 0 1 2 0 0 0 1 0 1
 2 0 1 2 0 2 2 1 1 2 1 0 1 2 0 0 1 2 0 2 0 0 2 1 2 2 2 2 1 0 0 2 2 0 0 0 2
 2 0 2 2 0 1 1 2 1 2 0 2 1 2 1 1 1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1
 1 2 2 0 1 1 0 1 2]
[0 0 1 0 0 2 1 0 0 0 2 1 1 0 0 1 2 2 1 2 1 2 1 0 2 1 0 0 0 1 2 0 0 0 1 0 1
 2 0 1 2 0 2 2 1 1 2 1 0 1 2 0 0 1 1 0 2 0 0 1 1 2 1 2 2 1 0 0 2 2 0 0 0 1
 2 0 2 2 0 1 1 2 1 2 0 2 1 2 1 1 1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1
 1 2 2 0 1 2 0 1 2]


In [57]:
# Presentación de los resultados de la evaluación.
print("Exactitud: %.3f" % (metrics.accuracy_score(y_train, y_pred))) # accuracy
print("Precisión: %.3f" % (metrics.precision_score(y_train, y_pred, average="micro"))) # precision
print("Sensibilidad: %.3f" % (metrics.recall_score(y_train, y_pred, average="micro"))) # sensibilidad
print("F1-score: %.3f" % (metrics.f1_score(y_train, y_pred, average="micro"))) # F-score

Exactitud: 0.950
Precisión: 0.950
Sensibilidad: 0.950
F1-score: 0.950


In [13]:
# Extraemos la matriz de confusión
print("Matriz de confusión:\n", metrics.confusion_matrix(y_train, y_pred))

Matriz de confusión:
 [[40  0  0]
 [ 0 37  4]
 [ 0  2 37]]


In [14]:
print("Tabla de métricas:\n", metrics.classification_report(y_train, y_pred))

Tabla de métricas:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
           1       0.95      0.90      0.92        41
           2       0.90      0.95      0.92        39

    accuracy                           0.95       120
   macro avg       0.95      0.95      0.95       120
weighted avg       0.95      0.95      0.95       120



In [36]:
# Una vez hemos optimizado los hiperparámetros con el conjunto de validación, utilizamos todos los datos de 
# "train" y "validation" para entrenar un modelo definitivo con esos hiperparámetros ajustados
model = SVC(C=1, gamma='scale', kernel='rbf') # Definimos el modelo con los hiperparámetros óptimos
model = model.fit(X_train, y_train) # Entrenamos el modelo con todos los datos de "train" + "val"

# y finalmente evaluamos el test con el método "score()"
test_results = model.score(X_test, y_test)
print('Exactitud en test: ', np.round(test_results*100,4), '%')

Exactitud en test:  100.0 %


In [37]:
# obtenemos las predicciones del test para extraer el resto de métricas de interés
y_pred_test = model.predict(X_test) # Extraer predicciones
print(metrics.classification_report(y_test, y_pred_test)) # Aplicar el método de classification_report()
print(metrics.confusion_matrix(y_test, y_pred_test)) # Extraer la matriz de confusión

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


### Otro ejecicio

In [65]:
###Otro ejercicio con datos de columna
csv = pd.read_csv('./res/Dataset_spine_2C.csv', sep=',')
print(csv.head())

data = np.array(csv)
X=data[:,:-1]
y=data[:,-1]


        Col1       Col2       Col3       Col4        Col5       Col6  \
0  63.027817  22.552586  39.609117  40.475232   98.672917  -0.254400   
1  39.056951  10.060991  25.015378  28.995960  114.405425   4.564259   
2  68.832021  22.218482  50.092194  46.613539  105.985135  -3.530317   
3  69.297008  24.652878  44.311238  44.644130  101.868495  11.211523   
4  49.712859   9.652075  28.317406  40.060784  108.168725   7.918501   

       Col7     Col8     Col9     Col10      Col11    Col12 Class_att  \
0  0.744503  12.5661  14.5386  15.30468 -28.658501  43.5123  Abnormal   
1  0.415186  12.8874  17.5323  16.78486 -25.530607  16.1102  Abnormal   
2  0.474889  26.8343  17.4861  16.65897 -29.031888  19.2221  Abnormal   
3  0.369345  23.5603  12.7074  11.42447 -30.470246  18.8329  Abnormal   
4  0.543360  35.4940  15.9546   8.87237 -16.378376  24.9171  Abnormal   

                                         Unnamed: 13  
0                                                NaN  
1                 

In [None]:
# Partición EXTERNA. Test: hold-out split 80-20%.
X_train, X_test, y_train, y_test = train_test_split(datos.data, datos.target, test_size=0.2, random_state=42)
print('Train dimensions: ', np.shape(X_train))
print('Test dimensions:  ', np.shape(X_test))