Comparaison de combinaisons d'hyperparamètres

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
import scipy.io as sp
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, f1_score, accuracy_score, recall_score

dataset = sp.loadmat('breastw.mat') # On importe le dataset 

X = dataset['X'] # On récupère les caractéristiques des points du data
y = dataset['y'] # On récupère les labels des points du dataset
y=np.reshape(y,(683,)) # On redimensionne les labels des points du dataset

K=np.array([0.5,0.514,0.543,0.571,0.6,0.629,0.12,0.25])
Distance=np.array(['cityblock','manhattan','l1','euclidean','manhattan','euclidean','l2','l2'])  # On définit les différentes combinaisons
Contamination=np.array([0.357,0.357,0.357,0.357,0.357,0.347,0.22,0.43])

skf= StratifiedKFold(n_splits=12,shuffle=False,random_state=None) # On splits nos données en 10


for i in range(0,8) :  # boucle pour changer de combinaison
    test=False
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index] # On sépare nos données en données d'entrainement et de test
        y_train, y_test = y[train_index], y[test_index]
        LOF = LocalOutlierFactor(n_neighbors=round(len(X_test)*K[i]), algorithm='auto', contamination=Contamination[i], metric=Distance[i]) # on paramètre le LOF
        y_pred = LOF.fit_predict(X_test) # On récupère les labels prédit de chaque points
        y_pred[y_pred>0]=0
        y_pred[y_pred<0]=1
        F1=round(f1_score(y_test,y_pred),3) # On calcul la F1 score
        if test == False :
            matrice = np.array([[F1]]) # On construit une matrice de stockage des F1 scores
            test = True
        else :
            matrice = np.append(matrice,np.array([F1]).reshape(1,1),axis=0)
    if i == 0  :
        df=pd.DataFrame(matrice,columns=['[{},{},{}]'.format(K[i],Distance[i],round(Contamination[i],3))])
    else :
        df1=pd.DataFrame(matrice,columns=['[{},{},{}]'.format(K[i],Distance[i],round(Contamination[i],3))])
        df=pd.concat([df, df1], axis = 1)
display(df) # On affiche les F1 scores des diffférentes combinaisons

Unnamed: 0,"[0.5,cityblock,0.357]","[0.514,manhattan,0.357]","[0.543,l1,0.357]","[0.571,euclidean,0.357]","[0.6,manhattan,0.357]","[0.629,euclidean,0.347]","[0.12,l2,0.22]","[0.25,l2,0.43]"
0,0.9,0.9,0.9,0.85,0.9,0.85,0.364,0.578
1,1.0,1.0,1.0,0.95,1.0,0.95,0.364,0.667
2,0.95,0.95,0.95,0.9,0.9,0.9,0.364,0.711
3,0.85,0.85,0.9,0.9,0.9,0.9,0.125,0.222
4,0.85,0.85,0.85,0.95,0.95,0.95,0.0,0.222
5,1.0,1.0,1.0,1.0,1.0,1.0,0.061,0.444
6,0.95,0.95,0.95,0.95,0.95,0.95,0.182,0.711
7,0.95,0.95,0.95,0.95,0.95,0.95,0.25,0.4
8,1.0,1.0,1.0,1.0,1.0,1.0,0.061,0.273
9,0.95,0.95,0.95,0.95,0.95,0.95,0.242,0.4


Test statistique

In [8]:
from scipy import stats
from scipy.stats import f_oneway
import scikit_posthocs as sp

t,p=stats.kruskal(df['[0.5,cityblock,0.357]'], df['[0.514,manhattan,0.357]'], df['[0.543,l1,0.357]'], df['[0.571,euclidean,0.357]'], df['[0.6,manhattan,0.357]'], df['[0.629,euclidean,0.347]'],df['[0.12,l2,0.22]'],df['[0.25,l2,0.43]'])

print("p-value = {}".format(round(p,3)))

sp.posthoc_conover([df['[0.5,cityblock,0.357]'], df['[0.514,manhattan,0.357]'], df['[0.543,l1,0.357]'], df['[0.571,euclidean,0.357]'], df['[0.6,manhattan,0.357]'], df['[0.629,euclidean,0.347]'],df['[0.12,l2,0.22]'],df['[0.25,l2,0.43]']])

p-value = 0.0


Unnamed: 0,1,2,3,4,5,6,7,8
1,1.0,1.0,0.9198105,0.8143079,0.8404425,0.8143079,3.045405e-10,7.290455e-08
2,1.0,1.0,0.9198105,0.8143079,0.8404425,0.8143079,3.045405e-10,7.290455e-08
3,0.9198105,0.9198105,1.0,0.7372654,0.9198105,0.7372654,1.912798e-10,4.699776e-08
4,0.8143079,0.8143079,0.7372654,1.0,0.6628198,1.0,8.958136e-10,2.007586e-07
5,0.8404425,0.8404425,0.9198105,0.6628198,1.0,0.6628198,1.199661e-10,3.021291e-08
6,0.8143079,0.8143079,0.7372654,1.0,0.6628198,1.0,8.958136e-10,2.007586e-07
7,3.045405e-10,3.045405e-10,1.912798e-10,8.958136e-10,1.199661e-10,8.958136e-10,1.0,0.2246852
8,7.290455e-08,7.290455e-08,4.699776e-08,2.007586e-07,3.021291e-08,2.007586e-07,0.2246852,1.0
