Comparaison des modèles non-supervisés optimisés

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
import scipy.io as sp
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, f1_score, accuracy_score, recall_score
from sklearn.ensemble import IsolationForest
from sklearn.cluster import KMeans

dataset = sp.loadmat('breastw.mat') # On importe le dataset 

X = dataset['X'] # On récupère les caractéristiques des points du data
y = dataset['y'] # On récupère les labels des points du dataset

skf= StratifiedKFold(n_splits=10,shuffle=False,random_state=None) # On splits nos données en 10
 
test=False
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index] # On sépare nos données en données d'entrainement et de test
    y_train, y_test = y[train_index], y[test_index]
    
    LOF = LocalOutlierFactor(n_neighbors=round(len(X_test)*0.6), algorithm='auto', contamination=0.357, metric="manhattan") #On appelle le LOF optimisé
    y_pred_LOF = LOF.fit_predict(X_test) # On récupère les labels prédit de chaque points par le LOF
    y_pred_LOF[y_pred_LOF>0]=0
    y_pred_LOF[y_pred_LOF<0]=1
    
    model = IsolationForest(contamination=0.4, n_estimators=30, max_samples=100,max_features=7) #On appelle le Isolation Forest optimisé
    model.fit(X_train)
    y_pred_iso=model.predict(X_test) # On récupère les labels prédit de chaque points par le Isolation Forest
    y_pred_iso[y_pred_iso>0]=0
    y_pred_iso[y_pred_iso<0]=1
    
    kmeans = KMeans(n_clusters=2, init='k-means++' ,n_init=11 ,max_iter=360 ,algorithm='full').fit(X_test) #On appelle le Kmeans optimisé
    y_pred_kmeans=kmeans.labels_# On récupère les labels prédit de chaque points par le Kmeans
    
    F1_LOF=round(f1_score(y_test,y_pred_LOF),3)  #On calcule les F1 scores
    F1_iso=round(f1_score(y_test,y_pred_iso),3)
    F1_kmeans=round(f1_score(y_test,y_pred_kmeans),3)
    
    if F1_kmeans < 0.1 :
        F1_kmeans=1-F1_kmeans
    if test == False :
        matrice_LOF = np.array([[F1_LOF]])  # On construit les matrices de stockages des F1 scores pour chaque algos
        matrice_iso = np.array([[F1_iso]])
        matrice_kmeans = np.array([[F1_kmeans]])
        test = True
    else :
        matrice_LOF = np.append(matrice_LOF,np.array([F1_LOF]).reshape(1,1),axis=0)
        matrice_iso = np.append(matrice_iso,np.array([F1_iso]).reshape(1,1),axis=0)
        matrice_kmeans = np.append(matrice_kmeans,np.array([F1_kmeans]).reshape(1,1),axis=0)

df_LOF=pd.DataFrame(matrice_LOF,columns=["LOF"])
df_iso=pd.DataFrame(matrice_iso,columns=["Isolation Forest"])
df_kmeans=pd.DataFrame(matrice_kmeans,columns=["Kmeans"])
Comparaison=pd.concat([df_LOF, df_iso, df_kmeans], axis = 1) # On concatene les F1 scores des algos
display(Comparaison) # On affiche le dataframe

Unnamed: 0,LOF,Isolation Forest,Kmeans
0,0.939,0.842,0.917
1,0.98,0.906,0.8
2,0.939,0.941,1.0
3,0.894,0.939,0.942
4,0.958,0.906,1.0
5,0.958,0.906,0.971
6,0.958,0.96,0.957
7,1.0,0.96,0.979
8,0.958,0.958,0.979
9,1.0,0.96,1.0


Test statistique

In [6]:
import scikit_posthocs as sp
from scipy import stats

# comparaison des variances avec le t-statistiques et la p-value
t,p = stats.kruskal(Comparaison['LOF'],Comparaison['Isolation Forest'],Comparaison['Kmeans'])
print('t statistic: %.3f' % t)
print('p value: %.20f' % p)
# comparaison des variances entre algorithme 1 à 1
sp.posthoc_conover([Comparaison['LOF'],Comparaison['Isolation Forest'],Comparaison['Kmeans']])

t statistic: 3.716
p value: 0.15597852830033173688


Unnamed: 0,1,2,3
1,1.0,0.173074,0.601758
2,0.173074,1.0,0.0645
3,0.601758,0.0645,1.0
