Import des bibliotheques

In [60]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.cluster import normalized_mutual_info_score, adjusted_rand_score
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.cluster import KMeans

'''
Variables:
---------

corpus : list of documents
embeddings : documents embeddings of size NxM (N : number of documents, M : embedding dimension)
red_emd : reduced embeddings matrix using dimentionality reduction
k : number of clusters
labels : documents labels
pred : list of clustering predicted clusters

''';

Définition des fonctions de réduction de dimension et de clustering :

In [68]:
def dim_red(mat, p):
  '''
    Perform dimensionality reduction

    Input:
    -----
        mat : NxM list
        p : number of dimensions to keep
    Output:
    ------
        red_mat : NxP list such that p<<m
  '''
  reducer = UMAP(n_components=p, random_state=42)
  red_emb = reducer.fit_transform(mat)

  red_mat = red_emb[:, :p]
  return red_mat



def clust(mat, k):
    '''
    Perform clustering

    Input:
    -----
        mat : input list
        k : number of cluster
    Output:
    ------
        pred : list of predicted labels
    '''
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)

    # Fit model to samples
    result = model.fit(mat)
    pred = result.labels_

    return pred


Import des données et préparation :


In [69]:
ng20 = fetch_20newsgroups(subset='test')
corpus = ng20.data[:2000]
labels = ng20.target[:2000]
k = len(set(labels))


In [70]:
print(k)

20


Embeddings avec Sentence Transformers :

In [56]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(corpus)


In [71]:
# perform dimentionality reduction
red_emb = dim_red(embeddings, 20)

# perform clustering
pred = clust(red_emb, k)

# evaluate clustering results
nmi_score = normalized_mutual_info_score(pred,labels)
ari_score = adjusted_rand_score(pred,labels)

print(f'NMI: {nmi_score:.2f} \nARI: {ari_score:.2f}')

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


NMI: 0.48 
ARI: 0.29


In [78]:

def stat_model(n_expr, mat, k, labels):
  means_nmi_score=0.
  varience_nmi_score=0.
  means_ari_score=0.
  varience_ari_score=0.

  results_nmi_score=[]
  results_ari_score=[]
  for i in range(n_expr):
    pred = clust(mat, k)
    results_nmi_score = normalized_mutual_info_score(pred,labels)
    results_ari_score = adjusted_rand_score(pred,labels)
  means_nmi_score = np.mean(results_nmi_score)
  means_ari_score = np.mean(results_ari_score)
  varience_nmi_score = np.std(results_nmi_score)
  varience_ari_score = np.std(results_ari_score)

  return means_nmi_score, varience_nmi_score, means_ari_score, varience_ari_score

In [81]:
n_expr=30
resultats = stat_model(n_expr, mat = red_emb, k = 20, labels = labels)



In [82]:
# Calculer la moyenne et l'écart type pour NMI
nmi_mean = resultats[0]
nmi_std = resultats[1]

# Calculer la moyenne et l'écart type pour ARI
ari_mean = resultats[2]
ari_std = resultats[3]

# Afficher les résultats
print(f'\nMoyenne NMI: {nmi_mean:.2f}, Écart type NMI +-: {nmi_std:.2f}')
print(f'Moyenne ARI: {ari_mean:.2f}, Écart type ARI +-: {ari_std:.2f}')


Moyenne NMI: 0.48, Écart type NMI +-: 0.00
Moyenne ARI: 0.29, Écart type ARI +-: 0.00
