In [8]:
import re
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import altair as alt

from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import AgglomerativeClustering

import gensim.downloader as api
loaded_glove_model = api.load("glove-wiki-gigaword-300")




In [16]:
# Preprocessed dataframe
data_proprocessed = "../Data_csv/data_preprocessed.csv"
data_df = pd.read_csv(data_proprocessed)

print(data_df[data_df['Name of the document'] == 'AI Ethics Resources'])
print(data_df[data_df['Name of the document'] == 'Recommendation of the Council on Artificial Intelligence'])

df_temp = data_df[data_df['Name of the document'] != 'AI Ethics Resources']
df_temp = df_temp[df_temp['Name of the document'] != 'Recommendation of the Council on Artificial Intelligence']
df_temp = df_temp.reset_index(drop=True)

print(df_temp[df_temp['Name of the document'] == 'AI Ethics Resources'])
print(df_temp[df_temp['Name of the document'] == 'Recommendation of the Council on Artificial Intelligence'])
# print(df_temp[df_temp['Name of the document'] == 'COMEST Report on Robotics'])

     doc_id Name of the document Institution  \
217     308  AI Ethics Resources      Fastai   

                                                   URL Authors Affiliates  \
217  https://www.fast.ai/2018/09/24/ai-ethics-resou...     NaN        NaN   

    Sector Country Date Keywords  ...  all sources Checked by Unnamed: 27  \
217    NaN     USA  NaN      NaN  ...          1.0        Mel         NaN   

     Unnamed: 28  Unnamed: 29                      text  langue  \
217          NaN          NaN  \n\nRedirect\n\n\n\n\n\n      en   

     text_processed                    tfidf  \
217        Redirect  [0. 0. 0. ... 0. 0. 0.]   

                             categorie Institution  
217  Entreprises technologiques et multinationales  

[1 rows x 36 columns]
     doc_id                               Name of the document  \
345     518  Recommendation of the Council on Artificial In...   

                                           Institution  \
345  Organisation for Economic Co-operati

In [19]:
# Embeddings functions
def tfidf(df):
    tfidf_vectorizer = TfidfVectorizer()
    Tfidf = tfidf_vectorizer.fit_transform(df['text_processed'])
    tfidf_a = Tfidf.toarray()
    return tfidf_a

def glove_embeddings(df):
    all_embeddings = []
    for text in df['text_processed']:
        word_vectors = []
        for word in text.split():
            if word in loaded_glove_model:
                word_vectors.append(loaded_glove_model[word])
        if word_vectors:
            sentence_embedding = np.mean(word_vectors, axis=0)
        else:
            sentence_embedding = np.zeros(loaded_glove_model.vector_size)
        all_embeddings.append(sentence_embedding)
        all_embeddings_a = np.array(all_embeddings)
    return all_embeddings_a

# Dimension reduction functions
def tsne(embeddings):
    docs_tsne = TSNE(n_components=2, learning_rate='auto',
                init='pca').fit_transform(embeddings)
    return docs_tsne

def pca(embeddings):
    svd = TruncatedSVD(n_components=2)
    embeddings_pca = svd.fit_transform(embeddings)
    return embeddings_pca

# Clusterings functions
def Kmeans_fct(n, embeddings):
    kmeans = KMeans(n_clusters=n, random_state=0)
    kmeans.fit(embeddings)
    labels = kmeans.labels_
    return labels


def correspondence_analysis(embeddings, n_components=2):
    svd = TruncatedSVD(n_components=n_components)
    embeddings_ca = svd.fit_transform(embeddings)
    return embeddings_ca

def hierarchical_clustering(n_clusters, embeddings):
    hc = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='ward')
    labels = hc.fit_predict(embeddings)
    return labels

# Validation function
def score_function(embeddings, labels):
    silhouette_s = silhouette_score(embeddings, labels)
    davies_bouldin_s = davies_bouldin_score(embeddings, labels)
    calinski_harabasz_s = calinski_harabasz_score(embeddings, labels)
    return silhouette_s, davies_bouldin_s, calinski_harabasz_s

def display_ca(embeddings, df, labels):
    embeddings_ca = correspondence_analysis(embeddings)
    data_ca = pd.DataFrame({'x': embeddings_ca[:, 0],
                            'y': embeddings_ca[:, 1],
                            'institution': df['categorie Institution'],
                            'title': df["Name of the document"],
                            'labels': labels
                            })
    alt.data_transformers.disable_max_rows()
    chart = alt.Chart(data_ca).mark_circle(size=200).encode(
        x="x", y="y", color=alt.Color('labels:N', scale=alt.Scale(scheme='category20')),
        tooltip=['institution', "title"]
        ).interactive().properties(
        width=500,
        height=500
    )
    chart.save('chart.html')
    chart.show()
    

    

def display_tsne(embeddings, df, labels):
    docs_tsne_th = TSNE(n_components=2, learning_rate='auto',
                        init='random', metric='cosine',
                        perplexity=50.0).fit_transform(embeddings)
    print(docs_tsne_th.shape)

    data_th = pd.DataFrame({'x': docs_tsne_th[:,0],
                            'y': docs_tsne_th[:,1],
                            'institution': df['categorie Institution'],
                            'title': df["Name of the document"],
                            'labels' : labels
                            })
    alt.data_transformers.disable_max_rows()
    chart = alt.Chart(data_th[:]).mark_circle(size=200).encode(
        x="x", y="y", color=alt.Color('labels:N', 
                                      scale=alt.Scale(scheme='category20')),
        tooltip=['institution', "title"]
        ).interactive().properties(
        width=500,
        height=500
    )
    chart.save('chart.html')
    chart.show()


# Clustering pipeline
def pipeline(dataframe, embedding_method, clustering_method, taille_cluster, reduction_method=display_tsne):
    print("start embedding")
    embeddings = embedding_method(dataframe)
    print("clustering")
    for i in range(taille_cluster[0], taille_cluster[1]):
        labels = clustering_method(i, embeddings)
    print("scoring")
    scores = score_function(embeddings, labels)
    print(f"silhouette_score: {scores[0]}, davies_bouldin_score: {scores[1]}, calinski_harabasz_score: {scores[2]}")
    reduction_method(embeddings, dataframe, labels)
    return scores

pipeline(dataframe=df_temp, embedding_method=glove_embeddings, clustering_method=Kmeans_fct, taille_cluster=[10,11], reduction_method=display_tsne)


start embedding
clustering
scoring
silhouette_score: 0.11423221975564957, davies_bouldin_score: 1.323591261558748, calinski_harabasz_score: 420.1674924216384
(472, 2)


(0.11423222, 1.323591261558748, 420.1674924216384)