# Clustering de documents

## Imports

In [None]:
import collections
import os
import string
import sys

import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
from pprint import pprint
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine

In [None]:
import nltk

nltk.download('punkt')

In [None]:
data_path = "../data/txt_clean/"

## Charger tous les  fichiers de la décennie et en créer une liste de textes

In [None]:
files1948 = [f for f in sorted(os.listdir(data_path)) if "_1948" in f]
files1950 = [f for f in sorted(os.listdir(data_path)) if "_1950" in f]
files1950 = [f for f in sorted(os.listdir(data_path)) if "_1950" in f]

In [None]:
# Exemple de fichiers
print(files1948[:5])
print(files1949[:5])
print(files1950[:5])

In [None]:
texts1948 = [open(data_path + f, "r", encoding="utf-8").read() for f in files1948]
texts1949 = [open(data_path + f, "r", encoding="utf-8").read() for f in files1949]
texts1950 = [open(data_path + f, "r", encoding="utf-8").read() for f in files1950]

In [None]:
# Exemple de textes
texts1948[0][:400]

## Vectoriser les documents à l'aide de TF-IDF

In [None]:
# Création d'une fonction de pré-traitement
def preprocessing(text, stem=True):
    """ Tokenize text and remove punctuation """
    text = text.translate(string.punctuation)
    tokens = word_tokenize(text)
    return tokens

### Instancier le modèle TF-IDF avec ses arguments

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=preprocessing,
    stop_words=stopwords.words('french'),
    max_df=0.5,
    min_df=0.1,
    lowercase=True)

### Construire la matrice de vecteurs à l'aide de la fonction `fit_transform`

In [None]:
tfidf_vectors_1948 = vectorizer.fit_transform(texts1948)
tfidf_vectors_1949 = vectorizer.fit_transform(texts1949)
tfidf_vectors_1950 = vectorizer.fit_transform(texts1950)

## Appliquer un algorithme de clustering sur les vecteurs TF-IDF des documents

Pour en savoir plus sur le KMeans clustering :
- https://medium.com/dataseries/k-means-clustering-explained-visually-in-5-minutes-b900cc69d175

### Définir un nombre de clusters

In [None]:
N_CLUSTERS = 3

### Instancier le modèle K-Means et ses arguments

In [None]:
km_model = KMeans(n_clusters=N_CLUSTERS)

### Appliquer le clustering à l'aide de la fonction `fit_predict`

In [None]:
clusters = km_model.fit_predict(tfidf_vectors_1950)

In [None]:
clustering = collections.defaultdict(list)

for idx, label in enumerate(clusters):
    clustering[label].append(files1950[idx])

In [None]:
pprint(dict(clustering))

## Visualiser les clusters

### Réduire les vecteurs à 2 dimensions à l'aide de l'algorithme PCA
Cette étape est nécessaire afin de visualiser les documents dans un espace 2D

https://fr.wikipedia.org/wiki/Analyse_en_composantes_principales

In [None]:
pca = PCA(n_components=2)
reduced_vectors_1950 = pca.fit_transform(tfidf_vectors_1950.toarray())

In [None]:
reduced_vectors_1950[:10]

### Générer le plot

In [None]:
x_axis = reduced_vectors_1950[:, 0]
y_axis = reduced_vectors_1950[:, 1]

plt.figure(figsize=(10,10))
scatter = plt.scatter(x_axis, y_axis, s=100, c=clusters)

# Ajouter les centroïdes
centroids = pca.transform(km_model.cluster_centers_)
plt.scatter(centroids[:, 0], centroids[:, 1],  marker = "x", s=100, linewidths = 2, color='black')

# Ajouter la légende
plt.legend(handles=scatter.legend_elements()[0], labels=set(clusters), title="Clusters")

### Analyse de sentiments sur les clusters

In [None]:
import shutil
import os

# Define the source and destination folders
source_folder = "../data/txt_clean/"
destination_folder = "../data/clustered_documents_1950/"

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Iterate over the clusters
for label, files in clustering.items():
    # Create a folder for each cluster
    cluster_folder = os.path.join(destination_folder, f"Cluster_{label}")
    os.makedirs(cluster_folder, exist_ok=True)
    
    # Copy the documents of the cluster to the cluster folder
    for file in files:
        source_file = os.path.join(source_folder, file)
        destination_file = os.path.join(cluster_folder, file)
        shutil.copyfile(source_file, destination_file)


In [None]:
import os
import pandas as pd
from textblob import TextBlob

def calculate_polarity_subjectivity(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

base_directory = "../data/clustered_documents_1950/"
clusters = [d for d in os.listdir(base_directory) if os.path.isdir(os.path.join(base_directory, d))]

for cluster in clusters:
    results = []
    directory = os.path.join(base_directory, cluster)
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding="utf-8") as file:
                text = file.read()
                polarity, subjectivity = calculate_polarity_subjectivity(text)
                results.append([filename, polarity, subjectivity])

    df = pd.DataFrame(results, columns=['Document', 'Polarity', 'Subjectivity'])
    print(f"\nCluster: {cluster}")
    total_docs = len(df)
    count = len(df[(df['Polarity'] == 0) & (df['Subjectivity'] == 0)])
    print(f"The percentage of texts with 0 polarity and 0 subjectivity is: {count/total_docs*100}%")
    
    zero_subjectivity_negative_polarity_count = len(df[(df['Polarity'] < 0) & (df['Subjectivity'] == 0)])
    zero_subjectivity_positive_polarity_count = len(df[(df['Polarity'] > 0) & (df['Subjectivity'] == 0)])
    positive_subjectivity_zero_polarity_count = len(df[(df['Polarity'] == 0) & (df['Subjectivity'] > 0)])
    positive_subjectivity_negative_polarity_count = len(df[(df['Polarity'] < 0) & (df['Subjectivity'] > 0)])
    positive_subjectivity_positive_polarity_count = len(df[(df['Polarity'] > 0) & (df['Subjectivity'] > 0)])
    
    
    print(f"The percentage of texts with 0 subjectivity and negative polarity is: {zero_subjectivity_negative_polarity_count/total_docs*100}%")
    print(f"The percentage of texts with 0 subjectivity and positive polarity is: {zero_subjectivity_positive_polarity_count/total_docs*100}%")
    print(f"The percentage of texts with positive subjectivity and 0 polarity is: {positive_subjectivity_zero_polarity_count/total_docs*100}%")
    print(f"The percentage of texts with positive subjectivity and negative polarity is: {positive_subjectivity_negative_polarity_count/total_docs*100}%")
    print(f"The percentage of texts with positive subjectivity and positive polarity is: {positive_subjectivity_positive_polarity_count/total_docs*100}%")
    