## Chargement des données

In [None]:
import os
from getpass import getpass

cache_dir = input("Indicate path to all Hugging Face caches:")
os.environ["HF_DATASETS_CACHE"] = cache_dir
os.environ["HF_HUB_CACHE"] = cache_dir
os.environ["HF_TOKEN"] = getpass("Enter your HuggingFace token:")

In [None]:
from rank_comparia.utils import load_comparia

votes = load_comparia("ministere-culture/comparia-votes")

## Import de librairies

In [None]:
import polars as pl

# import torch
from pathlib import Path

import umap
import hdbscan
import nltk
import pickle

from bertopic import BERTopic
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from matplotlib import pyplot as plt

nltk.download("stopwords")

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
# torch.cuda.is_available()
# https://maartengr.github.io/BERTopic/getting_started/best_practices/best_practices.html#training

## Utils

In [None]:
def get_topic_model(docs, model_name, min_cluster_size):

    sentence_model = SentenceTransformer(model_name)
    embeddings = sentence_model.encode(docs, how_progress_bar=False)
    vectorizer_model = CountVectorizer(stop_words=get_stop_words(), min_df=2, ngram_range=(1, 3))

    # reduce embeddings dimension
    umap_model = umap.UMAP(n_neighbors=20, n_components=10, min_dist=0.0, metric="cosine", low_memory=False)

    """perform documents clustering"""
    hdbscan_model = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=1,
        metric="euclidean",
        cluster_selection_method="eom",
        prediction_data=True,
    )

    topic_model = BERTopic(
        # Pipeline models
        embedding_model=sentence_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        # Hyperparameters
        top_n_words=10,
        verbose=True,
    )
    """
    if save:
        topic_model.save(
        model_title = model_title,
        serialization="safetensors",
        save_ctfidf=True
    )
    #my_model = BERTopic.load("my_model")
    """
    """ Train model"""
    topics, probs = topic_model.fit_transform(docs, embeddings)

    return topic_model, embeddings, topics, probs

In [None]:
def clean_df(df, var):

    df = df.with_columns(foo_lower=pl.col(var).str.to_lowercase())
    df1 = df.unique(subset=[var], keep="first")

    return df1

In [None]:
def drop_words(phrase):

    list_stop = [
        "ton",
        "de",
        "les",
        "et",
        "le",
        "la",
        "que",
        "un",
        "je",
        "c'est",
        "quand",
        "cest",
        "si",
        "comme",
        "bonjour",
        "un",
        "il",
        "je",
        "to",
        "the",
        "of",
        "for",
        "et",
        "une",
        "un",
        "de" "est",
    ]

    disallowed_wordlist = stopwords.words("french") + list_stop

    phrase_split = phrase.split()

    allowed_words_list = []

    for word in phrase_split:

        if word not in disallowed_wordlist:

            allowed_words_list.append(word)

    new_phrase = " ".join(allowed_words_list)

    return new_phrase

In [None]:
def get_stop_words():
    list_stop = [
        "ton",
        "de",
        "les",
        "et",
        "le",
        "la",
        "que",
        "un",
        "je",
        "c'est",
        "quand",
        "cest",
        "si",
        "comme",
        "bonjour",
        "un",
        "il",
        "to",
        "the",
        "of",
        "for",
        "une",
        "un",
        "de" "est",
        "ne",
    ]

    list_stop_words = stopwords.words("french") + list_stop

    return list_stop_words

In [None]:
def plot_categories(df, topic_model, model_name, figure_name, save):

    topics = topic_model.get_topic_info()

    print("there are ", len(topics), " topics, when I use the model ", model_name)
    topics = topics[topics["Topic"].isin(range(0, 100))]
    N = len(topics)

    legende = []
    legende_dict = {}

    for i in range(0, N):
        legende.append("Topic " + str(i) + ": " + ", ".join(topics["Representation"].iloc[i][:9]))
        legende_dict[i] = "Topic " + str(i) + ": " + ", ".join(topics["Representation"].iloc[i][:9])

    var1 = "Topic"
    df_pl = df[["Topic"]].sort_values(by=var1)
    df_pl = df.groupby([var1], as_index=False)[var1].count().fillna(0)
    df_pl["count"] = df_pl.sum(axis=1)
    df_pl = df_pl.sort_values(by="count", ascending=False).drop(columns=["count"])
    plt.figure(figsize=(8, N / 4))

    ax = df_pl.plot(kind="barh", stacked=True, figsize=(12, N / 4), colormap=plt.cm.tab20)
    plt.title(model_name)
    plt.yticks(range(N), map(lambda x: legende_dict[x], range(N)))
    plt.legend()
    plt.margins(0.04, 0)
    if save:
        plt.savefig(figure_name, bbox_inches="tight", dpi=600)

## Process data

In [None]:
df = clean_df(votes, "opening_msg")
print("Number of unique observations / documents :", len(df))

In [None]:
docs = df["opening_msg"].to_list()
# docs_stopwords = [drop_words(x) for x in docs]

## Step 1: Get topics with BerTopic

In [None]:
list_models = ["sentence-transformers/roberta-base-nli-stsb-mean-tokens"]
min_cluster_size = 50
for model in list_models:
    topic_model, embeddings, topics, probs = get_topic_model(
        docs,
        model,
        min_cluster_size,
    )

In [None]:
# topic_model = BERTopic.load(path=f"../data/bertopic_july_20")

In [None]:
df1 = topic_model.get_document_info(docs).sort_values(by="Topic")

In [None]:
df1["Topic"].value_counts()

In [None]:
plot_categories(df1[df1["Topic"].isin(range(0, 100))], topic_model, "BertTopic", "../figures/topics_bert.jpg", True)

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

In [None]:
fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
fig

In [None]:
fig.write_html("../figures/dendogram.html")

In [None]:
plt.savefig("../figures/dendogram.jpg", bbox_inches="tight", dpi=600)

## Step 2 : sampling of most relevant documents

#### Sampling : this part was copied from the Arena Explorer notebook 

In [None]:
doc = docs
sampled_prompts = defaultdict(list)
topic_info = topic_model.get_topic_info()
doc_info = topic_model.get_document_info(doc)

for topic_id in topic_info["Topic"][1:]:
    filtered_docs = doc_info[
        (doc_info["Topic"] == topic_id)
        & (doc_info["Probability"] >= doc_info["Probability"].quantile(0.8))
        & (doc_info["Document"].str.split().str.len() >= 5)
    ]

    res = filtered_docs
    cap = 100
    if len(filtered_docs) >= 20:
        while len(res) < 20:
            res = filtered_docs[filtered_docs["Document"].str.split().str.len() <= cap]
            cap += 50

    sampled_docs = res.sample(n=min(20, len(res)), random_state=42, replace=False)

    sampled_prompts[topic_id] = sampled_docs["Document"].tolist()

In [None]:
save_path = "../data"

with open(f"{save_path}/example_prompts.pkl", "wb") as f:
    pickle.dump(sampled_prompts, f)

In [None]:
new_topics = topic_model.reduce_outliers(list(doc), topics, strategy="c-tf-idf", threshold=0.1)
new_topics = topic_model.reduce_outliers(list(doc), new_topics, strategy="distributions")
topic_model.update_topics(doc, topics=new_topics)

In [None]:
topic_info = topic_model.get_topic_info()
doc_info = topic_model.get_document_info(docs)
topic_info[topic_info["Topic"].isin(range(0, 30))]

## Step 3: summarize each topic using an LLM