# Pipeline de topic modeling

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from code.text_cleaning import text_process_nltk
from code.data_preparation import get_cleaned_doc_from_question, prep_answer_df
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Variables setting

In [None]:
DATA_FOLDER = "../Data/"
QUESTION_COL = "Questions → Title"
RESPONSE_COL = "Response Text"
TOKEN_COL = "tokens"

In [None]:
filename = "liste_des_reponses_aux_questions_ouvertes_par_consultation_2023-11-02T10_21_31.386925747Z.csv"
filename_with_demo = "reponses_aux_questions_ouvertes_croisees_avec_les_donnees_demo_2023-11-02T13_09_36.480435917Z.csv"
df = pd.read_csv(DATA_FOLDER + filename)
df_demo = pd.read_csv(DATA_FOLDER + filename_with_demo)

In [None]:
filepath = "../data/custom_analysis/agora_repIA_27122023.csv"
question_short = "intelligence_artificielle"
df = pd.read_csv(filepath)
df_preped = prep_answer_df(df, RESPONSE_COL)
df_preped

In [None]:
# filepath = "../data/francetransfert-2628211345/MDPH-verbatims_MDU_negatifs.csv"
# df_nico = pd.read_csv(filepath, sep=";")
# df_nico = df_nico.dropna(axis=0)
# question_short = "MDPH_MDU_negatif"
# df_preped = prep_answer_df(df_nico, "Satis_moins")
# df_preped

In [None]:
# Exploration des questions existantes
df[QUESTION_COL].unique()

In [None]:
# #question = 'Avez-vous des propositions pour financer la transition écologique ? C’est la dernière question !'
# question = 'Quelle est pour vous la mesure la plus importante pour réussir la transition écologique ? C’est la dernière question, partagez-nous toutes vos idées !'
# question_short = "new_mesure_transition_ecologique"
# # question = 'Quelles sont vos autres propositions pour lutter contre les violences faites aux enfants ?'
# # question_short = "solutions_violence_enfants"
# df_preped = get_cleaned_doc_from_question(df, QUESTION_COL, RESPONSE_COL, question)
# df_preped

In [None]:
def count_fracking(df: pd.DataFrame, size_before_fracking: int):
    result_df = df.copy()
    result_df["not_fracked"] = result_df["fracking_count"] == 1
    fracking = size_before_fracking - result_df["not_fracked"].sum()
    return fracking

In [None]:
#X = df_preped[df_preped["response_size"] >= 3]["fracked_text"]
# X : L'ensemble des réponses sous forme de liste
X = df_preped["fracked_text"]


In [None]:
fracking_merge = df_preped[["old_index", "fracking_count"]].copy()
fracking_merge

### Topic Modeling

In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

from sentence_transformers import SentenceTransformer

from code.topic_dataviz import create_wordcloud_from_topic
from nltk.corpus import stopwords

In [None]:
# TODO : Histogramme, des mots par topic
# TODO: Score d'incertitude
# Essayer sans passer par résumé de chaque échantillon
# Créer 5 titres et agréger pour réduire le bruit
# Check repetition_penalty param

In [None]:
def get_custom_bertopic_model(X: pd.Series, sub_topics: bool=False)-> tuple[BERTopic, pd.DataFrame]:
    # Remove stopwords
    vectorizer_model = CountVectorizer(stop_words=stopwords.words("french"), strip_accents="ascii")
    
    #nr_topics = "auto"
    nr_topics = 10
    min_topic_size = 100
    if ~sub_topics:
        topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=nr_topics, language="french")
    else:
        n_docs = round(X.size * 0.02)
        nr_topics = "auto"
        topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=nr_topics, language="french")
    # Reduce frequent word importance
    #ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    #topic_model = BERTopic(ctfidf_model=ctfidf_model, language="french")
     
    topics, probs = topic_model.fit_transform(X)
    #topics = topic_model.reduce_outliers(X, topics)
    return topic_model, topics
    

In [None]:
custom_bert, custom_topics = get_custom_bertopic_model(X)
custom_bert.save("../data/topic_modeling/" + question_short + "/bertopic_model/", serialization="safetensors", save_ctfidf=True)

In [None]:
#custom_bert.visualize_heatmap()
custom_bert.visualize_topics()
#custom_bert.visualize_term_rank()
#custom_bert.hierarchical_topics(X)

In [None]:
custom_bert.visualize_hierarchy()

In [None]:
doc_infos = custom_bert.get_document_info(X)
#docs_with_topics[docs_with_topics["Topic"] == 2]
doc_infos.loc[0]["Representative_Docs"]

In [None]:
doc_infos.Topic.unique()

In [None]:
def get_topic_distribution(doc_infos: pd.DataFrame):
    answers_per_topic = doc_infos.groupby("Topic").agg(answers=("Document", "count")).reset_index()
    answers_per_topic["percentage"] = answers_per_topic["answers"] / answers_per_topic["answers"].values.sum() * 100
    return answers_per_topic

In [None]:
answers_per_topic = get_topic_distribution(doc_infos)
answers_per_topic

In [None]:
def get_docs_from_topic(doc_infos: pd.DataFrame, topic: int):
    representatives = doc_infos[doc_infos["Topic"] == topic].copy()
    print("Taille du topic : ", len(representatives.index), " documents")
    return representatives

In [None]:
def save_doc_info(doc_infos: pd.DataFrame, question_short: str):
    save_folder = "../data/topic_modeling/" + question_short + "/"
    save_path = save_folder + "doc_infos.csv"
    doc_infos.to_csv(save_path)
    return

In [None]:
# Sous_topics
def get_sub_topics_new(doc_infos: pd.DataFrame, topic_range: int):
    to_merge = None
    for topic in range(topic_range):
        print(f"Sub_topics for topic {topic}")
        topic_documents = get_docs_from_topic(doc_infos, topic)
        X = topic_documents["Document"]
        custom_bert, custom_topics = get_custom_bertopic_model(X, True)
        topic_infos = custom_bert.get_document_info(X)
        topic_infos["id"] = topic_documents.index
        print(get_topic_distribution(topic_infos))
        topic_infos = topic_infos.rename(columns={"Topic": "sub_topic", "Name": "sub_name"})
        if to_merge is None:
            to_merge = topic_infos[["id", "sub_topic", "sub_name"]]
        else:
            to_merge = pd.concat([to_merge, topic_infos[["id", "sub_topic", "sub_name"]]])
    doc_infos["id"] = doc_infos.index
    doc_infos_w_subtopics = doc_infos.merge(to_merge, on="id", how="left")
    doc_infos_w_subtopics["sub_topic"] = doc_infos_w_subtopics["sub_topic"].fillna(-2)
    return doc_infos_w_subtopics

In [None]:
sub_topic_range = answers_per_topic[answers_per_topic["percentage"] > 5].count()[0]
doc_infos_w_subtopics = get_sub_topics_new(doc_infos, sub_topic_range)
doc_infos_w_subtopics_fracking = doc_infos_w_subtopics.join(fracking_merge)

## Sauvegarde des données analysés par cluster dans un fichier

In [None]:
save_doc_info(doc_infos_w_subtopics_fracking, question_short)

In [None]:
doc_infos_w_subtopics_fracking.loc[2535]

In [None]:
def split_doc_infos(doc_infos: pd.DataFrame):
    # topics
    topics = doc_infos.groupby(["Topic", "sub_topic"]).agg(name=("Name", "first"))
    print(topics)
    
    return

def save_to_sql(doc_infos: pd.DataFrame, conn):
    RESPONSES_COLS = ["text", "topic_id", "topic_probability", "sentiment", "sentiment_score", "fracking_count"]
    responses = doc_infos.copy()   
    df.to_sql("responses", con=conn)
    return

In [None]:
# sub_topic_range = answers_per_topic[answers_per_topic["percentage"] > 5].count()[0]
# get_sub_topics(doc_infos, sub_topic_range, save_folder)
# stat_dict["sub_topics"] = sub_topic_range

In [None]:
stat_dict.__delitem__("date")

In [None]:
import json
stat_dict_path = save_folder + "stat_dict.json"
with open(stat_dict_path, 'w', encoding='utf-8') as f:
    json.dump(stat_dict, f, ensure_ascii=False, indent=4)

### Summarization 

In [None]:
import torch
from transformers import TFT5ForConditionalGeneration, T5Tokenizer, pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from random import sample


def get_tokenizer(t5=True):
    if t5:
        tokenizer = T5Tokenizer.from_pretrained("t5-base")
        #tokenizer = AutoTokenizer.from_pretrained("mrm8488/camembert2camembert_shared-finetuned-french-summarization", padding_side='left')
    else: 
        tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
        #tokenizer = AutoTokenizer.from_pretrained("moussaKam/barthez")
        #tokenizer = AutoTokenizer.from_pretrained("moussaKam/barthez-orangesum-abstract")
    return tokenizer


def get_summarizer_pipeline(tokenizer, t5=True):
    if t5:
        language_model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
        summarizer = pipeline("summarization", model=language_model, tokenizer=tokenizer, framework="tf")
        #summarizer = pipeline("summarization", model="mrm8488/camembert2camembert_shared-finetuned-french-summarization", tokenizer=tokenizer, framework="tf")


    else:
        summarizer = pipeline("summarization", model="csebuetnlp/mT5_multilingual_XLSum", tokenizer=tokenizer)
        #summarizer = pipeline("summarization", model="moussaKam/barthez", tokenizer=tokenizer)
        #summarizer = pipeline("summarization", model="moussaKam/barthez-orangesum-abstract", tokenizer=tokenizer)
    return summarizer


def get_headline_tokenizer(t5):
    headline_tokenizer = T5Tokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
    return headline_tokenizer


def get_headline_generator(t5=True, model_name=""):
    if True: 
        headline_generator = TFT5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
    else:
        #headline_generator = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
        headline_generator = AutoModelForSeq2SeqLM.from_pretrained("moussaKam/barthez-orangesum-title")
    return headline_generator

In [None]:
def get_summary_list_from_answers(answers: list[str], summarizer, tokenizer):
    summary_list = []
    current_token_length = 0
    max_token_length = 500 # anciennement 512
    #WHITESPACE_HANDLER = lambda k: re.sub('\\s+', ' ', re.sub('\n+', ' ', k.strip()))
    for answer in sample(answers, k=len(answers)):
        max_length = min(150, max(10, len(tokenizer.encode(answer))))
        summary = summarizer(answer, min_length=10, max_length=max_length)[0]["summary_text"]
        current_token_length += len(tokenizer.encode(summary))
        if current_token_length >= max_token_length:
            break
        summary_list.append(summary)
    return summary_list


def get_summary_list_from_answers_with_answer_sum(answers: list[str], summarizer, tokenizer):
    summary_list = []
    current_token_length = 0
    max_token_length = 500 # anciennement 512
    min_token_length = 100
    to_summarize = ""
    #WHITESPACE_HANDLER = lambda k: re.sub('\\s+', ' ', re.sub('\n+', ' ', k.strip()))
    for answer in sample(answers, k=len(answers)):
        to_summarize = to_summarize + answer + ". "
        answer_size = len(tokenizer.encode(to_summarize))
        print(str(answer_size) + " " + to_summarize)
        if answer_size <= min_token_length: 
            continue
        max_length = min(150, max(10, int(answer_size/2)))
        to_summarize = to_summarize.replace("..", ",")
        summary = summarizer(to_summarize, min_length=10, max_length=max_length)[0]["summary_text"]
        print("to_summarize: ", to_summarize)
        print("summary: ", summary)
        to_summarize = ""
        current_token_length += len(tokenizer.encode(summary))
        if current_token_length >= max_token_length:
            break
        summary_list.append(summary)
    return summary_list


def get_summary_of_samples(answers: list[str], summarizer, tokenizer):
    summary_list = []
    current_token_length = 0
    max_token_length = 500 # anciennement 512
    min_token_length = 200
    to_summarize = "summarize: "
    for answer in sample(answers, k=len(answers)):
        to_summarize = to_summarize + answer + ". "
        answer_size = len(tokenizer.encode(to_summarize))
        if answer_size <= min_token_length: 
            continue
        max_length = min(150, max(10, int(answer_size/2)))
        to_summarize = to_summarize.replace("..", ".")
        summary = summarizer(to_summarize, min_length=10, max_length=max_length, num_beams=5)[0]["summary_text"]
        print(to_summarize)
        print("summary: ", summary)
        to_summarize = "summarize: "
        current_token_length += len(tokenizer.encode(summary))
        if current_token_length >= max_token_length:
            break
        summary_list.append(summary)
    return summary_list

In [None]:
import re

def generate_topic_label(answers: list[str], summarizer, tokenizer, headline_generator, headline_tokenizer, verbose=False) -> str:
    summary_list = get_summary_of_samples(answers, summarizer, tokenizer)
    print(summary_list)
    encoding = headline_tokenizer.encode("titre : " + " ".join(summary_list), return_tensors="pt") # test en retirant return_tensors et en mettant titre
    output = headline_generator.generate(encoding, max_length=64, num_beams=5, no_repeat_ngram_size=2, repetition_penalty=2.0)
    return headline_tokenizer.decode(output[0][1:-1])


def get_labels_from_topics(doc_infos, i_range, verbose=False, t5=True, label_per_topic: int=5):
    tokenizer = get_tokenizer(t5)
    summarizer = get_summarizer_pipeline(tokenizer, t5)
    headline_generator = get_headline_generator(t5)
    headline_tokenizer = get_headline_tokenizer(t5)
    topic_labels = []
    for i in range(i_range):
        topic_i = get_docs_from_topic(doc_infos, i)
        doc_i = topic_i['Document'].values.tolist()
        label_options = []
        for j in range(label_per_topic):
            label = generate_topic_label(doc_i, summarizer, tokenizer, headline_generator, headline_tokenizer, verbose)
            label_options.append(label)
        print("Topic " + str(i) + " : " + ", ".join(label_options))
        topic_labels.append(label_options)
    return topic_labels

In [None]:
def format_summary(question, summary_list):
    result = "Question : " + question + ". Réponse : " + " ".join(summary_list)
    return result


def generate_topic_summary(answers: list[str], summarizer, tokenizer, question):
    summary_list = get_summary_list_from_answers_with_answer_sum(answers, summarizer, tokenizer)
    #formated_summary = format_summary(question, summary_list)
    formated_summary = " ".join(summary_list)
    print(formated_summary)
    return summarizer(formated_summary, max_length=150)[0]["summary_text"]


def get_summary_from_topics(doc_infos, i_range, question, t5):
    tokenizer = get_tokenizer(t5)
    summarizer = get_summarizer_pipeline(tokenizer, t5)
    topic_labels = []
    for i in range(i_range):
        topic_i = get_docs_from_topic(doc_infos, i)
        doc_i = topic_i['Document'].values.tolist()
        topic_summary = generate_topic_summary(doc_i, summarizer, tokenizer, question)
        print(topic_summary)
        label = "Résumé " + str(i) + " : " + topic_summary
        print(label)
        topic_labels.append(label)
    return topic_labels

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def measure_similarity_of_topic(topic_labels: list[str], topic_model):
    embedding = topic_model.embedding_model.embed(topic_labels)
    similarity_matrix = cosine_similarity(embedding)

    top_label = topic_labels[np.argmax(np.sum(similarity_matrix, axis=1))]
        
    # scoring topic
    triu_mat = np.triu(similarity_matrix, k=1)
    score = np.mean(triu_mat[np.nonzero(triu_mat)])

    return similarity_matrix, top_label, score

In [None]:
i_range = sum(answers_per_topic["percentage"] > 2) -1
i_range

In [None]:
def filter_doc_infos_before_labeling(doc_infos: pd.DataFrame):
    filtered_doc_info = doc_infos[doc_infos["Probability"] > 0.70].copy()
    filtered_doc_info["response_size"] = filtered_doc_info["Document"].str.split().apply(len)
    to_drop = filtered_doc_info[filtered_doc_info["response_size"] < 10]
    filtered_doc_info.drop(to_drop.index, inplace = True)
    return filtered_doc_info

In [None]:
filtered_doc_infos = filter_doc_infos_before_labeling(doc_infos)
filtered_doc_infos

In [None]:
t5 = True
label_per_topic = 10
topic_labels = get_labels_from_topics(filtered_doc_infos, i_range=i_range, verbose=True, t5=t5, label_per_topic=label_per_topic)
for topic in topic_labels:
    print(topic)

In [None]:
topic_labels

In [None]:
import json

def write_json_file(data, filename: str):
    with open(filename, 'w') as f:
        json.dump(data, f)


def clean_labels(labels_list: list[list[str]]):
    REMOVE_TOKENS = lambda x : re.sub("(<pad>)*(</s>)*", "", x)
    new_labels_list = []
    for labels in labels_list:
        new_labels = [REMOVE_TOKENS(x) for x in labels]
        new_labels_list.append(new_labels)
    return new_labels_list

cleaned_labels = clean_labels(topic_labels)
write_json_file(cleaned_labels, DATA_FOLDER + "topic_modeling/" + question_short + "/cleaned_labels.json")
                              

In [None]:
measure_similarity_of_topic(topic_labels[1], custom_bert)

In [None]:
topic_summaries = get_summary_from_topics(doc_infos, i_range=i_range, question=question, t5=t5)

In [None]:
topic_summaries

In [None]:
topic_n

In [None]:
topic_n = get_docs_from_topic(filtered_doc_infos, 4)
looking_for = ""
doc_n = topic_n[topic_n['Document'].str.contains(looking_for)]["Document"].values
doc_n

In [None]:
print(f"Topic 4 label: {generate_topic_label(doc_n.tolist())}")

## Using KeyBert

In [None]:
from keybert import KeyBERT
# Prepare documents 
docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

# Extract keywords
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(docs)

# Create our vocabulary
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))
# Then, we pass our vocabulary to BERTopic and train the model:

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model= CountVectorizer(vocabulary=vocabulary)
topic_model = BERTopic(vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.generate_topic_labels(nr_words=1)

In [None]:
import collections
def get_most_present_words_g(df: pd.DataFrame, col: str, ngram: int):
    # remove stopwords
    c=collections.Counter()
    for i in df[col]:
        x = i.rstrip().split(" ")
        c.update(set(zip(x[:-1],x[1:])))
    return c.most_common(5)

In [None]:
c = get_most_present_words_g(doc_infos, "Document", 2)
c.most_common(3)

In [None]:
sw = stopwords.words("french")
sw