In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from text_cleaning import text_process_nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [None]:
DATA_FOLDER = "../Data/"

In [None]:
filename = "liste_des_reponses_aux_questions_ouvertes_par_consultation_2023-11-02T10_21_31.386925747Z.csv"
filename_with_demo = "reponses_aux_questions_ouvertes_croisees_avec_les_donnees_demo_2023-11-02T13_09_36.480435917Z.csv"
df = pd.read_csv(DATA_FOLDER + filename)
df_demo = pd.read_csv(DATA_FOLDER + filename_with_demo)

In [None]:
QUESTION_COL = "Questions → Title"
RESPONSE_COL = "Response Text"
TOKEN_COL = "tokens"

In [None]:
# Questions existantes
df[QUESTION_COL].unique()

In [None]:
import re
# Remove, useless answers
def clean_df(df: pd.DataFrame, size: int=10):
    cleaned_df = df.copy()
    cleaned_df["response_size"] = cleaned_df[RESPONSE_COL].str.split().apply(len)
    cleaned_df = cleaned_df.drop(cleaned_df[cleaned_df["response_size"] < size].index)
    WHITESPACE_HANDLER = lambda k: re.sub('\\s+', ' ', re.sub('\n+', ' ', k.strip()))
    cleaned_df["cleaned_text"] = cleaned_df["Response Text"].apply(WHITESPACE_HANDLER)
    return cleaned_df

In [None]:
#df_filtered = df[df[QUESTION_COL] == "Quelles sont vos autres propositions pour lutter contre les violences faites aux enfants ?"].copy()
question = 'Avez-vous des propositions pour financer la transition écologique ? C’est la dernière question !'
df_filtered = df[df[QUESTION_COL] == question].copy()
df_filtered = clean_df(df_filtered)
df_filtered


In [None]:
df

In [None]:
#X = df_filtered[RESPONSE_COL]
X = df_filtered["cleaned_text"]

### Data Processing

In [None]:
tfid = TfidfVectorizer(analyzer=text_process_nltk, ngram_range=(1,3)).fit(X)

In [None]:
df_filtered[TOKEN_COL] = df_filtered[RESPONSE_COL].apply(text_process_nltk)

In [None]:
text_process_nltk("Légiférer au sujet de l'aliénation parentale", True)

In [None]:
import nltk
from nltk.tokenize import word_tokenize

def get_token_test(doc: str):
    for i in doc:
        print(i)
        print(word_tokenize(i, language='french'))
    return


TODO : Enlever les "l'" des mots avant d'enlever le '

In [None]:
#df_filtered

In [None]:
X_topic = df_filtered[TOKEN_COL]

### Topic Modeling

In [None]:
from gensim import corpora
import gensim
import pickle


dictionary = corpora.Dictionary(X_topic)
corpus = [dictionary.doc2bow(text) for text in X_topic]

#pickle.dump(corpus, open('corpus.pkl', 'wb'))
#dictionary.save('dictionary.gensim')

In [None]:

NUM_TOPICS = 4
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
#ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

In [None]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

we assume that the articles in the same topic contain roughly the same information

### Summarization 

In [None]:
# TODO : Histogramme, des mots par topic
# Essayer sans passer par résumé de chaque échantillon
# Créer 5 titres et agréger pour réduire le bruit
# Check repetition_penalty param

In [None]:
#X_topic.values

In [None]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

from sentence_transformers import SentenceTransformer


# download dataset of 20,000 news articles
#docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
docs = df_filtered[RESPONSE_COL].values

sentence_model = SentenceTransformer("dangvantuan/sentence-camembert-base")
embeddings = sentence_model.encode(docs)

# Train BERTopic with a custom CountVectorizer
vectorizer_model = CountVectorizer(strip_accents="ascii")
#topic_model = BERTopic(vectorizer_model=vectorizer_model, embedding_model=sentence_model, nr_topics=6, language="french")

min_topic_size = 10 # Default=10

topic_model = BERTopic(min_topic_size=min_topic_size, nr_topics=8, language="french")
topics = topic_model.fit_transform(docs)

In [None]:
from nltk.corpus import stopwords
from bertopic.representation import MaximalMarginalRelevance



def get_custom_bertopic_model(X):
    # Remove stopwords
    #representation_model = MaximalMarginalRelevance(diversity=0.2)
    vectorizer_model = CountVectorizer(stop_words=stopwords.words("french"), strip_accents="ascii")
    topic_model = BERTopic(vectorizer_model=vectorizer_model, nr_topics=10, language="french")
    
    # Reduce frequent word importance
    #ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    #topic_model = BERTopic(ctfidf_model=ctfidf_model, language="french")
    
    topics = topic_model.fit_transform(X)
    return topic_model, topics
    

In [None]:
custom_bert, custom_topics = get_custom_bertopic_model(X)

In [None]:
#custom_bert.visualize_heatmap()
custom_bert.visualize_barchart()
#custom_bert.visualize_topics()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# Show wordcloud
create_wordcloud(custom_bert, topic=3)

In [None]:
#topic_list = list(range(8))
#topic_model.visualize_approximate_distribution()
# TODO: Score d'incertitude

In [None]:
doc_infos = custom_bert.get_document_info(X)
#docs_with_topics[docs_with_topics["Topic"] == 2]
doc_infos

In [None]:
doc_infos.Topic.unique()

In [None]:
def get_topic_distribution(doc_infos: pd.DataFrame):
    answers_per_topic = doc_infos.groupby("Topic").agg(answers=("Document", "count")).reset_index()
    answers_per_topic["percentage"] = answers_per_topic["answers"] / answers_per_topic["answers"].values.sum() * 100
    return answers_per_topic

In [None]:
answers_per_topic = get_topic_distribution(doc_infos)
answers_per_topic

In [None]:
def get_docs_from_topic(doc_infos, topic):
    representatives = doc_infos[doc_infos["Topic"] == topic].copy()
    #docs_with_topics["Representative_document"]
    return representatives

In [None]:
# Visualization functions
def get_topic_histogram(doc_infos, topic):
    return

In [None]:
import torch
from transformers import TFT5ForConditionalGeneration, T5Tokenizer, pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from random import sample


def get_tokenizer(t5=True):
    if t5:
        tokenizer = T5Tokenizer.from_pretrained("t5-base")
    else: 
        #tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
        tokenizer = AutoTokenizer.from_pretrained("moussaKam/barthez")
        #tokenizer = AutoTokenizer.from_pretrained("moussaKam/barthez-orangesum-title")
    return tokenizer


def get_summarizer_pipeline(tokenizer, t5=True):
    if t5:
        language_model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
        summarizer = pipeline("summarization", model=language_model, tokenizer=tokenizer, framework="tf")
    else:
        #summarizer = pipeline("summarization", model="csebuetnlp/mT5_multilingual_XLSum", tokenizer=tokenizer)
        #summarizer = pipeline("summarization", model="moussaKam/barthez", tokenizer=tokenizer)
        summarizer = pipeline("summarization", model="moussaKam/barthez-orangesum-abstract", tokenizer=tokenizer)
    return summarizer


def get_headline_generator(t5=True, model_name=""):
    if t5: 
        headline_generator = TFT5ForConditionalGeneration.from_pretrained("Michau/t5-base-en-generate-headline")
    else:
        #headline_generator = AutoModelForSeq2SeqLM.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
        headline_generator = AutoModelForSeq2SeqLM.from_pretrained("moussaKam/barthez-orangesum-title")
    return headline_generator

In [None]:
def get_summary_list_from_answers(answers: list[str], summarizer, tokenizer):
    summary_list = []
    current_token_length = 0
    max_token_length = 500 # anciennement 512
    #WHITESPACE_HANDLER = lambda k: re.sub('\\s+', ' ', re.sub('\n+', ' ', k.strip()))
    for answer in sample(answers, k=len(answers)):
        max_length = min(150, max(10, len(tokenizer.encode(answer))))
        summary = summarizer(answer, min_length=10, max_length=max_length)[0]["summary_text"]
        current_token_length += len(tokenizer.encode(summary))
        if current_token_length >= max_token_length:
            break
        summary_list.append(summary)
    return summary_list


def get_summary_list_from_answers_with_answer_sum(answers: list[str], summarizer, tokenizer):
    summary_list = []
    current_token_length = 0
    max_token_length = 500 # anciennement 512
    min_token_length = 100
    to_summarize = ""
    #WHITESPACE_HANDLER = lambda k: re.sub('\\s+', ' ', re.sub('\n+', ' ', k.strip()))
    for answer in sample(answers, k=len(answers)):
        to_summarize = to_summarize + answer + ", "
        answer_size = len(tokenizer.encode(to_summarize))
        print(str(answer_size) + " " + to_summarize)
        if answer_size <= min_token_length: 
            continue
        max_length = min(150, max(10, int(answer_size/2)))
        to_summarize = to_summarize.replace(".,", ",")
        summary = summarizer(to_summarize, min_length=10, max_length=max_length)[0]["summary_text"]
        print("to_summarize: ", to_summarize)
        print("summary: ", summary)
        to_summarize = ""
        current_token_length += len(tokenizer.encode(summary))
        if current_token_length >= max_token_length:
            break
        summary_list.append(summary)
    return summary_list

In [None]:
import re

def generate_topic_label(answers: list[str], summarizer, tokenizer, headline_generator, verbose=False) -> str:
    summary_list = get_summary_list_from_answers_with_answer_sum(answers, summarizer, tokenizer)
    print(summary_list)
    encoding = tokenizer.encode("Titre : " + " ".join(summary_list), return_tensors="pt")
    output = headline_generator.generate(encoding, max_length=64)
    return tokenizer.decode(output[0][1:-1])


def get_labels_from_topics(doc_infos, i_range, verbose=False, t5=True):
    tokenizer = get_tokenizer(t5)
    summarizer = get_summarizer_pipeline(tokenizer, t5)
    headline_generator = get_headline_generator(t5)
    topic_labels = []
    for i in range(i_range):
        topic_i = get_docs_from_topic(doc_infos, i)
        doc_i = topic_i['Document'].values.tolist()
        label = "Topic " + str(i) + " : " + generate_topic_label(doc_i, summarizer, tokenizer, headline_generator, verbose)
        print(label)
        topic_labels.append(label)
    return topic_labels

In [None]:
def format_summary(question, summary_list):
    result = "Question : " + question + ". Réponse : " + " ".join(summary_list)
    return result


def generate_topic_summary(answers: list[str], summarizer, tokenizer, question):
    summary_list = get_summary_list_from_answers_with_answer_sum(answers, summarizer, tokenizer)
    #formated_summary = format_summary(question, summary_list)
    formated_summary = " ".join(summary_list)
    print(formated_summary)
    return summarizer(formated_summary, max_length=150)[0]["summary_text"]


def get_summary_from_topics(doc_infos, i_range, question, t5):
    tokenizer = get_tokenizer(t5)
    summarizer = get_summarizer_pipeline(tokenizer, t5)
    topic_labels = []
    for i in range(i_range):
        topic_i = get_docs_from_topic(doc_infos, i)
        doc_i = topic_i['Document'].values.tolist()
        topic_summary = generate_topic_summary(doc_i, summarizer, tokenizer, question)
        print(topic_summary)
        label = "Résumé " + str(i) + " : " + topic_summary
        print(label)
        topic_labels.append(label)
    return topic_labels

In [None]:
i_range = sum(answers_per_topic["percentage"] > 2) -1
i_range

In [None]:
t5 = True
topic_labels = get_labels_from_topics(doc_infos, i_range=i_range, verbose=True, t5=t5)
for topic in topic_labels:
    print(topic)

In [None]:
topic_labels

In [None]:
topic_summaries = get_summary_from_topics(doc_infos, i_range=i_range, question=question, t5=t5)

In [None]:
topic_summaries

In [None]:
topic_n = get_docs_from_topic(doc_infos, 3)
looking_for = "solaire"
doc_n = topic_n[topic_n['Document'].str.contains(looking_for)]["Document"].values
doc_n

In [None]:
print(f"Topic 4 label: {generate_topic_label(doc_n.tolist())}")

## Using KeyBert

In [None]:
from keybert import KeyBERT
# Prepare documents 
docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']

# Extract keywords
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(docs)

# Create our vocabulary
vocabulary = [k[0] for keyword in keywords for k in keyword]
vocabulary = list(set(vocabulary))
# Then, we pass our vocabulary to BERTopic and train the model:

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model= CountVectorizer(vocabulary=vocabulary)
topic_model = BERTopic(vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.generate_topic_labels(nr_words=1)