### Réduction des dimenssions

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
%%time 
import gensim
import gensim.corpora as corpora
from gensim import models
from gensim.utils import simple_preprocess
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import CoherenceModel
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)


def compute_coherence_values(dictionary, corpus, texts, limit, mallet=False, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Args:
  
        dictionary : Gensim dictionary
        corpus : Gensim corpus
        texts : List of input texts
        limit : Max num of topics

    Returns:
    
        model_list : List of LDA topic models
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    
    
    for num_topics in range(start, limit, step):
        
        if mallet == True:
            model = gensim.models.wrappers.LdaMallet(mallet_path = mallet_path, 
                                                     corpus=corpus, num_topics=num_topics, 
                                                     id2word=id2word, 
                                                     prefix = 'temp_file_',
                                                     workers = 7)
        
        else:
            model = LdaMulticore(corpus=corpus,
                                id2word=dictionary,
                                num_topics=num_topics, 
                                random_state=42,
                                chunksize=100,
                                passes=10,
                                workers=7)

        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

#model_list, coherence_values = compute_coherence_values(mallet=False, dictionary=id2word, corpus=corpus_tfidf, texts=texts, start=2, limit=51, step=1)

CPU times: user 564 ms, sys: 817 ms, total: 1.38 s
Wall time: 335 ms


In [4]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda, corpus=corpus_tfidf, texts=html_cleaned_texts)

# # Format
# df_dominant_topic = df_topic_sents_keywords.reset_index()
# df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# # Show
# df_dominant_topic.head(10)

### Avec bigrammes lematizés

In [None]:
%%time
import spacy
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

data = filtered_data['tokenized_post'].values.tolist()

bigram = gensim.models.Phrases(data, min_count=1000, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

data_bigrams = [bigram_mod[doc] for doc in data]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

bigrams_data = make_bigrams(data)
lemmatized_data = lemmatization(bigrams_data, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

id2word = corpora.Dictionary(lemmatized_data)
corpus = [id2word.doc2bow(text) for text in texts] 
bow_corpus = [id2word.doc2bow(text) for text in texts]

model_list, coherence_values = compute_coherence_values(mallet=False, dictionary=id2word, corpus=bow_corpus, texts=lemmatized_data, start=2, limit=51, step=1)

limit=51; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
%%time
from pprint import pprint
optimal_model = model_list[9]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
%%time
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=bow_corpus, texts=html_cleaned_texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
print("Publication du premier document\n")
display(df_dominant_topic.loc[0,'Text'])
print("\n")
print(f"Numéro du topic: {df_dominant_topic.loc[0,'Dominant_Topic']}")
print("\n")
print("Mots clés associés\n")
display(df_dominant_topic.loc[0,'Keywords'])

In [None]:
sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf

In [None]:
for index, row in sent_topics_sorteddf.iterrows():
    print(f"Numéro du topic: {row['Topic_Num']}")
    print("\n")
    print("Mots clés associes au topic:\n")
    print(f"{row['Keywords']}")
    print("\n")
    print("Publication la plus significative du topic:")
    print("\n")
    print(f"{row['Text']}")
    print("\n")
    print("=" * 50)

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = sent_topics_sorteddf[["Topic_Num","Keywords"]]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

In [None]:
%%time

from IPython.display import HTML
css_str = '<style> \
.jp-icon-warn0 path {fill: var(--jp-warn-color0);} \
.bp3-button-text path { fill: var(--jp-inverse-layout-color3);} \
.jp-icon-brand0 path { fill: var(--jp-brand-color0);} \
text.terms { fill: #616161;} \
</style>'
display(HTML(css_str))

import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(optimal_model, bow_corpus, id2word)
pyLDAvis.save_html(vis, 'lda.html')
display(HTML('lda.html'))

## Avec données importées

In [None]:
%%time
data = pd.read_csv("./data/cleaned_corpus.csv")
texts = data['Text'].to_list()
texts = [tokens.split(" ") for tokens in texts]
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts] 
bow_corpus = [id2word.doc2bow(text) for text in texts]

model_list, coherence_values = compute_coherence_values(mallet=False, dictionary=id2word, corpus=bow_corpus, texts=texts, start=2, limit=51, step=1)

limit=51; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))