In [1]:
%matplotlib inline

import gensim
import json
import logging
import math
import os
import random
import re
import spacy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyLDAvis.gensim_models
import warnings

from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.test.utils import datapath

from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import TrigramAssocMeasures
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import normalize

from stop_words import get_stop_words
from wordcloud import WordCloud

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open(os.path.join('posts_maior_propre.json'), 'r', encoding = 'UTF-8') as fin_M2 :
    data = json.load(fin_M2)

In [3]:
index = ''
a = ''
list_clean_content = []

for j in range(len(data)) :
    index = data[j]
    a = index.replace( "\n", "")
    list_clean_content.append(a)

In [4]:
sp_stop_words = get_stop_words('spanish')
fr_stop_words = get_stop_words('french')
en_stop_words = get_stop_words('english')

In [5]:
words = [ 'plus', 'rien', 'tres', 'ca', 'fr', 'com', 'www', 'http',
          'etais', 'etait', 'etaient', 'meme', 'non', 'bien', 'oui', 'quoi',
          'chez', 'va', 'apres']

In [6]:
stop_words = sp_stop_words + fr_stop_words + en_stop_words + words 

In [7]:
# fonction qui génère les listes de mots (token) à partir des textes
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

# on construit le corpus
data_words = list(sent_to_words(list_clean_content))

In [8]:
doc_set=list_clean_content

In [9]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

# on retire les mots-outils
data_words_nostops = remove_stopwords(data_words)

In [10]:
# création du dictionnaire
dico = corpora.Dictionary(data_words_nostops)

### Filtering rare and extreme vocabulary ### 
dico.filter_extremes( no_below = 5, no_above = 0.5)

# Create Corpus
texts = data_words_nostops

# matrice Term Document Frequency
corpus = [dico.doc2bow(text) for text in texts]

In [11]:
ldamodel = LdaModel(corpus,
                    id2word = dico,
                    num_topics=40, 
                    passes=2, 
                    random_state=100, 
                    per_word_topics=True)

In [12]:
ldamodel.show_topics(num_topics=40,formatted=False)

[(0,
  [('roman', 0.011101952),
   ('dont', 0.008585464),
   ('histoire', 0.007922815),
   ('auteur', 0.005623733),
   ('œuvre', 0.0053327065),
   ('livre', 0.0052345353),
   ('litterature', 0.005192275),
   ('annees', 0.00496529),
   ('personnages', 0.0047211247),
   ('recit', 0.0040478446)]),
 (1,
  [('photo', 0.10563041),
   ('ile', 0.06708874),
   ('mer', 0.04300688),
   ('port', 0.03232534),
   ('cm', 0.021750832),
   ('avion', 0.019874325),
   ('matricule', 0.014484915),
   ('ambre', 0.011962216),
   ('ulysse', 0.011766081),
   ('quentin', 0.011090266)]),
 (2,
  [('sexe', 0.011813908),
   ('masculin', 0.010036173),
   ('animal', 0.0094039515),
   ('croire', 0.00875692),
   ('lucie', 0.0073206644),
   ('dune', 0.006791551),
   ('sonner', 0.0067498265),
   ('feminin', 0.0063092643),
   ('baleine', 0.006126382),
   ('violet', 0.005972298)]),
 (3,
  [('anna', 0.036052696),
   ('victor', 0.029417023),
   ('hugo', 0.026909456),
   ('didier', 0.016143993),
   ('robin', 0.01599371),
   (

In [13]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, gensim.matutils.corpus2csc(corpus), dictionary=ldamodel.id2word)
pyLDAvis.display(vis)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [14]:
ldc = ldamodel[corpus]

In [25]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    i=0
    for i, row in enumerate(ldamodel[corpus]):
        #print(row[0])            
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        #print(row)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamodel, corpus=corpus, texts=doc_set)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(50)

  sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,33,0.3164,"nuit, yeux, corps, jour, ciel, mots, temps, vo...",le premier jour du premier mois de l'année res...
1,1,0,0.025,"roman, dont, histoire, auteur, œuvre, livre, l...",
2,2,15,0.6686,"etre, dire, chose, faut, reponse, toujours, au...",âmes perduescauses perduesun constat s'impose ...
3,3,33,0.3797,"nuit, yeux, corps, jour, ciel, mots, temps, vo...",pour y croîtrej'ai enterré quelque chose de mo...
4,4,0,0.025,"roman, dont, histoire, auteur, œuvre, livre, l...",
5,5,33,0.2747,"nuit, yeux, corps, jour, ciel, mots, temps, vo...",des sentiments le poursuivent qui lui cuisent ...
6,6,39,0.4677,"monde, vie, homme, amour, mort, etre, temps, c...",y a des journées parfaites et fébriles où j'ar...
7,7,15,0.4668,"etre, dire, chose, faut, reponse, toujours, au...",je ne suis ni normand ni féministe ni chanteur...
8,8,16,0.3261,"terre, espace, lumiere, eau, feu, ciel, ombre,...",je ne sais pas ce qui me peine et je sais à p...
9,9,33,0.4009,"nuit, yeux, corps, jour, ciel, mots, temps, vo...",sans les méchants les saints ne le sont guère ...


In [26]:
sent_topics_sorteddf = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf = pd.concat([sent_topics_sorteddf, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0,0.9607,"roman, dont, histoire, auteur, œuvre, livre, l...",c'est le soir - je retire le texte écrit ce ma...
1,1,0.675,"photo, ile, mer, port, cm, avion, matricule, a...",photo F. Laborde
2,2,0.8375,"sexe, masculin, animal, croire, lucie, dune, s...","A trop grave, manquer les heurs,A trop léger, ..."
3,3,0.6966,"anna, victor, hugo, didier, robin, nicolas, mo...",02 juillet 2020. 05 juillet 2020...
4,4,0.8205,"blog, da, blogspot, blogs, jose, ana, didier, ...",Ana pourquoi tu t 'appelles Angeles Ana Angele...
5,5,0.8478,"berri, mort, uqam, tait, moscou, connaissez, e...",interruption au milieu - reste l'oiseau bec su...
6,6,0.7375,"police, https, facebook, vote, people, org, ju...","Credit: NASA, ESA, F. Paresce (INAF-IASF, Bolo..."
7,7,0.8124,"pays, france, contre, guerre, francais, politi...",J'ai été condamné à seulement 807 années de pr...
8,8,0.6796,"pen, chiffres, ira, alex, tourne, figaro, port...","un homme marche au bord du chemin blanc , s'ar..."
9,9,0.9639,"pain, table, manger, cuisine, mange, huile, so...",A cassé B cassé C cassé D cassé E cassé F cass...


In [27]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()
topic_counts

33    24600
25    17002
0     15230
15    14978
32     6348
14     4681
39     3877
28     3759
7      3719
38     2769
36     2710
16     1795
26     1520
37     1353
31     1336
19     1321
23      559
18      525
6       335
9       329
30      233
34      221
4       220
1       177
24      171
5       156
17      142
2       126
27      104
22      101
13       73
11       66
3        66
29       57
20       53
35       45
21       45
8        36
10       35
12       22
Name: Dominant_Topic, dtype: int64