In [1]:
#Source du code
#https://larevueia.fr/latent-dirichlet-allocation-topic-modeling-en-python/


In [2]:
#Import de pandas
import pandas as pd

In [3]:
#Installation de nltk et de gensim
# !pip install nltk
# !pip install gensim

In [4]:
#Installation de funcy numexpr et pyLDAvis car pyLDAvis 3.4.1 nécessite les packages funcy et numexpr, qui ne sont pas installés.

# !pip install funcy numexpr
# !pip install pyLDAvis


In [5]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import CoherenceModel

In [6]:
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
#Charger le dataset, ici children_covid_analyse.csv
def load_data(path):
    return pd.read_csv(path)
tweets_df=load_data("../../Donnees/analyse_texte/csv/children_covid_analyse.csv") 
#Pour effectuer un autre TM il faudra juste changer le nom du dataset

In [8]:
#La racinisation (stemming) consiste à transformer les mots en forme radicale c’est ce que fait la fonction SnowballStemmer de Nltk.
stemmer = SnowballStemmer('english') #ici anglais car nos tweets sont en anglais

In [9]:
tweets_df['Texte corrigé'].dtype

dtype('O')

In [10]:
#Toutes les lignes de la colonne "Texte corrigé" n'étaient pas toutes de types 'str'
tweets_df['Texte corrigé'] = tweets_df['Texte corrigé'].astype(str)

#Suppression des liens commençant par http et https  
def remove_url(text):
 return re.sub(r'https?:\S*', '', text)
tweets_df['Texte corrigé']=tweets_df['Texte corrigé'].apply(remove_url)
def remove_url1(text):
 return re.sub(r'http?:\S*', '', text)
tweets_df['Texte corrigé']=tweets_df['Texte corrigé'].apply(remove_url1)

In [11]:
#opération de mettre les mots sous leur forme canonique 
#ex: running devient run 
def lemmatize_stemming(text) :
  return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='n'))

In [12]:
#suppression des stopwords (the, or, and…) et des mots de moins de 3 lettres
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result


# Imprimez le résultat intermédiaire pour chaque document
processed_docs = [preprocess(doc) for doc in tweets_df]
for doc in processed_docs:
    # Concaténez les mots pour former une phrase
    doc_text = ' '.join(doc)
    print(preprocess(doc_text))
    



['fichier']
[]
['retweet']
['like']
['répon']
['citat']
['hashtag']
['text']
[]
['phrase']
[]
['sentiment']
['polarité']
['subjectivité']
['phrase', 'nomin']
['text', 'corrigé']
['text', 'tokénizé']


In [14]:
# On stocke les données dans un dictionnaire Gensim et on les convertit au format Bag Of Words(bow).
#On a alors un couple mot/nb d'occurence

# Utilisez la colonne "Texte corrigé" pour le prétraitement
processed_docs = [preprocess(doc) for doc in tweets_df['Texte corrigé']]

# Créez le dictionnaire et le corpus
dictionary = gensim.corpora.Dictionary(processed_docs)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Créez et entraînez le modèle LDA
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=10, # Le nombre de topics à extraire
                                       id2word=dictionary, # Le mapping entre les identifiants et les mots
                                       passes=10000) # Le nombre d'itérations dans le corpus/ passages




# Affichez les topics
topics = []
for idx, topic in lda_model.print_topics(-1) :
    print("Topic: {} -> Words: {}".format(idx, topic))
    topics.append(topic)


Topic: 0 -> Words: 0.078*"vaccin" + 0.077*"child" + 0.072*"covid" + 0.015*"test" + 0.012*"finer" + 0.011*"modern" + 0.009*"trial" + 0.009*"begin" + 0.008*"start" + 0.006*"protect"
Topic: 1 -> Words: 0.041*"vaccin" + 0.035*"covid" + 0.033*"child" + 0.012*"health" + 0.012*"china" + 0.012*"school" + 0.010*"cuti" + 0.009*"googlenew" + 0.009*"sinovac" + 0.008*"literaci"
Topic: 2 -> Words: 0.088*"vaccin" + 0.073*"covid" + 0.069*"child" + 0.014*"help" + 0.011*"question" + 0.011*"famili" + 0.011*"lid" + 0.009*"money" + 0.008*"hopeless" + 0.008*"covid_"
Topic: 3 -> Words: 0.013*"cgtnamerica" + 0.013*"report" + 0.009*"winter" + 0.009*"refund" + 0.009*"stimulus" + 0.008*"news" + 0.006*"detail" + 0.005*"state" + 0.004*"florenc" + 0.004*"srilanka"
Topic: 4 -> Words: 0.057*"vaccin" + 0.049*"covid" + 0.047*"child" + 0.035*"finer" + 0.027*"biontech" + 0.018*"cdcgov" + 0.017*"kiss" + 0.016*"coronavirus" + 0.014*"pidsocieti" + 0.014*"realtimecovid"
Topic: 5 -> Words: 0.120*"vaccin" + 0.087*"covid" + 0.0

temps d'exécution pour 1407 lignes et 1000 passages: 10 min 30.1 s 
pour 2000 passages: 21 min 58.6 s 

In [15]:
#Mesure de la cohérence du modèle
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary)
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.398691082380645


Pour 1000 passages et 10 topics on a une cohérence de 0.40353069757626814
Pour 2000 passages et 10 topics  on a une cohérence de 0.41939273995841353

On observe aucune augmentation significative. 
La cohérence du modèle LDA doit être comprise entre 0.4 et 0.7. Dans la documentation il est proposé d'augmenter le nb de passages à 10000 pour avoir un score proche de 70%


In [16]:
#On garde les résultats dans un df
all_topic_model = []
for i in range(len(topics)):
  str = topics[i].split(' + ')
  topic_model = []
  for j in range(10):
    weight = str[j][0:5]
    word = str[j][7:len(str[j])-1]
    topic_model.append((weight, word))
  all_topic_model.append(topic_model)

In [17]:
df_topic_model = pd.DataFrame(all_topic_model)
df_topic_model.rename(index = {0: "Topic 1", 1: "Topic 2", 2: "Topic 3", 3: "Topic 4", 4: "Topic 5", 5: "Topic 6", 6: "Topic 7", 7: "Topic 8", 8: "Topic 9", 9: "Topic 10"})

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Topic 1,"(0.078, vaccin)","(0.077, child)","(0.072, covid)","(0.015, test)","(0.012, finer)","(0.011, modern)","(0.009, trial)","(0.009, begin)","(0.008, start)","(0.006, protect)"
Topic 2,"(0.041, vaccin)","(0.035, covid)","(0.033, child)","(0.012, health)","(0.012, china)","(0.012, school)","(0.010, cuti)","(0.009, googlenew)","(0.009, sinovac)","(0.008, literaci)"
Topic 3,"(0.088, vaccin)","(0.073, covid)","(0.069, child)","(0.014, help)","(0.011, question)","(0.011, famili)","(0.011, lid)","(0.009, money)","(0.008, hopeless)","(0.008, covid_)"
Topic 4,"(0.013, cgtnamerica)","(0.013, report)","(0.009, winter)","(0.009, refund)","(0.009, stimulus)","(0.008, news)","(0.006, detail)","(0.005, state)","(0.004, florenc)","(0.004, srilanka)"
Topic 5,"(0.057, vaccin)","(0.049, covid)","(0.047, child)","(0.035, finer)","(0.027, biontech)","(0.018, cdcgov)","(0.017, kiss)","(0.016, coronavirus)","(0.014, pidsocieti)","(0.014, realtimecovid)"
Topic 6,"(0.120, vaccin)","(0.087, covid)","(0.074, child)","(0.036, kiss)","(0.023, health)","(0.020, parent)","(0.018, anaem)","(0.018, coronavirus)","(0.013, school)","(0.012, canada)"
Topic 7,"(0.097, vaccin)","(0.081, child)","(0.080, covid)","(0.032, finer)","(0.027, age)","(0.019, approv)","(0.018, year)","(0.013, biontech)","(0.011, coronavirus)","(0.011, author)"
Topic 8,"(0.050, vaccin)","(0.048, child)","(0.046, covid)","(0.010, health)","(0.009, want)","(0.007, govern)","(0.007, cuba)","(0.007, later)","(0.005, american)","(0.005, particip)"
Topic 9,"(0.032, vaccin)","(0.028, israel)","(0.026, child)","(0.025, covid)","(0.019, scienc)","(0.015, maga)","(0.015, albert)","(0.015, corrupt)","(0.014, murder)","(0.013, arizona)"
Topic 10,"(0.092, vaccin)","(0.081, covid)","(0.072, child)","(0.013, year)","(0.011, covaxin)","(0.011, move)","(0.008, school)","(0.008, coronavirus)","(0.008, finer)","(0.007, felt)"


In [18]:
import pyLDAvis.gensim_models

In [19]:
#Visu interractive affichant les topics, les mots etc
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)

Attribution des Topics à chaque tweets

In [None]:
# Utilisez la colonne "Texte corrigé" pour le prétraitement
processed_docs = [preprocess(doc) for doc in tweets_df['Texte corrigé']]

# Obtenez la distribution des sujets pour chaque tweet
tweet_topic_distributions = [lda_model.get_document_topics(dictionary.doc2bow(doc)) for doc in processed_docs]


In [None]:
# Choisissez le sujet dominant pour chaque tweet
dominant_topics = [max(topic_dist, key=lambda x: x[1])[0] for topic_dist in tweet_topic_distributions]

# Ajoutez la colonne 'Dominant Topic' au DataFrame
tweets_df['Dominant Topic'] = dominant_topics


In [None]:
# Créer une Mapping pour les Topics
topic_mapping = {}
for i in range(lda_model.num_topics):
    topic_mapping[i] = 'Topic {}'.format(i+1)


In [None]:
tweets_df['Topic Label'] = [topic_mapping[dominant_topic] for dominant_topic in dominant_topics]


In [None]:
# Affichez les premières lignes du DataFrame
tweets_df.head()


Unnamed: 0,Fichier,ID,Nb retweet,Nb like,Nb réponses,Nb citations,Hashtags,Texte,Mots,Phrases,Tags POS,Sentiment,Polarité,Subjectivité,Phrases nominales,Texte corrigé,Texte tokénizé,Dominant Topic,Topic Label
0,1432163370669596672.json,"1,4321633706696E+018",0,1,1,0.0,"['#VACCINE', '#COVID19', '#CHILDREN', '#MASK']",Data doesn't lie. This virus does not effect c...,"['Data', 'does', ""n't"", 'lie', 'This', 'virus'...","[Sentence(""Data doesn't lie.""), Sentence(""This...","[('Data', 'NNP'), ('does', 'VBZ'), (""n't"", 'RB...","Sentiment(polarity=0.0, subjectivity=0.0)",0.0,0.0,"['data', ""n't lie"", 'effect children', '# vacc...",Data doesn't lie. His virus does not effect ch...,"['Data', 'does', ""n't"", 'lie', '.', 'This', 'v...",6,Topic 7
1,1361269067441713153.json,"1,36126906744171E+018",0,0,0,0.0,"['#COVID19', '#CORONAVIRUS', '#HEALTH', '#HEAL...",A new trial is to test how well the Oxford-Ast...,"['A', 'new', 'trial', 'is', 'to', 'test', 'how...","[Sentence(""A new trial is to test how well the...","[('A', 'DT'), ('new', 'JJ'), ('trial', 'NN'), ...","Sentiment(polarity=0.018181818181818174, subje...",0.018182,0.427273,"['new trial', 'oxford-astrazeneca', 'coronavir...",A new trial is to test how well the Oxford-Ast...,"['A', 'new', 'trial', 'is', 'to', 'test', 'how...",7,Topic 8
2,1382650191526965249.json,"1,38265019152697E+018",0,2,0,0.0,"['#CHILDREN', '#KINDER', '#CORONAVIRUSDE', '#C...",'Kids are going to be the key' to get to herd ...,"[""'Kids"", 'are', 'going', 'to', 'be', 'the', '...","[Sentence(""'Kids are going to be the key' to g...","[(""'Kids"", 'NNS'), ('are', 'VBP'), ('going', '...","Sentiment(polarity=0.0, subjectivity=1.0)",0.0,1.0,"['herd immunity # children #', 'kinder', 'karl...",'Lids are going to be the key' to get to herd ...,"[""'Kids"", 'are', 'going', 'to', 'be', 'the', '...",0,Topic 1
3,1436717418840088582.json,"1,43671741884009E+018",0,0,0,0.0,"['#SOUTHAFRICA', '#HEALTH', '#PFIZER', '#COVID...",#SouthAfrica's #health regulator has approved ...,"['SouthAfrica', ""'s"", 'health', 'regulator', '...","[Sentence(""#SouthAfrica's #health regulator ha...","[('SouthAfrica', 'NNP'), (""'s"", 'POS'), ('heal...","Sentiment(polarity=0.18888888888888888, subjec...",0.188889,0.411111,"['southafrica', ""'s # health regulator"", 'pfiz...",#SouthAfrica's #health regulatory has approved...,"['#', 'SouthAfrica', ""'s"", '#', 'health', 'reg...",6,Topic 7
4,1456708686437621764.json,"1,45670868643762E+018",0,0,0,0.0,"['#CHICAGO', '#VACCINE', '#CPS', '#SCHOOL', '#...","""With a week’s notice, #Chicago Public Schools...","['With', 'a', 'week', '’', 's', 'notice', 'Chi...","[Sentence(""""With a week’s notice, #Chicago Pub...","[('With', 'IN'), ('a', 'DT'), ('week', 'NN'), ...","Sentiment(polarity=0.0, subjectivity=0.0666666...",0.0,0.066667,"['week ’ s notice', 'chicago', 'public schools...","""With a week’s notice, #Chicago Public Schools...","['``', 'With', 'a', 'week', '’', 's', 'notice'...",5,Topic 6
