# Topic Modelling and LDA

**Documentation :**

https://medium.com/nanonets/topic-modeling-with-lsa-psla-lda-and-lda2vec-555ff65b0b05

https://towardsdatascience.com/nlp-extracting-the-main-topics-from-your-dataset-using-lda-in-minutes-21486f5aa925

https://towardsdatascience.com/the-complete-guide-for-topics-extraction-in-python-a6aaa6cedbbc

https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21

https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/topic_modeling_Gensim.ipynb


https://github.com/FelixChop/MediumArticles/blob/master/LDA-BBC.ipynb


https://pypi.org/project/spacy-lefff/


https://radimrehurek.com/gensim/models/tfidfmodel.html

https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html



## Importing the necessary libraries : 

In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import pyLDAvis
import pyLDAvis.gensim

import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 500)

## Importing the dataset :

In [3]:
data_ecolo=pd.read_csv('Donnees_clean/contributions_coordonnees_insee/data_ecolo_coord_insee.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data_ecolo_title=data_ecolo['title']

## Pre-processing :

- tokenization
- removing stopwords
- words are lemmatized (not possible in French)
- words are stemmed

## Stopwords in French :

In [5]:
from nltk.corpus import stopwords
All_Stopwords=stopwords.words('French')
print(All_Stopwords)

['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aur

## Words are stemmed : 

In [6]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
# Tokenize and stemming :
stemmer = SnowballStemmer("french")
def preprocess(text):
    result=[]
    # https://tedboy.github.io/nlps/generated/generated/gensim.utils.simple_preprocess.html
    for token in gensim.utils.simple_preprocess(text) :
        # We take the words which are not stopwords and only words with length>=3
        if token not in All_Stopwords and len(token) >=3:
            result.append(stemmer.stem(token))  
            #result.append(token)
    return result

## We are now converting the text : 

In [7]:
import random
text_data = []
for line in data_ecolo_title:
    tokens = preprocess(line)
    text_data.append(tokens)

## Dictionnary :

In [8]:
dictionary = gensim.corpora.Dictionary(text_data)

In [9]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 impos
1 pass
2 transit
3 voulu
4 écolog
5 air
6 plus
7 pollu
8 abon
9 absent
10 automobil


In [11]:

'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in text_data]

In [12]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 42
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 121 ("animal") appears 1 time.
Word 122 ("débat") appears 1 time.
Word 123 ("expériment") appears 1 time.
Word 124 ("national") appears 1 time.


In [13]:
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 10, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [14]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.045*"pollueur" + 0.040*"plus" + 0.036*"tax" + 0.034*"moin" + 0.026*"fair" + 0.020*"pollu" + 0.019*"terr" + 0.019*"payeur" + 0.013*"entrepris" + 0.012*"pai"


Topic: 1 
Words: 0.053*"tax" + 0.046*"tout" + 0.045*"tous" + 0.025*"agir" + 0.017*"carbon" + 0.015*"interdir" + 0.015*"emballag" + 0.014*"mond" + 0.013*"pesticid" + 0.013*"plastiqu"


Topic: 2 
Words: 0.096*"transport" + 0.043*"commun" + 0.043*"chang" + 0.034*"développ" + 0.032*"agricultur" + 0.030*"stop" + 0.021*"vill" + 0.020*"biodivers" + 0.018*"vivr" + 0.017*"mieux"


Topic: 3 
Words: 0.159*"ecolog" + 0.039*"bon" + 0.037*"aven" + 0.037*"vi" + 0.034*"enfant" + 0.030*"sen" + 0.026*"proposit" + 0.022*"véhicul" + 0.014*"meilleur" + 0.012*"prendr"


Topic: 4 
Words: 0.066*"planet" + 0.042*"pollut" + 0.036*"économ" + 0.030*"urgenc" + 0.029*"vrai" + 0.027*"polit" + 0.018*"écolog" + 0.017*"mondial" + 0.016*"humain" + 0.015*"raison"


Topic: 5 
Words: 0.059*"climat" + 0.057*"consomm" + 0.050*"non" + 0.033*"voitur" + 

### Visualization :

In [15]:
LDAvis = pyLDAvis.gensim.prepare(topic_model=lda_model, corpus=bow_corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(LDAvis)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## TF-IDF & LDA : 

**Documentation :**

https://towardsdatascience.com/transforming-tokens-into-useful-features-bow-tf-idf-e924ed536bcf

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [45]:
"""
from sklearn.feature_extraction.text import TfidfVectorizer 
import pandas as pd 
tfidf = TfidfVectorizer(min_df=2,max_df=0.5,ngram_range=(1,2)) 
tfidf_corpus=tfidf.fit_transform(data_ecolo_title)
#pd.DataFrame(tfidf_corpus.todense(),columns=tfidf.get_feature_names())
"""

In [15]:
"""
>>> import gensim.downloader as api
>>> from gensim.models import TfidfModel
>>> from gensim.corpora import Dictionary
>>>
>>> dataset = api.load("text8")
>>> dct = Dictionary(dataset)  # fit dictionary
>>> corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format
>>>
>>> model = TfidfModel(corpus)  # fit model
>>> vector = model[corpus[0]]  # apply model to the first corpus document

"""

'\nimport gensim.downloader as api\nfrom gensim.models import TfidfModel\nfrom gensim.corpora import Dictionary\n\ndataset = api.load("text8")\ndct = Dictionary(dataset)  # fit dictionary\ncorpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format\n\nmodel = TfidfModel(corpus)  # fit model\nvector = model[corpus[0]]  # apply model to the first corpus document\n\n'

In [46]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

model = TfidfModel(bow_corpus)  # fit model
#vector = model[corpus[0]]  # apply model to the first corpus document
tfidf_corpus = [model[doc] for doc in bow_corpus]

In [47]:
print(tfidf_corpus[105])

[(6, 0.10630688106798118), (10, 0.16912687859842718), (64, 0.14980552494211433), (97, 0.10277655557397265), (111, 0.17717815981646975), (120, 0.15710070254590772), (124, 0.16807640136779797), (137, 0.11590120509598058), (251, 0.2447252384137833), (252, 0.18860484270421257), (253, 0.33549200234271476), (254, 0.2254784353875924), (255, 0.30029610632445053), (256, 0.1371748165693755), (257, 0.3132858643411846), (258, 0.1735066473240022), (259, 0.3132858643411846), (260, 0.1747549877903323), (261, 0.20078758864981927), (262, 0.20952934239551896), (263, 0.2311133892938003), (264, 0.2586713864123982)]


applying LDA : 

In [54]:
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
lda_model_tfidf =  gensim.models.LdaMulticore(tfidf_corpus, 
                                   num_topics = 5, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [55]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.064*"transit" + 0.062*"écolog" + 0.031*"transport" + 0.020*"urgenc" + 0.016*"aven" + 0.015*"commun" + 0.014*"développ" + 0.013*"énerget" + 0.013*"proposit" + 0.012*"énerg"


Topic: 1 
Words: 0.084*"ecolog" + 0.037*"pollut" + 0.036*"environ" + 0.020*"contribu" + 0.015*"nucléair" + 0.014*"voitur" + 0.011*"air" + 0.010*"plastiqu" + 0.010*"déchet" + 0.008*"econom"


Topic: 2 
Words: 0.028*"climat" + 0.021*"chang" + 0.020*"planet" + 0.015*"priorit" + 0.013*"agir" + 0.012*"environ" + 0.012*"vi" + 0.011*"respect" + 0.011*"terr" + 0.010*"mond"


Topic: 3 
Words: 0.019*"respons" + 0.015*"bon" + 0.013*"sen" + 0.012*"punit" + 0.012*"écolog" + 0.011*"agricultur" + 0.011*"citoyen" + 0.010*"tous" + 0.009*"bio" + 0.009*"non"


Topic: 4 
Words: 0.031*"tax" + 0.023*"pollueur" + 0.014*"moin" + 0.013*"payeur" + 0.011*"mieux" + 0.011*"pollu" + 0.010*"plus" + 0.010*"carbon" + 0.010*"vivr" + 0.010*"consomm"




Visualisation :

In [56]:
LDAvis_tfidf = pyLDAvis.gensim.prepare(topic_model=lda_model_tfidf, corpus=tfidf_corpus, dictionary=dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(LDAvis_tfidf)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## LSA : 

## TF-IDF & svd : 
https://www.kaggle.com/shivam1600/simple-information-retrieval-using-tf-idf-and-lsa