# Latent Dirichlet Allocation (LDA)

LDA (short for Latent Dirichlet Allocation) is an unsupervised machine-learning model that takes documents as input and finds topics as output. The model also says in what percentage each document talks about each topic.

- https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
- https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
- https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
- https://github.com/FelixChop/MediumArticles/blob/master/LDA-BBC.ipynb
- https://towardsdatascience.com/the-complete-guide-for-topics-extraction-in-python-a6aaa6cedbbc

In [9]:
import datetime
import copy as c
import random
import collections
import time



import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px 
import re
import nltk
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm_notebook().pandas()


0it [00:00, ?it/s]

In [2]:
#On récupère directement le dataset nettoyé
url_local=r"C:\Users\Antoine\Downloads\clean_ecb_speeches_dataset.csv"
df=pd.read_csv(url_local,sep = ",",encoding='utf-8')

In [3]:
help(nltk.tokenize.sent_tokenize)

Help on function sent_tokenize in module nltk.tokenize:

sent_tokenize(text, language='english')
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).
    
    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus



In [7]:
N=random.randint(0,len(df))
nltk.tokenize.sent_tokenize(df["contents"][N])

['  The economic crisis and the response of fiscal and monetary policy   Speech by Jürgen Stark, Member of the Executive Board of the ECBDelivered at the Austrian Industrial OrganisationLinz, Austria, 8 June 2009      Introduction    Following the default of Lehman Brothers in mid-September last year, the turmoil in financial markets which had started in August 2007 turned into a major financial crisis.',
 'Liquidity dried up, and credit flows to the economy slowed down.',
 'Problems in the financial system quickly spilled over to the real economy, and an adverse feedback loop between the real economy and the financial sector emerged.',
 'This has led to the most severe and synchronised global economic downturn for 80 years.',
 'The euro area has not been spared.',
 'Economic activity has declined sharply, and inflation is at its lowest level since the launch of the euro.',
 'Monetary and fiscal authorities across the globe have responded quickly and decisively to these extraordinary d

In [5]:
help(nltk.tokenize.word_tokenize)

Help on function word_tokenize in module nltk.tokenize:

word_tokenize(text, language='english', preserve_line=False)
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).
    
    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
    :type preserve_line: bool



In [8]:
nltk.tokenize.word_tokenize(df["contents"][N])

['The',
 'economic',
 'crisis',
 'and',
 'the',
 'response',
 'of',
 'fiscal',
 'and',
 'monetary',
 'policy',
 'Speech',
 'by',
 'Jürgen',
 'Stark',
 ',',
 'Member',
 'of',
 'the',
 'Executive',
 'Board',
 'of',
 'the',
 'ECBDelivered',
 'at',
 'the',
 'Austrian',
 'Industrial',
 'OrganisationLinz',
 ',',
 'Austria',
 ',',
 '8',
 'June',
 '2009',
 'Introduction',
 'Following',
 'the',
 'default',
 'of',
 'Lehman',
 'Brothers',
 'in',
 'mid-September',
 'last',
 'year',
 ',',
 'the',
 'turmoil',
 'in',
 'financial',
 'markets',
 'which',
 'had',
 'started',
 'in',
 'August',
 '2007',
 'turned',
 'into',
 'a',
 'major',
 'financial',
 'crisis',
 '.',
 'Liquidity',
 'dried',
 'up',
 ',',
 'and',
 'credit',
 'flows',
 'to',
 'the',
 'economy',
 'slowed',
 'down',
 '.',
 'Problems',
 'in',
 'the',
 'financial',
 'system',
 'quickly',
 'spilled',
 'over',
 'to',
 'the',
 'real',
 'economy',
 ',',
 'and',
 'an',
 'adverse',
 'feedback',
 'loop',
 'between',
 'the',
 'real',
 'economy',
 'and

In [19]:
help(nltk.pos_tag)

Help on function pos_tag in module nltk.tag:

pos_tag(tokens, tagset=None, lang='eng')
    Use NLTK's currently recommended part of speech tagger to
    tag the given list of tokens.
    
        >>> from nltk.tag import pos_tag
        >>> from nltk.tokenize import word_tokenize
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."))
        [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'),
        ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')]
        >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal')
        [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'),
        ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')]
    
    NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence.
    
    :param tokens: Sequence of tokens to be tagged
    :type tokens: list(str)
    :param tagset: the tagset to be u

In [23]:
nltk.pos_tag(nltk.tokenize.word_tokenize(df["contents"][N]))

[('The', 'DT'),
 ('economic', 'JJ'),
 ('crisis', 'NN'),
 ('and', 'CC'),
 ('the', 'DT'),
 ('response', 'NN'),
 ('of', 'IN'),
 ('fiscal', 'JJ'),
 ('and', 'CC'),
 ('monetary', 'JJ'),
 ('policy', 'NN'),
 ('Speech', 'NNP'),
 ('by', 'IN'),
 ('Jürgen', 'NNP'),
 ('Stark', 'NNP'),
 (',', ','),
 ('Member', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('Executive', 'NNP'),
 ('Board', 'NNP'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('ECBDelivered', 'NNP'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('Austrian', 'JJ'),
 ('Industrial', 'NNP'),
 ('OrganisationLinz', 'NNP'),
 (',', ','),
 ('Austria', 'NNP'),
 (',', ','),
 ('8', 'CD'),
 ('June', 'NNP'),
 ('2009', 'CD'),
 ('Introduction', 'NNP'),
 ('Following', 'VBG'),
 ('the', 'DT'),
 ('default', 'NN'),
 ('of', 'IN'),
 ('Lehman', 'NNP'),
 ('Brothers', 'NNPS'),
 ('in', 'IN'),
 ('mid-September', 'NN'),
 ('last', 'JJ'),
 ('year', 'NN'),
 (',', ','),
 ('the', 'DT'),
 ('turmoil', 'NN'),
 ('in', 'IN'),
 ('financial', 'JJ'),
 ('markets', 'NNS'),
 ('which', 'WDT'),
 ('had', 'VBD'),
 