# LDA Model for abstract analysis of Brazilian Administrative Decrees

## Imports

In [None]:
!pip install pyLDAvis==3.2.1
!pip install spacy==2.2.4
!pip install https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz
!pip install nltk==3.5

Collecting https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz (21.2 MB)
[K     |████████████████████████████████| 21.2 MB 5.4 MB/s 


In [None]:
import sqlite3
import pandas as pd
import numpy as np
import json

import nltk
import gensim
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

import pyLDAvis.gensim
import pickle 
import pyLDAvis

import math
from matplotlib import pyplot as plt

import spacy
import io

import pt_core_news_sm
nlp = pt_core_news_sm.load()

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

from pprint import pprint

import logging
import itertools
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
  from collections import Iterable
  from collections import MutableMapping, Sequence  # noqa
  from collections import MutableMapping, Sequence  # noqa
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):


In [None]:
# Print main versions used:
print("pt_core_news_sm version:", pt_core_news_sm.__version__)
print("Spacy version:", spacy.__version__)
print("NLTK version:", nltk.__version__)
import sklearn
print("Scikit version:", sklearn.__version__)

pt_core_news_sm version: 2.2.5
Spacy version: 2.2.4
NLTK version: 3.5
Scikit version: 1.0.1


## Get database

In [None]:
# This code was executed on Google Colab. Upload the file "radar_db" version 0.1 to your drive. This file is available on Kaggle.
from google.colab import drive
from google.colab import files

drive.mount('/content/gdrive')
database_file = '/content/gdrive/MyDrive/database - radar/radar_db' # replace with your drive link

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Preprocessing functions

In [None]:
def get_decretos():
  conn = sqlite3.connect(database_file)
  c = conn.cursor()
  c.execute("SELECT cod_ident_ato, ementa, full_text, referenda FROM Decretos")
  decretos = c.fetchall()
  decretos = [(decreto[0], str(decreto[1]) + " " + str(decreto[2]), decreto[3]) for decreto in decretos]
  conn.close()
  for i in range(len(decretos)):
        referenda_aux = [ref.strip() for ref in decretos[i][2].split(";")]
        decretos[i] = list(decretos[i])
        decretos[i][2] = referenda_aux
        decretos[i] = tuple(decretos[i])
  decretos = pd.DataFrame(decretos)
  decretos.columns = ['cod_ident_ato', 'full_text', 'referenda']
  return decretos

def get_ementa():
  conn = sqlite3.connect(database_file)
  c = conn.cursor()
  c.execute("SELECT ementa FROM Decretos")
  decretos = c.fetchall()
  conn.close()
  return decretos

def preprocess_ementa(decretos_df):
  number_of_decrees = len(decretos_df['ementa'])
  breaking_punctuation = ['.', '!', '?', ';', ':',]
  for j,decreto in enumerate(decretos_df['ementa']):
      text = ""
      
      decreto = decreto.replace("/", " ")
      decreto = decreto.replace("º", '')
      tokenized_document = word_tokenize(decreto)
      for i, word in enumerate(tokenized_document):
          
          if word.lower() == "art":               # lidando com os padrões de numeração de artigos definidos na observação.
              if tokenized_document[i+2].isnumeric() and tokenized_document[i+3] == ".":
                  tokenized_document[i+3] = ' '
      
          if(word.isalpha() or word.isnumeric()):
              text+=(word.lower()+" ")
          elif word in breaking_punctuation:
              text+=(" . ")                       # padroniza pontuações específicas em ponto final.
          elif word[0].isnumeric() and ("," not in word):
              word = word.replace(".", "")
              if(word.isnumeric()):
                  text+=(word+" ")                # lidando com números com pontuação no meio. Desconsidera números reais, comumente referentes à moeda.
      
      text = text.replace("art  .", " art ")
      text = text.replace("arts  .", " arts ")
      
      # remoção de assinaturas
      if(text.rfind(" república") != -1):
          signature = text.rfind(" república")
          if signature < len(text)/2:
              signature = len(text)
      
      decretos_df['ementa'].iloc[j] = text
  return decretos_df

def removeCommomStopWords(texts, st= stopwords.words('portuguese')):
    texts_r = []
    
    for t in texts:
        final_t = ""
        for w in t.split(" "):
            if w not in st and w!=".":
                final_t += w + " "
        texts_r.append(final_t)
    return texts_r

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out



In [None]:
decretos = get_decretos()
decretos['ementa'] = pd.DataFrame(get_ementa())
decretos = decretos.drop(columns = ['cod_ident_ato','referenda'], axis=1)
decretos

Unnamed: 0,full_text,ementa
0,Dispõe sobre o Comitê Consultivo de Nanotecnol...,Dispõe sobre o Comitê Consultivo de Nanotecnol...
1,"Altera o Decreto nº 8.642 , de 19 de janeiro ...","Altera o Decreto nº 8.642 , de 19 de janeiro ..."
2,Promulga o Acordo entre o Governo da República...,Promulga o Acordo entre o Governo da República...
3,Promulga o Acordo entre o Governo da República...,Promulga o Acordo entre o Governo da República...
4,Dispõe sobre o Comitê Interministerial de Tecn...,Dispõe sobre o Comitê Interministerial de Tecn...
...,...,...
6836,"PROMULGA O PROTOCOLO AO TRATADO DE AMIZADE, CO...","PROMULGA O PROTOCOLO AO TRATADO DE AMIZADE, CO..."
6837,PRORROGA A VIGÊNCIA DOS RESTOS A PAGAR QUE ESP...,PRORROGA A VIGÊNCIA DOS RESTOS A PAGAR QUE ESP...
6838,"ALTERA O DECRETO N. 2.889, DE 21/12/1998, QUE ...","ALTERA O DECRETO N. 2.889, DE 21/12/1998, QUE ..."
6839,"REVOGA O ART. 20 DO DECRETO N. 2.451, DE 05/01...","REVOGA O ART. 20 DO DECRETO N. 2.451, DE 05/01..."


## Text analysis

In [None]:
aux = preprocess_ementa(decretos)
ementa_preprocessed = aux['ementa']
ementa_preprocessed = ementa_preprocessed.values.tolist()

In [None]:
ementa_processed = removeCommomStopWords(ementa_preprocessed)

In [None]:
ementa_processed_docs = [doc.split(" ") for doc in ementa_processed]

data_lemmatized = lemmatization(ementa_processed_docs, allowed_postags=['NOUN', 'ADJ'])
splited_doc = [doc.split(" ") for doc in data_lemmatized]

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY

In [None]:
ementa_dictionary = gensim.corpora.Dictionary(splited_doc)

non_filtering = ementa_dictionary
ementa_dictionary.filter_extremes(no_below=3, no_above=0.8) #keep_n=10000)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(2762 unique tokens: ['comitê', 'consultivo', 'material', 'novo', 'tecnologia']...) from 6841 documents (total 53278 corpus positions)
INFO : discarding 1355 tokens: [('assistiva', 1), ('primário', 1), ('secundário', 1), ('esporte', 1), ('sucroalcooleiro', 2), ('participar', 1), ('eletrobras', 1), ('café', 2), ('instância', 2), ('entrar', 2)]...
INFO : keeping 1407 tokens which were in no less than 3 and no more than 5472 (=80.0%) documents
INFO : resulting dictionary: Dictionary(1407 unique tokens: ['comitê', 'consultivo', 'material', 'novo', 'tecnologia']...)


In [None]:
bow_corpus = [ementa_dictionary.doc2bow(doc) for doc in splited_doc]

## Validation

In [None]:
vectorizer = CountVectorizer(analyzer='word', min_df=3, stop_words='english', lowercase=True, token_pattern='[a-zA-Z0-9]{3,}')
data_vectorized = vectorizer.fit_transform(data_lemmatized)

search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)

GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 learning_offset=50.0,
                                                 max_iter=5, random_state=0),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [10, 15, 20, 25, 30]})

In [None]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -74077.7990407488
Model Perplexity:  276.74601767490145


## Final model creation

In [None]:
lda_model_n = gensim.models.LdaModel(bow_corpus, num_topics=10, id2word=ementa_dictionary, passes=30, alpha='auto', random_state=42)

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elo

In [None]:
lda_model_n.show_topics(num_topics=10)

[(0,
  '0.056*"executivo" + 0.054*"financeiro" + 0.048*"poder" + 0.040*"exercício" + 0.038*"execução" + 0.034*"providência" + 0.033*"orçamentária" + 0.029*"programação" + 0.026*"decretar" + 0.025*"sanção"'),
 (1,
  '0.100*"decretar" + 0.059*"regulamentar" + 0.055*"lei" + 0.045*"n" + 0.030*"providência" + 0.024*"dezembro" + 0.022*"outubro" + 0.022*"julho" + 0.021*"maio" + 0.020*"janeiro"'),
 (2,
  '0.134*"redação" + 0.126*"novo" + 0.057*"decretar" + 0.043*"competência" + 0.026*"ministrar" + 0.019*"assistência" + 0.018*"gestão" + 0.018*"atos" + 0.016*"direito" + 0.014*"comissão"'),
 (3,
  '0.052*"oficiar" + 0.040*"produto" + 0.039*"promoção" + 0.036*"impor" + 0.033*"incidência" + 0.033*"tempo" + 0.031*"decretar" + 0.031*"dezembro" + 0.027*"efetivos" + 0.025*"pessoal"'),
 (4,
  '0.130*"providência" + 0.088*"social" + 0.077*"nacional" + 0.031*"brasileiro" + 0.030*"remanejamento" + 0.027*"conselho" + 0.025*"estatuto" + 0.023*"sistema" + 0.022*"assessoramento" + 0.021*"funcionamento"'),
 (5,

## Final model visualization

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_n, bow_corpus, ementa_dictionary)
LDAvis_prepared

INFO : NumExpr defaulting to 2 threads.
