# LDA model for full text analysis of Brazilian Administrative Decrees

## Imports

In [None]:
!pip install pyLDAvis==3.2.1
!pip install spacy==2.2.4
!pip install https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-2.2.5/pt_core_news_sm-2.2.5.tar.gz
!pip install nltk==3.4.5

Collecting pyLDAvis==3.2.1
  Downloading pyLDAvis-3.2.1.tar.gz (1.7 MB)
[?25l[K     |▏                               | 10 kB 12.1 MB/s eta 0:00:01[K     |▍                               | 20 kB 17.0 MB/s eta 0:00:01[K     |▋                               | 30 kB 21.5 MB/s eta 0:00:01[K     |▉                               | 40 kB 24.5 MB/s eta 0:00:01[K     |█                               | 51 kB 19.9 MB/s eta 0:00:01[K     |█▏                              | 61 kB 8.8 MB/s eta 0:00:01[K     |█▍                              | 71 kB 8.9 MB/s eta 0:00:01[K     |█▋                              | 81 kB 9.8 MB/s eta 0:00:01[K     |█▉                              | 92 kB 10.6 MB/s eta 0:00:01[K     |██                              | 102 kB 11.2 MB/s eta 0:00:01[K     |██▏                             | 112 kB 11.2 MB/s eta 0:00:01[K     |██▍                             | 122 kB 11.2 MB/s eta 0:00:01[K     |██▋                             | 133 kB 11.2 MB/s eta 0:00:

In [None]:
import sqlite3
import pandas as pd
import numpy as np
import json

import nltk
import gensim
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

import pyLDAvis.gensim
import pickle 
import pyLDAvis

import math
from matplotlib import pyplot as plt

import spacy
import io

import pt_core_news_sm
nlp = pt_core_news_sm.load()

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

from pprint import pprint

import logging
import itertools
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.
  from collections import MutableMapping, Sequence  # noqa
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):


In [None]:
# Print main versions used:
print("pt_core_news_sm version:", pt_core_news_sm.__version__)
print("Spacy version:", spacy.__version__)
print("NLTK version:", nltk.__version__)
import sklearn
print("Scikit version:", sklearn.__version__)

pt_core_news_sm version: 2.2.5
Spacy version: 2.2.4
NLTK version: 3.4.5
Scikit version: 1.0.1


## Get database

In [None]:
# This code was executed on Google Colab. Upload the file "radar_db" version 0.1 to your drive and the "frequency_stopwords_list.csv" file here. Both files are available on Kaggle.
from google.colab import drive
from google.colab import files

drive.mount('/content/gdrive')
database_file = '/content/gdrive/MyDrive/database - radar/radar_db' # replace with your drive link
stop_words_upload = files.upload()

Mounted at /content/gdrive


Saving frequency_stopwords_list.csv to frequency_stopwords_list.csv


## Preprocessing Functions

In [None]:
def get_decretos():
    conn = sqlite3.connect(database_file)
    c = conn.cursor()
    c.execute("SELECT cod_ident_ato, ementa, full_text, referenda FROM Decretos")
    decretos = c.fetchall()
    decretos = [(decreto[0], str(decreto[1]) + " " + str(decreto[2]), decreto[3]) for decreto in decretos]
    conn.close()
    for i in range(len(decretos)):
        referenda_aux = [ref.strip() for ref in decretos[i][2].split(";")]
        decretos[i] = list(decretos[i])
        decretos[i][2] = referenda_aux
        decretos[i] = tuple(decretos[i])
    decretos = pd.DataFrame(decretos)
    decretos.columns = ['cod_ident_ato', 'full_text', 'referenda']
    return decretos

def preprocess_full(decretos_df):
  number_of_decrees = len(decretos_df['full_text'])
  breaking_punctuation = ['.', '!', '?', ';', ':']
  for j,decreto in enumerate(decretos_df['full_text']):
      print("Preprocessing", j+1, "/", number_of_decrees)
      text = ""
      
      decreto = decreto.replace("/", " ")
      decreto = decreto.replace("º", '')
      tokenized_document = word_tokenize(decreto)
      for i, word in enumerate(tokenized_document):
          
          if word.lower() == "art":               # lidando com os padrões de numeração de artigos definidos na observação.
              if tokenized_document[i+2].isnumeric() and tokenized_document[i+3] == ".":
                  tokenized_document[i+3] = ' '
      
          if(word.isalpha() or word.isnumeric()):
              text+=(word.lower()+" ")
          elif word in breaking_punctuation:
              text+=(" . ")                       # padroniza pontuações específicas em ponto final.
          elif word[0].isnumeric() and ("," not in word):
              word = word.replace(".", "")
              if(word.isnumeric()):
                  text+=(word+" ")                # lidando com números com pontuação no meio. Desconsidera números reais, comumente referentes à moeda.
      
      text = text.replace("art  .", " art ")
      text = text.replace("arts  .", " arts ")
      
      # remoção de assinaturas
      if(text.rfind(" república") != -1):
          signature = text.rfind(" república")
          if signature < len(text)/2:
              signature = len(text)
      
      decretos_df['full_text'].iloc[j] = text[:signature] + " república."
  return decretos_df

def removeComumStopWords(stopwords_file,texts, st= stopwords.words('portuguese')):
    frequency_stopwords=pd.read_csv(stopwords_file, sep=',')
    frequency_stopwords = frequency_stopwords['0']
    frequency_stopwords=frequency_stopwords.values.tolist()
    frequency_stopwords = set(frequency_stopwords)
    texts_r = []
    
    for t in texts:
        final_t = ""
        for w in t.split(" "):
            if w not in st and w!="." and w not in frequency_stopwords:
                final_t += w + " "
        texts_r.append(final_t)
    return texts_r

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out



In [None]:
decretos = get_decretos() # get decress from data base
decretos = decretos.drop(columns = ['cod_ident_ato','referenda'], axis=1)
stop_words = pd.read_csv(io.BytesIO(stop_words_upload['frequency_stopwords_list.csv']), sep=',') # Set dataframe of stop_words
decretos

Unnamed: 0,full_text
0,Dispõe sobre o Comitê Consultivo de Nanotecnol...
1,"Altera o Decreto nº 8.642 , de 19 de janeiro ..."
2,Promulga o Acordo entre o Governo da República...
3,Promulga o Acordo entre o Governo da República...
4,Dispõe sobre o Comitê Interministerial de Tecn...
...,...
6836,"PROMULGA O PROTOCOLO AO TRATADO DE AMIZADE, CO..."
6837,PRORROGA A VIGÊNCIA DOS RESTOS A PAGAR QUE ESP...
6838,"ALTERA O DECRETO N. 2.889, DE 21/12/1998, QUE ..."
6839,"REVOGA O ART. 20 DO DECRETO N. 2.451, DE 05/01..."


## Text analysis

In [None]:
full_text_preprocessed = preprocess_full(decretos)
full_text_preprocessed = full_text_preprocessed['full_text'].values.tolist()

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
Preprocessing 1842 / 6841
Preprocessing 1843 / 6841
Preprocessing 1844 / 6841
Preprocessing 1845 / 6841
Preprocessing 1846 / 6841
Preprocessing 1847 / 6841
Preprocessing 1848 / 6841
Preprocessing 1849 / 6841
Preprocessing 1850 / 6841
Preprocessing 1851 / 6841
Preprocessing 1852 / 6841
Preprocessing 1853 / 6841
Preprocessing 1854 / 6841
Preprocessing 1855 / 6841
Preprocessing 1856 / 6841
Preprocessing 1857 / 6841
Preprocessing 1858 / 6841
Preprocessing 1859 / 6841
Preprocessing 1860 / 6841
Preprocessing 1861 / 6841
Preprocessing 1862 / 6841
Preprocessing 1863 / 6841
Preprocessing 1864 / 6841
Preprocessing 1865 / 6841
Preprocessing 1866 / 6841
Preprocessing 1867 / 6841
Preprocessing 1868 / 6841
Preprocessing 1869 / 6841
Preprocessing 1870 / 6841
Preprocessing 1871 / 6841
Preprocessing 1872 / 6841
Preprocessing 1873 / 6841
Preprocessing 1874 / 6841
Preprocessing 1875 / 6841
Preprocessing 1876 / 6841
Preprocessing 187

In [None]:
full_text_processed = removeComumStopWords("frequency_stopwords_list.csv",full_text_preprocessed)
full_text_processed_docs = [doc.split(" ") for doc in full_text_processed]

In [None]:
data_lemmatized_full = lemmatization(full_text_processed_docs, allowed_postags=['NOUN', 'ADJ'])
full_text_splited_doc = [doc.split(" ") for doc in data_lemmatized_full]

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY

In [None]:
dictionary = gensim.corpora.Dictionary(full_text_splited_doc)

non_filtering_full = dictionary
dictionary.filter_extremes(no_below=3, no_above=0.8)

bow_corpus_full = [dictionary.doc2bow(doc) for doc in full_text_splited_doc]

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : built Dictionary(12676 unique tokens: ['absoluto', 'alocação', 'ano', 'aperfeiçoamento', 'aprovação']...) from 6841 documents (total 1299370 corpus positions)
INFO : discarding 6212 tokens: [('atribuição', 6810), ('cbpf', 1), ('datar', 6714), ('decretar', 6830), ('nanotecnologia', 1), ('nanotecnologias', 2), ('7857', 1), ('ccl', 2), ('ccli', 2), ('ccxxv', 1)]...
INFO : keeping 6464 tokens which were in no less than 3 and no more than 5472 (=80.0%) documents
INFO : resulting dictionary: Dictionary(6464 unique tokens: ['absoluto', 'alocação', 'ano', 'aperfeiçoamento', 'aprovação']...)


## Validation

In [None]:
vectorizer = CountVectorizer(analyzer='word', min_df=3, stop_words='english', lowercase=True, token_pattern='[a-zA-Z0-9]{3,}')
data_vectorized_full = vectorizer.fit_transform(data_lemmatized_full)

# Define Search Param
search_params = {'n_components': [5,10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized_full)

GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 learning_offset=50.0,
                                                 max_iter=5, random_state=0),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 10, 15, 20, 25, 30]})

In [None]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best params: ", model.best_params_)
print("Best Number of components: ", model.best_params_['n_components'])
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized_full))

Best params:  {'learning_decay': 0.5, 'n_components': 25}
Best Number of components:  25
Best Log Likelihood Score:  -1868031.8851055547
Model Perplexity:  564.2535896099004


## Final model creation

In [None]:
lda_model_n = gensim.models.LdaModel(bow_corpus_full, num_topics=25, id2word=dictionary, passes=30, alpha='auto', random_state=42)

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elo

In [None]:
lda_model_n.show_topics(num_topics=25)

[(0,
  '0.053*"avaliação" + 0.048*"servidor" + 0.028*"desempenhar" + 0.023*"cargo" + 0.019*"individual" + 0.016*"institucional" + 0.015*"exercício" + 0.014*"período" + 0.012*"resultar" + 0.012*"lei"'),
 (1,
  '0.051*"militar" + 0.029*"armar" + 0.025*"serviço" + 0.018*"oficial" + 0.011*"transportar" + 0.011*"público" + 0.010*"forçar" + 0.010*"idoso" + 0.010*"redação" + 0.009*"lei"'),
 (2,
  '0.023*"prazo" + 0.020*"lei" + 0.014*"jurídico" + 0.013*"pessoa" + 0.010*"decisão" + 0.010*"serviço" + 0.009*"entidade" + 0.009*"administrativo" + 0.009*"formar" + 0.008*"hipótese"'),
 (3,
  '0.041*"público" + 0.026*"serviço" + 0.024*"órgão" + 0.022*"entidade" + 0.020*"preço" + 0.014*"contratação" + 0.013*"contratar" + 0.011*"direta" + 0.009*"licitação" + 0.009*"proposto"'),
 (4,
  '0.232*"pontar" + 0.084*"linha" + 0.031*"seguir" + 0.028*"s" + 0.025*"indígena" + 0.024*"vinho" + 0.022*"zonar" + 0.021*"uva" + 0.018*"reta" + 0.016*"geográfico"'),
 (5,
  '0.036*"órgão" + 0.027*"atividades" + 0.021*"públi

## Final model visualization

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_n, bow_corpus_full, dictionary)
LDAvis_prepared

INFO : NumExpr defaulting to 2 threads.
