# Análise dos Tópicos 

Esse notebook possui os códigos para a análise dos tópicos extraídos dos diários oficiais. 

## Imports Necessários

In [1]:
# Código para poder usar os módulos implementados nesse repositório no notebook 
import sys, os
path_module = os.path.abspath('../modules/')
if path_module not in sys.path:
    sys.path.append(path_module)

In [2]:
# imports do projeto

from utils.process_gazette import ProcessGazette
from preprocess.pre_process_text import PreProcessText
from nlp.extract_topics import ExtractTopics

## Extração e análise dos tópicos do Diário Teste

### Ao iterar por todas gazetas em busca dos termos: Aquisições Diretas, Contratações Emergenciais e Dispensas de Licitação, temos como resultado a seguinte lista.

In [19]:
pp = ProcessGazette(BASE_DIR="gazettes/")
txt_files = [f for f in os.listdir("gazettes/") if f.endswith('.txt')]
all_pages = []
all_pages_name = []
for txt_file in txt_files:
    pages = pp.break_pages(txt_file, "ANO [X|V|I]+ ", save_file=True)
    all_pages.append(pages)
    all_pages_name.append(txt_file)

In [31]:
all_topics = {}
topics_ = {}
ppt = PreProcessText("pt_core_news_lg")

for gazette in range(len(all_pages)):
    all_topics[gazette] = {}

    for page, text in all_pages[gazette].items():
        topics_[page] = {'text': text, 'topics': []}
        tokens_ = ppt.process_text(topics_[page]['text'])
        for token in tokens_:
            if len(tokens_) > 5 and page and token in ['aquisicao', 'contratacao', 'emergencial', 'dispensa', 'licitacao']:
                topics_[page]['tokens'] = tokens_
                all_topics[gazette][page] = topics_[page]

In [32]:
for i in range(0,11):
    print(f"{all_pages_name[i]}: {list(all_topics[i].keys())}")

2927408_20200306_3.txt: [1, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23]
2927408_20200316_7.txt: []
2927408_20200318_0.txt: [1, 33, 34, 35, 36, 42, 43, 44, 46, 47, 48, 72, 73, 74, 75]
2927408_20200320_2.txt: [1, 2, 3]
2927408_20200324_1.txt: []
2927408_20200408_9.txt: [1, 4, 5, 6, 7, 8, 11, 12]
2927408_20200414_6.txt: [1, 7, 8, 9, 12, 13, 14, 18]
2927408_20200418_5.txt: [1, 4, 5, 6, 7, 8, 9, 10]
2927408_20200423_4.txt: [1, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]
2927408_20200429_8.txt: [1, 7, 17, 18, 19, 20, 22, 24]
2927408_20230704_0.txt: [1, 28, 29, 32, 33, 34]


In [30]:
for i in range(0,11):
    print(f"{all_pages_name[i]}: {list(all_topics[i].keys())}")

2927408_20200306_3.txt: [1, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23]
2927408_20200316_7.txt: [2]
2927408_20200318_0.txt: [1, 5, 33, 34, 35, 36, 42, 43, 44, 46, 47, 48, 72, 73, 74, 75, 78]
2927408_20200320_2.txt: [1, 2, 3]
2927408_20200324_1.txt: []
2927408_20200408_9.txt: [1, 3, 4, 5, 6, 7, 8, 11, 12]
2927408_20200414_6.txt: [1, 7, 8, 9, 12, 13, 14, 18]
2927408_20200418_5.txt: [1, 4, 5, 6, 7, 8, 9, 10]
2927408_20200423_4.txt: [1, 4, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]
2927408_20200429_8.txt: [1, 4, 5, 6, 7, 17, 18, 19, 20, 22, 24]
2927408_20230704_0.txt: [1, 28, 29, 32, 33, 34]


### Extraindo de uma unica gazeta

In [11]:
pp = ProcessGazette(BASE_DIR="gazettes/")
all_pages = pp.break_pages("2927408_20200318_0.txt","ANO [X|V|I]+ ")

In [12]:
topics_ = {}
ppt = PreProcessText("pt_core_news_lg")

for page, text in all_pages.items():
    extractor = ExtractTopics(model = 'default')#to gpt install openai
    topics_[page] = {'text': text, 'topics': []}
    tokens_ = ppt.process_text(topics_[page]['text'])   
    try:
        if len(tokens_) > 5 and page:
            topics_[page]['tokens'] = tokens_
            topics_[page]['topics'], topics_[page]['topics_info'] = extractor.extract_topics(topics_[page]['tokens'])
            topics_[page]['similarity'] = extractor.find_topics('Licitação')
    except Exception as e:
        print(page)
        print(tokens_)
        print(topics_[page]['text'])
        print(topics_[page]['tokens'])
        print(f"Error: {e}")

In [15]:
print(topics_[30]['similarity'])
print(topics_[30]['topics'][5])

[5, -1, 1]
[('resultado', 0.24751318496457658), ('relator', 0.24751318496457658), ('processo', 0.24751318496457658), ('solicitante', 0.24751318496457658), ('dantas', 0.24751318496457658), ('comerciar', 0.24751318496457658), ('portellar', 0.15767011966073635), ('oficial', 0.15767011966073635), ('hildebrar', 0.15767011966073635), ('funsaude', 0.15767011966073635)]


### Extraindo de multiplas gazetas

In [3]:
pp = ProcessGazette(BASE_DIR="gazettes/")
txt_files = [f for f in os.listdir("gazettes/") if f.endswith('.txt')]
all_pages = []
for txt_file in txt_files:
    pages = pp.break_pages(txt_file, "ANO [X|V|I]+ ", save_file=True)
    all_pages.append(pages)

In [4]:
all_topics = {}
topics_ = {}
ppt = PreProcessText("pt_core_news_lg")
for gazette in range(len(all_pages)):
    all_topics[gazette] = {}
    for page, text in all_pages[gazette].items():
        extractor = ExtractTopics(model = 'default')
        topics_[page] = {'text': text, 'topics': []}
        tokens_ = ppt.process_text(topics_[page]['text'])
        try:
            if len(tokens_) > 5 and page:
                topics_[page]['tokens'] = tokens_
                topics_[page]['topics'], topics_[page]['topics_info'] = extractor.extract_topics(topics_[page]['tokens'])
                all_topics[gazette][page] = topics_[page]
        except Exception as e:
            print(page)
            print(tokens_)
            print(topics_[page]['text'])
            print(topics_[page]['tokens'])
            print(f"Error: {e}")


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
gazeta = 1
pg = 2
all_topics[gazeta][pg]['text']

'| N º 7.5942  DIÁRIO OFICIAL DO  Criado pelo art. 82 da Lei nº 3.601, de 18 de fevereiro de 1986  Ouvidoria Geral do Município - Para registrar reclamações, denúncias,  sugestões ou elogios, acesse: www.ouvidoria.salvador.ba.gov.br ou ligue para  (71) 3202-5909, de segunda a sexta-feira, das 8 às 17 horas, exceto feriados.  Disque Salvador - Para solicitar serviços ou informação,  acesse: www.disquesalvador.ba.gov.br ou ligue 156,  atendimento 24h.  Diário Oficial do Município - Edições Anteriores, acesse: www.dom.salvador. ba.gov.br ou solicite através do e-mail: diario.oficial@salvador.ba.gov.br, de  segunda a sexta-feira, das 8 às 18 horas, exceto feriados.  CEP: 40.020-000 - Tel.: 3202-6261/6262  Chefe de Gabinete do Prefeito Coordenador de Tecnologia  Claudio Raphael Pereira Pinto  3  Kaio Vinicius Moraes Leal  Órgão responsável  Andrey Das Neves Santos  Gestor de Editoração  I- Academias de Ginástica;  II- Cinemas;  III- Teatros e demais Casas de Espetáculos; e  IV-  Parques Inf