# Análise dos Tópicos 

Esse notebook possui os códigos para a análise dos tópicos extraídos dos diários oficiais. 

## Imports Necessários

In [1]:
# Código para poder usar os módulos implementados nesse repositório no notebook 
import sys, os
path_module = os.path.abspath('../modules/')
if path_module not in sys.path:
    sys.path.append(path_module)

In [2]:
# imports do projeto

from utils.process_gazette import ProcessGazette
from preprocess.pre_process_text import PreProcessText
from nlp.extract_topics import ExtractTopics
from nlp.ner import Ner

## Extração e análise dos tópicos do Diário Teste

### Esta seção realizará a filtragem de páginas relacionadas a Contratos Públicos, garantindo apenas conteúdo relevante.

In [3]:
pp = ProcessGazette(BASE_DIR="gazettes/")
txt_files = [f for f in os.listdir("gazettes/") if f.endswith('.txt')]
all_pages = []
all_pages_name = []
for txt_file in txt_files:
    pages = pp.break_pages(txt_file, "ANO [X|V|I]+ ", save_file=False)
    all_pages.append(pages)
    all_pages_name.append(txt_file)

In [4]:
revistas_filtradas = {}
topics_ = {}
ppt = PreProcessText("pt_core_news_lg")

for gazette in range(len(all_pages)):
    revistas_filtradas[gazette] = {}

    for page, text in all_pages[gazette].items():
        topics_[page] = {'text': text}
        tokens_ = ppt.process_text(topics_[page]['text'])
        for token in tokens_:
            if len(tokens_) > 5 and page and token in ['aquisicao', 'contratacao', 'emergencial', 'dispensa', 'licitacao','compra','comprar']:
                revistas_filtradas[gazette][page] = topics_[page]
                break

In [5]:
for i in range(0,len(revistas_filtradas)):
    revistas_filtradas[i] = list(revistas_filtradas[i].keys())

### resultado da filtragem.

In [3]:
revistas_filtradas = {0: [3, 5],
 1: [1, 2, 4, 11, 12, 13, 14, 16, 17, 18, 19],
 2: [1, 3, 8, 10, 36, 37, 38, 39, 40, 44, 45, 47, 48, 49, 113],
 3: [1, 7, 11, 12, 13, 14, 15],
 4: [1, 2, 4, 5, 6, 7, 8, 9, 10],
 5: [1, 5, 6, 7, 8, 11, 14, 15, 16],
 6: [],
 7: [1, 8, 9, 10, 11, 12, 13],
 8: [2, 11],
 9: [1, 9, 10, 11, 13, 14, 15, 17, 19],
 10: [1, 4, 5, 6, 7, 8, 11, 12],
 11: [1, 10, 11, 12, 13, 15, 16, 17, 18, 19],
 12: [1, 3, 4, 5, 6, 7, 8, 9, 10],
 13: [],
 14: [1, 7, 8, 9, 12, 13, 14, 18],
 15: [1, 5, 6, 7, 8, 9, 12, 13, 14],
 16: [1],
 17: [1, 2, 3, 6, 7, 11, 12, 13, 14, 15, 16, 17],
 18: [1, 2],
 19: [1, 6, 7, 12, 13, 14],
 20: [1, 4, 5, 6, 7, 8, 9, 10],
 21: [2, 3],
 22: [1, 4, 5, 6, 7, 8, 9, 10, 12, 13],
 23: [1, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53],
 24: [1, 38, 39, 40, 50, 51, 55],
 25: [1, 2],
 26: [],
 27: [1, 5, 6, 8, 9, 11, 12, 13, 14, 16, 18],
 28: [1, 7, 9, 12, 13, 14, 16, 17],
 29: [1, 5, 7, 11, 17, 18, 19, 20, 22, 24],
 30: [1, 7, 8, 12, 15, 16, 18],
 31: [1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14],
 32: [],
 33: [1, 3, 4, 7, 8, 9, 10],
 34: [1, 7, 8, 9, 11, 14, 15, 17, 18, 19, 21],
 35: [1, 8, 19, 20, 23, 25, 26, 27, 28, 29, 31, 33, 34, 35, 36],
 36: [],
 37: [1, 6, 8, 10, 11, 12, 13, 14, 15, 16],
 38: [],
 39: [1, 6, 7, 8, 13, 14, 15, 17],
 40: [1, 3, 5, 8, 9, 10, 11, 12, 13, 17, 18, 19],
 41: [2],
 42: [1, 13, 16, 17, 18, 19, 20, 26, 27, 32],
 43: [1, 4, 8, 9, 10, 13, 14, 15, 16, 17],
 44: [1, 7, 37, 38, 39, 40, 41],
 45: [1, 6, 7, 8, 9, 10, 11, 12],
 46: [],
 47: [1, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21],
 48: [1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 23],
 49: [],
 50: [1, 4, 8, 9, 10, 11, 12, 13, 14],
 51: [1, 7, 8, 9, 10, 11, 12, 13, 15, 16],
 52: [],
 53: [],
 54: [1, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25],
 55: [1, 2, 3, 4, 5, 6, 7, 8, 9],
 56: [],
 57: [1, 3, 31, 32, 33],
 58: [1, 4, 5, 6, 7, 8, 9],
 59: [],
 60: [1, 13, 15, 16, 17, 18, 20, 21, 22, 23],
 61: [1, 5, 6, 7, 8, 10, 11, 12],
 62: [5],
 63: [1, 7, 12, 13, 14, 18, 19, 21, 22, 23],
 64: [1, 34, 35, 38, 40, 41],
 65: [],
 66: [1, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48],
 67: [1, 10, 12, 13, 14, 16, 17, 18],
 68: [2],
 69: [1, 5, 6, 7, 8, 10, 11],
 70: [],
 71: [1, 5, 8, 10, 11, 14, 15, 19],
 72: [],
 73: [1, 9, 11, 12, 13, 14, 16, 17, 21],
 74: [1, 3, 7, 13, 14, 16, 17, 18, 19, 20],
 75: [],
 76: [1, 7, 8, 9, 10],
 77: [],
 78: [1, 2, 3, 6, 7, 8, 9, 13],
 79: [],
 80: [1, 3, 22, 23, 24, 25, 26, 27, 28, 29],
 81: [1, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18],
 82: [1, 8, 12, 13, 15, 16, 20, 24, 25, 26],
 83: [],
 84: [1, 2],
 85: [1, 5, 6, 7, 8, 10, 11, 12, 13],
 86: [1],
 87: [1, 8, 9, 10, 12, 13, 14, 15],
 88: [],
 89: [],
 90: [1, 5, 7, 9, 10, 11],
 91: [1, 2, 3, 7, 8, 9, 12, 13],
 92: [1, 6, 7, 8, 9, 10, 11, 13, 14, 15],
 93: [],
 94: [1, 4, 5, 8, 9, 10, 11, 12, 13],
 95: [],
 96: [1, 6, 7, 8, 9, 10, 12, 14, 17, 18],
 97: [1, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17],
 98: [1, 4, 5, 6, 10, 12],
 99: [1, 2],
 100: [1, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17],
 101: [1, 11, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22, 23],
 102: [],
 103: [1, 2],
 104: [1, 5, 6, 8, 9, 10, 11, 17],
 105: [1, 2, 3, 4, 5, 8, 9, 10, 15],
 106: [],
 107: [1, 3, 9, 21, 22, 23, 24, 25, 26, 27],
 108: [1, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15],
 109: [],
 110: [1, 15, 16, 17, 18, 19],
 111: [1, 3, 5, 6, 9, 10, 13],
 112: [1, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19],
 113: [],
 114: [1, 6, 7, 9, 10, 11, 12, 13, 17],
 115: [1, 7, 9, 10, 11, 12, 17],
 116: [1, 8, 9, 10, 11, 12, 13, 14],
 117: [1, 2],
 118: [],
 119: [1, 8, 11, 12, 14, 18],
 120: [],
 121: [1, 5, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18, 19],
 122: [1, 12, 13, 25, 26, 27, 28],
 123: [2],
 124: [1, 4, 5, 9, 10, 11, 13, 14, 15, 16, 18],
 125: [],
 126: [1, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
 127: [1, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 20],
 128: [1, 3, 5, 6, 11, 12, 13, 14, 15, 16, 17, 21],
 129: [1, 3, 4, 5, 6, 11, 15, 17, 18, 20],
 130: [1, 5, 6, 8, 9, 10, 11],
 131: [1, 3, 4, 10, 12, 13, 14, 15, 17, 18, 19, 22, 24],
 132: [1, 6, 8, 9, 10, 12, 13, 14, 15],
 133: [1, 7, 8, 9, 13, 16, 18],
 134: [1, 8, 9, 10, 13, 25],
 135: [],
 136: [],
 137: [1, 3, 4, 7, 10, 12, 13, 14, 15, 16],
 138: [1, 6, 7, 8, 9, 10, 11, 12, 13, 80, 106],
 139: [1, 2],
 140: [1, 5, 6, 7, 15, 16, 17, 18, 21, 22],
 141: [2],
 142: [],
 143: [1, 12, 13, 15, 17, 19, 20, 21, 22],
 144: [1, 2, 6, 7, 8, 11, 13, 14, 15, 16, 17, 18, 19, 22],
 145: [],
 146: [],
 147: [1, 11, 12, 13, 15, 16],
 148: [],
 149: [1, 12, 13, 14, 15, 16, 18, 19, 20, 21, 23, 28],
 150: [1, 7, 8, 9, 11, 13, 14],
 151: [],
 152: [1, 7, 8, 9, 10, 15, 16, 21, 22],
 153: [1, 7, 8, 11],
 154: [1, 7, 8, 22, 23, 25, 26, 28, 29],
 155: [1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
 156: [],
 157: [1, 7, 8, 9, 10, 14, 15, 16, 17, 18, 19, 21],
 158: [1, 6, 13, 14, 29, 30, 31, 35, 40],
 159: [1, 5, 9, 10, 11, 17, 18, 19, 21, 22],
 160: [1, 9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27],
 161: [],
 162: [1, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17],
 163: [],
 164: [1, 7, 8, 9, 12, 13, 14, 15, 16],
 165: [1, 9, 10, 14, 15, 16, 17, 21],
 166: [1, 33, 34, 38, 39, 40, 41],
 167: [1, 8, 9, 11, 21, 22, 23],
 168: [],
 169: [1, 3, 4, 7, 8, 12, 13, 14, 15],
 170: [1, 9, 10, 12, 13, 15, 16, 17],
 171: [1, 3, 4, 5, 6, 7, 12, 13, 15, 16, 17, 18, 20],
 172: [1, 6, 8, 17, 19, 20, 21, 24, 32, 37, 42, 43, 44, 51, 52, 53],
 173: [3, 4],
 174: [1, 5, 8, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24],
 175: [],
 176: [1, 10, 11, 13, 14, 16],
 177: [1, 29, 30, 31, 32, 34, 39, 40],
 178: [1, 6, 9, 10, 11, 13, 14],
 179: [1, 10, 12, 16, 17, 18, 20, 21, 22],
 180: [],
 181: [1, 47, 48, 52, 53, 54, 55, 56],
 182: [1, 3, 10, 11, 12, 14, 15, 16, 17],
 183: [1, 39, 40, 41, 42, 43, 44, 46, 47, 52],
 184: [1, 3, 5, 6, 9, 10, 12, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55],
 185: [1, 15, 16, 17, 18, 19, 20, 22],
 186: [1, 4, 11, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 28],
 187: [1, 2, 3, 4, 5],
 188: [1, 8, 9, 10, 11, 14, 15, 16, 17, 20, 22, 25, 26, 28, 29, 32, 34, 36, 39],
 189: [],
 190: [1, 7, 42, 45, 49],
 191: [],
 192: [1, 5, 6, 33, 34, 35, 38, 39, 44],
 193: [1, 4, 5, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20],
 194: [1, 7, 8, 9, 10, 22, 23, 24, 25, 26, 29],
 195: [1, 2],
 196: [1, 5, 7, 8, 17, 24, 25, 26, 27, 28, 31],
 197: [1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 26, 28, 29, 30, 33, 34],
 198: [1, 6, 13, 14, 15, 17, 22],
 199: [1, 6, 7, 9, 10, 11, 12, 13, 14],
 200: [3],
 201: [1, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23],
 202: [1, 8, 9, 10, 11, 12, 13, 17],
 203: [],
 204: [1, 17, 18, 19, 20, 21, 22, 23],
 205: [1, 6, 10, 11, 12, 13, 14, 15, 16, 17, 33],
 206: [2],
 207: [1, 6, 9, 10, 13, 14, 16, 17, 18, 19, 23],
 208: [1, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
 209: [1, 3, 4, 5, 6, 7, 11, 12, 14, 15, 16, 17, 18],
 210: [1, 10, 13, 14, 15, 17, 18, 19, 20, 22, 23],
 211: [1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 26, 27],
 212: [1, 5, 20, 21, 22, 25, 26, 31, 32, 33, 34, 36],
 213: [1, 6, 11, 12, 18, 19, 20, 21, 22],
 214: [1, 6, 10, 23, 24, 25, 28, 30, 31, 33],
 215: [],
 216: [],
 217: [1, 8, 9, 10, 11, 12, 13, 14, 17],
 218: [2, 3],
 219: [1, 10, 15, 16, 18, 21, 22, 25],
 220: [1, 5, 6, 13, 14, 15, 16, 17, 18, 19, 20, 21],
 221: [],
 222: [1, 7, 8, 9, 10, 12, 13, 17, 21, 22, 23, 24, 27],
 223: [1, 3, 14, 15, 16, 18, 20, 21, 22, 23, 24, 25],
 224: [1, 18, 19, 23, 25, 26, 27, 28],
 225: [1, 8, 9, 10, 11, 12, 13, 14, 15, 16],
 226: [1, 4, 8, 9, 10, 12, 13, 14, 15, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
 227: [1, 7, 11, 12, 13, 14, 16, 17, 18, 21, 22, 23, 41],
 228: [],
 229: [1, 10, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25],
 230: [],
 231: [1, 7, 9, 10, 11, 14, 15, 16, 23],
 232: [1, 11, 12, 17, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32],
 233: [1, 10, 12, 14, 15, 16, 17, 19, 20, 27, 28, 32],
 234: [1, 8, 11, 12, 13, 14, 15, 17, 18, 19],
 235: [1, 8, 12, 13, 14, 15, 19, 20, 24],
 236: [1, 11, 13, 14, 15, 16],
 237: [1, 6, 14, 15, 16, 19, 21],
 238: [1, 3, 7, 8, 9, 10, 20, 21, 25, 29, 30],
 239: [1, 24, 25, 27, 29, 30, 32],
 240: [2],
 241: [1, 8, 11, 14, 15, 16, 17, 19, 22, 23],
 242: [1, 7, 8, 14, 19, 20, 21, 23, 24, 25, 34, 36],
 243: [1, 6, 7, 10, 22, 23, 24, 25, 26, 29, 31],
 244: [],
 245: [1, 10, 14, 15, 16, 17, 18, 19, 20, 26],
 246: [1, 6, 12, 13, 15, 16, 17, 18, 19, 25, 26, 28, 30, 31],
 247: [1, 6, 8, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 25],
 248: [1, 30, 31, 32, 39, 40, 41, 42, 48],
 249: [1, 5, 11, 12, 13, 14, 16, 17, 18, 19],
 250: [],
 251: [1, 4, 5, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 30],
 252: [1, 11, 14, 16, 17, 19, 20, 21, 22, 23],
 253: [],
 254: [],
 255: [1, 6, 11, 12, 13, 14, 16, 17, 21],
 256: [],
 257: [1, 8, 12, 14, 21, 22, 23, 24, 25, 27],
 258: [1, 11, 12, 13, 18, 19, 20, 22, 23],
 259: [3],
 260: [1, 25, 26, 27, 28, 29, 31, 32, 33, 34],
 261: [],
 262: [1, 7, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21],
 263: [],
 264: [],
 265: [1, 5, 14, 15, 16, 19, 21, 22, 23, 24],
 266: [1, 9, 10, 11, 12, 13, 14, 15, 16, 17],
 267: [],
 268: [1, 3, 11, 20, 21, 22, 23, 24, 25, 26, 27],
 269: [],
 270: [1, 4, 7, 8, 14, 15, 16, 18, 19, 28],
 271: [1, 5, 6, 7, 9, 13, 14, 15, 16, 17, 18],
 272: [1, 2],
 273: [1, 6, 18, 21, 22, 23, 24, 28, 29],
 274: [1, 3, 23, 24, 25, 35],
 275: [1, 7, 8, 14, 15, 16, 17, 25, 110, 111, 112, 113, 114, 115, 116, 118],
 276: [1, 5, 86, 87, 90, 93, 101, 105, 106, 107, 109, 110, 111],
 277: [],
 278: [],
 279: [],
 280: [],
 281: [],
 282: [1, 70, 71, 81],
 283: [1, 6, 9, 10, 12, 13, 14, 19, 20],
 284: [1, 3, 8, 9, 10, 11, 15],
 285: [1, 36, 37, 40, 41, 42, 44, 48, 49, 54, 55],
 286: [],
 287: [1, 7, 9, 11],
 288: [1, 7, 9, 15, 16, 17, 19],
 289: [1, 20, 21, 22, 23, 32],
 290: [1, 4, 5, 6, 9],
 291: [1, 2, 3, 15, 16, 18, 19, 20],
 292: [1, 2, 5, 6, 7, 14, 15, 17],
 293: [],
 294: [1, 10, 11, 12],
 295: [1, 4, 7, 11, 12],
 296: [1, 8, 12, 13, 15, 16, 17, 18],
 297: [1, 3, 5, 6, 7, 8, 9, 12, 13],
 298: [1, 5, 6, 9, 10, 12, 13, 14],
 299: [1, 3, 4, 8, 10, 18, 19, 25, 31],
 300: [1, 10, 11, 12, 13, 14, 16, 22, 23, 24],
 301: [1, 7, 8, 9, 10, 11, 12, 14, 15, 21, 22],
 302: [1, 7, 8, 9, 10, 13, 14, 15, 16, 17, 18],
 303: [1, 12, 13, 15, 18, 20, 21],
 304: [1, 10, 11, 12, 14, 22, 60, 61],
 305: [],
 306: [1, 6, 15, 16, 17, 18, 20, 21, 30],
 307: [1, 7, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30],
 308: [1, 8, 10, 12, 13, 14, 15, 16, 17, 18, 24, 25, 26, 28, 30, 31, 33, 34, 37],
 309: [1, 6, 7, 8, 9, 10, 12, 19, 21, 22, 23, 24, 26, 27],
 310: [1, 11, 12, 17, 20, 24],
 311: [1, 8, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20],
 312: [],
 313: [1, 6, 7, 8, 9, 10, 16, 17, 19, 21, 22, 23],
 314: [1, 8, 9, 12, 17, 18, 25, 26, 27, 28, 33, 34],
 315: [1, 8, 9, 15, 16, 17, 25, 26, 27],
 316: [],
 317: [1, 11, 13, 14, 15, 23, 24, 25],
 318: [1, 6, 11, 13, 14, 17, 18, 19, 20, 24, 26, 27],
 319: [1, 10, 11, 14, 15, 16, 17, 18, 21, 22],
 320: [1, 6, 10, 11, 13, 14, 16, 17, 18, 19, 20, 21, 22, 61],
 321: [],
 322: [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 29, 30, 32, 34, 35],
 323: [],
 324: [1, 4, 10, 11, 13, 14],
 325: [1, 7, 8, 9, 11, 12, 13, 14, 15, 16],
 326: [],
 327: [2],
 328: [1, 10, 11, 13, 14, 15, 16, 17, 18],
 329: [1, 9, 10, 13, 14, 16, 19],
 330: [1, 7, 8, 12, 13],
 331: [],
 332: [],
 333: [2],
 334: [1, 3, 4, 6, 8, 9, 10, 11, 12, 16, 17, 18, 19],
 335: [1],
 336: [1, 7, 8, 9, 12],
 337: [1, 14, 15, 16, 17, 18, 20],
 338: [],
 339: [1, 12, 14, 15, 16, 19, 20, 21],
 340: [1, 10, 11, 13, 14, 15, 16, 17, 20, 21, 24],
 341: [2],
 342: [1, 12],
 343: [1, 5, 6, 7, 8, 9, 10, 11, 12],
 344: [1, 3, 6, 7, 8, 13, 15, 16, 17],
 345: [1, 13, 14, 15, 16, 18, 20, 21, 24, 25],
 346: [1, 4, 7, 8, 9, 10, 15, 16, 17, 18],
 347: [1],
 348: [1, 9, 10, 16, 17, 18],
 349: [1, 2, 4, 5, 6, 11, 12],
 350: [1, 8, 9, 11, 12, 15, 16],
 351: [1, 5, 22, 23, 25, 26, 27, 30, 31, 32],
 352: [1, 23, 25, 36, 44, 45, 50],
 353: [1, 5, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27],
 354: [2],
 355: [1, 4, 8, 10, 11, 13],
 356: [],
 357: [1, 9, 14, 15, 16, 17, 19, 20, 22, 23, 24],
 358: [1, 7, 8, 9, 12, 14, 19, 20],
 359: [1],
 360: [1, 2, 3, 15, 16, 17],
 361: [1, 10, 11, 12, 14, 15, 16, 17, 18, 22, 23, 24, 25],
 362: [1, 12, 13, 14, 15, 16, 18, 19, 20, 21, 22],
 363: [1],
 364: [1, 2],
 365: [1, 3, 7, 8, 9, 10, 11, 12, 13, 14, 16],
 366: [1, 7, 8, 12, 18, 19, 20],
 367: [1],
 368: [1, 9, 14, 15, 19, 20],
 369: [],
 370: [3, 13, 14],
 371: [1, 8, 12, 14, 15, 16, 17, 18, 19, 20],
 372: [1, 5, 6, 7, 8, 10, 11, 13, 14],
 373: [5],
 374: [1, 5, 16, 17, 18, 19, 20, 21, 22, 23, 34]}

### Extraindo de multiplas gazetas

In [4]:
pp = ProcessGazette(BASE_DIR="gazettes/")
txt_files = [f for f in os.listdir("gazettes/") if f.endswith('.txt')]
all_pages = []
for txt_file in txt_files:
    pages = pp.break_pages(txt_file, "ANO [X|V|I]+ ", save_file=True)
    all_pages.append(pages)

In [5]:
cont = 2 #len(all_pages)

In [6]:
all_topics = {}
topics_ = {}
ppt = PreProcessText("pt_core_news_lg")
for gazette in range(0,cont):
    all_topics[gazette] = {}
    for page, text in all_pages[gazette].items():
        if page in revistas_filtradas[gazette]:
            extractor = ExtractTopics(model = 'default')
            extractorGpt = ExtractTopics(model = 'gpt')
            ner = Ner()
            topics_[page] = {'text': text, 'topics': []}
            tokens_ = ppt.process_text(topics_[page]['text'])
            try:
                if len(tokens_) > 5 and page:
                    topics_[page]['tokens'] = tokens_
                    topics_[page]['topics'], topics_[page]['topics_info'] = extractor.extract_topics(topics_[page]['tokens'])
                    topics_[page]['topicsGpt'], topics_[page]['topics_infoGpt'] = extractorGpt.extract_topics(topics_[page]['tokens'])
                    topics_[page]['Ner'] = ner.extract_entities(topics_[page]['tokens'])
                    topics_[page]['similarity'] = extractor.find_topics('Licitação')
                    all_topics[gazette][page] = topics_[page]
            except Exception as e:
                print(page)
                print(tokens_)
                print(topics_[page]['text'])
                print(topics_[page]['tokens'])
                print(f"Error: {e}")


  from .autonotebook import tqdm as notebook_tqdm


5
['diario', 'oficial', 'fevereiro', 'ouvidoria', 'municipio', 'registrar', 'reclamacao', 'denuncia', 'sugestao', 'elogio', 'www.ouvidoria.salvador.ba.gov.br', 'sexta-feira', 'exceto', 'feriado', 'disque', 'salvador', 'solicitar', 'servico', 'informacao', 'acesse', 'www.disquesalvador.ba.gov.br', 'atendimento', 'diario', 'oficial', 'municipio', 'edicao', 'anterior', 'www.dom.salvador', 'ba.gov.br', 'solicitar', 'e-mail', 'diario.oficial@salvador.ba.gov.br', 'sexta-feira', 'exceto', 'feriado', 'gabinete', 'prefeito', 'Coordenador', 'tecnologia', 'Claudio', 'Raphael', 'Pereira', 'Vinicius', 'Moraes', 'responsavel', 'Andrey', 'gestor', 'editoracao', 'orcamentario', 'incluir', 'abertura', 'credito', 'adicional', 'remanejamento', 'transposicao', 'transferencia', 'observar', 'legislacao', 'vigente', 'limite', 'dotacao', 'global', 'regulamentar', 'couber', 'executivo', 'entrar', 'publicacao', 'retroagir', 'efeito', 'artigo', 'decretacao', 'emergencia', 'municipio', 'Salvador', 'gabinete', 'pr

In [7]:
gazeta = 1
pg = 7
print(all_topics[gazeta][pg]['similarity'])
print(all_topics[gazeta][pg]['Ner'])
all_topics[gazeta][pg]['topics_info'][:3]

[0, 1, 2]
[(Executivo Municipal, 'LOC'), (relatorio orcamentario, 'PER')]


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,45,0_desenvolvimento_manutencao_custeio_estruturar,"[desenvolvimento, manutencao, custeio, estrutu...","[desenvolvimento, desenvolvimento, desenvolvim..."
1,1,22,1_educacao_ensino_construcao_alimentacao,"[educacao, ensino, construcao, alimentacao, se...","[educacao, educacao, educacao]"
2,2,19,2_exclusivo_segunda_montante_feira,"[exclusivo, segunda, montante, feira, executiv...","[exclusivo, exclusivo, exclusivo]"


In [None]:
all_topics[gazeta][pg]['topics_infoGpt'][:3]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,40,-1_administrativo_servico_magalhaes_medida,"[administrativo, servico, magalhaes, medida, t...","[administrativo, administrativo, administrativo]"
1,0,38,0_andrey_antonio_claudio_pereira,"[andrey, antonio, claudio, pereira, moreira, m...","[Moreira, Moraes, Pereira]"
2,1,30,1_municipal_urbanismo_registrar_hospital,"[municipal, urbanismo, registrar, hospital, , ...","[municipal, municipal, municipal]"
