In [1]:
import json
import pandas as pd
import os
from tqdm import tqdm
import pickle
from getpass import getpass

In [2]:
REINDEXAR_INDICE_BM25_TODOS_CHUNKS = False
NOME_ARQUIVO_INDICE_BM25_TODOS_CHUNKS = 'outputs/indice_bm25_todos_chunks.pickle'

REINDEXAR_INDICE_BM25_APENAS_ART = False
NOME_ARQUIVO_INDICE_BM25_APENAS_ART = 'outputs/indice_bm25_apenas_art.pickle'

# 1. Carrega as bases de dados

Carrega as bases de dados de questões (são as queries) e os chunks de pesquisa (são os documentos a serem pesquisados).

In [3]:
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

chunks_pesquisa = load_jsonl('inputs/chunks_pesquisa.jsonl')
questoes = load_jsonl('inputs/questoes.jsonl')

## 1.1 Gera dados derivados das bases de dados carregadas

O código para gerar as métricas considera um qrels no dataframe pandas. Gera o qrels no formato esperado pela ferramenta.

In [4]:
# Cria um qrels no formato esperado
id_questao = []
urn_chunk = []
score = []
rank = []
for q in questoes:
    total_docs = len(q['URN_FUNDAMENTACAO'])
    id_questao += [q['ID_QUESTAO']] * total_docs
    urn_chunk += q['URN_FUNDAMENTACAO']
    score += [1] * total_docs
    rank += list(range(1, total_docs+1))

qrels_todos_chunks = pd.DataFrame({
    "QUERY_KEY": id_questao,
    "DOC_KEY": urn_chunk,
    "SCORE": score,
    "RANK": rank
})

Filtra chunks_pesquisa para isolar apenas os chunks que são artigos completos.

In [5]:
chunks_pesquisa_apenas_art = [c for c in chunks_pesquisa if c['TIPO'] == 'ART']

Uniformiza as questões para considerar, na fundamentação, apenas o artigo

In [6]:
import copy
import re

padrao = re.compile(r'!art\d{1,3}')

questoes_fund_apenas_art = copy.deepcopy(questoes)

for questao in questoes_fund_apenas_art:
    nova_fundamentacao = []

    for texto in questao.get("URN_FUNDAMENTACAO", []):
        match = padrao.search(texto)

        if match:
            # corta exatamente no final de !artX
            nova_fundamentacao.append(texto[:match.end()])
        else:
            nova_fundamentacao.append(texto)

    questao["URN_FUNDAMENTACAO"] = list(set(nova_fundamentacao))


Agora cria um qrels para essa situação uniformizada por artigo.

In [7]:
# Cria um qrels no formato esperado
id_questao = []
urn_chunk = []
score = []
rank = []
for q in questoes_fund_apenas_art:
    total_docs = len(q['URN_FUNDAMENTACAO'])
    id_questao += [q['ID_QUESTAO']] * total_docs
    urn_chunk += q['URN_FUNDAMENTACAO']
    score += [1] * total_docs
    rank += list(range(1, total_docs+1))

qrels_apenas_art = pd.DataFrame({
    "QUERY_KEY": id_questao,
    "DOC_KEY": urn_chunk,
    "SCORE": score,
    "RANK": rank
})

# 2. Cria índices invertidos e buscadores BM25 para o campo TEXTO

In [8]:
from bm25 import IndiceInvertido, BM25, tokenizador_pt_remove_html

# Índice para todos os chunks
iidx_todos_chunks = IndiceInvertido(tokenizador_pt_remove_html)
if REINDEXAR_INDICE_BM25_TODOS_CHUNKS or not os.path.exists(NOME_ARQUIVO_INDICE_BM25_TODOS_CHUNKS):
    # Se for indexar a primeira vez:
    # Demora cerca de 35 minutos para indexar
    iidx_todos_chunks.adiciona_objetos(chunks_pesquisa, lambda obj: (obj['URN'], obj['TEXTO']))
    iidx_todos_chunks.to_pickle(NOME_ARQUIVO_INDICE_BM25_TODOS_CHUNKS)
else:
    # Se quiser recuperar de um arquivo:
    iidx_todos_chunks.from_pickle(NOME_ARQUIVO_INDICE_BM25_TODOS_CHUNKS)

# Índice para os chunks apenas de artigos
iidx_apenas_art = IndiceInvertido(tokenizador_pt_remove_html)
if REINDEXAR_INDICE_BM25_APENAS_ART or not os.path.exists(NOME_ARQUIVO_INDICE_BM25_APENAS_ART):
    # Se for indexar a primeira vez:
    # Demora cerca de 35 minutos para indexar
    iidx_apenas_art.adiciona_objetos(chunks_pesquisa_apenas_art, lambda obj: (obj['URN'], obj['TEXTO']))
    iidx_apenas_art.to_pickle(NOME_ARQUIVO_INDICE_BM25_APENAS_ART)
else:
    # Se quiser recuperar de um arquivo:
    iidx_apenas_art.from_pickle(NOME_ARQUIVO_INDICE_BM25_APENAS_ART)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\P_8454\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\P_8454\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\P_8454\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


# 3. Cria buscadores BM25 e mostra os resultados

In [9]:
# Agora instancia um BM25
buscador_todos_chunks = BM25(iidx_todos_chunks, k1=0.82, b=0.68, bias_idf=1)
buscador_apenas_art = BM25(iidx_apenas_art, k1=0.82, b=0.68, bias_idf=1)

In [10]:
def pesquisa_bm25(buscador):   
    col_resultado_id_questao=[]
    col_resultado_urn_chunk=[]
    col_resultado_rank=[]
    
    for q in tqdm(questoes):
        id_questao = q['ID_QUESTAO']
        enunciado_com_alternativas = q['ENUNCIADO_COM_ALTERNATIVAS']
        resultados = buscador.pesquisar(enunciado_com_alternativas)
    
        primeiros_20_urns = [tupla_key_score[0] for tupla_key_score in resultados[:20]]
        ids_questao = [id_questao] * len(primeiros_20_urns)
        ranking = list(range(1, len(primeiros_20_urns)+1))
    
        col_resultado_id_questao.extend(ids_questao)
        col_resultado_urn_chunk.extend(primeiros_20_urns)
        col_resultado_rank.extend(ranking)
    
    df_resultados = pd.DataFrame({
        "QUERY_KEY": col_resultado_id_questao,
        "DOC_KEY": col_resultado_urn_chunk,
        "RANK": col_resultado_rank,
    })
    
    return df_resultados

In [11]:
df_resultados_pesquisa_todos_chunks = pesquisa_bm25(buscador_todos_chunks)
df_resultados_pesquisa_apenas_art = pesquisa_bm25(buscador_apenas_art)

100%|████████████████████████████████████████████████████████████████████████████████| 700/700 [00:26<00:00, 26.17it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 700/700 [00:19<00:00, 36.44it/s]


In [12]:
from metricas import histograma_metricas, boxplot_metricas, metricas

df_metricas_pesquisa_todos_chunks = metricas(df_resultados_pesquisa_todos_chunks, qrels_todos_chunks, aproximacao_trec_eval=True, k=[5, 10, 20])

df_metricas_pesquisa_apenas_art = metricas(df_resultados_pesquisa_apenas_art, qrels_apenas_art, aproximacao_trec_eval=True, k=[5, 10, 20])

Resultados para a pesquisa com BM25 em todos os chunks

In [13]:
display(df_metricas_pesquisa_todos_chunks.describe())
#histograma_metricas(df_metricas_pesquisa_todos_chunks, metrica_1='P@5', metrica_2='R@5', metrica_3='MRR@5', metrica_4='nDCG@5')
#boxplot_metricas(df_metricas_pesquisa_todos_chunks, metricas=['P@5', 'R@5', 'MRR@5'])

Unnamed: 0,P@5,P@10,P@20,R@5,R@10,R@20,MRR@5,MRR@10,MRR@20,nDCG@5,nDCG@10,nDCG@20
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.121429,0.088714,0.058429,0.349733,0.437997,0.515781,0.321357,0.33742,0.343217,0.283202,0.320505,0.349228
std,0.132968,0.093577,0.059688,0.420041,0.424924,0.422059,0.387661,0.37664,0.371907,0.343661,0.335228,0.326191
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.2,0.1,0.05,0.166667,0.25,0.5,0.2,0.2,0.2,0.146068,0.234639,0.294053
75%,0.2,0.1,0.05,1.0,1.0,1.0,0.5,0.5,0.5,0.5,0.544052,0.570642
max,0.6,0.6,0.45,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
display(df_metricas_pesquisa_apenas_art.describe())
#histograma_metricas(df_metricas_pesquisa_apenas_art, metrica_1='P@5', metrica_2='R@5', metrica_3='MRR@5', metrica_4='nDCG@5')
#boxplot_metricas(df_metricas_pesquisa_apenas_art, metricas=['P@5', 'R@5', 'MRR@5'])

Unnamed: 0,P@5,P@10,P@20,R@5,R@10,R@20,MRR@5,MRR@10,MRR@20,nDCG@5,nDCG@10,nDCG@20
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.202571,0.114143,0.065571,0.717337,0.77148,0.854449,0.682238,0.686473,0.691132,0.65432,0.676136,0.701098
std,0.147635,0.086994,0.046866,0.405348,0.381373,0.308293,0.409947,0.403534,0.395904,0.389864,0.375434,0.344655
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.2,0.1,0.05,0.333333,0.575,1.0,0.333333,0.333333,0.333333,0.386853,0.396306,0.430677
50%,0.2,0.1,0.05,1.0,1.0,1.0,1.0,1.0,1.0,0.787702,0.874215,0.884642
75%,0.2,0.1,0.05,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,0.8,0.5,0.25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
