In [1]:
import utils
import os
import json
import torch
import numpy as np

from pyserini.search.lucene import LuceneSearcher
from pyserini.search.faiss import FaissSearcher, DprQueryEncoder

from models import DenseDocumentRetriever, SparseDocumentRetriever
from models import DprHighlighter, CncBertHighlighter
from models import DprReader

from config import ROOT, RAW_DIR, FORMMATED_DIR, INDEX_DIR

# sparse searcher
sparse_searcher = LuceneSearcher(f"{INDEX_DIR}/multi_fields")
# dense searcher
query_encoder = DprQueryEncoder("facebook/dpr-question_encoder-multiset-base")
dense_searcher = FaissSearcher(f"{INDEX_DIR}/dpr-ctx_encoder-multiset-base", query_encoder)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.weight', 'question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
def generate_statistics_summary(titles):
    # Count the number of documents retrieved
    statistics_summary = {'year': dict(), 'form': dict(), 'cik': dict(), 'part': dict(), 'item': dict()}

    for title in titles:
        title_parts = title.split('_')
        year, form, cik, part, item = title_parts[0][:4], title_parts[1], title_parts[2], title_parts[3], title_parts[4]

        for category, value in zip(['year', 'form', 'cik', 'part', 'item'], [year, form, cik, part, item]):
            statistics_summary[category].setdefault(value, 0)
            statistics_summary[category][value] += 1

    return statistics_summary

def print_hits(hits, display_top_n=10):
    for i in range(display_top_n):
        print(f'{i+1:2} {hits[i].docid:7} {hits[i].score:.5f}')
        print(utils.retrieve_paragraph_from_docid(hits[i].docid))
    print()


# Retriever-Highlighter

In [3]:
target_paragraph = utils.retrieve_paragraph_from_docid("20221025_10-Q_789019_part1_item2_para334")
target_title = utils.convert_docid_to_title("20221025_10-Q_789019_part1_item2_para334")
print(f"target title: {target_title}")
print(target_paragraph)

target title: Microsoft Corp 2022 Q4 10-Q
On January 18, 2022, we entered into a definitive agreement to acquire Activision Blizzard, Inc. Activision Blizzard for 95.00 per share in an all-cash transaction valued at 68.7 billion, inclusive of Activision Blizzard s net cash. The acquisition has been approved by Activision Blizzard s shareholders, and we expect it to close in fiscal year 2023, subject to the satisfaction of certain regulatory approvals and other customary closing conditions.


## Retriever

In [4]:
sparse_retriever = SparseDocumentRetriever(sparse_searcher)
dense_retriever = DenseDocumentRetriever(dense_searcher)

sparse_hits = sparse_retriever.search_documents(target_paragraph)
dense_hits = dense_retriever.search_documents(target_paragraph)

# titles & texts format for DPR highlighter
sparse_titles, sparse_texts = sparse_retriever.retrieve_and_process_documents(target_paragraph)
dense_titles, dense_texts = dense_retriever.retrieve_and_process_documents(target_paragraph)

print(f"\nTarget: {target_paragraph}\n")
print("Retrieval results from sparse retriever:")
print_hits(sparse_hits)
print()

print("Retrieval results from dense retriever:")
print_hits(dense_hits)
print()


Target: On January 18, 2022, we entered into a definitive agreement to acquire Activision Blizzard, Inc. Activision Blizzard for 95.00 per share in an all-cash transaction valued at 68.7 billion, inclusive of Activision Blizzard s net cash. The acquisition has been approved by Activision Blizzard s shareholders, and we expect it to close in fiscal year 2023, subject to the satisfaction of certain regulatory approvals and other customary closing conditions.

Retrieval results from sparse retriever:
 1 20221025_10-Q_789019_part1_item2_para334 51.17300
On January 18, 2022, we entered into a definitive agreement to acquire Activision Blizzard, Inc. Activision Blizzard for 95.00 per share in an all-cash transaction valued at 68.7 billion, inclusive of Activision Blizzard s net cash. The acquisition has been approved by Activision Blizzard s shareholders, and we expect it to close in fiscal year 2023, subject to the satisfaction of certain regulatory approvals and other customary closing co

## Dpr Highlighter

The current behavior of the DPR highlighter, where it conducts span prediction on the reference, is not desired.

In [5]:
# highlighter
dpr_highlighter = DprHighlighter(model_name="facebook/dpr-reader-multiset-base", tokenizer_name="facebook/dpr-reader-multiset-base")

sparse_encoded_inputs, sparse_outputs = dpr_highlighter.highlighting_outputs(target_paragraph, target_title, sparse_texts)

start_logits = sparse_outputs.start_logits
end_logits = sparse_outputs.end_logits
relevance_logits = sparse_outputs.relevance_logits

# masked the logits prior to the start of target paragraph
target_para_start_idxs = dpr_highlighter.find_target_start_position(sparse_encoded_inputs['input_ids'])
masked_start_logits = dpr_highlighter.mask_prior_logits(start_logits, target_para_start_idxs)
masked_end_logits = dpr_highlighter.mask_prior_logits(end_logits, target_para_start_idxs)

print(f"\nTarget: {target_paragraph}\n")
print("Highlighting results from sparse retriever: (desc)")
dpr_highlighter.visualize_highlight_span(sparse_encoded_inputs, sparse_titles, relevance_logits, masked_start_logits, masked_end_logits)
print()


Some weights of the model checkpoint at facebook/dpr-reader-multiset-base were not used when initializing DPRReader: ['span_predictor.encoder.bert_model.pooler.dense.weight', 'span_predictor.encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRReader from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRReader from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRReaderTokenizer'.



Target: On January 18, 2022, we entered into a definitive agreement to acquire Activision Blizzard, Inc. Activision Blizzard for 95.00 per share in an all-cash transaction valued at 68.7 billion, inclusive of Activision Blizzard s net cash. The acquisition has been approved by Activision Blizzard s shareholders, and we expect it to close in fiscal year 2023, subject to the satisfaction of certain regulatory approvals and other customary closing conditions.

Highlighting results from sparse retriever: (desc)
0.2967 reference 20210427_10-Q_789019_part1_item2_para481:
start_idx: 115, end_idx: 115, span: blizzard
0.2721 reference 20220125_10-Q_789019_part1_item1_para153:
start_idx: 141, end_idx: 141, span: blizzard
0.1928 reference 20211026_10-Q_789019_part1_item2_para340:
start_idx: 129, end_idx: 129, span: blizzard
0.1320 reference 20220125_10-Q_789019_part1_item2_para475:
start_idx: 120, end_idx: 120, span: blizzard
0.0348 reference 20220426_10-Q_789019_part1_item1_para169:
start_idx: 

In [6]:
# highlighter
dpr_highlighter = DprHighlighter(model_name="facebook/dpr-reader-multiset-base", tokenizer_name="facebook/dpr-reader-multiset-base")

dense_encoded_inputs, dense_outputs = dpr_highlighter.highlighting_outputs(target_paragraph, target_title, dense_texts)

start_logits = sparse_outputs.start_logits
end_logits = sparse_outputs.end_logits
relevance_logits = sparse_outputs.relevance_logits

# masked the logits prior to the start of target paragraph
target_para_start_idxs = dpr_highlighter.find_target_start_position(sparse_encoded_inputs['input_ids'])
masked_start_logits = dpr_highlighter.mask_prior_logits(start_logits, target_para_start_idxs)
masked_end_logits = dpr_highlighter.mask_prior_logits(end_logits, target_para_start_idxs)

print(f"\nTarget: {target_paragraph}\n")
print("Highlighting results from dense retriever: (desc)")
dpr_highlighter.visualize_highlight_span(dense_encoded_inputs, dense_titles, relevance_logits, masked_start_logits, masked_end_logits)
print()


Some weights of the model checkpoint at facebook/dpr-reader-multiset-base were not used when initializing DPRReader: ['span_predictor.encoder.bert_model.pooler.dense.weight', 'span_predictor.encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRReader from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRReader from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRReaderTokenizer'.



Target: On January 18, 2022, we entered into a definitive agreement to acquire Activision Blizzard, Inc. Activision Blizzard for 95.00 per share in an all-cash transaction valued at 68.7 billion, inclusive of Activision Blizzard s net cash. The acquisition has been approved by Activision Blizzard s shareholders, and we expect it to close in fiscal year 2023, subject to the satisfaction of certain regulatory approvals and other customary closing conditions.

Highlighting results from dense retriever: (desc)
0.2967 reference 20220125_10-Q_789019_part1_item2_para475:
start_idx: 115, end_idx: 115, span: agreement
0.2721 reference 20220426_10-Q_789019_part2_item1a_para28:
start_idx: 141, end_idx: 141, span: significant
0.1928 reference 20221025_10-Q_789019_part2_item1a_para29:
start_idx: 129, end_idx: 129, span: our
0.1320 reference 20220125_10-Q_789019_part2_item1a_para28:
start_idx: 120, end_idx: 120, span: to
0.0348 reference 20220426_10-Q_789019_part1_item1_para169:
start_idx: 135, end

## CncBertHighlighter

In [6]:
cnc_highlighter = CncBertHighlighter(model_name="DylanJHJ/bert-base-final-v0-ep2")

sparse_highlight_results = cnc_highlighter.highlighting_outputs(target_paragraph, sparse_texts)
dense_highlight_results = cnc_highlighter.highlighting_outputs(target_paragraph, dense_texts)

print(f"\nTarget: {target_paragraph}\n")
print("Highlighting results from sparse retriever:")
cnc_highlighter.visualize_top_k_highlight(sparse_highlight_results)
print()

print("Highlighting results from dense retriever:")
cnc_highlighter.visualize_top_k_highlight(dense_highlight_results)
print()


Target: On January 18, 2022, we entered into a definitive agreement to acquire Activision Blizzard, Inc. Activision Blizzard for 95.00 per share in an all-cash transaction valued at 68.7 billion, inclusive of Activision Blizzard s net cash. The acquisition has been approved by Activision Blizzard s shareholders, and we expect it to close in fiscal year 2023, subject to the satisfaction of certain regulatory approvals and other customary closing conditions.

Highlighting results from sparse retriever:
reference 1: ['acquire', 'acquisition', 'close', 'Blizzard,', 'Activision']
reference 2: ['approved', 'acquire', 'acquisition', 'Blizzard,', 'close']
reference 3: ['approved', 'acquire', 'acquisition', 'Blizzard,', 'close']
reference 4: ['95.00', '68.7', '18,', 'January', '2022,']
reference 5: ['95.00', '68.7', '18,', 'January', '2022,']
reference 6: ['95.00', '68.7', 'January', '18,', 'approved']
reference 7: ['January', '18,', '95.00', '68.7', 'Blizzard']
reference 8: ['January', '18,',

# Question Answering

In [7]:
query = "What company does microsoft acquire in 2022?"

## Retriever

In [8]:
sparse_retriever = SparseDocumentRetriever(sparse_searcher)
dense_retriever = DenseDocumentRetriever(dense_searcher)

sparse_hits = sparse_retriever.search_documents(query)
dense_hits = dense_retriever.search_documents(query)

# titles & texts format for DPR reader
sparse_titles, sparse_texts = sparse_retriever.retrieve_and_process_documents(query)
dense_titles, dense_texts = dense_retriever.retrieve_and_process_documents(query)

print(f"\Query: {query}\n")
print("Retrieval results from sparse retriever:")
print_hits(sparse_hits)
print()

print("Retrieval results from dense retriever:")
print_hits(dense_hits)
print()

\Query: What company does microsoft acquire in 2022?

Retrieval results from sparse retriever:
 1 20220429_10-Q_320193_part1_item2_para40 3.27060
In addition to its contractual cash requirements, the Company has a share repurchase program authorized by the Board of Directors (the Program ). As of March 26, 2022, the remaining availability under the Program was 17.6 billion. On April 28, 2022, the Company announced the Board of Directors increased the Program authorization by 90 billion. The Program does not obligate the Company to acquire a minimum amount of shares.
 2 20220729_10-Q_320193_part1_item2_para38 2.99690
In addition to its contractual cash requirements, the Company has a share repurchase program authorized by the Board of Directors (the Program ). The Program does not obligate the Company to acquire a minimum amount of shares. As of June 25, 2022, the Company s quarterly cash dividend was 0.23 per share. The Company intends to increase its dividend on an annual basis, subje

## Reader

In [10]:
# reader
dpr_reader = DprReader(model_name="facebook/dpr-reader-multiset-base", tokenizer_name="facebook/dpr-reader-multiset-base")

sparse_encoded_inputs, sparse_outputs = dpr_reader.generate_model_outputs(query, sparse_titles, sparse_texts)
dense_encoded_inputs, dense_outputs = dpr_reader.generate_model_outputs(query, dense_titles, dense_texts)

print(f"\nQuery: {query}\n")
print("Reader results from sparse retriever: (desc)")
dpr_reader.visualize_answer_span(sparse_encoded_inputs, sparse_titles, sparse_outputs.relevance_logits, sparse_outputs.start_logits, sparse_outputs.end_logits)
print()

print("Reader results from dense retriever: (desc)")
dpr_reader.visualize_answer_span(dense_encoded_inputs, dense_titles, dense_outputs.relevance_logits, dense_outputs.start_logits, dense_outputs.end_logits)

Some weights of the model checkpoint at facebook/dpr-reader-multiset-base were not used when initializing DPRReader: ['span_predictor.encoder.bert_model.pooler.dense.weight', 'span_predictor.encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRReader from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRReader from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRReaderTokenizer'.



Query: What company does microsoft acquire in 2022?

Reader results from sparse retriever: (desc)
0.1966 reference 20220429_10-Q_320193_part1_item2_para40:
start_idx: 57, end_idx: 42, span: 
0.1400 reference 20220729_10-Q_320193_part1_item2_para37:
start_idx: 36, end_idx: 37, span: the company
0.1321 reference 20220429_10-Q_320193_part1_item2_para39:
start_idx: 36, end_idx: 37, span: the company
0.1240 reference 20180202_10-Q_320193_part1_item2_para53:
start_idx: 38, end_idx: 38, span: 2017
0.1147 reference 20220729_10-Q_320193_part1_item2_para38:
start_idx: 57, end_idx: 42, span: 
0.0757 reference 20220128_10-Q_320193_part1_item2_para37:
start_idx: 117, end_idx: 41, span: 
0.0579 reference 20180202_10-Q_320193_part1_item1_para42:
start_idx: 47, end_idx: 47, span: directors
0.0578 reference 20220429_10-Q_320193_part2_item2_para1:
start_idx: 139, end_idx: 139, span: as
0.0509 reference 20220729_10-Q_320193_part2_item2_para1:
start_idx: 233, end_idx: 247, span: board of directors author