Use BERT model to re-rank the passages.

In [9]:
from transformers import AutoTokenizer, BertForSequenceClassification
from datasets import load_dataset
import torch
from tqdm.auto import tqdm
import math

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [27]:
dataset = load_dataset('json', data_files={ 
    'test' : 'dataset-test-100.jl'})['test']

Using custom data configuration default-528fc9f264cbcfca


Downloading and preparing dataset json/default to /home/i306412/.cache/huggingface/datasets/json/default-528fc9f264cbcfca/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/i306412/.cache/huggingface/datasets/json/default-528fc9f264cbcfca/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
dataset

Dataset({
    features: ['question_text', 'passages_id', 'passage_text'],
    num_rows: 119944
})

In [29]:
question_texts=[]
for question_text in dataset['question_text']:
    if question_text not in question_texts:
        question_texts.append(question_text)

In [30]:
len(question_texts)

1200

In [7]:
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

In [31]:
model = BertForSequenceClassification.from_pretrained('model/morfeusz-10-1epoch')
model.eval()
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [34]:
batch_size=20
dataset_size=len(dataset['question_text'])
probs={question_text:[] for question_text in question_texts} # {question_text: [(passage_id, prob),...]}
for i in tqdm(range(0, dataset_size, batch_size)):
    questions = dataset['question_text'][i:min(i+batch_size, dataset_size)]
    passage_ids = dataset['passages_id'][i:min(i+batch_size, dataset_size)]
    passages = dataset['passage_text'][i:min(i+batch_size, dataset_size)]
    inputs = tokenizer(text=questions, text_pair=passages, return_tensors='pt', padding='max_length', truncation=True)
    inputs.to(device)
    with torch.no_grad():
        preds = model(**inputs)
    preds['logits'] = preds['logits'].cpu()
    for question_text, passage_id, logits in zip(questions, passage_ids, preds['logits']):
        prob = torch.softmax(logits, dim=0)[1]
        probs[question_text].append((prob, passage_id))

  0%|          | 0/5998 [00:00<?, ?it/s]

In [35]:
out_path = 'out-test-morfeusz-10-1epoch.tsv'
f_out = open(out_path, 'w')
k = 10
for question_text in question_texts:
    topk = sorted(probs[question_text],reverse=True)[:k]
    f_out.write('\t'.join([pair[1] for pair in topk])+'\n')
f_out.close()

In [21]:
def score_passages(predicted_path, expected_path):
    score = 0
    for predicted_line, expected_line in zip(open(predicted_path).read().split('\n')[:-1], open(expected_path).read().split('\n')[:-1]):
        predicted_ids = predicted_line.split()
        expected_ids = expected_line.split()

        dcg = 0
        for i, id in enumerate(predicted_ids):
            if id in expected_ids:
                dcg += 1/math.log2(i+2)
        idcg = sum([1/math.log2(i+2) for i in range(len(expected_ids))])

        score += dcg/idcg
    return 100*score/(len(open(predicted_path).read().split('\n'))-1)

In [22]:
score_passages(out_path, 'expected.tsv')

24.424258761933928