Add miniLM score for (question, passage) pairs.

In [29]:
import json
from tqdm.auto import tqdm

In [30]:
def read_rows(path):
    rows = []
    for line in open(path):
        rows.append(json.loads(line))
    return rows

def write_json_format(path_out, rows):
    f_out = open(path_out, 'w')
    for row in rows:
        f_out.write(json.dumps(row, ensure_ascii=False)+'\n')

In [31]:
path_dataset = 'test-B-big/allegro.jl'
path_model = 'cross-encoder/ms-marco-MiniLM-L-6-v2'

In [32]:
rows = read_rows(path_dataset)

In [33]:
inputs = [(row['question_translated'], row['passage_translated']) for row in rows]

In [34]:
from sentence_transformers import CrossEncoder
model = CrossEncoder(path_model, max_length=512)

In [35]:
scores = model.predict(inputs)

In [36]:
scores

array([  0.9330688,  -5.916269 ,  -7.3078804, ...,  -9.100338 ,
       -11.066532 , -11.440943 ], dtype=float32)

In [37]:
for row, score in zip(rows, scores):
    row['score_miniLM'] = float(score)

In [38]:
len(rows)

326596

In [39]:
len(scores)

326596

In [40]:
rows[-10:]

[{'question_id': 499,
  'question_text': 'Co mogę umieścić w dodatkowych informacjach o dostawie i płatności?',
  'passage_id': '193',
  'passage_text': 'Postaraj się, aby jakość Twoich ocen na koncie, które bierze udział w promocji nie spadła poniżej 98%. Jeśli jednak tak się stanie lub jeśli przestaniesz spełniać inne warunki promocji, Twoje konto zostanie z niej wykluczone z końcem opłaconego okresu.',
  'score_bm25': 0.2670532,
  'score_bm25_not_lemmatized': 0.27342082998151523,
  'score_bm25_bigrams': 0,
  'passage_translated': 'Try to ensure that the quality of your rating in your account that participates in the promotion does not fall below 98%. However, if this happens or if you stop fulfilling other promotional conditions, your account will be excluded from it at the end of the paid period.',
  'question_translated': 'What can I put in the additional delivery and payment information?',
  'distillbert_answer': 'promotional conditions',
  'gpt3_answer': 'None.',
  'chatgpt_answ

In [41]:
rows[2100]

{'question_id': 2,
 'question_text': 'Czym się różnią zakładki Sprzedane i Zamówienia?',
 'passage_id': '741',
 'passage_text': 'Tak, możesz posiadać kilka kont, o ile zarejestrujesz je na swoje poprawne i prawdziwe dane osobowe.',
 'score_bm25': 0.93356365,
 'score_bm25_not_lemmatized': 0.9523778504243336,
 'score_bm25_bigrams': 0,
 'passage_translated': 'Yes, you may have several accounts if you register them for your correct and true personal information.',
 'question_translated': 'How do Sold Bookmarks and Orders differ?',
 'distillbert_answer': 'if you register them for your correct and true personal information',
 'gpt3_answer': 'Sold Bookmarks are items that have been purchased, while Orders are requests for items to be purchased.',
 'chatgpt_answer': 'Sold tab shows items that have already been sold, while Orders tab shows items that have been purchased but not yet shipped.',
 'score_miniLM': -11.117280960083008}

In [42]:
write_json_format(path_dataset, rows)