In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
drive_PATH = '../content/drive/MyDrive/Colab Notebooks/l101.experiments.1'

In [None]:
!pip install jsonlines

In [None]:
from transformers import AutoTokenizer, RemBertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("google/rembert")
model = RemBertModel.from_pretrained("google/rembert")

In [None]:
def encode(sent):
    inputs = tokenizer(sent, return_tensors="pt")
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    return last_hidden_states.detach().numpy()[0]

def encode_sentence(sent):
    enc = encode(sent)
    pooled_enc = sum(enc) / enc.shape[0]
    return pooled_enc

def encode_candidates(cands):
    return [encode_sentence(cand) for cand in cands]

In [None]:
import jsonlines
# BLEURT
bleurt_ENDE_PATH = drive_PATH + '/data-main/bleurt-ENDE.jsonl'
bleurt_ENDE = []
with jsonlines.open(bleurt_ENDE_PATH) as f:
    for line in f.iter():
        bleurt_ENDE.append(line)
# chrF++
chrf_ENDE_PATH = drive_PATH + '/data-main/chrf-ENDE.jsonl'
chrf_ENDE = []
with jsonlines.open(chrf_ENDE_PATH) as f:
    for line in f.iter():
        chrf_ENDE.append(line)

In [None]:
num_examples = 1000
assert(len(chrf_ENDE) == num_examples)
assert(len(bleurt_ENDE) == num_examples)

for i in range(num_examples):
  assert(chrf_ENDE[i]['src'] == bleurt_ENDE[i]['src'])
  assert(chrf_ENDE[i]['ref'] == bleurt_ENDE[i]['ref'])
  assert(chrf_ENDE[i]['candidates'] == bleurt_ENDE[i]['candidates'])

In [None]:
import tqdm
ENDE = []

for i in tqdm.tqdm(range(num_examples)):
  ENDE_dict = { 'src': chrf_ENDE[i]['src'], 'ref': chrf_ENDE[i]['ref'], 'candidates': chrf_ENDE[i]['candidates'], 'enc_candidates':encode_candidates(chrf_ENDE[i]['candidates']) }

  ### bleurt
  # cand scores
  ENDE_dict['BLEURT_cand_scores'] = bleurt_ENDE[i]['BLEURT_cand_scores']
  # mbr cand scores
  ENDE_dict['MBR_BLEURT_cand_scores'] = bleurt_ENDE[i]['MBR_BLEURT_cand_scores']
  # mbr score matrix
  ENDE_dict['MBR_BLEURT_score_matrix'] = bleurt_ENDE[i]['MBR_BLEURT_score_matrix']
  # mbr winner index
  ENDE_dict['MBR_BLEURT_winner_index'] = bleurt_ENDE[i]['MBR_BLEURT_winner_index']
  # mbr winner
  ENDE_dict['MBR_BLEURT_winner'] = bleurt_ENDE[i]['MBR_BLEURT_winner']
  ### chrf
  # cand scores
  ENDE_dict['CHRF_cand_scores'] = chrf_ENDE[i]['CHRF_cand_scores']
  # mbr cand scores
  ENDE_dict['MBR_CHRF_cand_scores'] = chrf_ENDE[i]['MBR_CHRF_cand_scores']
  # mbr score matrix
  ENDE_dict['MBR_CHRF_score_matrix'] = chrf_ENDE[i]['MBR_CHRF_score_matrix']
  # mbr winner index
  ENDE_dict['MBR_CHRF_winner_index'] = chrf_ENDE[i]['MBR_CHRF_winner_index']
  # mbr winner
  ENDE_dict['MBR_CHRF_winner'] = chrf_ENDE[i]['MBR_CHRF_winner']
  ###
  ENDE.append(ENDE_dict)

In [None]:
import numpy as np
for i in tqdm.tqdm(range(num_examples)):
  ENDE[i]['enc_candidates'] = np.array(ENDE[i]['enc_candidates'], dtype=float).tolist()

In [None]:
ENDE_PATH = drive_PATH+ f'/data-main/ENDE.jsonl'
with jsonlines.open(ENDE_PATH, mode='w') as writer:
    for item in ENDE:
        writer.write(item)