In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
drive_PATH = '../content/drive/MyDrive/Colab Notebooks/l101.experiments.1'

In [None]:
!pip install jsonlines

In [None]:
from transformers import AutoTokenizer, RemBertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("google/rembert")
model = RemBertModel.from_pretrained("google/rembert")

In [None]:
def encode(sent):
    inputs = tokenizer(sent, return_tensors="pt")
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    return last_hidden_states.detach().numpy()[0]

def encode_sentence(sent):
    enc = encode(sent)
    pooled_enc = sum(enc) / enc.shape[0]
    return pooled_enc

def encode_candidates(cands):
    return [encode_sentence(cand) for cand in cands]

In [None]:
import jsonlines
# BLEURT
bleurt_DEEN_PATH = drive_PATH + '/data-main/bleurt-DEEN.jsonl'
bleurt_DEEN = []
with jsonlines.open(bleurt_DEEN_PATH) as f:
    for line in f.iter():
        bleurt_DEEN.append(line)
# chrF++
chrf_DEEN_PATH = drive_PATH + '/data-main/chrf-DEEN.jsonl'
chrf_DEEN = []

with jsonlines.open(chrf_DEEN_PATH) as f:
    for line in f.iter():
        chrf_DEEN.append(line)

In [None]:
num_examples = 1000
assert(len(chrf_DEEN) == num_examples)
assert(len(bleurt_DEEN) == num_examples)


for i in range(num_examples):
  assert(chrf_DEEN[i]['src'] == bleurt_DEEN[i]['src'])
  assert(chrf_DEEN[i]['ref'] == bleurt_DEEN[i]['ref'])
  assert(chrf_DEEN[i]['candidates'] == bleurt_DEEN[i]['candidates'])

In [None]:
import tqdm
DEEN = []

for i in tqdm.tqdm(range(num_examples)):
  DEEN_dict = { 'src': chrf_DEEN[i]['src'], 'ref': chrf_DEEN[i]['ref'], 'candidates': chrf_DEEN[i]['candidates'], 'enc_candidates':encode_candidates(chrf_DEEN[i]['candidates']) }
  ### bleurt
  # cand scores
  DEEN_dict['BLEURT_cand_scores'] = bleurt_DEEN[i]['BLEURT_cand_scores']
  # mbr cand scores
  DEEN_dict['MBR_BLEURT_cand_scores'] = bleurt_DEEN[i]['MBR_BLEURT_cand_scores']
  # mbr score matrix
  DEEN_dict['MBR_BLEURT_score_matrix'] = bleurt_DEEN[i]['MBR_BLEURT_score_matrix']
  # mbr winner index
  DEEN_dict['MBR_BLEURT_winner_index'] = bleurt_DEEN[i]['MBR_BLEURT_winner_index']
  # mbr winner
  DEEN_dict['MBR_BLEURT_winner'] = bleurt_DEEN[i]['MBR_BLEURT_winner']
  ### chrf
  # cand scores
  DEEN_dict['CHRF_cand_scores'] = chrf_DEEN[i]['CHRF_cand_scores']
  # mbr cand scores
  DEEN_dict['MBR_CHRF_cand_scores'] = chrf_DEEN[i]['MBR_CHRF_cand_scores']
  # mbr score matrix
  DEEN_dict['MBR_CHRF_score_matrix'] = chrf_DEEN[i]['MBR_CHRF_score_matrix']
  # mbr winner index
  DEEN_dict['MBR_CHRF_winner_index'] = chrf_DEEN[i]['MBR_CHRF_winner_index']
  # mbr winner
  DEEN_dict['MBR_CHRF_winner'] = chrf_DEEN[i]['MBR_CHRF_winner']
  ###
  DEEN.append(DEEN_dict)

In [None]:
import numpy as np
for i in tqdm.tqdm(range(num_examples)):
  DEEN[i]['enc_candidates'] = np.array(DEEN[i]['enc_candidates'], dtype=float).tolist()

In [None]:
# DEEN[999].keys()

In [None]:
DEEN_PATH = drive_PATH+ f'/data-main/DEEN.jsonl'
with jsonlines.open(DEEN_PATH, mode='w') as writer:
    for item in DEEN:
        writer.write(item)