<a href="https://colab.research.google.com/github/amrtanair/master_thesis/blob/main/run_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import json
import math
import torch
import pickle
import random
import numpy as np
from tqdm.notebook import tqdm

from scipy.special import softmax
from transformers import BertTokenizer, BertForSequenceClassification, BertForMaskedLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:

'''This cell is used to generate the evaluation script for the bert-large FINETUNED models
'''
model_name = "bert-large-uncased"
split = "test"
finetuning_datasets = ['CoLA', "MegaAcceptability"]
evaluation_datasets = ['OIE2016', 'NYT', 'PENN', 'WEB']
f_d = finetuning_datasets[1]
e_d = evaluation_datasets[3]

def create_result_files(measures, k):
  results_files = []
  for measure in measures:
    results_files.append(split + '-' + e_d + '-' + f_d + '-' + model_name + "-" + measure + '-' + str(k) + '.txt')
  return results_files


def run_model(model, tokenizer, device, measures):
  if model_name == "bert-large-uncased":
    unigram_freq = pickle.load(open('/content/drive/MyDrive/thesis/bert-large-uncased-bookcorpus-wikipedia.pickle', "rb"))
  elif model_name == "bert-large-cased":
    unigram_freq = pickle.load(open('/content/drive/MyDrive/thesis/bert-large-cased-bookcorpus-wikipedia.pickle', "rb"))
  unigram_total = sum(unigram_freq.values())

  if e_d == 'OIE2016':
    if split == "test":
      input_file_path = '/content/drive/MyDrive/thesis/OIE2016/test_search_res.json'
    else:
      input_file_path = '/content/drive/MyDrive/thesis/OIE2016/train_search_res.json'
  elif e_d == "WEB":
    input_file_path = '/content/drive/MyDrive/thesis/WEB_search_res.json'
  elif e_d == "PENN":
    input_file_path = '/content/drive/MyDrive/thesis/PENN_search_res.json'
  elif e_d == 'NYT':
    input_file_path = '/content/drive/MyDrive/thesis/NYT_search_res.json'


  with open(input_file_path, 'r') as file:
    json_data = file.read()

  data_dict = json.loads(json_data)
  pattern = r'\$input_txt:\$ '

  search_results = {}
  sentences = {}

  for key, value in data_dict.items():
    sentence = re.sub(pattern, '', value[0][0])
    sentences[key] = sentence
    search_results[key] = []
    for k, v in value[0][1]["deduplicated:"].items():
        np_pair = [sentence[start:end] for start, end in v[2]]
        triple_text =  np_pair[0] + ' [SEP] ' + k.split(' [SEP] ')[1] + ' [SEP] ' + np_pair[1]
        search_results[key].append([triple_text, v[1]])

  top_k = 3
  print("Selecting top: ", top_k)
  results_files = create_result_files(measures, top_k)

  with open(results_files[0], "w") as f, open(results_files[1], "w") as g, open(results_files[2], "w") as h, open(results_files[3], "w") as i, open(results_files[4], "w") as j, open(results_files[5], "w") as k, open(results_files[6], "w") as l:
    ID = 0
    for key, value in tqdm(sentences.items()):
      rv_prob = []
      rv_meanp = []
      rv_logprob = []
      rv_meanlp = []
      rv_normlp_div = []
      rv_normlp_sub = []
      rv_slor = []

      triples = search_results[key]
      for triple in triples:
        input_text = triple[0].replace("[SEP] ", "")
        input_id = tokenizer(input_text, return_tensors="pt")["input_ids"].to(device)
        tokenize_input = tokenizer.tokenize(input_text)

        output = model(input_id)
        prob = torch.softmax(output.logits, dim=1)[0][1].item()
        logprob = math.log(prob)
        sentence_len = len(input_text)

        uni_lp = 0.0
        for w in tokenize_input:
          try:
              if unigram_freq[w] > 0 and unigram_total > 0:
                  uni_lp += math.log(float(unigram_freq[w]) / unigram_total)
          except:
              print(triple)

        rv_prob.append([triple, prob])
        rv_meanp.append([triple, prob/sentence_len, prob])
        rv_logprob.append([triple, logprob, prob])
        rv_meanlp.append([triple, logprob/sentence_len, prob])
        rv_normlp_sub.append([triple, logprob-uni_lp, prob])
        rv_normlp_div.append([triple, -(logprob/uni_lp), prob])
        rv_slor.append([triple, (logprob - uni_lp)/sentence_len, prob])

      rv_prob = sorted(rv_prob, key=lambda x: x[1], reverse = True)[:top_k]
      rv_meanp = sorted(rv_meanp, key=lambda x: x[1], reverse = True)[:top_k]
      rv_logprob = sorted(rv_logprob, key=lambda x: x[1], reverse = True)[:top_k]
      rv_meanlp = sorted(rv_meanlp, key=lambda x: x[1], reverse = True)[:top_k]
      rv_normlp_sub = sorted(rv_normlp_sub, key=lambda x: x[1], reverse = True)[:top_k]
      rv_normlp_div = sorted(rv_normlp_div, key=lambda x: x[1], reverse = True)[:top_k]
      rv_slor = sorted(rv_slor, key=lambda x: x[1], reverse = True)[:top_k]

      # for the Prob measure
      f.write(value+"\n")
      ID = ID + 1
      for t in rv_prob:
        text = t[0][0].split("[SEP] ")
        try:
          f.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[1]))+ '\n')
        except:
          print(ID)

      # for the MeanP measure
      g.write(value+"\n")
      ID = ID + 1
      for t in rv_meanp:
        text = t[0][0].split("[SEP] ")
        try:
          g.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the LogProb measure
      h.write(value+"\n")
      ID = ID + 1
      for t in rv_logprob:
        text = t[0][0].split("[SEP] ")
        try:
          h.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the MeanLP measure
      i.write(value+"\n")
      ID = ID + 1
      for t in rv_meanlp:
        text = t[0][0].split("[SEP] ")
        try:
          i.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the NormLP_sub measure
      j.write(value+"\n")
      ID = ID + 1
      for t in rv_normlp_sub:
        text = t[0][0].split("[SEP] ")
        try:
          j.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the NormLP_div measure
      k.write(value+"\n")
      ID = ID + 1
      for t in rv_normlp_div:
        text = t[0][0].split("[SEP] ")
        try:
          k.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the SLOR measure
      l.write(value+"\n")
      ID = ID + 1
      for t in rv_slor:
        text = t[0][0].split("[SEP] ")
        try:
          l.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

  return

if __name__ == "__main__":
  seed = 42
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

  if model_name == "bert-large-cased":
    if f_d == "CoLA":
      model_dir = '/content/drive/MyDrive/thesis/models/CoLA-BERT-large-cased-model'
    elif f_d == 'MegaAcceptability':
      model_dir = '/content/drive/MyDrive/thesis/models/MegaAcceptability-bert-large-cased'
  elif model_name == 'bert-large-uncased':
    if f_d == "CoLA":
      model_dir = '/content/drive/MyDrive/thesis/models/CoLA-BERT-large-uncased-model'
    elif f_d == 'MegaAcceptability':
      model_dir = '/content/drive/MyDrive/thesis/models/MegaAcceptability-bert-large-uncased'

  if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(seed)
    device_name = torch.cuda.get_device_name(0)
    print('GPU:', device_name)
  else:
    print('Using CPU')
    device = torch.device("cpu")
    device_name = 'cpu'

  model = BertForSequenceClassification.from_pretrained(model_dir)
  tokenizer = BertTokenizer.from_pretrained(model_name)
  model.to(device)

  print(f'For model: {model_name}, eval dataset: {e_d}, finetuning dataset: {f_d} and split: {split}')

  measures = ['Prob', 'MeanP', 'LogProb', 'MeanLP', 'NormLP_sub', 'NormLP_div', 'SLOR']
  run = False
  if run:
    run_model(model, tokenizer, device, measures)


GPU: NVIDIA A100-SXM4-40GB


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


For model: bert-large-uncased, eval dataset: WEB, finetuning dataset: MegaAcceptability and split: test


In [None]:

'''This cell is used to generate the evaluation script for the bert-large models
'''
model_name = "bert-large-cased"
split = "test"

evaluation_datasets = ['OIE2016', 'NYT', 'PENN', 'WEB']
e_d = evaluation_datasets[3]

def create_result_files(measures, k):
  results_files = []
  for measure in measures:
    results_files.append(split + '-' + e_d + '-' + model_name + "-" + measure + '-' + str(k) + '.txt')
  return results_files


# The following function was copied and adapted from the following GitHub repository:
# Author:  jhlau
# Repository: https://github.com/jhlau/acceptability-prediction-in-context/
# file: https://github.com/jhlau/acceptability-prediction-in-context/blob/master/code/compute_model_score.py
# License: Apache License 2.0
# I have made some modifications to adapt it to my needs.

def model_score(tokenize_input, model, tokenizer, device):
  batched_indexed_tokens = []
  batched_segment_ids = []

  tokenize_combined = ["[CLS]"] + tokenize_input + ["[SEP]"]

  for i in range(len(tokenize_input)):
    masked_index = i + 1
    tokenize_masked = tokenize_combined.copy()
    tokenize_masked[masked_index] = '[MASK]'

    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenize_masked)
    segment_ids = [0]*len(tokenize_masked)

    batched_indexed_tokens.append(indexed_tokens)
    batched_segment_ids.append(segment_ids)

  tokens_tensor = torch.tensor(batched_indexed_tokens, device=device)
  segment_tensor = torch.tensor(batched_segment_ids, device=device)

  with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segment_tensor)
    predictions = outputs[0]

  lp = 0.0
  for i in range(len(tokenize_input)):
    masked_index = i + 1
    predicted_score = predictions[i, masked_index]
    predicted_prob = softmax(predicted_score.cpu().numpy())
    lp += np.log(predicted_prob[tokenizer.convert_tokens_to_ids([tokenize_combined[masked_index]])[0]])
  p = np.exp(lp)
  return lp, p

def run_bert_model(model, tokenizer, device, measures):
  if model_name == "bert-large-uncased":
    unigram_freq = pickle.load(open('/content/drive/MyDrive/thesis/bert-large-uncased-bookcorpus-wikipedia.pickle', "rb"))
  elif model_name == 'bert-large-cased':
    unigram_freq = pickle.load(open('/content/drive/MyDrive/thesis/bert-large-cased-bookcorpus-wikipedia.pickle', "rb"))
  unigram_total = sum(unigram_freq.values())

  if e_d == 'OIE2016':
    if split == "test":
      input_file_path = '/content/drive/MyDrive/thesis/OIE2016/test_search_res.json'
    else:
      input_file_path = '/content/drive/MyDrive/thesis/OIE2016/train_search_res.json'
  elif e_d == "WEB":
    input_file_path = '/content/drive/MyDrive/thesis/WEB_search_res.json'
  elif e_d == "PENN":
    input_file_path = '/content/drive/MyDrive/thesis/PENN_search_res.json'
  elif e_d == 'NYT':
    input_file_path = '/content/drive/MyDrive/thesis/NYT_search_res.json'

  with open(input_file_path, 'r') as file:
    json_data = file.read()

  data_dict = json.loads(json_data)
  pattern = r'\$input_txt:\$ '

  search_results = {}
  sentences = {}

  for key, value in data_dict.items():
    sentence = re.sub(pattern, '', value[0][0])
    sentences[key] = sentence
    search_results[key] = []
    for k, v in value[0][1]["deduplicated:"].items():
        np_pair = [sentence[start:end] for start, end in v[2]]
        triple_text =  np_pair[0] + ' [SEP] ' + k.split(' [SEP] ')[1] + ' [SEP] ' + np_pair[1]
        search_results[key].append([triple_text, v[1]])

  top_k = 3
  print("Selecting top: ", top_k)
  results_files = create_result_files(measures, top_k)

  with open(results_files[0], "w") as f, open(results_files[1], "w") as g, open(results_files[2], "w") as h, open(results_files[3], "w") as i, open(results_files[4], "w") as j, open(results_files[5], "w") as k, open(results_files[6], "w") as l:
    ID = 0
    for key, value in tqdm(sentences.items()):
      rv_prob = []
      rv_meanp = []
      rv_logprob = []
      rv_meanlp = []
      rv_normlp_div = []
      rv_normlp_sub = []
      rv_slor = []

      triples = search_results[key]
      for triple in triples:
        input_text = triple[0].replace("[SEP] ", "")
        tokenize_input = tokenizer.tokenize(input_text)

        logprob, prob = model_score(tokenize_input, model, tokenizer, device)
        sentence_len = len(input_text)

        uni_lp = 0.0
        for w in tokenize_input:
          try:
              if unigram_freq[w] > 0 and unigram_total > 0:
                  uni_lp += math.log(float(unigram_freq[w]) / unigram_total)
          except:
              print(triple)

        rv_prob.append([triple, prob])
        rv_meanp.append([triple, prob/sentence_len, prob])
        rv_logprob.append([triple, logprob, prob])
        rv_meanlp.append([triple, logprob/sentence_len, prob])
        rv_normlp_sub.append([triple, logprob-uni_lp, prob])
        rv_normlp_div.append([triple, -(logprob/uni_lp), prob])
        rv_slor.append([triple, (logprob - uni_lp)/sentence_len, prob])

      rv_prob = sorted(rv_prob, key=lambda x: x[1], reverse = True)[:top_k]
      rv_meanp = sorted(rv_meanp, key=lambda x: x[1], reverse = True)[:top_k]
      rv_logprob = sorted(rv_logprob, key=lambda x: x[1], reverse = True)[:top_k]
      rv_meanlp = sorted(rv_meanlp, key=lambda x: x[1], reverse = True)[:top_k]
      rv_normlp_sub = sorted(rv_normlp_sub, key=lambda x: x[1], reverse = True)[:top_k]
      rv_normlp_div = sorted(rv_normlp_div, key=lambda x: x[1], reverse = True)[:top_k]
      rv_slor = sorted(rv_slor, key=lambda x: x[1], reverse = True)[:top_k]


      # for the Prob measure
      f.write(value+"\n")
      ID = ID + 1
      for t in rv_prob:
        text = t[0][0].split("[SEP] ")
        try:
          f.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[1]))+ '\n')
        except:
          print(ID)

      # for the MeanP measure
      g.write(value+"\n")
      ID = ID + 1
      for t in rv_meanp:
        text = t[0][0].split("[SEP] ")
        try:
          g.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the LogProb measure
      h.write(value+"\n")
      ID = ID + 1
      for t in rv_logprob:
        text = t[0][0].split("[SEP] ")
        try:
          h.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the MeanLP measure
      i.write(value+"\n")
      ID = ID + 1
      for t in rv_meanlp:
        text = t[0][0].split("[SEP] ")
        try:
          i.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the NormLP_sub measure
      j.write(value+"\n")
      ID = ID + 1
      for t in rv_normlp_sub:
        text = t[0][0].split("[SEP] ")
        try:
          j.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the NormLP_div measure
      k.write(value+"\n")
      ID = ID + 1
      for t in rv_normlp_div:
        text = t[0][0].split("[SEP] ")
        try:
          k.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the SLOR measure
      l.write(value+"\n")
      ID = ID + 1
      for t in rv_slor:
        text = t[0][0].split("[SEP] ")
        try:
          l.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

  return

if __name__ == "__main__":
  seed = 42
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

  if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(seed)
    device_name = torch.cuda.get_device_name(0)
    print('GPU:', device_name)
  else:
    print('Using CPU')
    device = torch.device("cpu")
    device_name = 'cpu'

  model = BertForMaskedLM.from_pretrained(model_name)
  tokenizer = BertTokenizer.from_pretrained(model_name)
  model.to(device)

  measures = ['Prob', 'MeanP', 'LogProb', 'MeanLP', 'NormLP_sub', 'NormLP_div', 'SLOR']
  print(f'For model: {model_name}, eval dataset: {e_d}, and split: {split}')

  run_bert_model(model, tokenizer, device, measures)


GPU: NVIDIA A100-SXM4-40GB


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

For model: bert-large-cased, eval dataset: WEB, and split: test
Selecting top:  3


  0%|          | 0/442 [00:00<?, ?it/s]

In [None]:

'''This cell is used to generate the evaluation script for the gpt base model
'''
model_name = "gpt2-medium"
split = "test"

evaluation_datasets = ['OIE2016', 'NYT', 'PENN', 'WEB']
e_d = evaluation_datasets[0]

def create_result_files(measures, k):
  results_files = []
  for measure in measures:
    results_files.append(split + '-' + e_d + '-' + model_name + "-" + measure + '-' + str(k) + '.txt')
  return results_files


# The following function was copied and adapted from the following GitHub repository:
# Author:  jhlau
# Repository: https://github.com/jhlau/acceptability-prediction-in-context/
# file: https://github.com/jhlau/acceptability-prediction-in-context/blob/master/code/compute_model_score.py
# License: Apache License 2.0
# I have made some modifications to adapt it to my needs.

def model_score(tokenize_input, model, tokenizer, device):
  #prepend the sentence with <|endoftext|> token, so that the loss is computed correctly
  tensor_input = torch.tensor([[50256] + tokenizer.convert_tokens_to_ids(tokenize_input)], device=device)
  labels = torch.tensor([[50256] + tokenizer.convert_tokens_to_ids(tokenize_input)], device=device)
  labels[:,:1] = -1
  loss = model(tensor_input, labels=tensor_input)

  return float(loss[0]) * -1.0 * len(tokenize_input)

def run_bert_model(model, tokenizer, device, measures):
  unigram_freq = pickle.load(open('/content/drive/MyDrive/thesis/gpt-openwebtext.pickle', "rb"))
  unigram_total = sum(unigram_freq.values())

  if e_d == 'OIE2016':
    if split == "test":
      input_file_path = '/content/drive/MyDrive/thesis/OIE2016/test_search_res.json'
    else:
      input_file_path = '/content/drive/MyDrive/thesis/OIE2016/train_search_res.json'
  elif e_d == "WEB":
    input_file_path = '/content/drive/MyDrive/thesis/WEB_search_res.json'
  elif e_d == "PENN":
    input_file_path = '/content/drive/MyDrive/thesis/PENN_search_res.json'
  elif e_d == 'NYT':
    input_file_path = '/content/drive/MyDrive/thesis/NYT_search_res.json'

  with open(input_file_path, 'r') as file:
    json_data = file.read()

  data_dict = json.loads(json_data)
  pattern = r'\$input_txt:\$ '

  search_results = {}
  sentences = {}

  for key, value in data_dict.items():
    sentence = re.sub(pattern, '', value[0][0])
    sentences[key] = sentence
    search_results[key] = []
    for k, v in value[0][1]["deduplicated:"].items():
        np_pair = [sentence[start:end] for start, end in v[2]]
        triple_text =  np_pair[0] + ' [SEP] ' + k.split(' [SEP] ')[1] + ' [SEP] ' + np_pair[1]
        search_results[key].append([triple_text, v[1]])

  if e_d == "OIE2016":
    top_k = 3
  else:
    top_k = 1
  print("Selecting top: ", top_k)
  results_files = create_result_files(measures, top_k)

  with open(results_files[0], "w") as h, open(results_files[1], "w") as i, open(results_files[2], "w") as j, open(results_files[3], "w") as k, open(results_files[4], "w") as l:
    ID = 0
    for key, value in tqdm(sentences.items()):
      rv_logprob = []
      rv_meanlp = []
      rv_normlp_div = []
      rv_normlp_sub = []
      rv_slor = []

      triples = search_results[key]
      for triple in triples:
        input_text = triple[0].replace("[SEP] ", "")
        tokenize_input = tokenizer.tokenize(input_text)

        logprob = model_score(tokenize_input, model, tokenizer, device)
        sentence_len = len(input_text)

        uni_lp = 0.0
        for w in tokenize_input:
          try:
              if unigram_freq[w] > 0 and unigram_total > 0:
                  uni_lp += math.log(float(unigram_freq[w]) / unigram_total)
          except:
              print(triple)


        rv_logprob.append([triple, logprob, logprob])
        rv_meanlp.append([triple, logprob/sentence_len, logprob/sentence_len])
        rv_normlp_sub.append([triple, logprob-uni_lp, logprob-uni_lp])
        rv_normlp_div.append([triple, -(logprob/uni_lp), -(logprob/uni_lp)])
        rv_slor.append([triple, (logprob - uni_lp)/sentence_len, (logprob - uni_lp)/sentence_len])

      rv_logprob = sorted(rv_logprob, key=lambda x: x[1], reverse = True)[:top_k]
      rv_meanlp = sorted(rv_meanlp, key=lambda x: x[1], reverse = True)[:top_k]
      rv_normlp_sub = sorted(rv_normlp_sub, key=lambda x: x[1], reverse = True)[:top_k]
      rv_normlp_div = sorted(rv_normlp_div, key=lambda x: x[1], reverse = True)[:top_k]
      rv_slor = sorted(rv_slor, key=lambda x: x[1], reverse = True)[:top_k]


      # for the LogProb measure
      h.write(value+"\n")
      ID = ID + 1
      for t in rv_logprob:
        text = t[0][0].split("[SEP] ")
        try:
          h.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the MeanLP measure
      i.write(value+"\n")
      ID = ID + 1
      for t in rv_meanlp:
        text = t[0][0].split("[SEP] ")
        try:
          i.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the NormLP_sub measure
      j.write(value+"\n")
      ID = ID + 1
      for t in rv_normlp_sub:
        text = t[0][0].split("[SEP] ")
        try:
          j.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the NormLP_div measure
      k.write(value+"\n")
      ID = ID + 1
      for t in rv_normlp_div:
        text = t[0][0].split("[SEP] ")
        try:
          k.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

      # for the SLOR measure
      l.write(value+"\n")
      ID = ID + 1
      for t in rv_slor:
        text = t[0][0].split("[SEP] ")
        try:
          l.write(str(ID)+'\t'+
            ('"'+text[0]+'"')+'\t'+
            ('"'+text[1]+'"')+'\t'+
              ('"'+text[2]+'"')+'\t'+
                  str(-(1-t[2]))+ '\n')
        except:
          print(ID)

  return

if __name__ == "__main__":
  seed = 42
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)

  if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(seed)
    device_name = torch.cuda.get_device_name(0)
    print('GPU:', device_name)
  else:
    print('Using CPU')
    device = torch.device("cpu")
    device_name = 'cpu'

  model = GPT2LMHeadModel.from_pretrained(model_name)
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  model.to(device)

  measures = ['LogProb', 'MeanLP', 'NormLP_sub', 'NormLP_div', 'SLOR']
  print(f'For model: {model_name}, eval dataset: {e_d}, and split: {split}')

  run_bert_model(model, tokenizer, device, measures)


GPU: NVIDIA A100-SXM4-40GB
For model: gpt2-medium, eval dataset: OIE2016, and split: test
Selecting top:  3


  0%|          | 0/635 [00:00<?, ?it/s]