In [None]:
import os
import numpy as np
import copy

In [None]:
!pip install transformers
from transformers import pipeline, set_seed
from transformers import GPT2Tokenizer, GPT2Model

import spacy
from spacy import displacy

In [None]:
os.chdir('/content/drive/MyDrive/ML_Diss')

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

In [None]:
# Masking training and test sets:

masked_train_data = copy.deepcopy(full_train_data_new)
NER = spacy.load("en_core_web_sm")

for entry in masked_train_data:
    NER_on_claim = NER(entry['claim'])
    if len(list(NER_on_claim.ents)) == 0:
        entry['claim'] = 'Empty'
        continue
    else: 
        masked_train_data = list(NER_on_claim.ents)[-1].text
        masked_entity_idx = entry['claim'].index(masked_entity)
        entry['claim'] = entry['claim'][:masked_entity_idx]
        print(f'Entry {masked_train_data.index(entry)} done!')

In [None]:
masked_dev_data = copy.deepcopy(full_dev_data_new)
NER = spacy.load("en_core_web_sm")

for entry in masked_dev_data:
    NER_on_claim = NER(entry['claim'])
    if len(list(NER_on_claim.ents)) == 0:
        entry['claim'] = 'Empty'
        continue
    else: 
        masked_entity = list(NER_on_claim.ents)[-1].text
        masked_entity_idx = entry['claim'].index(masked_entity)
        entry['claim'] = entry['claim'][:masked_entity_idx]
        print(f'Entry {masked_dev_data.index(entry)} done!')

In [None]:
# Begin with GPT-2-mask: 
from nltk.translate.bleu_score import sentence_bleu

full_GPT2_train_data = copy.deepcopy(masked_train_data)

set_seed(42)

count = 0

for entry in full_GPT2_train_data:
    prompt = entry['claim']
    input_ids = tokenizer.encode(prompt, return_tensors='tf')
    candidate_sequences = list()
    output = model.generate(
       input_ids, 
       max_length=50, 
       num_beams=5, num_return_sequences = 5,
       early_stopping=True)
       for i in range(5):
          sequence = tokenizer.decode(output[i], skip_special_tokens=True)
          candidate_sequences.append(candidate_sequences)
    reference = [prompt.split()]      
    bleu_scores = [sentence_bleu(reference, candidate.split()) for candidate in candidate_sequences]
    ev_statement = candidate_sequences[bleu_scores.index(max(bleu_scores))]
    entry['evidence'] = ev_statement
    print(f'Done with {count}th entry')
    # save to Google Drive folder after every 100 entries:
    if count%100 == 0:
      np.save('full_GPT2_train_data.npy', full_GPT2_dev_data) 
      print('Saved training set!')
    count += 1


In [None]:
full_GPT2_dev_data = copy.deepcopy(masked_dev_data)

set_seed(42)

count = 0

for entry in full_GPT2_dev_data:
    prompt = entry['claim']
    input_ids = tokenizer.encode(prompt, return_tensors='tf')
    candidate_sequences = list()
    output = model.generate(
       input_ids, 
       max_length=50, 
       num_beams=5, num_return_sequences = 5,
       early_stopping=True)
       for i in range(5):
          sequence = tokenizer.decode(output[i], skip_special_tokens=True)
          candidate_sequences.append(candidate_sequences)
    reference = [prompt.split()]      
    bleu_scores = [sentence_bleu(reference, candidate.split()) for candidate in candidate_sequences]
    ev_statement = candidate_sequences[bleu_scores.index(max(bleu_scores))]
    entry['evidence'] = ev_statement
    print(f'Done with {count}th entry')
    # save to Google Drive folder after every 100 entries:
    if count%100 == 0:
      np.save('full_GPT2_train_data.npy', full_GPT2_dev_data) 
      print('Saved training set!')
    count += 1

In [None]:
# Now, we process GPT-2-no-mask:


set_seed(42)

count = 0
full_GPT2_train_data_no_mask = list()

for entry in full_GPT2_train_data:
    input_ids = tokenizer.encode(prompt, return_tensors='tf')
    candidate_sequences = list()
    output = model.generate(
       input_ids, 
       max_length=50, 
       num_beams=5, num_return_sequences = 5,
       early_stopping=True)
       for i in range(5):
          sequence = tokenizer.decode(output[i], skip_special_tokens=True)
          candidate_sequences.append(candidate_sequences)
    reference = [prompt.split()]      
    bleu_scores = [sentence_bleu(reference, candidate.split()) for candidate in candidate_sequences]
    ev_statement = candidate_sequences[bleu_scores.index(max(bleu_scores))]
    entry['evidence'] = ev_statement
    print(f'Done with {count}th entry')
    # save to Google Drive folder after every 100 entries:
    if count%100 == 0:
      np.save('full_GPT2_train_data_no_mask.npy', full_GPT2_train_data_no_mask) 
      print('Saved training set!')
    count += 1

In [None]:
set_seed(42)

count = 0
full_GPT2_dev_data_no_mask = list()

for entry in full_GPT2_dev_data:
    input_ids = tokenizer.encode(prompt, return_tensors='tf')
    candidate_sequences = list()
    output = model.generate(
       input_ids, 
       max_length=50, 
       num_beams=5, num_return_sequences = 5,
       early_stopping=True)
       for i in range(5):
          sequence = tokenizer.decode(output[i], skip_special_tokens=True)
          candidate_sequences.append(candidate_sequences)
    reference = [prompt.split()]      
    bleu_scores = [sentence_bleu(reference, candidate.split()) for candidate in candidate_sequences]
    ev_statement = candidate_sequences[bleu_scores.index(max(bleu_scores))]
    entry['evidence'] = ev_statement
    print(f'Done with {count}th entry')
    # save to Google Drive folder after every 100 entries:
    if count%100 == 0:
      np.save('full_GPT2_dev_data_no_mask.npy', full_GPT2_dev_data_no_mask) 
      print('Saved test set!')
    count += 1