### Environment Preparation

In [None]:
# code to set the python version to 3.8
# !sudo update-alternatives --config python3
# !python -V
# !sudo apt-get install python3-pip
# !python -m pip install --upgrade pip
# !pip install ipykernel

In [None]:
!pip install torch torchvision transformers
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from torch.utils.data import Dataset
import json
import random
from tqdm import tqdm
import torch.nn.functional as F
import torch
import numpy as np
from datetime import datetime
from transformers import AutoTokenizer, AutoModel
import wandb
from torch.utils.data import DataLoader
import torch.optim as optim
import os
import pickle

### Evidence Retrieval Dataset

In [None]:
# function to merge two dictionaries
def Merge(dict1, dict2):
    res = {**dict1, **dict2}
    return res

In [None]:
# Process the training dataset
class TrainDataset(Dataset):
  def __init__(self, mode, tokenizer, evidence_samples, max_length):
    self.mode = mode
    self.tokenizer = tokenizer
    self.evidence_samples = evidence_samples
    self.max_length = max_length
    
    # use both the train and dev datasets to train the model
    if mode == "train_dev":
      f = open("./drive/MyDrive/NLP_ass3/data/train-claims.json", "r")
      train_dataset = json.load(f)
      f.close()

      f = open("./drive/MyDrive/NLP_ass3/data/dev-claims.json", "r")
      dev_dataset = json.load(f)
      f.close()

      self.dataset = Merge(train_dataset, dev_dataset)
    else:
      # open the train/dev file
      f = open("./drive/MyDrive/NLP_ass3/data/{}-claims.json".format(mode), "r")
      self.dataset = json.load(f)
      f.close()
    self.claim_ids = list(self.dataset.keys())
    
    # open the evidence file 
    f = open("./drive/MyDrive/NLP_ass3/data/evidence.json", "r")
    self.evidences = json.load(f)
    f.close()
    self.evidence_ids = list(self.evidences.keys())

  def __len__(self):
    return len(self.claim_ids)

  def __getitem__(self, index):
    claim_id = self.claim_ids[index]
    data = self.dataset[claim_id]
    processed_query = data["claim_text"].lower()
    evidences = []
    for evidence_id in data["evidences"]:
      evidences.append(evidence_id)
    return [processed_query, evidences]

  def collate_fn(self, batch):
    queries = []
    evidences = []
    answer_lens = []
    for query, evidence in batch:
      queries.append(query)
      evidences.extend(evidence)
      answer_lens.append(len(evidence))
    
    # set some negative example for training
    evidence_num = len(evidences)
    # if the number of evidence is larger than example setting, trancate the evidence list
    if evidence_num > self.evidence_samples:
      evidences = evidences[:self.evidence_samples]

    # if the number of evidence is less than example setting,
    # randomly choose evidence from the dataset while proving there is no duplication
    evidences_text = [self.evidences[evidence_id].lower() for evidence_id in evidences]
    while evidence_num < self.evidence_samples:
      evidence_id = random.choice(self.evidence_ids)
      while evidence_id in evidences:
        evidence_id = random.choice(self.evidence_ids)
      evidences.append(evidence_id)
      evidences_text.append(self.evidences[evidence_id].lower())
      evidence_num += 1

    query_text_token = self.tokenizer(
        queries,
        max_length=self.max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    evidences_text_token = self.tokenizer(
        evidences_text,
        max_length=self.max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    encoding_dict = {"query_input_ids" : query_text_token["input_ids"],
            "evidence_input_ids" : evidences_text_token["input_ids"],
            "query_attention_mask" : query_text_token["attention_mask"],
            "evidence_attention_mask" : evidences_text_token["attention_mask"],
            "answer_lens" : answer_lens}
    return encoding_dict


In [None]:
# Process the validate dataset
class ValidateDataset(Dataset):
  def __init__(self, mode, tokenizer, max_length):
    self.tokenizer = tokenizer
    self.mode = mode
    self.max_length = max_length

    # open the dev/test file
    if mode != "test":
      f = open("./drive/MyDrive/NLP_ass3/data/{}-claims.json".format(mode), "r")
    else:
      f = open("./drive/MyDrive/NLP_ass3/data/test-claims-unlabelled.json", "r")

    self.dataset = json.load(f)
    f.close()

    # read the claim ids to a list
    self.claim_ids = list(self.dataset.keys())

  def __len__(self):
    return len(self.claim_ids)
  
  def __getitem__(self, index):
    claim_id = self.claim_ids[index]
    data = self.dataset[claim_id]
    processed_query = data["claim_text"].lower()
    return [processed_query, data, claim_id]

  def collate_fn(self, batch):
    queries = []
    datas = []
    claim_ids = []
    evidences = []
    # print("read the dev file 1111")

    for query, data, claim_id in batch:
      queries.append(query)
      datas.append(data)
      claim_ids.append(claim_id)
      if self.mode != "test":
        evidences.append(data["evidences"])

    # print("read the dev file")

    query_text_token = self.tokenizer(
      queries,
      max_length=self.max_length,
      padding=True,
      truncation=True,
      return_tensors="pt"
    )

    encoding_dict = {"query_input_ids" : query_text_token["input_ids"],
              "query_attention_mask" : query_text_token["attention_mask"],
              "datas" : datas,
              "claim_ids" : claim_ids}
    # if the file is dev, encode the evidences into the encodings
    if self.mode != "test":
      encoding_dict["evidences"] = evidences
    return encoding_dict


In [None]:
# Process the evidence dataset
class EvidenceDataset(Dataset):
  def __init__(self, tokenizer, max_length):
    self.tokenizer = tokenizer
    self.max_length = max_length

    f = open("./drive/MyDrive/NLP_ass3/data/evidence.json", "r")
    self.evidences = json.load(f)
    f.close()

    self.evidences_ids = list(self.evidences.keys())

  def __len__(self):
    return len(self.evidences_ids)

  def __getitem__(self, index):
    evidences_id = self.evidences_ids[index]
    evidence = self.evidences[evidences_id]
    return [evidences_id, evidence]
    
  def collate_fn(self, batch):
    evidences_ids = []
    evidences = []

    for evidences_id, evidence in batch:
      evidences_ids.append(evidences_id)
      evidences.append(evidence.lower())

    evidences_text_token = self.tokenizer(
        evidences,
        max_length=self.max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    encoding_dict = {"evidence_input_ids" : evidences_text_token["input_ids"],
               "evidence_attention_mask" : evidences_text_token["attention_mask"],
               "evidences_ids" : evidences_ids}

    return encoding_dict

### Auxiliary Functions

In [None]:
# function to set the random seed
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [None]:
def set_cuda(batch):
  for key in batch.keys():
    if key in ["query_input_ids", "evidence_input_ids", "query_attention_mask", "evidence_attention_mask"]:
      batch[key] = batch[key].cuda()

### Generate and save the evidences embedings and ids

In [None]:
# the function to get the evidence embeddings
def Generate_E_Embedding():
  # define the model_name
  model_name = "bert-base-uncased"

  # get the tokenizer from the specific model
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  # initialize the pretrained model
  model = AutoModel.from_pretrained(model_name)

  model.cuda()
  model.eval()

  evidence_dataset = EvidenceDataset(tokenizer, max_length=128)
  evidence_DataLoader = DataLoader(evidence_dataset, batch_size=128, shuffle=False, num_workers=8, collate_fn=evidence_dataset.collate_fn)

  e_embeddings = []
  e_ids = []
  for batch in tqdm(evidence_DataLoader):
    set_cuda(batch)
    # get the last hidden layer, detach the embedding from the logits
    e_outputs = model(input_ids=batch["evidence_input_ids"], attention_mask=batch["evidence_attention_mask"])
    e_logits = e_outputs.last_hidden_state
    e_embedding = e_logits[:, 0, :].detach()

    # transfer the e_embedding to cpu
    e_embedding_cpu = F.normalize(e_embedding).cpu()
    del e_logits, e_embedding

    # append the data to list
    e_embeddings.append(e_embedding_cpu)
    e_ids.extend(batch["evidences_ids"])

  e_embeddings = torch.cat(e_embeddings, dim=0).t()
  
  torch.save(e_embeddings, "./drive/MyDrive/NLP_ass3/embedding_ids/e_embedding.pth")

  file = open("./drive/MyDrive/NLP_ass3/embedding_ids/e_ids.pkl", 'wb')
  r = pickle.dump(e_ids, file)
  file.close()

  print(e_embedding)
  print(e_ids)

In [None]:
# Generate_E_Embedding()

In [None]:
# the function to get the evidence embeddings
def embed_evidence(evidence_DataLoader, model):
  # set the model to evaluate mode
  model.eval()
  e_embeddings = []
  e_ids = []
  for batch in tqdm(evidence_DataLoader):
    set_cuda(batch)
    # get the last hidden layer, detach the embedding from the logits
    e_outputs = model(input_ids=batch["evidence_input_ids"], attention_mask=batch["evidence_attention_mask"])
    e_logits = e_outputs.last_hidden_state
    e_embedding = e_logits[:, 0, :].detach()

    # transfer the e_embedding to cpu
    e_embedding_cpu = F.normalize(e_embedding).cpu()
    del e_logits, e_embedding

    # append the data to list
    e_embeddings.append(e_embedding_cpu)
    e_ids.extend(batch["evidences_ids"])

  e_embeddings = torch.cat(e_embeddings, dim=0).t()
  return e_embeddings, e_ids

### Crucial Functions

In [None]:
def evaluate(val_DataLoader, evidence_embeddings, evidence_ids, model, top_evidence):
  # set the model to evaluate mode
  model.eval()
  
  f_scores = []
  for batch in tqdm(val_DataLoader):
    set_cuda(batch)
    # get the last hidden layer, detach the embedding from the logits
    q_outputs = model(input_ids=batch["query_input_ids"], attention_mask=batch["query_attention_mask"])
    q_logits = q_outputs.last_hidden_state
    q_embedding = q_logits[:, 0, :]

    # transfer the q_embedding to cpu
    q_embedding_cpu = F.normalize(q_embedding).cpu()
    
    # get the evidences scores seperately and select the top ones
    similarity_scores = torch.mm(q_embedding_cpu, evidence_embeddings)
    batch_e_ids = torch.topk(similarity_scores, k=top_evidence, dim=1).indices.tolist()

    for index, data in enumerate(batch["datas"]):
        top_k_ids = batch_e_ids[index]
        top_e_ids = [evidence_ids[id] for id in top_k_ids]
        correct_evidence = [e_id for e_id in batch["evidences"][index] if e_id in top_e_ids]
        
        if len(correct_evidence) > 0:
            recall = len(correct_evidence) / len(batch["evidences"][index])
            precision = len(correct_evidence) / len(top_e_ids)
            f_score = (2 * precision * recall) / (precision + recall)
        else:
            f_score = 0
        f_scores.append(f_score)

  f_socre_final = np.mean(f_scores)
  print("\nEvidence Retrieval F-score = %.3f\n" % f_socre_final)

  # set the model back to train mode
  model.train()

  return f_socre_final

In [None]:
def train(model_name, epoch, batch_size, max_length, evidence_samples, top_evidence, model_path):
  # initiate the wandb
  wandb.init(project="Task1 Evidence Retrieval", name="BERT")

  # set the random seed of the model
  setup_seed(42)

  # create the folder to save the model trained  
  month_date = datetime.now().strftime("%m-%d")
  checkpoints_dir_path = f"./drive/MyDrive/NLP_ass3/checkpoints/{month_date}"
  if not os.path.exists(checkpoints_dir_path):
    os.makedirs(checkpoints_dir_path)
  
  # initialize the pretrained model
  model = AutoModel.from_pretrained(model_name)
  if model_path != "":
      model.load_state_dict(torch.load(os.path.join("./drive/MyDrive/NLP_ass3/checkpoints", model_path, "best_state_dict.bin")))

  # use GPU to train the model
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  # print(device)
  model.to(device)
  # set model to train mode
  model.train()

  # get the tokenizer from the specific model
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  # initialize the datasets and read to the DataLoader for feeding into the model
  train_dataset = TrainDataset("train", tokenizer, evidence_samples, max_length)
  val_dataset = ValidateDataset("dev", tokenizer, max_length)
  evidence_dataset = EvidenceDataset(tokenizer, max_length)

  train_DataLoader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8, collate_fn=train_dataset.collate_fn)
  val_DataLoader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=8, collate_fn=val_dataset.collate_fn)
  evidence_DataLoader = DataLoader(evidence_dataset, batch_size=128, shuffle=False, num_workers=8, collate_fn=evidence_dataset.collate_fn)

  # TODO: here to change the optimizer
  # TODO: adjust the lr
  optimizer = optim.Adam(model.parameters(), lr=2e-5)

  # set some counter for training
  update_count = 0
  avg_loss = 0
  max_f_score = 0
  epoch_num = 0

  # get the evidence embedding for scoring
  # evidence_embeddings, evidence_ids = embed_evidence(evidence_DataLoader, model)

  evidence_embeddings = torch.load("./drive/MyDrive/NLP_ass3/embedding_ids/e_embedding.pth")

  evidence_ids=[]
  file=open(r"./drive/MyDrive/NLP_ass3/embedding_ids/e_ids.pkl","rb")
  evidence_ids=pickle.load(file)

  f_score = evaluate(val_DataLoader, evidence_embeddings, evidence_ids, model, top_evidence)
  wandb.log({"f_score": f_score}, step=epoch_num)

  # assign the f_score to max_f_score
  max_f_score = f_score
  
  for epoch in range(epoch):
    print("##################################################Training##################################################")
    # use to save the count of epoch update step
    for (idx, batch) in enumerate(tqdm(train_DataLoader)):
      # start training
      optimizer.zero_grad()
      
      # put the data in batch into cuda
      set_cuda(batch)

      # get the embeddings of the q and e according 
      q_outputs = model(input_ids=batch["query_input_ids"], attention_mask=batch["query_attention_mask"])
      q_logits = q_outputs.last_hidden_state

      e_outputs = model(input_ids=batch["evidence_input_ids"], attention_mask=batch["evidence_attention_mask"])
      e_logits = e_outputs.last_hidden_state
      
      q_embeddings = q_logits[:, 0, :]
      e_embeddings = e_logits[:, 0, :]

      # normalize the embeddings
      q_embeddings = F.normalize(q_embeddings)
      e_embeddings = F.normalize(e_embeddings)

      # calculate the cosine similarity between the queries and evidences
      cos_similarities = torch.mm(q_embeddings, e_embeddings.t())
      # prevent overflow, accelarate back propogation
      log_soft_scores = - F.log_softmax(cos_similarities, dim=1)

      loss_list = []
      start_index = 0
      for index, answer_len in enumerate(batch["answer_lens"]):
        end_index = start_index + answer_len
        current_loss = torch.mean(log_soft_scores[index, start_index:end_index])
        loss_list.append(current_loss)
        start_index = end_index

      loss = torch.stack(loss_list).mean()
      
      # backward the loss, update the parameters in the model
      loss.backward()
      avg_loss = avg_loss + loss.item()

      # optimize the model
      optimizer.step()

      update_count += 1

      wandb_freq = 20
      if update_count % wandb_freq == 0:
        wandb.log({"loss": avg_loss / wandb_freq}, step=update_count)
        avg_loss = 0

    print("##################################################Evaluate##################################################")
    # finish one epoch, add one to the epoch_num
    epoch_num += 1
    # evaluate the model every epoch and save the best one that have the best f_score
    # evidence_embeddings, evidence_ids = embed_evidence(evidence_DataLoader, model)
    f_score = evaluate(val_DataLoader, evidence_embeddings, evidence_ids, model, top_evidence)
    wandb.log({"f_score": f_score}, step=update_count)

    if f_score > max_f_score:
      max_f_score = f_score
      torch.save(model.state_dict(), os.path.join(checkpoints_dir_path, "best_state_dict.bin"))
      print("\nThis is the", epoch_num, "epoch", "the max f_score is", max_f_score)
  # finish the wandb
  wandb.finish()

In [None]:
def predict(model_name, batch_size, max_length, top_evidence, model_path):
  # get the tokenizer from the specific model
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  # initialize the pretrained model
  model = AutoModel.from_pretrained(model_name)

  # load the best finetuned parameters
  assert model_path
  # print(os.path.join("./drive/MyDrive/NLP_ass3/checkpoints", model_path, "best_state_dict.bin"))
  model.load_state_dict(torch.load(os.path.join("./drive/MyDrive/NLP_ass3/checkpoints", model_path, "best_state_dict.bin")))

  # use GPU, and set the model to evaluate mode
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  # print(device)
  model.to(device)

  # load the test and evidence datasets
  test_dataset = ValidateDataset("test", tokenizer, max_length)
  evidence_dataset = EvidenceDataset(tokenizer, max_length)

  test_DataLoader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=8, collate_fn=test_dataset.collate_fn)
  evidence_DataLoader = DataLoader(evidence_dataset, batch_size=128, shuffle=False, num_workers=8, collate_fn=evidence_dataset.collate_fn)


  # utilize embed_evidence function to get the evidences embeddings
  # evidence_embeddings, evidence_ids = embed_evidence(evidence_DataLoader, model)
  evidence_embeddings = torch.load("./drive/MyDrive/NLP_ass3/embedding_ids/e_embedding.pth")
  
  evidence_ids=[]
  file=open(r"./drive/MyDrive/NLP_ass3/embedding_ids/e_ids.pkl","rb")
  evidence_ids=pickle.load(file)

  output = {}
  for batch in tqdm(test_DataLoader):
    set_cuda(batch)
    # get the last hidden layer, detach the embedding from the logits
    q_outputs = model(input_ids=batch["query_input_ids"], attention_mask=batch["query_attention_mask"])
    q_logits = q_outputs.last_hidden_state
    q_embedding = q_logits[:, 0, :]

    # transfer the q_embedding to cpu
    q_embedding_cpu = F.normalize(q_embedding).cpu()
    
    # get the evidences scores seperately and select the top ones
    similarity_scores = torch.mm(q_embedding_cpu, evidence_embeddings)
    batch_e_ids = torch.topk(similarity_scores, k=top_evidence, dim=1).indices.tolist()

    for index, data in enumerate(batch["datas"]):
      top_k_ids = batch_e_ids[index]
      data["evidences"] = [evidence_ids[id] for id in top_k_ids]
      claim_id = batch["claim_ids"][index]
      output[claim_id] = data
  fout = open("./drive/MyDrive/NLP_ass3/data/test-claims-retrieved.json", 'w')
  json.dump(output, fout)
  fout.close()

In [None]:
# Settings
epoch = 10
batch_size = 8
max_length = 128
evidence_samples = 64
model_path = ""
# TODO, adjust the top_evidence number
top_evidence = 3
# initialize the pretrained model
# TODO, use different pretrained model
model_name = "bert-base-uncased"

train(model_name, epoch, batch_size, max_length, evidence_samples, top_evidence, model_path)

# predict(model_name, batch_size, max_length, top_evidence, model_path)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 20/20 [00:07<00:00,  2.74it/s]



Evidence Retrieval F-score = 0.022

##################################################Training##################################################


100%|██████████| 154/154 [00:38<00:00,  4.00it/s]


##################################################Evaluate##################################################


100%|██████████| 20/20 [00:07<00:00,  2.71it/s]



Evidence Retrieval F-score = 0.000

##################################################Training##################################################


100%|██████████| 154/154 [00:38<00:00,  4.01it/s]


##################################################Evaluate##################################################


100%|██████████| 20/20 [00:06<00:00,  3.06it/s]



Evidence Retrieval F-score = 0.000

##################################################Training##################################################


  4%|▍         | 6/154 [00:03<01:19,  1.85it/s]


KeyboardInterrupt: ignored