### Environment Preparation

In [1]:
# code to set the python version to 3.8
# !sudo update-alternatives --config python3
# !python -V
# !sudo apt-get install python3-pip
# !python -m pip install --upgrade pip
# !pip install ipykernel

In [2]:
!pip install torch torchvision transformers
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
from torch.utils.data import Dataset
import json
import random
from transformers import AutoTokenizer, AutoModel
import wandb
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch import nn
import os
import torch
import numpy as np
from datetime import datetime
from tqdm import tqdm

### CLS Model

In [5]:
class CLS_Model(nn.Module):
    def __init__(self, model_name, dropout=0.5):
        super(CLS_Model, self).__init__()
        self.pretrained_model = AutoModel.from_pretrained(model_name)
        hidden_size = self.pretrained_model.config.hidden_size

        self.cls_models = nn.Sequential(
            # nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 4),
            nn.ReLU()
        )

    def forward(self, input_ids, attn_mask):
        e_outputs = self.pretrained_model(input_ids=input_ids, attention_mask=attn_mask)
        e_logits = e_outputs.last_hidden_state
        cls_tokens = e_logits[:, 0, :]
        logits = self.cls_models(cls_tokens)
        return logits

### CLS Dataset

In [6]:
# function to merge two dictionaries
def Merge(dict1, dict2):
    res = {**dict1, **dict2}
    return res

In [7]:
class CLSDataset(Dataset):
    def __init__(self, mode, tokenizer, label_dic, max_length):
        self.mode = mode
        self.tokenizer = tokenizer
        self.max_length = max_length

        # use both the train and dev datasets to train the model
        if mode == "train_dev":
            f = open("./drive/MyDrive/NLP_ass3/data/train-claims.json", "r")
            train_dataset = json.load(f)
            f.close()

            f = open("./drive/MyDrive/NLP_ass3/data/dev-claims.json", "r")
            dev_dataset = json.load(f)
            f.close()

            self.dataset = Merge(train_dataset, dev_dataset)
        elif mode == "test":
            f = open("./drive/MyDrive/NLP_ass3/predict/v1/test-claims-retrieved.json", "r")
            self.dataset = json.load(f)
            f.close()
        else:
            # open the train/dev file
            f = open("./drive/MyDrive/NLP_ass3/data/{}-claims.json".format(mode), "r")
            self.dataset = json.load(f)
            f.close()
        self.claim_ids = list(self.dataset.keys())

        # open the evidence file
        f = open("./drive/MyDrive/NLP_ass3/data/evidence.json", "r")
        self.evidences = json.load(f)
        f.close()
        self.evidence_ids = list(self.evidences.keys())

        self.label_dic = label_dic

    def __len__(self):
        return len(self.claim_ids)

    def __getitem__(self, index):
        claim_id = self.claim_ids[index]
        data = self.dataset[claim_id]
        input_list = []
        claim = data["claim_text"].lower()
        input_list.append(claim)

        # add evidences to the list
        for e_id in data["evidences"]:
            evidence = self.evidences[e_id].lower()
            input_list.append(evidence)

        # add [SEP] tokens between the texts
        input_text = ""
        for idx, text in enumerate(input_list):
            if idx == 0:
                input_text = text
            else:
                input_text = input_text + "[SEP]" + text
        if self.mode != "test":
          label = self.label_dic[data["claim_label"]]
        else:
          label = None
        return [input_text, label, data, claim_id]

    def collate_fn(self, batch):
        inputs = []
        labels = []
        datas = []
        claim_ids = []

        for item in batch:
            inputs.append(item[0])
            labels.append(item[1])
            datas.append(item[2])
            claim_ids.append(item[3])

        input_texts_tokens = self.tokenizer(
            inputs,
            max_length=self.max_length,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )

        encoding_dict = {
            "text_input_ids": input_texts_tokens["input_ids"],
            "text_attn_mask": input_texts_tokens["attention_mask"],
            "datas": datas,
            "claims_ids": claim_ids
        }

        if self.mode != "test":
            encoding_dict["label"] = torch.LongTensor(labels)

        return encoding_dict

### Auxiliary Functions

In [8]:
# set up the seed
def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [9]:
# put batch to device
def set_cuda(batch):
    for key in batch.keys():
        if key in ["text_input_ids", "text_attn_mask", "label"]:
            # print(batch[key])
            # print(device)
            # print(batch[key].type)
            # print(batch[key].is_cuda)
            batch[key] = batch[key].cuda()

### Crucial Functions

In [10]:
def evaluate(cls_model, val_dataloader, epoch_num):
    # set model to evaluate mode
    cls_model.eval()
    all_count = 0
    correct_count = 0

    for batch in tqdm(val_dataloader):
        set_cuda(batch)
        logits = CLS_Model.forward(cls_model, input_ids=batch["text_input_ids"], attn_mask=batch["text_attn_mask"])
        predictions = logits.argmax(-1).tolist()

        # accumulate the correct count number
        for idx, prediction in enumerate(predictions):
            if prediction == batch["label"][idx]:
                correct_count += 1

        all_count += len(predictions)

    accuracy = correct_count / all_count
    print("\nThis is epoch", epoch_num, ", the accuracy =", accuracy)

    # set model to train mode
    cls_model.train()
    return accuracy


In [11]:
def train(model_name, epochs, batch_size, max_length, model_path):
    # initiate the wandb
    wandb.init(project="Task2 Text Classification", name="CLS_Bert_without_dropout")

    setup_seed(42)
    # create the folder to save the model trained
    month_date = datetime.now().strftime("%m-%d")
    checkpoints_dir_path = f"./drive/MyDrive/NLP_ass3/CLS_checkpoints/{month_date}"
    if not os.path.exists(checkpoints_dir_path):
        os.makedirs(checkpoints_dir_path)

    # initialize the pretrained model
    CLS_model = CLS_Model(model_name, dropout=0.5)
    if model_path != "":
        CLS_model.load_state_dict(torch.load(os.path.join("./drive/MyDrive/NLP_ass3/CLS_checkpoints", model_path, "best_state_dict.bin")))

    # use GPU to train the model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # print(device)
    CLS_model.to(device)
    CLS_model.train()

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # transform label to num
    label_dic = {"SUPPORTS": 0, "REFUTES": 1, "NOT_ENOUGH_INFO": 2, "DISPUTED": 3}

    train_Dataset = CLSDataset("train_dev", tokenizer, label_dic, max_length)
    val_Dataset = CLSDataset("dev", tokenizer, label_dic, max_length)

    train_DataLoader = DataLoader(train_Dataset, batch_size=batch_size, shuffle=True, num_workers=4,
                                  collate_fn=train_Dataset.collate_fn)
    val_DataLoader = DataLoader(val_Dataset, batch_size=batch_size, shuffle=False, num_workers=4,
                                collate_fn=val_Dataset.collate_fn)

    Loss_Function = CrossEntropyLoss()
    optimizer = Adam(CLS_model.parameters(), lr=2e-5)

    # set some counter for training
    update_count = 0
    avg_loss = 0
    max_accuracy = 0
    epoch_num = 0

    for epoch in range(epochs):
        print("########################################Training########################################")
        for (idx, batch) in enumerate(tqdm(train_DataLoader)):
            # start training
            optimizer.zero_grad()

            # put the data in batch into cuda
            set_cuda(batch)

            # calculate the loss and do the back propagation
            logits = CLS_Model.forward(CLS_model, input_ids=batch["text_input_ids"], attn_mask=batch["text_attn_mask"])
            loss = Loss_Function(logits, batch["label"])
            loss.backward()
            avg_loss += loss.item()

            #  update the optimizer
            optimizer.step()

            update_count += 1

            wandb_freq = 20
            if update_count % wandb_freq == 0:
                wandb.log({"loss": avg_loss / wandb_freq}, step=update_count)
                avg_loss = 0

        print("########################################Evaluate########################################")
        # finish one epoch, add one to the epoch_num
        epoch_num += 1
        # evaluate the model every epoch and save the best one that have the best f_score
        accuracy = evaluate(CLS_model, val_DataLoader, epoch_num)
        wandb.log({"accuracy": accuracy}, step=update_count)

        if accuracy > max_accuracy:
            max_accuracy = accuracy
            torch.save(CLS_model.state_dict(), os.path.join(checkpoints_dir_path, "best_state_dict.bin"))
            print("\nThis is the", epoch_num, "epoch", "the max acc is", max_accuracy)


    # finish the wandb
    wandb.finish()

In [12]:
def predict(model_name, batch_size, max_length, model_path):
    # initialize the pretrained model
    CLS_model = CLS_Model(model_name, dropout=0.5)

    label_dic = {"SUPPORTS": 0, "REFUTES": 1, "NOT_ENOUGH_INFO": 2, "DISPUTED": 3}
    label_list = ["SUPPORTS", "REFUTES", "NOT_ENOUGH_INFO", "DISPUTED"]

    assert model_path
    CLS_model.load_state_dict(torch.load(os.path.join("./drive/MyDrive/NLP_ass3/CLS_checkpoints", model_path, "best_state_dict.bin")))

    # use GPU to train the model and set the model mode to evaluate
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # print(device)
    CLS_model.to(device)
    CLS_model.eval()

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    test_Dataset = CLSDataset("test", tokenizer, label_dic, max_length)
    test_DataLoader = DataLoader(test_Dataset, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=test_Dataset.collate_fn)

    # generate the output
    output = {}
    for batch in tqdm(test_DataLoader):
      set_cuda(batch)
      logits = CLS_Model.forward(CLS_model, input_ids=batch["text_input_ids"], attn_mask=batch["text_attn_mask"])
      predictions = logits.argmax(-1).tolist()
      
      for idx, data in enumerate(batch["datas"]):
        prediction = predictions[idx]
        data["claim_label"] = label_list[prediction]
        claim_id = batch["claims_ids"][idx]
        output[claim_id] = data

    fout = open("./drive/MyDrive/NLP_ass3/data/test-claims-predictions.json", 'w')
    json.dump(output, fout)
    fout.close()


In [13]:
def main():
    # settings of the model
    model_name = "bert-base-uncased"
    # model_name = "roberta-base"
    epochs = 10
    batch_size = 8
    # max_length = 512
    max_length = 512
    model_path = ""

    train(model_name, epochs, batch_size, max_length, model_path)

    # predict(model_name, batch_size, max_length, model_path)

In [17]:
main()

[34m[1mwandb[0m: Currently logged in as: [33m377188503[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666987836667128, max=1.0)…

Problem at: <ipython-input-11-ef20f1e3aeb5> 3 train


CommError: ignored