# **The task2 file should be run on the Google Colab Pro with "V100 GPU + High RAM"**

# **Task-2**

### Reuse the code for Dataset from task-1

In [1]:
import numpy as np
import torch.nn as nn  # neutral network "https://pytorch.org/docs/stable/nn.html"; pos_weight "https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html"
import torch.nn.functional as F  # F.relu "https://pytorch.org/docs/stable/generated/torch.nn.functional.relu.html"
import torch.optim as optim  # Adam optimizer "https://pytorch.org/docs/stable/optim.html" & "https://pytorch.org/docs/stable/generated/torch.optim.Adam.html"

import torch, os, random, csv, multiprocessing, time, logging

import nltk
nltk.download('punkt')  # google colab warning "https://www.nltk.org/api/nltk.tokenize.punkt.html", which was helpful for the tokenization process

from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score
'''
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html#sklearn.metrics.precision_score
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html#sklearn.metrics.recall_score
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
'''

from torch.utils.data import DataLoader  # loading the data in the software engineering style "https://pytorch.org/tutorials/beginner/basics/data_tutorial.html"
from tqdm import tqdm  # visualising the process
from torch.utils.data import Dataset  # try the software engineering style for dataset not pandas flow "https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset"
from nltk import word_tokenize  # nltk tokenization tool "https://www.nltk.org/api/nltk.tokenize.html"
from gensim.models import word2vec, Word2Vec  # word2vec model "https://radimrehurek.com/gensim/models/word2vec.html"
from transformers import BertTokenizer, BertModel  # hugging face:) for BERT tokenizer and model "https://huggingface.co/bert-base-cased"


class BaseDataset(Dataset):
    def __init__(self, csv_file_path, max_length=512, return_text=True, cache_path=None):
        self.max_length = max_length  # the maximum length of samples
        self.return_text = return_text  # whether return ['sentences_text']
        self.cache_path = cache_path  # .pth :preprocessed data or weights (or actually any python object)

        if cache_path is None or not os.path.exists(cache_path):
            # load raw data
            try:
                with open(csv_file_path, 'r', encoding='utf-8') as f:
                    csv_reader = csv.reader(f)
                    lines = [row for row in csv_reader]
            except:
                with open(csv_file_path, 'r', encoding='gbk') as f:
                    csv_reader = csv.reader(f)
                    lines = [row for row in csv_reader]

            data = []
            for line in lines[1:]:
                id_, title, plot_synopsis = line[:3]
                sentences_text = self.tokenize_fn(plot_synopsis)

                label = np.array([int(i) for i in line[3:]])  # labels
                data.append({
                    'ID': id_,
                    'title': title,
                    'sentences_text': sentences_text,
                    'label': label
                })

            self.data = data  # get the input data

    def preprocess_text(self, text):
        return np.zeros((len(text), 384))

    def tokenize_fn(self, text):
        text = text.replace('.', ' . ').lower()  # considering word + point('.') + word without blanks
        tokenized_text = word_tokenize(text)  # nltk tokenizer
        sentences_text = []
         # some special tags should be considered after observing the dataset
        for s in tokenized_text:
            if ',' in s and s != ',':
                s = '#NUMBER#'  # the format like word + comma(',') had been processed through the nltk tokenizer
            if s.isdigit():
                s = '#NUMBER#'
            if ':' in s and s != ':':
                s = '#TIME'
            sentences_text.append(s)
        return sentences_text

    def __len__(self):
        return len(self.data)


# basic class for dataset training
class TrainDataset(BaseDataset):
    def __init__(self, *args):
        super(TrainDataset, self).__init__(*args)
        self.mode = 'train'  # model.train()

    def aug(self, sentence, p=0.5):
        # data argumentation was dropped in task-2 as poor performance
        return sentence

    def __getitem__(self, idx):
        # get data by random idx generated by pytorch itself.
        id_ = self.data[idx]['ID']
        text = self.data[idx]['sentences_text']
        label = torch.tensor(self.data[idx]['label'])
        if 'sentences_emb' in self.data[idx].keys():
            emb = self.data[idx]['sentences_emb']
            emb = self.pad_sequence(emb)
            emb = self.aug(emb)
            if self.return_text:
                return id_, text, emb, label
            else:
                return emb, label
        else:
            return id_, text, label


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Reuse the code for word2vec embeddings from task-1

In [2]:
# word2vec
def build_word2vec_model(train_file_name):
    # load and write sentences to a file
    dataset = TrainDataset(train_file_name)
    with open('sentences.txt', 'w') as f:
        for i in range(len(dataset)):
            f.write(' '.join(dataset[i][1]) + '\n')  # 0:id_, 1:text, 2:label

    # build and save model
    all_sentences = list(word2vec.LineSentence('sentences.txt'))
    model = Word2Vec(all_sentences, vector_size=384, min_count=5, window=5, sg=0, workers=multiprocessing.cpu_count())  # CBOW: context->target
    model.save('word2vec.model')

    # check model
    model = word2vec.Word2Vec.load('word2vec.model')
    word2vec_vocab_key = list(model.wv.key_to_index.keys())  # # unique words
    print(len(word2vec_vocab_key))  # how many unique words in the corpus(vocabulary)


# build word embeddings
t0 = time.time()
build_word2vec_model('./data/Training-dataset.csv')
t1 = time.time()
print('Time for generating Word2Vector Embeddings: %.2f ms'%(1000*(t1-t0)))


39828
Time for generating Word2Vector Embeddings: 78555.28 ms


## Task-2-1 Input Features for Bi-LSTM and BERT

In [3]:
class TestDataset(BaseDataset):
    def __init__(self, *args):
        super(TestDataset, self).__init__(*args)

    def get_all_ids(self):
        return [self.data[idx]['ID'] for idx in range(len(self.data))]  # all the ['ID']

    def __getitem__(self, idx):
        # get data by random idx generated by pytorch itself.
        id_ = self.data[idx]['ID']
        text = self.data[idx]['sentences_text']
        # label = torch.tensor(self.data[idx]['label'])
        if 'sentences_emb' in self.data[idx].keys():
            emb = self.data[idx]['sentences_emb']
            emb = self.pad_sequence(emb)
            if self.return_text:
                return id_, text, emb
            else:
                return emb
        else:
            return id_, text


# utilize word2vector embeddings as input features of Bi-LSTM
class LSTMDataset(BaseDataset):
    def __init__(self, *args):
        super(LSTMDataset, self).__init__(*args)
        self.emb_dim = 384

        # if data had been cached, directly loading the pth cached file
        # otherwise, run the preprocess deriving the 'word2vec.model'
        if self.cache_path is None or not os.path.exists(self.cache_path):
            # load word2vector model
            word2vec_model = word2vec.Word2Vec.load('word2vec.model')
            word2vec_vocab_key = list(word2vec_model.wv.key_to_index.keys())  # unique words
            word2vec_vocab = torch.tensor(np.array([word2vec_model.wv[key] for key in word2vec_vocab_key]))  # embedding index according to these unique words

            self.word2vec_vocab = torch.cat([word2vec_vocab, torch.zeros(1, self.emb_dim)], 0)  # considering the unkown tag
            self.vocab2idx = word2vec_model.wv.key_to_index  # corpus(vocabulary) {word: index}

            for i in range(len(self.data)):
                sentences_text = self.data[i]['sentences_text']
                emb_idx = [self.vocab2idx.get(word, -1) for word in sentences_text]  # get the sepcific index one by one; if cannot get, the unkown tag was used
                emb_idx = torch.tensor(emb_idx)
                self.data[i]['sentences_emb'] = self.word2vec_vocab[emb_idx]  # create the ['sentences_emb']

            if self.cache_path is not None:
                save_dir = '/'.join(self.cache_path.split('/')[:-1])
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                torch.save(self.data, self.cache_path)  # save operation
        else:
            with open(self.cache_path, 'rb') as f:
                self.data = torch.load(self.cache_path)  # pth cached file

    # align the sentence length for batch training
    def pad_sequence(self, sequence):
        output = torch.zeros(self.max_length, self.emb_dim)
        if len(sequence) > self.max_length:
            output = sequence[:self.max_length]
        else:
            output[:len(sequence)] = sequence
        return output


# utilize the BERT embedding index as input features
class BERTDataset(BaseDataset):
    def __init__(self, *args):
        model_name = 'bert-base-cased'
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)

        super(BERTDataset, self).__init__(*args)
        self.emb_dim = 768

        if self.cache_path is None or not os.path.exists(self.cache_path):
            for i in range(len(self.data)):
                sentences_text = self.data[i]['sentences_text']
                self.data[i]['sentences_emb'] = self.get_embedding(sentences_text)  # create the ['sentences_emb']

            if self.cache_path is not None:
                save_dir = '/'.join(self.cache_path.split('/')[:-1])
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                torch.save(self.data, self.cache_path)  # save operation
        else:
            with open(self.cache_path, 'rb') as f:
                self.data = torch.load(self.cache_path)  # pth cached file

    # align the sentence length for batch training
    def pad_sequence(self, sequence):
        output = torch.zeros(self.max_length)
        if len(sequence) > self.max_length:
            output = sequence[:self.max_length]
        else:
            output[:len(sequence)] = sequence
        return output

    @torch.no_grad()
    def get_embedding(self, tokens):
        tokens = ['[CLS]'] + tokens + ['[SEP]']  # BERT requirements
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_ids = torch.tensor([input_ids]).cuda()
        input_ids = input_ids.squeeze()

        return input_ids.data.cpu().long()  # embedding index not weight

    def tokenize_fn(self, text):
        return self.tokenizer.tokenize(text)


# multiple inheritance for these classes
class LSTMTrainDataset(TrainDataset, LSTMDataset):
    def __init__(self, *args):
        super(LSTMTrainDataset, self).__init__(*args)

class LSTMTestDataset(TestDataset, LSTMDataset):
    def __init__(self, *args):
        super(LSTMTestDataset, self).__init__(*args)

class BERTVecTrainDataset(TrainDataset, BERTDataset):
    def __init__(self, *args):
        super(BERTVecTrainDataset, self).__init__(*args)

class BERTVecTestDataset(TestDataset, BERTDataset):
    def __init__(self, *args):
        super(BERTVecTestDataset, self).__init__(*args)


## Task-2-2 Define Models (Bi-LSTM & BERT)

In [4]:
'''
Multi-label Classifier constructed by RNN.
'''
class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size  # the size of LSTM's hidden layer
        self.num_layers = num_layers  # the amount of hidden layers

        # Define LSTM Layer
        # batch was the first position and the dropout was ignored due to the under-fitting concern
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True, dropout=0)

        # Output Layer(classification head)
        # these layers were set FROM the direction of the output, not the direction of the time step
        self.bn0 = nn.BatchNorm1d(2*hidden_size)  # bidirectional=True
        self.fc1 = nn.Linear(2*hidden_size, 2*hidden_size)
        self.bn1 = nn.BatchNorm1d(2*hidden_size)
        self.fc2 = nn.Linear(2*hidden_size, output_size)  # 9 classes

    def forward(self, x):
        emb, _ = self.rnn(x)
        emb = emb.mean(dim=1)  # mean() at the direction of time steps considering the meaning of the whole sentence
        out = self.bn0(emb)
        out = self.fc1(out)
        out = self.bn1(out)
        out = F.relu(out)  # activation
        out = self.fc2(out)  # without sigmoid
        return out


'''
Multi-label Classifier constructed by BERT.
'''
class BERTClassifier(nn.Module):
    def __init__(self, output_size, model_name='bert-base-cased', hidden_size=768):
        super(BERTClassifier, self).__init__()
        self.bert_model = BertModel.from_pretrained(model_name)  # pre-train

        # Output Layer(classification head)
        self.fc1 = nn.Linear(768, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)  # 9 classes

    def forward(self, x):
        outputs = self.bert_model(x.long())  # pass the sequence to catch temporal information
        emb = outputs[0]  # hidden states
        emb = emb.mean(dim=1)  # consider the meaning of the whole sentence, not [cls]
        out = self.fc1(emb)
        out = self.bn1(out)
        out = F.relu(out)  # activation
        out = self.fc2(out)  # without sigmoid
        return out


## Task-2-3 Evaluator

In [5]:
# the wrapper of metrics
class Evaluator(object):
    def __init__(self) -> None:
        super(Evaluator, self).__init__()
        self.preds = []
        self.targets = []

    def add_batch(self, preds, targets):
        # add the preds and targes into the model evaluator
        if isinstance(preds, torch.Tensor):
            preds = preds.cpu().numpy()
            targets = targets.cpu().numpy()
        self.preds.append(preds)
        self.targets.append(targets)

    def run(self):
        preds = np.concatenate(self.preds, axis=0)  # flat the each batch data
        targets = np.concatenate(self.targets, axis=0)  # flat the each batch data

        results = []
        for i in range(9):
            y_pred = preds[:, i]
            y_true = targets[:, i]
            acc = accuracy_score(y_true, y_pred)
            bacc = balanced_accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred, average=None)
            rec = recall_score(y_true, y_pred, average=None)
            f1 = f1_score(y_true, y_pred, average=None)
            # although this task was related to the multi-label classifer, one lable could be checked and viewed as the binary classification
            # for exmaple, this movie should be at label 2 or this movie should not be label 2
            results.append({
                'bacc': bacc,
                'acc': acc,
                'prec_cls_0': prec[0],
                'prec_cls_1': prec[1],
                'rec_cls_0': rec[0],
                'rec_cls_1': rec[1],
                'f1_cls_0': f1[0],
                'f1_cls_1': f1[1],
            })

        return results

    def reset(self):
        del self.preds
        del self.targets
        self.preds = []  # new list
        self.targets = []  # new list


## Task-2-4 Training

In [6]:
for EXPERIMENT_NAME in ['bert', 'lstm']:
    # BERT was listed first considering the V100 GPU RAM or mac mps

    # parameters
    HIDDEN_SIZE = 256  # the hidden layer
    NUM_LAYERS = 3  # the number of LSTM layers
    NUM_CLASSES = 9  # the amount of classes for the movie
    LR = 1e-5  # learning rate
    DECAY = 1e-4  # weight decay
    EPOCHS = 5  # trainning epoch
    BATCH_SIZE = 4  # batch size
    DEVICE = 'cuda:0'  # 'cuda:0' or 'cpu', Google Colab pro and M1pro mps both didnt have enough GPU RAM when batch_size=16 or 8, but OK for 4 on the V100
    MAX_LENGTH = 512

    TRAIN_DATA_PATH = 'data/Training-dataset.csv'  # fixed
    VAL_DATA_PATH = 'data/Task-2-validation-dataset.csv'  # fixed and provided from the blackboard to optimize the classifier
    SAVE_DIR = 'model_weights/' + EXPERIMENT_NAME  # the best weight in the pth format
    if not os.path.exists(SAVE_DIR):
        os.makedirs(SAVE_DIR)

    if 'lstm' in EXPERIMENT_NAME:
        TrainDatasetCls = LSTMTrainDataset
        TestDatasetCls = LSTMTrainDataset
        TRAIN_DATA_CACHE_PAT = TRAIN_DATA_PATH.replace('.csv', '+word2vec.pth')  # the name of cached file
        VAL_DATA_CACHE_PAT = VAL_DATA_PATH.replace('.csv', '+word2vec.pth')  #  the name of cached file

        # init model
        model = RNNClassifier(384, HIDDEN_SIZE, NUM_LAYERS, NUM_CLASSES)

    elif 'bert' in EXPERIMENT_NAME:
        TrainDatasetCls = BERTVecTrainDataset
        TestDatasetCls = BERTVecTrainDataset
        TRAIN_DATA_CACHE_PAT = TRAIN_DATA_PATH.replace('.csv', '+bert.pth')  # the name of cached file
        VAL_DATA_CACHE_PAT = VAL_DATA_PATH.replace('.csv', '+bert.pth')  # the name of cached file

        # init model
        model = BERTClassifier(NUM_CLASSES, 'bert-base-cased', HIDDEN_SIZE)

    else:
        raise NotImplementedError

    # setup logger visualisation and recording
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    os.makedirs('log', exist_ok=True)
    fh = logging.FileHandler('log/%s.log'%EXPERIMENT_NAME, mode='w')
    fh.setLevel(logging.INFO)
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)
    logger.addHandler(ch)
    logger.addHandler(fh)

    # CELoss for classification using reweight
    # The pos_weight was used to consider the "negtive/positive" ratio of samples in this unbalanced dataset; the positive samples were assigned larger weights
    # "The test data will contain a large number of documents that you will be expected to process."
    # "The test documents will be in the same format and with a similar distribution as the training and development data."
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([5.7, 3.8, 5, 49, 1, 4, 3.2, 32, 1.9], device=DEVICE))

    # load data
    train_set = TrainDatasetCls(TRAIN_DATA_PATH, MAX_LENGTH, False, TRAIN_DATA_CACHE_PAT)  # cache path could be used
    val_set = TestDatasetCls(VAL_DATA_PATH, MAX_LENGTH, False, VAL_DATA_CACHE_PAT)  # cache path could be used
    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_set, batch_size=256)

    # optimizer
    model.to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=DECAY)  # use ADAM optimizer, which was less sensitive to the learning rate, helpful to the fine-tuning, and punish model
    evaluator = Evaluator()
    best_val_bacc = 0.0

    # trainning loops with DataLoader
    for epoch in range(EPOCHS):
        model.train()
        for inputs, labels in tqdm(train_loader):
            outputs = model(inputs.to(DEVICE))  # pass the sequence to catch temporal information
            labels = labels.to(DEVICE)
            loss = criterion(outputs, labels.float()).mean()  # mean() loss of multi labels
            optimizer.zero_grad()  # without gradient accumulation
            loss.backward()  # backpropagation
            optimizer.step()  # update

        # start validation
        model.eval()
        with torch.no_grad():
            evaluator.reset()  # make sure the unique test/validation for every time, dropping old val recordings
            for inputs, labels in val_loader:
                outputs = model(inputs.to(DEVICE))  # pass the sequence to catch temporal information
                outputs = torch.sigmoid(outputs)  # socres -> probablity
                evaluator.add_batch((outputs > 0.5).long(), labels)  # add labels
            eval_metrics = evaluator.run()

        # save evaluation results
        logger.info(f"Epoch {epoch+1}/{EPOCHS}:")
        val_bacc = 0
        for subset in range(9):
            logger.info(f"Class {subset+1}:")
            logger.info('  '.join(['%s: %.2f'%(k,v) for k,v in eval_metrics[subset].items()]))
            val_bacc += eval_metrics[subset]['bacc']
        val_bacc /= 9.0  # average was compulsory to examine whole performance of the multi-label classifier

        # save best model
        if val_bacc > best_val_bacc:
            best_val_bacc = val_bacc
            torch.save(model.state_dict(), os.path.join(SAVE_DIR, "best_weights.pth"))
        logger.info(f"Epoch {epoch+1}/{EPOCHS}, Validation Balanced Acc: {val_bacc:.4f}")
        logger.info("\n")
        torch.save(model.state_dict(), os.path.join(SAVE_DIR, "latest_weights.pth"))


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
DEBUG:filelock:Attempting to acquire lock 132716760062112 on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/e3c6d456fb2616f01a9a6cd01a1be1a36353ed22.lock
Attempting to acquire lock 132716760062112 on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/e3c6d456fb2616f01a9a6cd01a1be1a36353ed22.lock
DEBUG:filelock:Lock 132716760062112 acquired on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/e3c6d456fb2616f01a9a6cd01a1be1a36353ed22.lock
Lock 132716760062112 acquired on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/e3c6d456fb2616f01a9a6cd01a1be1a36353ed22.lock
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /bert-base-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 29
https://huggingface.co:443 

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 132716760062112 on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/e3c6d456fb2616f01a9a6cd01a1be1a36353ed22.lock
Attempting to release lock 132716760062112 on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/e3c6d456fb2616f01a9a6cd01a1be1a36353ed22.lock
DEBUG:filelock:Lock 132716760062112 released on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/e3c6d456fb2616f01a9a6cd01a1be1a36353ed22.lock
Lock 132716760062112 released on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/e3c6d456fb2616f01a9a6cd01a1be1a36353ed22.lock
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/vocab.txt HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/vocab.txt HTTP/1.1" 200 0
DEBUG:filelock:Attempting to acquire lock 132716760070608 on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/2ea941cc79a6f3d7985ca6991ef4f67dad62af04.lock
Attempting to 

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 132716760070608 on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/2ea941cc79a6f3d7985ca6991ef4f67dad62af04.lock
Attempting to release lock 132716760070608 on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/2ea941cc79a6f3d7985ca6991ef4f67dad62af04.lock
DEBUG:filelock:Lock 132716760070608 released on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/2ea941cc79a6f3d7985ca6991ef4f67dad62af04.lock
Lock 132716760070608 released on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/2ea941cc79a6f3d7985ca6991ef4f67dad62af04.lock
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/added_tokens.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/added_tokens.json HTTP/1.1" 404 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/special_tokens_map.json HTTP/1.1" 404 0
https://huggingface.co:443 "HEAD 

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 132716760062016 on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/1ab2a0d23e5a032b6dcd6a3d0976c2af4d2c27f8.lock
Attempting to release lock 132716760062016 on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/1ab2a0d23e5a032b6dcd6a3d0976c2af4d2c27f8.lock
DEBUG:filelock:Lock 132716760062016 released on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/1ab2a0d23e5a032b6dcd6a3d0976c2af4d2c27f8.lock
Lock 132716760062016 released on /root/.cache/huggingface/hub/.locks/models--bert-base-cased/1ab2a0d23e5a032b6dcd6a3d0976c2af4d2c27f8.lock
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-cased/resolve

## Task-2-5 Run and Save Results for Task-2

In [7]:
for EXPERIMENT_NAME in ['bert', 'lstm']:

    # parameters
    HIDDEN_SIZE = 256 # size of LSTM's hidden layer
    NUM_LAYERS = 3 # number of layers
    NUM_CLASSES = 9 # class_num
    BATCH_SIZE = 16 # batch size
    DEVICE = 'cuda:0' # 'cuda:0' or 'cpu'
    MAX_LENGTH = 512

    ######
    TEST_DATA_PATH = 'data/Task-2-test-dataset1.csv'  ###### change this to show the validation results ######
    ######

    if EXPERIMENT_NAME == 'bert':
        method_id = 'c'
    else:
        method_id = 'b'

    SAVE_PREDICTION_PATH = 'data/10879229-Task2-method-%s.csv'%method_id
    MODEL_PATH = 'model_weights/' + EXPERIMENT_NAME + '/best_weights.pth'


    if 'lstm' in EXPERIMENT_NAME:
        TestDatasetCls = LSTMTestDataset
        TEST_DATA_CACHE_PAT = TEST_DATA_PATH.replace('.csv', '+word2vec.pth')

        # init model
        model = RNNClassifier(384, HIDDEN_SIZE, NUM_LAYERS, NUM_CLASSES)

    elif 'bert' in EXPERIMENT_NAME:
        TestDatasetCls = BERTVecTestDataset
        TEST_DATA_CACHE_PAT = TEST_DATA_PATH.replace('.csv', '+bert.pth')

        # init model
        model = BERTClassifier(NUM_CLASSES, 'bert-base-cased', HIDDEN_SIZE)

    else:
        raise NotImplementedError

    # load data
    test_set = TestDatasetCls(TEST_DATA_PATH, MAX_LENGTH, False, TEST_DATA_CACHE_PAT)
    test_loader = DataLoader(test_set, batch_size=BATCH_SIZE)

    # load pretrained checkpoint
    model.load_state_dict(torch.load(MODEL_PATH))
    model.to(DEVICE)

    # start testing
    model.eval()  # not train!
    preds = []
    t0 = time.time()
    with torch.no_grad():
        for inputs in test_loader:
            outputs = model(inputs.to(DEVICE))
            outputs = torch.sigmoid(outputs)
            preds.append((outputs > 0.5).long().cpu())
    t1 = time.time()
    print('Test time of %s model: %.2f ms'%(EXPERIMENT_NAME, 1000*(t1-t0)))

    preds = torch.cat(preds).numpy()  # ready to write
    ids = test_set.get_all_ids()  # ready to write

    with open(SAVE_PREDICTION_PATH, 'w') as f:
        for i in range(len(preds)):
            f.write(','.join([ids[i]] + [str(i) for i in list(preds[i])]) + '\n')  # output the result file


DEBUG:urllib3.connectionpool:Resetting dropped connection: huggingface.co
Resetting dropped connection: huggingface.co
Resetting dropped connection: huggingface.co
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0
h

Test time of bert model: 10923.54 ms


INFO:gensim.utils:loading Word2Vec object from word2vec.model
loading Word2Vec object from word2vec.model
loading Word2Vec object from word2vec.model
DEBUG:smart_open.smart_open_lib:{'uri': 'word2vec.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
{'uri': 'word2vec.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
{'uri': 'word2vec.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loading wv recursively from word2vec.model.wv.* with mmap=None
loading wv recursively from word2vec.model.wv.* with mmap=None
loading wv recursively from word2vec.model.wv.* with mma

Test time of lstm model: 3001.99 ms
