In [None]:
!pip install transformers

In [1]:
import os
import random

import pandas as pd
import numpy as np

from transformers import AutoModelForSequenceClassification
from transformers import AutoConfig
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import get_linear_schedule_with_warmup

import IPython

import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = "bert-base-uncased"
MAX_INPUT_LENGTH = 192
bert_tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Data Loader

In [4]:
# LOAD DATA

class Dataset:
    def __init__(self, file_path:str) -> None:
        self.emo2id = {
            "anger": 0,
            "disgust": 1,
            "fear": 2,
            "guilt": 3,
            "joy": 4,
            "sadness": 5,
            "shame": 6
        }
        self.df = pd.read_csv(file_path,
                 encoding="utf-8",
                 delimiter=",",
                 names=["Emotion","Text"])
        full_dataset = self.formalize_rawdata()


        # Calculate the number of samples to include in each set.
        train_size = int(0.8 * len(full_dataset))
        val_size = int(0.1 * len(full_dataset)) 
        test_size = len(full_dataset) - train_size - val_size

        self.train_set, self.validation_set, self.test_set = \
            random_split(full_dataset, [train_size, val_size, test_size])
    
    def formalize_rawdata(self):
        # list [ [input id, attention mask, label]]
        # TODO: preprocess the inputs
        # process the outputs

        input_ids = []
        attention_masks = []
        labels = []
        for _, row in self.df.iterrows():
            label, sentence = row['Emotion'], row['Text']
            # process the input
            # print("debug-------------", sentence, type(sentence))
            if not isinstance(sentence, str):
                continue
            encode_dict = bert_tokenizer.encode_plus(
                        sentence,            
                        add_special_tokens = True,
                        max_length = MAX_INPUT_LENGTH,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt')
            input_id, att_mask = encode_dict['input_ids'], encode_dict['attention_mask']
            # process the output
            label_id = self.emo2id[label]

            input_ids.append(input_id)
            attention_masks.append(att_mask)
            labels.append(label_id)
        
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        labels = torch.tensor(labels, dtype=torch.long)
        print(input_ids.shape, attention_masks.shape, labels.shape)
        # print(input_ids[3], attention_masks[3], labels[3])
        # print(self.df.iloc[3])

        return TensorDataset(input_ids, attention_masks, labels)


#emo_dataset = EMODataset("/content/drive/MyDrive/2021/Pytorch/data.csv")
drive.mount('/content/drive', force_remount=True)
emo_dataset = EMODataset("./drive/My Drive/Teams_Lab/isear/data.csv")


Mounted at /content/drive


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


torch.Size([7632, 192]) torch.Size([7632, 192]) torch.Size([7632])


In [5]:
print(len(emo_dataset.test_set.indices))
print(len(emo_dataset.train_set.indices))

764
6105


In [6]:
test_loader = DataLoader(
    emo_dataset.test_set,
    sampler = RandomSampler(emo_dataset.test_set), 
    batch_size = 16
)

In [7]:
print(next(iter(test_loader)))

[tensor([[ 101,  100, 2158,  ...,    0,    0,    0],
        [ 101,  100, 2026,  ...,    0,    0,    0],
        [ 101,  100, 2026,  ...,    0,    0,    0],
        ...,
        [ 101,  100, 8682,  ...,    0,    0,    0],
        [ 101,  100, 3663,  ...,    0,    0,    0],
        [ 101,  100,  100,  ...,    0,    0,    0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([2, 3, 4, 3, 6, 3, 3, 0, 2, 3, 3, 3, 4, 3, 6, 4])]


In [8]:
for batch in test_loader:
    print(batch)
    break

[tensor([[ 101,  100, 2000,  ...,    0,    0,    0],
        [ 101,  100, 2018,  ...,    0,    0,    0],
        [ 101,  100, 2204,  ...,    0,    0,    0],
        ...,
        [ 101,  100, 2514,  ...,    0,    0,    0],
        [ 101,  100, 2383,  ...,    0,    0,    0],
        [ 101,  100, 2001,  ...,    0,    0,    0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([3, 0, 5, 1, 3, 5, 2, 3, 4, 4, 0, 0, 3, 1, 0, 1])]


# Model

In [9]:
# Objects


class EMOModel:
    def __init__(self, num_labels:int=7) -> None:
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.num_labels = num_labels
        self.config.output_attentions = True
        print("config", self.config)

        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            config=self.config
        )


emo_model = EMOModel(7)


config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_attentions": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
# Empty cache of GPU
torch.cuda.empty_cache()

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [11]:
def set_random_seed(seed_val:int=42):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

class Trainer:
    def __init__(self, train_loader:DataLoader, valid_loader:DataLoader, model:EMOModel) -> None:
        self.train_loader = train_loader
        self.valid_loader = valid_loader
        self.model = model 
        self.epochs = 1
        self.seed_val = 41
        self.print_interval = 1
        self.device = "cpu"

        self.optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
        self.scheduler = get_linear_schedule_with_warmup(self.optimizer, 
                            num_warmup_steps = 0, # Default value in run_glue.py
                            num_training_steps = len(self.train_loader)*self.epochs)
        

    def flat_accuracy(self, preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return np.sum(pred_flat == labels_flat) / len(labels_flat)

    
    def train(self):
        set_random_seed(self.seed_val)
        self.model.train() # can be unset using model.eval()
        # model.to_device(device)
        if self.device == "gpu":
            self.model.cuda()
        for epoch_i in range(0, self.epochs):
            self.train_one_epoch()

    def train_one_epoch(self):
        total_train_loss = 0
        for step, batch in enumerate(self.train_loader):
            if step and step % self.print_interval == 0:
                print("finish 10 batches")
                print(f"avg loss: {total_train_loss/step}")
            
            b_input_ids = batch[0].to(self.device)
            b_input_mask = batch[1].to(self.device)
            b_labels = batch[2].to(self.device)   

            self.model.zero_grad()

            
            loss, logits, attentions = self.model(input_ids=b_input_ids, 
                                            attention_mask=b_input_mask, 
                                            labels=b_labels,
                                            return_dict = False)
            print(attentions[-1].size())
            print(len(attentions))
            total_train_loss += loss.item() # item on cpu, loss on gpu

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

            # update models
            self.optimizer.step()
            self.scheduler.step()
            

            if step > 3:
                break

        

trainer = Trainer(test_loader, test_loader, emo_model.model)
trainer.train()



torch.Size([16, 12, 192, 192])
12
finish 10 batches
avg loss: 2.0366389751434326
torch.Size([16, 12, 192, 192])
12
finish 10 batches
avg loss: 2.018377423286438
torch.Size([16, 12, 192, 192])
12
finish 10 batches
avg loss: 1.9998279809951782
torch.Size([16, 12, 192, 192])
12
finish 10 batches
avg loss: 1.9994991719722748
torch.Size([16, 12, 192, 192])
12
