In [9]:
from google.colab import drive

In [1]:
import pandas as pd

In [2]:
import torch
import torch.nn.functional as F
import torch.optim as optim

In [3]:
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

In [4]:
%%capture
!pip install -U transformers huggingface_hub

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [10]:
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
df = pd.read_csv('/content/drive/MyDrive/datasets/smile-annotations-final.csv', names = ['id', 'text', 'category'])

In [12]:
df['category'].unique()

array(['nocode', 'happy', 'not-relevant', 'angry', 'disgust|angry',
       'disgust', 'happy|surprise', 'sad', 'surprise', 'happy|sad',
       'sad|disgust', 'sad|angry', 'sad|disgust|angry'], dtype=object)

In [13]:
labels = ['happy', 'angry', 'disgust', 'sad', 'surprise']

In [14]:
df = df[df['category'].isin(labels)][['text', 'category']]

In [15]:
id2label = {i:l for i, l in enumerate(labels)}
label2id = {l:i for i, l in id2label.items()}

In [16]:
id2label

{0: 'happy', 1: 'angry', 2: 'disgust', 3: 'sad', 4: 'surprise'}

In [17]:
label2id

{'happy': 0, 'angry': 1, 'disgust': 2, 'sad': 3, 'surprise': 4}

In [18]:
df['label'] = df['category'].apply(lambda x: label2id[x])

In [19]:
df['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
happy,1137
angry,57
surprise,35
sad,32
disgust,6


In [20]:
x_train, x_val, y_train, y_val = train_test_split(
    df['text'],
    df['label'],
    test_size = 0.2,
    stratify = df['label'])

In [21]:
y_train.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,909
1,45
4,28
3,26
2,5


In [22]:
y_val.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,228
1,12
4,7
3,6
2,1


In [23]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [24]:
train_encoding = tokenizer.batch_encode_plus(x_train.values,
                                             padding = True,
                                             truncation = True,
                                             return_tensors = 'pt')
val_encoding = tokenizer.batch_encode_plus(x_val.values,
                                           padding = True,
                                           truncation = True,
                                           return_tensors = 'pt')

In [25]:
val_encoding

{'input_ids': tensor([[  101,  2633,  2081,  ...,     0,     0,     0],
        [  101,  1030,  7920,  ...,     0,     0,     0],
        [  101,  5667,  4086,  ...,     0,     0,     0],
        ...,
        [  101,  1030, 12245,  ...,     0,     0,     0],
        [  101,  2307,  2154,  ...,     0,     0,     0],
        [  101,  1030,  1040,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [26]:
Dtr = TensorDataset(train_encoding['input_ids'], train_encoding['attention_mask'], torch.tensor(y_train.values))
Dvl = TensorDataset(val_encoding['input_ids'], val_encoding['attention_mask'], torch.tensor(y_val.values))

In [27]:
DLtr = DataLoader(Dtr, batch_size = 32, shuffle = True)
DLvl = DataLoader(Dvl, batch_size = 32, shuffle = False)

In [28]:
DLtr

<torch.utils.data.dataloader.DataLoader at 0x7b7ceaa90d40>

In [29]:
model = BertForSequenceClassification.from_pretrained('google-bert/bert-base-uncased',
                                                      num_labels = len(labels))

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [31]:
model.parameters

In [32]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [33]:
# labels = F.one_hote(torch.tensor[df['label'].values], len(labels))

In [34]:
optimizer = optim.AdamW(model.parameters())

In [36]:
num_epochs = 5

for epoch in range(num_epochs):

    model.train()
    total_train_loss = 0.0

    for batch in DLtr:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(DLtr)
#val
    model.eval()
    total_val_loss = 0.0

    with torch.no_grad():
        for batch in DLvl:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(DLvl)

    print(
        f"Epoch {epoch+1}/{num_epochs} | "
        f"Train loss: {avg_train_loss:.4f} | "
        f"Val loss: {avg_val_loss:.4f}"
    )

Epoch 1/5 | Train loss: 0.4754 | Val loss: 0.4551
Epoch 2/5 | Train loss: 0.4722 | Val loss: 0.4600
Epoch 3/5 | Train loss: 0.4730 | Val loss: 0.4645
Epoch 4/5 | Train loss: 0.4685 | Val loss: 0.4672
Epoch 5/5 | Train loss: 0.4737 | Val loss: 0.4634


In [37]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [38]:
repo_name = "azizdevlab/bert-emotions"
model.push_to_hub(repo_name, commit_message="Fine-tuned BERT for emotions")
tokenizer.push_to_hub(repo_name, commit_message="Tokenizer for emotions")


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...7btubd_/model.safetensors:   0%|          | 14.2kB /  438MB            

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/azizdevlab/bert-emotions/commit/1e6429bfbddff414884712bd6090b504ccc94a9d', commit_message='Tokenizer for emotions', commit_description='', oid='1e6429bfbddff414884712bd6090b504ccc94a9d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/azizdevlab/bert-emotions', endpoint='https://huggingface.co', repo_type='model', repo_id='azizdevlab/bert-emotions'), pr_revision=None, pr_num=None)