# Mouting colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/MyDrive/LG/

/content/drive/MyDrive/LG


# Installing the libraries we need

In [None]:
!pip install torch
!pip install transformers
!pip install datasets
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.8 MB/s[0m eta [36m0:00:0

# Data pre-processing

In [None]:
def preprocess_data(texts, labels):

    flatten_texts = []
    flatten_labels = []

    for dialogue, emotion_list in zip(texts, labels):
        for sentence in dialogue:
            flatten_texts.append(sentence)
        flatten_labels.extend(emotion_list)

    return flatten_texts, flatten_labels

In [None]:
from datasets import load_dataset

# Load the daily_dialog dataset if available
dataset = load_dataset("daily_dialog")

# --------------------------------------------------------------------------------
# Train data
texts_train = dataset['train']['dialog']
labels_train = dataset['train']['emotion']

flatten_texts_train, flatten_labels_train = preprocess_data(texts_train, labels_train)

# --------------------------------------------------------------------------------
# Validation data
texts_validation = dataset['validation']['dialog']
labels_validation = dataset['validation']['emotion']

flatten_texts_validation, flatten_labels_validation = preprocess_data(texts_validation, labels_validation)

# --------------------------------------------------------------------------------
# Test data
texts_test = dataset['test']['dialog']
labels_test = dataset['test']['emotion']

flatten_texts_test, flatten_labels_test = preprocess_data(texts_test, labels_test)

Downloading builder script:   0%|          | 0.00/4.85k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.49k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11118 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
print(len(labels_train))
print(len(labels_validation))
print(len(labels_test))


print(len(flatten_labels_train))
print(len(flatten_labels_validation))
print(len(flatten_labels_test))

11118
1000
1000
87170
8069
7740


# Creating the model

In [None]:
# from transformers import BertTokenizer
# # Load the BERT tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the DistilBert tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize and encode the input texts for train, validation and test
encoded_texts_train = tokenizer(flatten_texts_train, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
encoded_texts_validation = tokenizer(flatten_texts_validation, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
encoded_texts_test = tokenizer(flatten_texts_test, padding='max_length', truncation=True, max_length=128, return_tensors='pt')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# Convert 'flatten_labels' to a PyTorch tensor for train, validation and test
labels_tensor_train = torch.tensor(flatten_labels_train)
labels_tensor_validation = torch.tensor(flatten_labels_validation)
labels_tensor_test = torch.tensor(flatten_labels_test)

# Combine input texts and labels into a dataset for train, validation and test
dataset_train = TensorDataset(encoded_texts_train.input_ids, encoded_texts_train.attention_mask, labels_tensor_train)
dataset_validation = TensorDataset(encoded_texts_validation.input_ids, encoded_texts_validation.attention_mask, labels_tensor_validation)
dataset_test = TensorDataset(encoded_texts_test.input_ids, encoded_texts_test.attention_mask, labels_tensor_test)

# Create a DataLoader
batch_size = 8
dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
dataloader_validation = DataLoader(dataset_validation, batch_size=batch_size, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)


In [None]:
print(flatten_labels_train)

# Train

In [None]:
import torch.nn as nn
from transformers import BertForSequenceClassification
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Using device:", device)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=7)
model.to(device)

# Define loss function and optimizer

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

# Training and validation loop
num_epochs = 10
best_loss = None

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_length = 0
    correct_predictions = 0

    dataloader_train = tqdm(dataloader_train, desc="Training Iteration")

    for batch in dataloader_train:
        input_ids, attention_mask, label = batch

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        label = label.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        total_loss += loss.item()
        total_length += len(label)

        preds = torch.argmax(outputs.logits, dim=1)
        correct_predictions += (preds == label).sum()

        loss.backward()
        optimizer.step()

    accuracy = correct_predictions.float() / total_length
    avg_loss = total_loss / total_length
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_loss}, Training Accuracy: {accuracy}")


    model.eval()
    total_val_loss = 0
    total_val_length = 0
    correct_val_predictions = 0

    with torch.no_grad():
        for batch in dataloader_validation:
            input_ids, attention_mask, label = batch

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            label = label.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=label)
            loss = outputs.loss
            total_val_loss += loss.item()
            total_val_length += len(label)

            val_preds = torch.argmax(outputs.logits, dim=1)
            correct_val_predictions += (val_preds == label).sum()

    val_accuracy = correct_val_predictions.float() / total_val_length
    avg_val_loss = total_val_loss / total_val_length
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss}, Validation Accuracy: {val_accuracy}")

    scheduler.step(avg_val_loss)

    # Save the model if the validation loss is the best we've seen so far.
    if not best_loss or avg_val_loss < best_loss:
        torch.save(model.state_dict(), 'emotion_detection_model.pt')
        best_loss = avg_val_loss

Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Iteration: 100%|██████████| 10897/10897 [06:37<00:00, 27.40it/s]


Epoch 1/10, Training Loss: 0.0510809865770666, Training Accuracy: 0.8597109317779541
Epoch 1/10, Validation Loss: 0.032726290087347294, Validation Accuracy: 0.9015986919403076


Training Iteration: 100%|██████████| 10897/10897 [06:36<00:00, 27.48it/s]


Epoch 2/10, Training Loss: 0.04130890183941917, Training Accuracy: 0.876907229423523
Epoch 2/10, Validation Loss: 0.032274370476069, Validation Accuracy: 0.9055644869804382


Training Iteration: 100%|██████████| 10897/10897 [06:36<00:00, 27.49it/s]


Epoch 3/10, Training Loss: 0.033085598825077535, Training Accuracy: 0.895801305770874
Epoch 3/10, Validation Loss: 0.034555868261055375, Validation Accuracy: 0.9037055373191833


Training Iteration: 100%|██████████| 10897/10897 [06:35<00:00, 27.55it/s]


Epoch 4/10, Training Loss: 0.026507989047764798, Training Accuracy: 0.9156476259231567
Epoch 4/10, Validation Loss: 0.03752613845779484, Validation Accuracy: 0.8947824835777283


Training Iteration: 100%|██████████| 10897/10897 [06:35<00:00, 27.54it/s]


Epoch 5/10, Training Loss: 0.0221642719521938, Training Accuracy: 0.9288057684898376
Epoch 5/10, Validation Loss: 0.04252559269376171, Validation Accuracy: 0.896021842956543


Training Iteration: 100%|██████████| 10897/10897 [06:35<00:00, 27.54it/s]


Epoch 6/10, Training Loss: 0.019449057870400386, Training Accuracy: 0.9375932216644287
Epoch 6/10, Validation Loss: 0.045657204153918725, Validation Accuracy: 0.8931714296340942
Epoch 00006: reducing learning rate of group 0 to 2.0000e-06.


Training Iteration: 100%|██████████| 10897/10897 [06:36<00:00, 27.52it/s]


Epoch 7/10, Training Loss: 0.01344705943573346, Training Accuracy: 0.9550648331642151
Epoch 7/10, Validation Loss: 0.05254458080335649, Validation Accuracy: 0.8916842341423035


Training Iteration: 100%|██████████| 10897/10897 [06:35<00:00, 27.54it/s]


Epoch 8/10, Training Loss: 0.011827620998255945, Training Accuracy: 0.9584949016571045
Epoch 8/10, Validation Loss: 0.054976001586879535, Validation Accuracy: 0.8925517201423645


Training Iteration: 100%|██████████| 10897/10897 [06:35<00:00, 27.53it/s]


Epoch 9/10, Training Loss: 0.01094888302748495, Training Accuracy: 0.9602386355400085
Epoch 9/10, Validation Loss: 0.06157056166515045, Validation Accuracy: 0.8932953476905823


Training Iteration: 100%|██████████| 10897/10897 [06:35<00:00, 27.53it/s]


Epoch 10/10, Training Loss: 0.010428805229919354, Training Accuracy: 0.9614087343215942
Epoch 10/10, Validation Loss: 0.06313456858899866, Validation Accuracy: 0.8937910795211792
Epoch 00010: reducing learning rate of group 0 to 2.0000e-07.


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

model_save_path = "distilbert-emotion-detector"

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('distilbert-emotion-detector/tokenizer_config.json',
 'distilbert-emotion-detector/special_tokens_map.json',
 'distilbert-emotion-detector/vocab.txt',
 'distilbert-emotion-detector/added_tokens.json')

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=7)

# Load the model weights from a file
state_dict = torch.load('emotion_detection_model.pt')

# Apply the weights to the model
model.load_state_dict(state_dict)

model.to(device)

model.eval()
total_test_loss = 0
total_test_length = 0
correct_test_predictions = 0

with torch.no_grad():
    for batch in dataloader_test:
        input_ids, attention_mask, label = batch

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        label = label.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        total_test_loss += loss.item()
        total_test_length += len(label)

        test_preds = torch.argmax(outputs.logits, dim=1)
        correct_test_predictions += (test_preds == label).sum()

    test_accuracy = correct_test_predictions.float() / total_test_length
    avg_test_loss = total_test_loss / total_test_length
    print(f"Test Loss: {avg_test_loss}, Test Accuracy: {test_accuracy}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss: 0.05174991291735889, Test Accuracy: 0.8543927073478699
