In [None]:
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import sklearn
from sklearn.model_selection import train_test_split
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

In [None]:
HYPERPARAMETERS_TUNING = False

# Set seed
np.random.seed(42)
torch.manual_seed(42)
random.seed(42)

# Load Data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/youtube-sentiments/youtube_labeled_edited.csv', usecols=['text', 'emotion'])

df

# Process Data

In [None]:
x = df['text']
y = df['emotion']

In [None]:
# Get number of emotions to classify
EMOTIONS = df['emotion'].unique()
N_EMOTIONS = len(EMOTIONS)
N_EMOTIONS

In [None]:
decode_map = {
    0: 'constructive feedback/idea',
    1: 'negative',
    2: 'neutral/other', 
    3: 'positive', 
    4: 'sadness', 
}

In [None]:
# Encode classes
y = y.apply(lambda example: [k for k, v in decode_map.items() if v == example][0])

### SMOTE Oversample and random undersample

https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/

In [None]:
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.pipeline import Pipeline

In [None]:
# # Define pipeline
# over = SMOTE(sampling_strategy=0.1)
# under = RandomUnderSampler(sampling_strategy=0.5)
# steps = [('o', over), ('u', under)]
# pipeline = Pipeline(steps=steps)
# # Transform dataset
# x, y = pipeline.fit_resample(x, y)

### Split data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.2,
    shuffle=False
)

### Text Augmentantion
* Spelling Augmenter
* Contextual Word Embeddings Augmenter
* Synonym Augmenter
* Antonym Augmenter
* Random Word Augmenter
* Contextual Word Embeddings for Sentence Augmenter

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

1. Character Augmenters (augmenters that work on character level)

In [None]:
# Keyboard Augmenter
def keyboard_augmenter(text):
    print('Keyboard Augmenter...')
    # substitute word by spelling mistake words dictionary
    aug = naw.SpellingAug()
    spelling_aug = aug.augment(text, n=3)
    return spelling_aug

2. Word Augmenters (augmenters that work on word level)

In [None]:
# Spelling Augmenter #
# Substitute word by spelling mistake words dictionary
def spelling_augmenter(text):
    print('Spelling Augmenter...')
    aug = naw.SpellingAug()
    spelling_aug = aug.augment(text, n=3)
    return spelling_aug

# Contextual Word Embeddings Augmenter #
# Insert word by contextual word embeddings
def insert_contextual_word_embeddings_augmenter(text):
    print('Insert Contextual Word Embeddings Augmenter...')
    context = naw.ContextualWordEmbsAug(
        model_path='bert-base-uncased', action="insert"
    )
    res = context.augment(text)
    return res

# Contextual Word Embeddings Augmenter #
# Substitute word by contextual word embeddings
def substitute_contextual_word_embeddings_augmenter(text):
    print('Substitute Contextual Word Embeddings Augmenter...')
    context = naw.ContextualWordEmbsAug(
        model_path='bert-base-uncased', action="substitute"
    )
    res = context.augment(text)
    return res


# Synonym Augmenter #
# Substitute word by WordNet's synonym
def synonym_augmenter(text):
    print('Synonym Augmenter...')
    aug = naw.SynonymAug(aug_src='wordnet')
    augmented_text = aug.augment(text)
    return augmented_text


# Antonym Augmenter #
# Substitute word by antonym
def antonym_augmenter(text):
    print('Antonym Augmenter...')
    aug = naw.AntonymAug()
    augmented_text = aug.augment(text)
    return augmented_text


# Random Word Augmenter #
# Swap word randomly
def swap_random_word_augmenter(text):
    print('Swap Random Word Augmenter...')
    aug = naw.RandomWordAug(action="swap")
    augmented_text = aug.augment(text)
    return augmented_text

# Random Word Augmenter #
# Delete word randomly
def delete_random_word_augmenter(text):
    print('Delete Random Word Augmenter...')
    aug = naw.RandomWordAug()
    augmented_text = aug.augment(text)
    return augmented_text

3. Sentence Augmenters (augmenters that work on sentence level)

In [None]:
# Contextual Word Embeddings #
# Insert sentence by contextual word embeddings
def contextual_word_embeddings_sentence_augmenter(text):
    print('Contextual Word Embeddings for Sentence Augmenter...')
    aug = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2', )
    augmented_text = aug.augment(text)
    return augmented_text

In [None]:
def augment_data(X, labels, is_train_data):
    X_augmented = []
    augmented_labels = []

    augmenters = [
        keyboard_augmenter,
        spelling_augmenter,
        # insert_contextual_word_embeddings_augmenter, 
        # substitute_contextual_word_embeddings_augmenter,
        # synonym_augmenter,
        # antonym_augmenter,
        # swap_random_word_augmenter,
        # delete_random_word_augmenter,
        # contextual_word_embeddings_sentence_augmenter
    ]

    print('Augmenting', 'training data' if is_train_data else 'test data', '\n')

    if is_train_data:
        for idx, example in enumerate(X.to_list()):
            if labels[idx] in [0, 1, 4]:
                for augmenter in augmenters:
                    augmented_example = augmenter(example)[:2]
                    X_augmented.extend(augmented_example)
                    augmented_labels.extend([labels[idx] for _ in range(len(augmented_example))])
            else:
                for augmenter in augmenters:
                    augmented_example = [augmenter(example)[0]]
                    X_augmented.extend(augmented_example)
                    augmented_labels.extend([labels[idx] for _ in range(len(augmented_example))])
    else:
        for idx, example in enumerate(X.to_list()):
            if labels.to_list()[idx] in [0, 1, 4]:
                for augmenter in augmenters:
                    augmented_example = [augmenter(example)[0]]
                    X_augmented.extend(augmented_example)
                    augmented_labels.extend([labels.to_list()[idx] for _ in range(len(augmented_example))])
            # else:
            #     for augmenter in augmenters:
            #         augmented_example = [augmenter(example)[0]]
            #         X.extend(augmented_example)
            #         augmented_labels.extend([labels.to_list()[idx] for _ in range(len(augmented_example))])



    X = X.to_list()
    X.extend(X_augmented)
    print('X data after augmentation:', len(X))

    labels = labels.tolist()
    labels.extend(augmented_labels)
    print('Labels data after augmentation:',len(labels))

    # Count augmented data by class
    count = {
        0: 0,
        1: 0,
        2: 0,
        3: 0,
        4: 0,
    }

    for i in labels:
        count[i] += 1

    print('-'*34)
    print(' constructive feedback/idea |', count[0])
    print(' negative', ' '*17 ,  '|', count[1])
    print(' neutral/other', ' '*12 ,  '|', count[2])
    print(' positive', ' '*17 ,  '|', count[3])
    print(' sadness', ' '*18 ,  '|', count[4])
    print('-'*34)

    return X, labels

In [None]:
X_train, y_train = augment_data(X_train, y_train, is_train_data=True)
X_test, y_test = augment_data(X_test, y_test, is_train_data=False)

In [None]:
print('X_train:', len(X_train), '|', 'X_test:', len(X_test))
print('y_train:', len(y_train), '|', 'y_test:', len(y_test))

### Tokenizer and Encoding

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased', use_fast=True)

In [None]:
# Encode data
X_train_encoded = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt")
X_test_encoded = tokenizer(X_test.to_list(), truncation=True, padding=True, return_tensors="pt")

### Training and Testing Datasets

In [None]:
# Training Data
train_data = torch.utils.data.TensorDataset(
    X_train_encoded['input_ids'], 
    X_train_encoded['attention_mask'],
    torch.tensor(y_train)
)
train_dataloader = torch.utils.data.DataLoader(
    train_data,
    batch_size=8,
    shuffle=True
)

# Testing Data
test_data = torch.utils.data.TensorDataset(
    X_test_encoded['input_ids'], 
    X_test_encoded['attention_mask'],
    torch.tensor(y_test.to_list())
)
test_dataloader = torch.utils.data.DataLoader(
    test_data,
    batch_size=8,
    num_workers=2
)

# Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=N_EMOTIONS)

model

In [None]:
# CUDA
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = model.to(device)

# Hyper-parameters Tuning

In [None]:
from functools import partial
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [None]:
def train_cifar(config, train_loader=None, test_loader=None):
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=N_EMOTIONS)
    # CUDA and parallel training (if available)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    # if torch.cuda.device_count() > 1:
    #     model = nn.DataParallel(model)

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=config['lr'], correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config['num_warmup_steps'], num_training_steps=len(train_loader)*10)

    print('Training...')
    for epoch in range(5):
        print('Epoch:', epoch+1)

        total_loss = 0

        for batch in tqdm(train_loader):
            # Zero model gradients
            model.zero_grad()

            # Get input data and move them to device
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)

            # Predict
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            # Get loss, calculate and clip gradients, and update parameters
            loss = outputs[0]
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            # Update scheduler
            scheduler.step()
            # Update total loss
            total_loss += loss

        avg_train_loss = total_loss / len(train_dataloader)
        print('Loss:', avg_train_loss.item())
        print("\n")


    correct = 0
    total = 0
    val_loss = 0.0
    val_steps = 0

    with torch.no_grad():
        for data in test_loader:
            # Get data and move them to the right device
            text, attention, labels = data
            text, attention, labels = text.to(device), attention.to(device), labels.to(device)
            # Get predictions from model
            outputs = model(text, attention)
            # Store predictions for batch size
            predictions = []
            for output in outputs.logits:
                _, predicted = torch.max(output, 0)
                predictions.append(predicted.item())
            predictions = torch.tensor(predictions).to(device)
            # Calculate total
            total += labels.size(0)
            # Calculate number of correct classification
            correct += (predictions == labels).sum().item()
            # Calculate loss
            print('Predictions', predictions)
            print('Labels', labels)
            loss = criterion(predictions, labels)
            print('Loss', loss)
            val_loss += loss.cpu().numpy()
            val_steps += 1
            print('Val loss', loss)


        tune.report(loss=(val_loss/val_steps), accuracy=correct/total)

    print("Finished Training")

In [None]:
if HYPERPARAMETERS_TUNING:
    # Hyper-parameters configuration
    config = {
        'lr': tune.loguniform(1e-6, 1e-1),
        'batch_size': tune.choice([2, 4, 8,]),
        'num_warmup_steps': tune.choice([0, 100, 200, 500])
    }

    # ASHAScheduler terminates bad performing trials early
    asha_scheduler = ASHAScheduler(
            metric="loss",
            mode="min",
            max_t=5,
            grace_period=1,
            reduction_factor=2
    )

    reporter = CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])

    result = tune.run(
        partial(train_cifar, train_loader=train_dataloader, test_loader=test_dataloader),
        resources_per_trial={ "cpu": 1, "gpu": 1 },
        config=config,
        num_samples=5,
        scheduler=asha_scheduler,
        progress_reporter=reporter,
    )

# Training

In [None]:
# Empty CUDA cache
if device == 'cuda':
    torch.cuda.empty_cache()
    print(torch.cuda.memory_summary(device=device, abbreviated=True))

In [None]:
# TRAINING
EPOCHS = 10

# Model on training mode
model.train()
# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*EPOCHS)

# Training loop
print('Training...\n')
for epoch in range(EPOCHS):
    print('-'*100)
    print('Epoch:', epoch+1)

    total_loss = 0

    for batch in tqdm(train_dataloader):
        # Zero model gradients
        model.zero_grad()

        # Get input data and move them to device
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Predict
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        # Get loss, calculate and clip gradients, and update parameters
        loss = outputs[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        # Update scheduler
        scheduler.step()
        # Update total loss
        total_loss += loss

    avg_train_loss = total_loss / len(train_dataloader)
    print('Loss:', avg_train_loss.item())
    print('-'*100)
    print("\n")

# Testing

In [None]:
# Model on CUDA
model = model.to(device)
# Model on evaluation mode
model.eval()

correct = 0
total = 0
predicted_all = []
true_all = []

with torch.no_grad():
    for data in test_dataloader:
        # Get data and move them to the right device
        text, attention, labels = data
        text, attention, labels = text.to(device), attention.to(device), labels.to(device)
        # Get predictions from model
        outputs = model(text, attention)
        # Store predictions for batch size
        predictions = []
        for output in outputs.logits:
            _, predicted = torch.max(output, 0)
            predictions.append(predicted.item())
        predictions = torch.tensor(predictions).to(device)
        # Calculate total
        total += labels.size(0)
        # Calculate number of correct classification
        correct += (predictions == labels).sum().item()
        # Store all predictions and labels
        predicted_all.extend(predictions.tolist())
        true_all.extend(labels.tolist())

print(f'Testing accuracy: {(100 * correct / total)}%')

### F-1 Score

In [None]:
# The closer to 1.0 the better
sklearn.metrics.f1_score(predicted_all, true_all, average='weighted')

### Cohen's Kappa statistic

In [None]:
# Bad model: less than 0.60
# Good model: 0.60-0.80
# Excellent: more than 0.80
sklearn.metrics.cohen_kappa_score(predicted_all, true_all)

### Plot correctly and incorrectly classified examples

In [None]:
kappa_predicted_decoded = list(map(lambda x: decode_map[x], predicted_all))
kappa_true_decoded = list(map(lambda x: decode_map[x], true_all))
res = {
    'neutral/other': { 'correct': 0, 'not_correct': 0 },
    'positive': { 'correct': 0, 'not_correct': 0 },
    'negative': { 'correct': 0, 'not_correct': 0 },
    'constructive feedback/idea': { 'correct': 0, 'not_correct': 0 },
    'sadness': { 'correct': 0, 'not_correct': 0 }
}

for i in range(len(kappa_true_decoded)):
    if kappa_true_decoded[i] == kappa_predicted_decoded[i]:
        res[kappa_true_decoded[i]]['correct'] += 1
    else:
        res[kappa_true_decoded[i]]['not_correct'] += 1


neutral_c = res['neutral/other']['correct']
neutral_nc = res['neutral/other']['not_correct']

positive_c = res['positive']['correct']
positive_nc = res['positive']['not_correct']

negative_c = res['negative']['correct']
negative_nc = res['negative']['not_correct']

constructive_c = res['constructive feedback/idea']['correct']
constructive_nc = res['constructive feedback/idea']['not_correct']

sadness_c = res['sadness']['correct']
sadness_nc = res['sadness']['not_correct']


print('='*102)
print('|', ' '*21, '|', 'neutral/other', '|', 'positive', '|', 'negative', '|', 'constructive feedback/idea', '|', 'sadness', '|')
print('|', '='*98, '|')
print('|', 'correctly predicted  ', '|', neutral_c, ' '*10, '|', positive_c, ' '*5, '|', negative_c, ' '*5, '|', constructive_c, ' '*23, '|', sadness_c, ' '*4, '|')
print('|', '='*98, '|')
print('|', 'incorrectly predicted', '|', neutral_nc, ' '*10, '|', positive_nc, ' '*5, '|', negative_nc, ' '*5, '|', constructive_nc, ' '*23, '|', sadness_nc, ' '*4, '|')
print('='*102)

### Confusion Matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Get confusion matrix
confusion_matrix = sklearn.metrics.confusion_matrix(true_all, predicted_all)

# Plot confusion matrix
plt.figure(figsize=(5, 5), dpi=100)
cmap = sns.cubehelix_palette(50, hue=0.05, rot=0, light=0.9, dark=0, as_cmap=True)
sns.set(font_scale=2.5)
sns.heatmap(confusion_matrix, annot=True, cmap=cmap, cbar=False)

# New Predictions

In [None]:
sentence = """
    Programming is yelling at a computer what to do in a made-up cyberlanguage and the computer ignoring what you said because you missed a comma.
"""

encoded_sentence = tokenizer(sentence, truncation=True, padding=True, return_tensors="pt")

model = model.to('cpu')
model.eval()

with torch.no_grad():
    prediction = model(encoded_sentence.input_ids, encoded_sentence.attention_mask)
    prediction = np.argmax(prediction.logits)

decode_map[prediction.item()]

# Save Model and Tokenizer

In [None]:
model.save_pretrained('/tmp/model')
tokenizer.save_pretrained('/tmp/tokenizer')