In [None]:
import random
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

In [None]:
# Set seed
np.random.seed(42)
torch.manual_seed(42)
random.seed(42)

# Load Data

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/youtube-sentiments/youtube_labeled_edited.csv', usecols=['text', 'emotion'])

df

# Process Data

In [None]:
x = df['text']
y = df['emotion']

In [None]:
# Get number of emotions to classify
EMOTIONS = df['emotion'].unique()
N_EMOTIONS = len(EMOTIONS)
N_EMOTIONS

In [None]:
decode_map = {
    0: 'constructive feedback/idea',
    1: 'negative',
    2: 'neutral/other', 
    3: 'positive', 
    4: 'sadness', 
}

In [None]:
# Encode classes
y = y.apply(lambda example: [k for k, v in decode_map.items() if v == example][0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    x,
    y,
    test_size=0.2,
    shuffle=False
)

### Tokenizer and Encoding

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased', use_fast=True)

In [None]:
# Encode data
X_train_encoded = tokenizer(X_train.to_list(), truncation=True, padding=True, return_tensors="pt")
X_test_encoded = tokenizer(X_test.to_list(), truncation=True, padding=True, return_tensors="pt")

### Training and Testing Datasets

In [None]:
# Training Data
train_data = torch.utils.data.TensorDataset(
    X_train_encoded['input_ids'], 
    X_train_encoded['attention_mask'],
    torch.tensor(y_train)
)
train_dataloader = torch.utils.data.DataLoader(
    train_data,
    batch_size=8,
    shuffle=True
)

# Testing Data
test_data = torch.utils.data.TensorDataset(
    X_test_encoded['input_ids'], 
    X_test_encoded['attention_mask'],
    torch.tensor(y_test.to_list())
)
test_dataloader = torch.utils.data.DataLoader(
    test_data,
    batch_size=8,
    num_workers=2
)

# Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=N_EMOTIONS)

model

In [None]:
# CUDA
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = model.to(device)

# Training

In [None]:
# Empty CUDA cache
if device == 'cuda':
    torch.cuda.empty_cache()
    print(torch.cuda.memory_summary(device=device, abbreviated=True))

In [None]:
# TRAINING
EPOCHS = 1

# Model on training mode
model.train()
# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*EPOCHS)

# Training loop
print('Training...\n')
for epoch in range(EPOCHS):
    print('-'*100)
    print('Epoch:', epoch+1)

    total_loss = 0

    for batch in tqdm(train_dataloader):
        # Zero model gradients
        model.zero_grad()

        # Get input data and move them to device
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Predict
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        # Get loss, calculate and clip gradients, and update parameters
        loss = outputs[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        # Update scheduler
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print('Loss:', avg_train_loss)
    print('-'*100)
    print("\n")

# Testing

In [None]:
# Model on CUDA
model = model.to(device)
# Model on evaluation mode
model.eval()

correct = 0
total = 0

with torch.no_grad():
    for data in test_dataloader:
        # Get data and move them to the right device
        text, attention, labels = data
        text, attention, labels = text.to(device), attention.to(device), labels.to(device)
        # Get predictions from model
        outputs = model(text, attention)
        # Store predictions for batch size
        predictions = []
        for output in outputs.logits:
            _, predicted = torch.max(outputs.logits[0], 0)
            predictions.append(predicted.item())
        predictions = torch.tensor(predictions).to(device)
        # Calculate total
        total += labels.size(0)
        # Calculate number of correct classification
        correct += (predictions == labels).sum().item()

print(f'Testing accuracy: {(100 * correct / total)}%')

# New Predictions

In [None]:
sentence = """
    Programming is yelling at a computer what to do in a made-up cyberlanguage and the computer ignoring what you said because you missed a comma.
"""

encoded_sentence = tokenizer(sentence, truncation=True, padding=True, return_tensors="pt")

model = model.to('cpu')
model.eval()

with torch.no_grad():
    prediction = model(encoded_sentence.input_ids, encoded_sentence.attention_mask)
    prediction = np.argmax(prediction.logits)

decode_map[prediction.item()]

# Save Model and Tokenizer

In [None]:
model.save_pretrained('/tmp/model')
tokenizer.save_pretrained('/tmp/tokenizer')