In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datetime import datetime

df = pd.read_csv('/kaggle/input/msc-thesis-dataset/full_with_llama_emotions.csv/full_with_llama_emotions.csv')

columns_to_drop = ['Unnamed: 0', 'excerpt_id', 'excerpt_value', 'source_url', 'source_id', 'source_label', 'source_date', 'relevantExcerpt', 'geoname_id', 'countries', 'sentiment', 'time', 'place', 'id']
df.drop(columns=columns_to_drop, inplace=True)

class EmotionDataset(Dataset):
    def __init__(self, excerpts, labels, tokenizer, max_len):
        self.excerpts = excerpts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.excerpts)

    def __getitem__(self, idx):
        excerpt = str(self.excerpts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            excerpt,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Parameters
MAX_LEN = 512
BATCH_SIZE = 16
EPOCHS = 3

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Map emotions to integers
label_dict = {label: idx for idx, label in enumerate(df['emotion_llama'].unique())}
df['emotion_labels'] = df['emotion_llama'].replace(label_dict)

# Split data into training/validation and test sets with stratified sampling
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    df['excerpt_value_cleaned'], 
    df['emotion_labels'], 
    test_size=0.3,  # Adjust the test set size as needed
    stratify=df['emotion_labels']
)

# Split the training/validation set into separate training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts, 
    train_val_labels, 
    test_size=0.2,  # Adjust the validation set size as needed
    stratify=train_val_labels
)

# Reset index for all the resulting sets
train_texts = train_texts.reset_index(drop=True)
val_texts = val_texts.reset_index(drop=True)
test_texts = test_texts.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
val_labels = val_labels.reset_index(drop=True)
test_labels = test_labels.reset_index(drop=True)

# Create datasets
train_dataset = EmotionDataset(train_texts, train_labels, tokenizer, MAX_LEN)
val_dataset = EmotionDataset(val_texts, val_labels, tokenizer, MAX_LEN)
test_dataset = EmotionDataset(test_texts, test_labels, tokenizer, MAX_LEN)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Load BERT with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_dict))
optimizer = AdamW(model.parameters(), lr=2e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
model.train()
for epoch in range(EPOCHS):
    print('started the epoch')
    total_loss = 0
    for batch in train_loader:
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_mask = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(device)
        model.zero_grad()
        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()  # Accumulate the loss from each batch

    average_loss = total_loss / len(train_loader)  # Calculate the average loss for the epoch
    print(f'Epoch {epoch + 1}/{EPOCHS}, Average Loss: {average_loss:.4f}, Time: {datetime.now().strftime("%H:%M:%S")}')
    

# Evaluation loop
model.eval()

def evaluate(loader, set_name):
    predictions = []
    true_labels = []
    for batch in loader:
        batch_input_ids = batch['input_ids'].to(device)
        batch_attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            logits = outputs[0]
            preds = torch.argmax(logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

    # Report
    labels_expected = list(range(len(label_dict)))
    target_names = [label for label, index in sorted(label_dict.items(), key=lambda x: x[1])]
    report = classification_report(true_labels, predictions, labels=labels_expected, target_names=target_names, zero_division=1)
    print(f"{set_name} Set Classification Report:")
    print(report)

    # Save the report
    with open(f'/kaggle/working/{set_name}_classification_report.txt', 'w') as file:
        file.write(report)

# Evaluate on validation set
evaluate(val_loader, "Validation")

# Evaluate on test set
evaluate(test_loader, "Test")