In [None]:
!pip install -q transformers datasets accelerate evaluate

# Load and Preprocess the Dataset

In [None]:
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer
import random
import nltk
from nltk.corpus import wordnet

# Disable parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
df = pd.read_csv("train_stockemo.csv")

# Print column names for debugging
print("Dataset Columns:", df.columns)

# Ensure column names are lowercase and strip whitespace
df.columns = df.columns.str.lower().str.strip()

# Verify required columns
required_columns = {"processed", "emo_label"}
if not required_columns.issubset(df.columns):
    raise ValueError(f"Dataset is missing required columns. Found: {df.columns}")

# Define new label mapping
label_merging = {
    "ambiguous": "neutral_belief",
    "belief": "neutral_belief",
    "amusement": "positive_outlook",
    "excitement": "positive_outlook",
    "optimism": "positive_outlook",
    "anger": "anger_disgust",
    "disgust": "anger_disgust",
    "anxiety": "anxiety_uncertainty",
    "panic": "anxiety_uncertainty",
    "confusion": "anxiety_uncertainty",
    "depression": "sadness_depression",
    "surprise": "surprise"
}

# Apply mapping to dataset
df["merged_label"] = df["emo_label"].map(label_merging)

# Ensure you have nltk wordnet
nltk.download('wordnet')

# Data Augmentation Techniques
def synonym_replacement(sentence, n=2):
    """Replace n words in the sentence with their synonyms."""
    words = sentence.split()
    new_words = words.copy()
    for _ in range(n):
        word = random.choice(words)
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            new_words = [synonym if w == word else w for w in new_words]
    return " ".join(new_words)

def random_deletion(sentence, p=0.2):
    """Randomly remove words with probability p."""
    words = sentence.split()
    if len(words) == 1:
        return sentence
    new_words = [word for word in words if random.random() > p]
    return " ".join(new_words) if new_words else words[0]

def word_swap(sentence, n=2):
    """Randomly swap two words in the sentence n times."""
    words = sentence.split()
    new_words = words.copy()
    for _ in range(n):
        idx1, idx2 = random.sample(range(len(words)), 2)
        new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
    return " ".join(new_words)

def augment_data(df, category, num_samples=500):
    """Generate new samples for a given category using augmentation techniques."""
    subset = df[df["merged_label"] == category].copy()
    new_samples = []
    
    while len(new_samples) < num_samples:
        row = subset.sample(n=1).iloc[0]
        text = row["processed"]
        
        augmentation_choice = random.choice([synonym_replacement, random_deletion, word_swap])
        new_text = augmentation_choice(text)
        
        new_samples.append({"processed": new_text, "merged_label": category})
    
    return pd.DataFrame(new_samples)

# Apply augmentation to underrepresented categories
augmented_data = pd.concat([
    augment_data(df, "surprise", 1000),
    augment_data(df, "sadness_depression", 1000),
    augment_data(df, "neutral_belief", 500),
    augment_data(df, "anger_disgust", 500),
    augment_data(df, "anxiety_uncertainty", 200)
])

# Merge augmented data with original
df = pd.concat([df, augmented_data])

# Create new label-to-ID mapping
new_labels = sorted(df["merged_label"].unique())  # Sort for consistency
label2id = {label: idx for idx, label in enumerate(new_labels)}
df["label"] = df["merged_label"].map(label2id)

# Print category distribution after augmentation
print("Updated Emotion Category Distribution:")
print(df["merged_label"].value_counts())

# Save the augmented dataset
df.to_csv("train_stockemo_augmented.csv", index=False)

# Optional: Display as a bar chart
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
df["merged_label"].value_counts().plot(kind="bar", color="skyblue", edgecolor="black")
plt.title("Emotion Category Distribution (After Augmentation)")
plt.xlabel("Emotion Category")
plt.ylabel("Number of Samples")
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Print new label mapping for reference
print("New Label Mapping:", label2id)

# Convert to Hugging Face Dataset (keeping only necessary columns)
dataset = Dataset.from_pandas(df[["processed", "label"]])

# Load RoBERTa tokenizer
MODEL_NAME = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["processed"], padding="max_length", truncation=True, max_length=512)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split dataset into training (80%) and test (20%)
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

# Remove text column after tokenization
tokenized_datasets = tokenized_datasets.remove_columns(["processed"])

print("Dataset successfully tokenized and split!")


Dataset Columns: Index(['id', 'date', 'ticker', 'emo_label', 'senti_label', 'original',
       'processed'],
      dtype='object')
Label Mapping: {'ambiguous': 0, 'amusement': 1, 'anger': 2, 'anxiety': 3, 'belief': 4, 'confusion': 5, 'depression': 6, 'disgust': 7, 'excitement': 8, 'optimism': 9, 'panic': 10, 'surprise': 11}


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

✅ Dataset successfully tokenized and split!


# Define Model & Training Config

In [2]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
import torch

# Load model with correct number of labels
num_labels = len(label2id)  # Get total number of emotion labels
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

# Move model to GPU if available
model = model.to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./roberta_emotion",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    fp16=True if torch.cuda.is_available() else False,  # Use fp16 only if GPU is available
)

# Define evaluation metrics (Accuracy & F1-score)
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average="macro")
    return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

# Train Model

In [3]:
import time
from transformers import Trainer, TrainerCallback
from tqdm import tqdm
from transformers import AutoTokenizer
import torch

# Custom progress bar callback
class AccuracyProgressBar(TrainerCallback):
    def __init__(self, total_steps):
        self.total_steps = total_steps  # Total number of batches
        self.pbar = tqdm(total=total_steps, desc="Training Progress", unit="step")
        self.start_time = time.time()

    def on_step_end(self, args, state, control, **kwargs):
        """Update after each training step (batch)."""
        elapsed_time = time.time() - self.start_time
        avg_time_per_step = elapsed_time / (state.global_step if state.global_step > 0 else 1)
        eta = avg_time_per_step * (self.total_steps - state.global_step)

        self.pbar.update(1)
        self.pbar.set_postfix(ETA=f"{eta:.2f}s")

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        """Log accuracy in real-time during evaluation."""
        if "eval_accuracy" in metrics:
            acc = metrics["eval_accuracy"]
            tqdm.write(f"\n📊 Accuracy: {acc:.4f}")

    def on_train_end(self, args, state, control, **kwargs):
        """Close progress bar at the end of training."""
        self.pbar.close()

# Ensure tokenizer and model are moved to GPU if available
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base", use_fast=True)

# Calculate the number of training steps (batches) based on the dataset and batch size
total_steps = len(tokenized_datasets["train"]) // training_args.per_device_train_batch_size * training_args.num_train_epochs

# Initialize Trainer
trainer = Trainer(
    model=model.to(device),  # Ensure model is on the correct device
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    # Use processing_class instead of tokenizer to avoid deprecation warning
    processing_class=tokenizer,  # Address the deprecation warning
)

# Attach the custom progress bar callback
trainer.add_callback(AccuracyProgressBar(total_steps))

# Start training
trainer.train()





  trainer = Trainer(


RuntimeError: MPS backend out of memory (MPS allocated: 8.91 GB, other allocations: 130.67 MB, max allowed: 9.07 GB). Tried to allocate 128.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

# Save and Test the Model

In [None]:
# Evaluate model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Save fine-tuned model
trainer.save_model("./roberta_emotion_model")
tokenizer.save_pretrained("./roberta_emotion_model")

print("Model saved successfully!")


# Testing

In [None]:
from transformers import pipeline

# Load fine-tuned RoBERTa model
classifier = pipeline("text-classification", model="./roberta_emotion_model", tokenizer="./roberta_emotion_model")

# Test with a sample input
text = "The stock market crash has caused widespread panic among investors."
result = classifier(text)

print("Prediction:", result)
