In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import random
import torch
from datasets import Dataset
from sklearn.metrics import f1_score
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    pipeline,
)
import matplotlib.pyplot as plt

In [3]:
# Load the dataset
splits = {'train': 'train.csv', 'test': 'test.csv'}
train_data = pd.read_csv("hf://datasets/holistic-ai/EMGSD/" + splits["train"])
test_data = pd.read_csv("hf://datasets/holistic-ai/EMGSD/" + splits["test"])

In [4]:
# Use a subset of the data for faster training
sample_ratio = 0.001
train_data = train_data.sample(frac=sample_ratio, random_state=42)
test_data = test_data.sample(frac=sample_ratio, random_state=42)

# Prepare train and test sets by using both training and testing data
X_train, y_train = train_data["text"].values.tolist(), train_data["category"].values.tolist()
X_test, y_test = test_data["text"].values.tolist(), test_data["category"].values.tolist()

In [5]:
# Function to compute Macro F1 score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average='macro')  # Use macro F1
    return {"f1": f1}

In [6]:
# Convert to Hugging Face dataset format
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

# Map labels to IDs
label2id = {
    'stereotype': 0,
    'unrelated': 1,
    'neutral': 2,
}

id2label = {v: k for k, v in label2id.items()}

def map_labels(example):
    example['label'] = label2id[example['label']]
    return example

# Apply the mapping to your dataset
train_dataset = train_dataset.map(map_labels)
test_dataset = test_dataset.map(map_labels)

# Random Model Prediction
random.seed(42)
random_predictions = [random.choice(y_test) for _ in range(len(y_test))]

# Evaluate the model
f1 = f1_score(y_test, random_predictions, average='macro')
print(f"F1 Score: {f1}")

Map: 100%|██████████| 46/46 [00:00<?, ? examples/s]
Map: 100%|██████████| 11/11 [00:00<?, ? examples/s]

F1 Score: 0.35714285714285715





In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from codecarbon import EmissionsTracker

# TF-IDF Vectorizer
X_train = train_dataset['text']
y_train = train_dataset['label']  
X_test = test_dataset['text']
y_test = test_dataset['label']

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Logistic Regression Model
model = LogisticRegression()

# Tracking emissions with CodeCarbon
tracker = EmissionsTracker()
tracker.start()

# Fit the model
model.fit(X_train_tfidf, y_train)

# Evaluate the model
predictions = model.predict(X_test_tfidf)
f1 = f1_score(y_test, predictions, average='macro')

emissions = tracker.stop()
print(f"F1 Score: {f1}")
print(f"Training carbon emissions: {emissions} kg")

[codecarbon INFO @ 16:47:19] [setup] RAM Tracking...
[codecarbon INFO @ 16:47:19] [setup] GPU Tracking...
[codecarbon INFO @ 16:47:19] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 16:47:19] [setup] CPU Tracking...
[codecarbon INFO @ 16:47:21] CPU Model on constant consumption mode: 13th Gen Intel(R) Core(TM) i7-13700H
[codecarbon INFO @ 16:47:21] >>> Tracker's metadata:
[codecarbon INFO @ 16:47:21]   Platform system: Windows-10-10.0.22631-SP0
[codecarbon INFO @ 16:47:21]   Python version: 3.10.15
[codecarbon INFO @ 16:47:21]   CodeCarbon version: 2.4.2
[codecarbon INFO @ 16:47:21]   Available RAM : 31.679 GB
[codecarbon INFO @ 16:47:21]   CPU count: 20
[codecarbon INFO @ 16:47:21]   CPU model: 13th Gen Intel(R) Core(TM) i7-13700H
[codecarbon INFO @ 16:47:21]   GPU count: 1
[codecarbon INFO @ 16:47:21]   GPU model: 1 x NVIDIA GeForce RTX 4060 Laptop GPU
[codecarbon INFO @ 16:47:24] Energy consumed for RAM : 0.000000 kWh. RAM Power : 11.879536628723145 W
[codecarbon INFO @ 16:47:24]

F1 Score: 0.17777777777777778
Training carbon emissions: 5.0235711066259276e-08 kg


In [8]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import matplotlib.pyplot as plt


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

# Tokenization function
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)
    
# Apply the tokenizer to the dataset
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 46/46 [00:00<00:00, 2812.02 examples/s]
Map: 100%|██████████| 11/11 [00:00<00:00, 692.99 examples/s]


In [9]:
# Load pre-trained ALBERT model with classification head
#model = AutoModelForSequenceClassification.from_pretrained(
#    "albert-base-v2", 
#    num_labels=3, 
#    label2id=label2id,
#    id2label=id2label
#)

model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=3,
    label2id=label2id,
    id2label=id2label
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # "mps" For macOS (Apple Silicon)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [10]:
# Fine-tuning the model and save the best model
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    num_train_epochs=3,
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="f1",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,  # Use macro F1 computation
)

  trainer = Trainer(
[codecarbon INFO @ 16:47:25] [setup] RAM Tracking...
[codecarbon INFO @ 16:47:25] [setup] GPU Tracking...
[codecarbon INFO @ 16:47:25] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 16:47:25] [setup] CPU Tracking...
[codecarbon INFO @ 16:47:26] CPU Model on constant consumption mode: 13th Gen Intel(R) Core(TM) i7-13700H
[codecarbon INFO @ 16:47:26] >>> Tracker's metadata:
[codecarbon INFO @ 16:47:26]   Platform system: Windows-10-10.0.22631-SP0
[codecarbon INFO @ 16:47:26]   Python version: 3.10.15
[codecarbon INFO @ 16:47:26]   CodeCarbon version: 2.4.2
[codecarbon INFO @ 16:47:26]   Available RAM : 31.679 GB
[codecarbon INFO @ 16:47:26]   CPU count: 20
[codecarbon INFO @ 16:47:26]   CPU model: 13th Gen Intel(R) Core(TM) i7-13700H
[codecarbon INFO @ 16:47:26]   GPU count: 1
[codecarbon INFO @ 16:47:26]   GPU model: 1 x NVIDIA GeForce RTX 4060 Laptop GPU


In [11]:
# Tracking emissions with CodeCarbon
tracker = EmissionsTracker()
tracker.start()

#trainer.train()

emissions = tracker.stop()

print()
print(f"Training carbon emissions: {emissions} kg")

[codecarbon INFO @ 16:47:30] [setup] RAM Tracking...
[codecarbon INFO @ 16:47:30] [setup] GPU Tracking...
[codecarbon INFO @ 16:47:30] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 16:47:30] [setup] CPU Tracking...
[codecarbon INFO @ 16:47:31] CPU Model on constant consumption mode: 13th Gen Intel(R) Core(TM) i7-13700H
[codecarbon INFO @ 16:47:31] >>> Tracker's metadata:
[codecarbon INFO @ 16:47:31]   Platform system: Windows-10-10.0.22631-SP0
[codecarbon INFO @ 16:47:31]   Python version: 3.10.15
[codecarbon INFO @ 16:47:31]   CodeCarbon version: 2.4.2
[codecarbon INFO @ 16:47:31]   Available RAM : 31.679 GB
[codecarbon INFO @ 16:47:31]   CPU count: 20
[codecarbon INFO @ 16:47:31]   CPU model: 13th Gen Intel(R) Core(TM) i7-13700H
[codecarbon INFO @ 16:47:31]   GPU count: 1
[codecarbon INFO @ 16:47:31]   GPU model: 1 x NVIDIA GeForce RTX 4060 Laptop GPU
[codecarbon INFO @ 16:47:34] Energy consumed for RAM : 0.000000 kWh. RAM Power : 11.879536628723145 W
[codecarbon INFO @ 16:47:34]


Training carbon emissions: 2.751324741017056e-08 kg


In [12]:
import torch
from transformers import (
    AutoTokenizer,
    AutoConfig,
    Trainer,
    TrainingArguments,
    AdamW,
    get_linear_schedule_with_warmup
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss
from datasets import Dataset, concatenate_datasets
import pandas as pd
import re
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
import nlpaug.augmenter.word as naw

# **1. Preprocess Text**
def preprocess_text(examples):
    text = examples['text']
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    examples['text'] = text
    return examples

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_text)
test_dataset = test_dataset.map(preprocess_text)

# **2. Convert to Pandas DataFrame for Oversampling**
train_df = train_dataset.to_pandas()

# Separate features and labels
X = train_df[['text']]
y = train_df['label']

# **3. Over-Sample Minority Classes**
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Convert back to Hugging Face Dataset
resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
resampled_dataset = Dataset.from_pandas(resampled_df)

# **4. Data Augmentation**
def augment_text(examples):
    aug = naw.SynonymAug()
    augmented_texts = [" ".join(text) if isinstance(text, list) else text for text in examples['text']]
    return {"text": augmented_texts, "label": examples['label']}

# Apply augmentation
augmented_dataset = resampled_dataset.map(augment_text, batched=True)

# Combine with resampled dataset
combined_dataset = concatenate_datasets([resampled_dataset, augmented_dataset])

# **5. Tokenization**
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train_dataset = combined_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# **6. Update Labels and Compute Class Weights**
train_labels = tokenized_train_dataset['label']

# Check class distribution
label_counts = Counter(train_labels)
print("Class distribution after oversampling and augmentation:", label_counts)

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = class_weights / class_weights.sum()
class_weights = torch.tensor(class_weights, dtype=torch.float)

# **7. Determine Device**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **8. Load Config and Customize Model**
label2id = {0: 'class_0', 1: 'class_1', 2: 'class_2'}
id2label = {v: k for k, v in label2id.items()}

config = AutoConfig.from_pretrained(
    "roberta-large",
    num_labels=3,
    label2id=label2id,
    id2label=id2label
)

from transformers import RobertaForSequenceClassification

class CustomRoberta(RobertaForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        # Customize the classifier
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(config.hidden_size, 512),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(512, config.num_labels)
        )

model = CustomRoberta.from_pretrained("roberta-large", config=config)
model.to(device)
class_weights = class_weights.to(device)

# **9. Define Training Arguments**
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=1e-5,
    weight_decay=0.01,
    warmup_steps=100,
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
)

# **10. Initialize Optimizer and Scheduler**
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)

total_steps = (
    len(tokenized_train_dataset) // training_args.per_device_train_batch_size
) * training_args.num_train_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=total_steps
)

# **11. Define Custom Trainer Class**
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to(device)
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Reshape logits to match the shape of labels
        logits = logits.view(-1, model.config.num_labels)
        labels = labels.view(-1)
        # Compute custom loss with class weights
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

    def training_step(self, model, inputs, num_items_in_batch=None):
        model.train()
        inputs = self._prepare_inputs(inputs)
        loss = self.compute_loss(model, inputs)
        return loss

# **12. Define Compute Metrics Function**
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds, average='macro')
    }

# **13. Initialize the Custom Trainer**
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)


Map: 100%|██████████| 46/46 [00:00<00:00, 19253.37 examples/s]
Map: 100%|██████████| 11/11 [00:00<00:00, 1067.23 examples/s]
Map: 100%|██████████| 54/54 [00:02<00:00, 22.87 examples/s]
Map: 100%|██████████| 108/108 [00:00<00:00, 5777.65 examples/s]
Map: 100%|██████████| 11/11 [00:00<?, ? examples/s]


Class distribution after oversampling and augmentation: Counter({2: 36, 1: 36, 0: 36})


Some weights of CustomRoberta were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = CustomTrainer(
[codecarbon INFO @ 16:47:37] [setup] RAM Tracking...
[codecarbon INFO @ 16:47:37] [setup] GPU Tracking...
[codecarbon INFO @ 16:47:37] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 16:47:37] [setup] CPU Tracking...
[codecarbon INFO @ 16:47:39] CPU Model on constant consumption mode: 13th Gen Intel(R) Core(TM) i7-13700H
[codecarbon INFO @ 16:47:39] >>> Tracker's metadata:
[codecarbon INFO @ 16:47:39]   Platform system: Windows-10-10.0.22631-SP0
[codecarbon INFO @ 16:47:39]   Python version: 3.10.15
[codecarbon INFO @ 16:47:39]   CodeCarbon version: 2.4.2
[codecarbon INFO @ 16:47:39]   Available RAM : 31.679 GB
[codecarbon INFO @ 16:47:39]   

In [None]:
# Start training
trainer.train()

  0%|          | 0/70 [00:00<?, ?it/s]

ValueError: Expected input batch_size (4096) to match target batch_size (8).

[codecarbon INFO @ 16:47:57] Energy consumed for RAM : 0.000050 kWh. RAM Power : 11.879536628723145 W
[codecarbon INFO @ 16:47:57] Energy consumed for all GPUs : 0.000030 kWh. Total GPU Power : 7.231286163651481 W
[codecarbon INFO @ 16:47:57] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 16:47:57] 0.000257 kWh of electricity used since the beginning.
[codecarbon INFO @ 16:48:12] Energy consumed for RAM : 0.000099 kWh. RAM Power : 11.879536628723145 W
[codecarbon INFO @ 16:48:12] Energy consumed for all GPUs : 0.000062 kWh. Total GPU Power : 7.75361859759768 W
[codecarbon INFO @ 16:48:12] Energy consumed for all CPUs : 0.000354 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 16:48:12] 0.000516 kWh of electricity used since the beginning.
[codecarbon INFO @ 16:48:27] Energy consumed for RAM : 0.000148 kWh. RAM Power : 11.879536628723145 W
[codecarbon INFO @ 16:48:27] Energy consumed for all GPUs : 0.000097 kWh. Total GPU Power : 8.2896173270296

In [None]:
from sklearn.metrics import f1_score

# Making predictions on the test set
preds = trainer.predict(tokenized_test_dataset).predictions.argmax(-1)
f1 = f1_score(tokenized_test_dataset['label'], preds, average='macro')
print(f"Macro F1 Score: {f1:.3f}")

[codecarbon INFO @ 15:36:43] Energy consumed for RAM : 0.001484 kWh. RAM Power : 11.879536628723145 W
[codecarbon INFO @ 15:36:43] Energy consumed for all GPUs : 0.001217 kWh. Total GPU Power : 6.149029723460931 W
[codecarbon INFO @ 15:36:43] Energy consumed for all CPUs : 0.005317 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 15:36:43] 0.008018 kWh of electricity used since the beginning.
100%|██████████| 2/2 [00:00<00:00,  3.24it/s]

Macro F1 Score: 0.208





[codecarbon INFO @ 15:36:49] Energy consumed for RAM : 0.001732 kWh. RAM Power : 11.879536628723145 W
[codecarbon INFO @ 15:36:49] Energy consumed for all GPUs : 0.001377 kWh. Total GPU Power : 7.0567263850462405 W
[codecarbon INFO @ 15:36:49] Energy consumed for all CPUs : 0.006202 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 15:36:49] 0.009311 kWh of electricity used since the beginning.
[codecarbon INFO @ 15:36:58] Energy consumed for RAM : 0.001534 kWh. RAM Power : 11.879536628723145 W
[codecarbon INFO @ 15:36:58] Energy consumed for all GPUs : 0.001250 kWh. Total GPU Power : 7.9407787181463885 W
[codecarbon INFO @ 15:36:58] Energy consumed for all CPUs : 0.005494 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 15:36:58] 0.008278 kWh of electricity used since the beginning.
[codecarbon INFO @ 15:37:04] Energy consumed for RAM : 0.001781 kWh. RAM Power : 11.879536628723145 W
[codecarbon INFO @ 15:37:04] Energy consumed for all GPUs : 0.001405 kWh. Total GPU Power : 6.6732879261