In [1]:
!pip install codecarbon

Collecting codecarbon
  Downloading codecarbon-2.7.4-py3-none-any.whl.metadata (8.7 kB)
Collecting fief-client[cli] (from codecarbon)
  Downloading fief_client-0.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting questionary (from codecarbon)
  Downloading questionary-2.0.1-py3-none-any.whl.metadata (5.4 kB)
Collecting rapidfuzz (from codecarbon)
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting jwcrypto<2.0.0,>=1.4 (from fief-client[cli]->codecarbon)
  Downloading jwcrypto-1.5.6-py3-none-any.whl.metadata (3.1 kB)
Collecting yaspin (from fief-client[cli]->codecarbon)
  Downloading yaspin-3.1.0-py3-none-any.whl.metadata (14 kB)
Collecting prompt_toolkit<=3.0.36,>=2.0 (from questionary->codecarbon)
  Downloading prompt_toolkit-3.0.36-py3-none-any.whl.metadata (7.0 kB)
Collecting termcolor<2.4.0,>=2.2.0 (from yaspin->fief-client[cli]->codecarbon)
  Downloading termcolor-2.3.0-py3-none-any.whl.metadata (5.3 kB)
Downloading

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import random
import torch
from datasets import Dataset
from sklearn.metrics import f1_score
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    pipeline,
)
import matplotlib.pyplot as plt

In [3]:
# Load the dataset
splits = {'train': 'train.csv', 'test': 'test.csv'}
train_data = pd.read_csv("hf://datasets/holistic-ai/EMGSD/" + splits["train"])
test_data = pd.read_csv("hf://datasets/holistic-ai/EMGSD/" + splits["test"])

In [4]:
# Use a subset of the data for faster training
sample_ratio = 0.001
train_data = train_data.sample(frac=sample_ratio, random_state=42)
test_data = test_data.sample(frac=sample_ratio, random_state=42)

# Prepare train and test sets by using both training and testing data
X_train, y_train = train_data["text"].values.tolist(), train_data["category"].values.tolist()
X_test, y_test = test_data["text"].values.tolist(), test_data["category"].values.tolist()

In [5]:
# Function to compute Macro F1 score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average='macro')  # Use macro F1
    return {"f1": f1}

In [6]:
# Convert to Hugging Face dataset format
train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

# Map labels to IDs
label2id = {
    'stereotype': 0,
    'unrelated': 1,
    'neutral': 2,
}

id2label = {v: k for k, v in label2id.items()}

def map_labels(example):
    example['label'] = label2id[example['label']]
    return example

# Apply the mapping to your dataset
train_dataset = train_dataset.map(map_labels)
test_dataset = test_dataset.map(map_labels)

# Random Model Prediction
random.seed(42)
random_predictions = [random.choice(y_test) for _ in range(len(y_test))]

# Evaluate the model
f1 = f1_score(y_test, random_predictions, average='macro')
print(f"F1 Score: {f1}")

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

F1 Score: 0.35714285714285715


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from codecarbon import EmissionsTracker

# TF-IDF Vectorizer
X_train = train_dataset['text']
y_train = train_dataset['label']  
X_test = test_dataset['text']
y_test = test_dataset['label']

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Logistic Regression Model
model = LogisticRegression()

# Tracking emissions with CodeCarbon
tracker = EmissionsTracker()
tracker.start()

# Fit the model
model.fit(X_train_tfidf, y_train)

# Evaluate the model
predictions = model.predict(X_test_tfidf)
f1 = f1_score(y_test, predictions, average='macro')

emissions = tracker.stop()
print(f"F1 Score: {f1}")
print(f"Training carbon emissions: {emissions} kg")

[codecarbon INFO @ 11:08:48] [setup] RAM Tracking...
[codecarbon INFO @ 11:08:48] [setup] GPU Tracking...
[codecarbon INFO @ 11:08:48] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 11:08:48] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 11:08:49] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 11:08:49] >>> Tracker's metadata:
[codecarbon INFO @ 11:08:49]   Platform system: Linux-6.6.56+-x86_64-with-glibc2.35
[codecarbon INFO @ 11:08:49]   Python version: 3.10.14
[codecarbon INFO @ 11:08:49]   CodeCarbon version: 2.7.4
[codecarbon INFO @ 11:08:49]   Available RAM : 31.351 GB
[codecarbon INFO @ 11:08:49]   CPU count: 4
[codecarbon INFO @ 11:08:49]   CPU model: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 11:08:49]   GPU count: 2
[codecarbon INFO @ 11:08:49]   GPU model: 2 x Tesla T4
[codecarbon INFO @ 11:08:52] Saving emissions data to file

F1 Score: 0.17777777777777778
Training carbon emissions: 3.78075521780836e-07 kg


  df = pd.concat([df, pd.DataFrame.from_records([dict(total.values)])])


In [8]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import matplotlib.pyplot as plt


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

# Tokenization function
def tokenize_function(example):
    return tokenizer(example['text'], padding='max_length', truncation=True)
    
# Apply the tokenizer to the dataset
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]



Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [9]:
# Load pre-trained ALBERT model with classification head
#model = AutoModelForSequenceClassification.from_pretrained(
#    "albert-base-v2", 
#    num_labels=3, 
#    label2id=label2id,
#    id2label=id2label
#)

model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=3,
    label2id=label2id,
    id2label=id2label
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # "mps" For macOS (Apple Silicon)
model.to(device)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [10]:
# Fine-tuning the model and save the best model
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    num_train_epochs=3,
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="f1",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    compute_metrics=compute_metrics,  # Use macro F1 computation
)

[codecarbon INFO @ 11:08:58] [setup] RAM Tracking...
[codecarbon INFO @ 11:08:58] [setup] GPU Tracking...
[codecarbon INFO @ 11:08:58] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 11:08:58] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at \sys\class\powercap\intel-rapl to measure CPU

[codecarbon INFO @ 11:08:59] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 11:08:59] >>> Tracker's metadata:
[codecarbon INFO @ 11:08:59]   Platform system: Linux-6.6.56+-x86_64-with-glibc2.35
[codecarbon INFO @ 11:08:59]   Python version: 3.10.14
[codecarbon INFO @ 11:08:59]   CodeCarbon version: 2.7.4
[codecarbon INFO @ 11:08:59]   Available RAM : 31.351 GB
[codecarbon INFO @ 11:08:59]   CPU count: 4
[codecarbon INFO @ 11:08:59]   CPU model: Intel(R) Xeon(R) CPU @ 2.00GHz
[codecarbon INFO @ 11:08:59]   GPU count: 2
[codecarbon INFO @ 11:08:59]   GPU model: 2 x Tesla T4
[codecarbon INFO @ 11:09:03] Saving emissions data to file

In [11]:
# Tracking emissions with CodeCarbon
tracker = EmissionsTracker()
tracker.start()

#trainer.train()

emissions = tracker.stop()

print()
print(f"Training carbon emissions: {emissions} kg")

[codecarbon ERROR @ 11:09:03] Error: Another instance of codecarbon is already running. Turn off the other instance to be able to run this one. Exiting.



Training carbon emissions: None kg


In [13]:
!pip install nlpaug

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting gdown>=4.0.0 (from nlpaug)
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown, nlpaug
Successfully installed gdown-5.2.0 nlpaug-1.1.11


In [14]:
import torch
from transformers import (
    AutoTokenizer,
    AutoConfig,
    Trainer,
    TrainingArguments,
    AdamW,
    get_linear_schedule_with_warmup
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss
from datasets import Dataset, concatenate_datasets
import pandas as pd
import re
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
import nlpaug.augmenter.word as naw

# **1. Preprocess Text**
def preprocess_text(examples):
    text = examples['text']
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    examples['text'] = text
    return examples

# Apply preprocessing
train_dataset = train_dataset.map(preprocess_text)
test_dataset = test_dataset.map(preprocess_text)

# **2. Convert to Pandas DataFrame for Oversampling**
train_df = train_dataset.to_pandas()

# Separate features and labels
X = train_df[['text']]
y = train_df['label']

# **3. Over-Sample Minority Classes**
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# Convert back to Hugging Face Dataset
resampled_df = pd.concat([X_resampled, y_resampled], axis=1)
resampled_dataset = Dataset.from_pandas(resampled_df)

# **4. Data Augmentation**
def augment_text(examples):
    aug = naw.SynonymAug()
    augmented_texts = [" ".join(text) if isinstance(text, list) else text for text in examples['text']]
    return {"text": augmented_texts, "label": examples['label']}

# Apply augmentation
augmented_dataset = resampled_dataset.map(augment_text, batched=True)

# Combine with resampled dataset
combined_dataset = concatenate_datasets([resampled_dataset, augmented_dataset])

# **5. Tokenization**
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train_dataset = combined_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# **6. Update Labels and Compute Class Weights**
train_labels = tokenized_train_dataset['label']

# Check class distribution
label_counts = Counter(train_labels)
print("Class distribution after oversampling and augmentation:", label_counts)

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = class_weights / class_weights.sum()
class_weights = torch.tensor(class_weights, dtype=torch.float)

# **7. Determine Device**
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# **8. Load Config and Customize Model**
label2id = {0: 'class_0', 1: 'class_1', 2: 'class_2'}
id2label = {v: k for k, v in label2id.items()}

config = AutoConfig.from_pretrained(
    "roberta-large",
    num_labels=3,
    label2id=label2id,
    id2label=id2label
)

from transformers import RobertaForSequenceClassification

class CustomRoberta(RobertaForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        # Customize the classifier
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(config.hidden_size, 512),
            torch.nn.ReLU(),
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(512, config.num_labels)
        )

model = CustomRoberta.from_pretrained("roberta-large", config=config)
model.to(device)
class_weights = class_weights.to(device)

# **9. Define Training Arguments**
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=1e-5,
    weight_decay=0.01,
    warmup_steps=100,
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
)

# **10. Initialize Optimizer and Scheduler**
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)

total_steps = (
    len(tokenized_train_dataset) // training_args.per_device_train_batch_size
) * training_args.num_train_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=training_args.warmup_steps,
    num_training_steps=total_steps
)

# **11. Define Custom Trainer Class**
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to(device)
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Reshape logits to match the shape of labels
        logits = logits.view(-1, model.config.num_labels)
        labels = labels.view(-1)
        # Compute custom loss with class weights
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

    def training_step(self, model, inputs, num_items_in_batch=None):
        model.train()
        inputs = self._prepare_inputs(inputs)
        loss = self.compute_loss(model, inputs)
        return loss

# **12. Define Compute Metrics Function**
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        'accuracy': accuracy_score(p.label_ids, preds),
        'f1': f1_score(p.label_ids, preds, average='macro')
    }

# **13. Initialize the Custom Trainer**
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Map:   0%|          | 0/108 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Class distribution after oversampling and augmentation: Counter({2: 36, 1: 36, 0: 36})


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of CustomRoberta were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[codecarbon ERROR @ 11:10:38] Error: Another instance of codecarbon is already running. Turn off the other instance to be able to run this one. Exiting.


In [None]:
# Start training
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

In [None]:
from sklearn.metrics import f1_score

# Making predictions on the test set
preds = trainer.predict(tokenized_test_dataset).predictions.argmax(-1)
f1 = f1_score(tokenized_test_dataset['label'], preds, average='macro')
print(f"Macro F1 Score: {f1:.3f}")