## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
)
import warnings
warnings.filterwarnings('ignore')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU: NVIDIA GeForce GTX 1650 with Max-Q Design


## 2. Improved Text Cleaning

In [3]:
def improved_text_cleaning(text):
    if pd.isna(text):
        return ""

    text = str(text)

    text = re.sub(r'http\S+|www\S+|https\S+', '[URL]', text, flags=re.MULTILINE)

    text = re.sub(r'@(\w+)', r'[USER]', text)

    text = re.sub(r'#(\w+)', r'\1', text)

    text = re.sub(r'(\!)\1{4,}', r'!!!', text)
    text = re.sub(r'(\?)\1{4,}', r'???', text)
    text = re.sub(r'(\.)\{4,}', r'...', text)

    text = re.sub(r'\s+', ' ', text)

    return text.strip()

## 3. Load and Prepare Data

In [12]:
df_train = pd.read_csv('iSarcasmEval_EN/train.En.csv', index_col=0)
df_test = pd.read_csv('iSarcasmEval_EN/task_A_En_test.csv')

print(f"Training data shape: {df_train.shape}")
print(f"Test data shape: {df_test.shape} \n")


class_counts = df_train['sarcastic'].value_counts()
print(f"Non-Sarcastic (0): {class_counts[0]} ({class_counts[0]/len(df_train)*100:.1f}%)")
print(f"Sarcastic (1):     {class_counts[1]} ({class_counts[1]/len(df_train)*100:.1f}%)")
print(f"\nImbalance Ratio: {class_counts[0]/class_counts[1]:.2f}")

Training data shape: (3468, 9)
Test data shape: (1400, 2) 

Non-Sarcastic (0): 2601 (75.0%)
Sarcastic (1):     867 (25.0%)

Imbalance Ratio: 3.00


In [14]:
df_train['text_cleaned'] = df_train['tweet'].apply(improved_text_cleaning)
df_test['text_cleaned'] = df_test['tweet'].apply(improved_text_cleaning)

## 4. Create Dataset Class with Tokenization

In [15]:
class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## 5. Configuration - Model and Training Parameters

In [17]:
CONFIG = {
    'model_name': 'roberta-base',  
    
    'num_epochs': 5, 
    'batch_size': 16,  
    'learning_rate': 1e-5, 
    'weight_decay': 0.01,
    'warmup_ratio': 0.1,
    'max_length': 128,

    'use_focal_loss': True,
    'focal_alpha': [0.25, 0.75],    
    'focal_gamma': 2.0,

    'validation_split': 0.2,
    'random_seed': 42,

}

## 6. Focal Loss Implementation

In [18]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=[0.25, 0.75], gamma=2.0):
        super(FocalLoss, self).__init__()
        if isinstance(alpha, list):
            self.alpha = torch.tensor(alpha, dtype=torch.float32)
        else:
            self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
    
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)

        alpha_t = self.alpha.to(inputs.device)[targets]

        focal_loss = alpha_t * (1 - pt) ** self.gamma * ce_loss

        return focal_loss.mean()

## 7. Custom Trainer with Focal Loss

In [19]:
class FocalLossTrainer(Trainer):
  
    def __init__(self, *args, focal_loss_fn=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.focal_loss_fn = focal_loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
     
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        if self.focal_loss_fn is not None:
            loss = self.focal_loss_fn(logits, labels)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss


def compute_metrics(eval_pred):
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    f1 = f1_score(labels, predictions, pos_label=1)
    precision = precision_score(labels, predictions, pos_label=1, zero_division=0)
    recall = recall_score(labels, predictions, pos_label=1)
    accuracy = accuracy_score(labels, predictions)

    f1_macro = f1_score(labels, predictions, average='macro')

    return {
        'f1_sarcastic': f1,           
        'f1_macro': f1_macro,         
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
    }

## 8. Prepare Data for Training

In [25]:
X_train_full = df_train['text_cleaned'].values
y_train_full = df_train['sarcastic'].values

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full,
    y_train_full,
    test_size=CONFIG['validation_split'],
    random_state=CONFIG['random_seed'],
    stratify=y_train_full
)

X_test = df_test['text_cleaned'].values
y_test = df_test['sarcastic'].values

print(f"Training samples:   {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples:       {len(X_test)}")

print("\nClass distribution in splits:")
print(f"Train - Non-sarcastic: {(y_train==0).sum()},     Sarcastic: {(y_train==1).sum()}")
print(f"Val   - Non-sarcastic: {(y_val==0).sum()},      Sarcastic: {(y_val==1).sum()}")
print(f"Test  - Non-sarcastic: {(y_test==0).sum()},     Sarcastic: {(y_test==1).sum()}")

Training samples:   2774
Validation samples: 694
Test samples:       1400

Class distribution in splits:
Train - Non-sarcastic: 2080,     Sarcastic: 694
Val   - Non-sarcastic: 521,      Sarcastic: 173
Test  - Non-sarcastic: 1200,     Sarcastic: 200


## 9. Load Model and Tokenizer

In [27]:
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])

model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG['model_name'],
    num_labels=2
)

model = model.to(device)

train_dataset = SarcasmDataset(X_train, y_train, tokenizer, CONFIG['max_length'])
val_dataset = SarcasmDataset(X_val, y_val, tokenizer, CONFIG['max_length'])
test_dataset = SarcasmDataset(X_test, y_test, tokenizer, CONFIG['max_length'])

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 10. Setup Training with Focal Loss

In [30]:
if CONFIG['use_focal_loss']:
    focal_loss = FocalLoss(
        alpha=CONFIG['focal_alpha'],
        gamma=CONFIG['focal_gamma']
    )
    print(f"   Alpha: {CONFIG['focal_alpha']} [non-sarcastic, sarcastic]")
    print(f"   Gamma: {CONFIG['focal_gamma']}")
else:
    focal_loss = None

training_args = TrainingArguments(
    num_train_epochs=CONFIG['num_epochs'],
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'] * 2,
    learning_rate=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay'],
    warmup_ratio=CONFIG['warmup_ratio'],

    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,

    load_best_model_at_end=True,
    metric_for_best_model="f1_sarcastic",  
    greater_is_better=True,

    fp16=torch.cuda.is_available(),
    dataloader_num_workers=0,

    save_total_limit=2,
    seed=CONFIG['random_seed'],
    report_to="none"
)

trainer = FocalLossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    focal_loss_fn=focal_loss
)

   Alpha: [0.25, 0.75] [non-sarcastic, sarcastic]
   Gamma: 2.0


## 11. Train the Model

In [None]:
train_result = trainer.train()


print(f"Training loss: {train_result.training_loss:.4f}")

val_results = trainer.evaluate()
print("\nValidation Results:")
print(f"  F1 (Sarcastic):  {val_results['eval_f1_sarcastic']:.4f} ({val_results['eval_f1_sarcastic']*100:.2f}%)")
print(f"  F1 (Macro):      {val_results['eval_f1_macro']:.4f}")
print(f"  Precision:       {val_results['eval_precision']:.4f}")
print(f"  Recall:          {val_results['eval_recall']:.4f}")
print(f"  Accuracy:        {val_results['eval_accuracy']:.4f}")

Epoch,Training Loss,Validation Loss


## 12. Test Set Evaluation (Default Threshold 0.5)

In [None]:
model.eval()
all_probabilities = []
all_predictions = []

with torch.no_grad():
    for i in range(0, len(test_dataset), CONFIG['batch_size'] * 2):
        batch_indices = range(i, min(i + CONFIG['batch_size'] * 2, len(test_dataset)))

        batch_input_ids = torch.stack([test_dataset[j]['input_ids'] for j in batch_indices]).to(device)
        batch_attention_mask = torch.stack([test_dataset[j]['attention_mask'] for j in batch_indices]).to(device)

        outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
        probs = F.softmax(outputs.logits, dim=1)

        all_probabilities.extend(probs.cpu().numpy())
        preds = torch.argmax(probs, dim=1).cpu().numpy()
        all_predictions.extend(preds)

all_probabilities = np.array(all_probabilities)

f1_sarc = f1_score(y_test, all_predictions, pos_label=1)
precision_sarc = precision_score(y_test, all_predictions, pos_label=1)
recall_sarc = recall_score(y_test, all_predictions, pos_label=1)
accuracy = accuracy_score(y_test, all_predictions)

print(f"\nResults with threshold 0.5:")
print(f"  F1 (Sarcastic):  {f1_sarc:.4f} ({f1_sarc*100:.2f}%)")
print(f"  Precision:       {precision_sarc:.4f}")
print(f"  Recall:          {recall_sarc:.4f}")
print(f"  Accuracy:        {accuracy:.4f}")

print(classification_report(
    y_test,
    all_predictions,
    target_names=['Non-Sarcastic (0)', 'Sarcastic (1)'],
    digits=4
))

TEST SET EVALUATION (Threshold 0.5)

Results with threshold 0.5:
  F1 (Sarcastic):  0.4150 (41.50%)
  Precision:       0.2876
  Recall:          0.7450
  Accuracy:        0.7000

Classification Report:
                   precision    recall  f1-score   support

Non-Sarcastic (0)     0.9422    0.6925    0.7983      1200
    Sarcastic (1)     0.2876    0.7450    0.4150       200

         accuracy                         0.7000      1400
        macro avg     0.6149    0.7188    0.6067      1400
     weighted avg     0.8487    0.7000    0.7435      1400


Confusion Matrix:
  True Negatives:  831
  False Positives: 369
  False Negatives: 51
  True Positives:  149

Results with threshold 0.5:
  F1 (Sarcastic):  0.4150 (41.50%)
  Precision:       0.2876
  Recall:          0.7450
  Accuracy:        0.7000

Classification Report:
                   precision    recall  f1-score   support

Non-Sarcastic (0)     0.9422    0.6925    0.7983      1200
    Sarcastic (1)     0.2876    0.7450    0.41

## 13. Threshold Optimization for Maximum F1

In [None]:
thresholds = np.arange(0.1, 0.9, 0.05)
best_threshold = 0.5
best_f1 = 0
threshold_results = []

print(f"{'Threshold':<12} {'F1':<10} {'Precision':<12} {'Recall':<10} {'Accuracy':<10}")
print("-" * 60)

for threshold in thresholds:
    preds = (all_probabilities[:, 1] >= threshold).astype(int)

    f1 = f1_score(y_test, preds, pos_label=1)
    precision = precision_score(y_test, preds, pos_label=1, zero_division=0)
    recall = recall_score(y_test, preds, pos_label=1)
    acc = accuracy_score(y_test, preds)

    threshold_results.append({
        'threshold': threshold,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'accuracy': acc
    })

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

    marker = " ‚Üê BEST" if threshold == best_threshold and f1 == best_f1 else ""
    print(f"{threshold:<12.2f} {f1:<10.4f} {precision:<12.4f} {recall:<10.4f} {acc:<10.4f}{marker}")


print(f"OPTIMAL THRESHOLD: {best_threshold:.2f}")
print(f"Maximum F1-Score: {best_f1:.4f} ({best_f1*100:.2f}%)")

THRESHOLD OPTIMIZATION
Finding optimal threshold to maximize F1 score for SARCASTIC class...

Threshold    F1         Precision    Recall     Accuracy  
------------------------------------------------------------
0.10         0.2782     0.1622       0.9750     0.2771     ‚Üê BEST
0.15         0.2966     0.1758       0.9500     0.3564     ‚Üê BEST
0.20         0.3112     0.1871       0.9250     0.4150     ‚Üê BEST
0.25         0.3257     0.1993       0.8900     0.4736     ‚Üê BEST
0.30         0.3448     0.2147       0.8750     0.5250     ‚Üê BEST
0.35         0.3571     0.2267       0.8400     0.5679     ‚Üê BEST
0.40         0.3744     0.2426       0.8200     0.6086     ‚Üê BEST
0.45         0.3865     0.2575       0.7750     0.6486     ‚Üê BEST
0.50         0.4150     0.2876       0.7450     0.7000     ‚Üê BEST
0.55         0.4319     0.3113       0.7050     0.7350     ‚Üê BEST
0.60         0.4355     0.3275       0.6500     0.7593     ‚Üê BEST
0.65         0.4624     0.3762       0

## 14. Final Results with Optimal Threshold

In [None]:
final_predictions = (all_probabilities[:, 1] >= best_threshold).astype(int)

final_f1 = f1_score(y_test, final_predictions, pos_label=1)
final_precision = precision_score(y_test, final_predictions, pos_label=1)
final_recall = recall_score(y_test, final_predictions, pos_label=1)
final_accuracy = accuracy_score(y_test, final_predictions)
final_f1_macro = f1_score(y_test, final_predictions, average='macro')


print(f"FINAL RESULTS (Optimal Threshold: {best_threshold:.2f})")
print(f"\nPerformance Metrics:")
print(f"  F1 (Sarcastic):  {final_f1:.4f} ({final_f1*100:.2f}%)")
print(f"  F1 (Macro):      {final_f1_macro:.4f} ({final_f1_macro*100:.2f}%)")
print(f"  Precision:       {final_precision:.4f}")
print(f"  Recall:          {final_recall:.4f}")
print(f"  Accuracy:        {final_accuracy:.4f}")


print(classification_report(
    y_test,
    final_predictions,
    target_names=['Non-Sarcastic (0)', 'Sarcastic (1)'],
    digits=4
))


minority_recall = final_cm[1][1] / (final_cm[1][0] + final_cm[1][1])
minority_precision = final_cm[1][1] / (final_cm[0][1] + final_cm[1][1]) if (final_cm[0][1] + final_cm[1][1]) > 0 else 0

print(f"\nüí° Minority Class Analysis:")
print(f"  Sarcastic samples in test: {final_cm[1][0] + final_cm[1][1]}")
print(f"  Correctly identified: {final_cm[1][1]} ({minority_recall*100:.1f}%)")
print(f"  Precision: {minority_precision*100:.1f}%")

FINAL RESULTS (Optimal Threshold: 0.70)

üìä Performance Metrics:
  F1 (Sarcastic):  0.4967 (49.67%) ‚≠ê
  F1 (Macro):      0.6995 (69.95%)
  Precision:       0.4431
  Recall:          0.5650
  Accuracy:        0.8364

Detailed Classification Report:
                   precision    recall  f1-score   support

Non-Sarcastic (0)     0.9240    0.8817    0.9023      1200
    Sarcastic (1)     0.4431    0.5650    0.4967       200

         accuracy                         0.8364      1400
        macro avg     0.6836    0.7233    0.6995      1400
     weighted avg     0.8553    0.8364    0.8444      1400


üìä Confusion Matrix:
  True Negatives:  1058 (correct non-sarcastic)
  False Positives: 142 (non-sarcastic classified as sarcastic)
  False Negatives: 87 (sarcastic classified as non-sarcastic)
  True Positives:  113 (correct sarcastic) ‚≠ê

üí° Minority Class Analysis:
  Sarcastic samples in test: 200
  Correctly identified: 113 (56.5%)
  Precision: 44.3%
