In [1]:
!pip install -q transformers torch scikit-learn


In [2]:
import pandas as pd
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


In [3]:

# Dataset B — labeled
train_df = pd.read_csv("/content/Arabic.csv")
train_df = train_df.dropna(subset=["tweet", "label"])

# Dataset A — unlabeled
real_df = pd.read_csv("/content/merged_twitterdata.csv")
real_df = real_df.dropna(subset=["text"])


# Task
Fix the `NameError` in the `label_map` application by changing `df_train` to `train_df` and apply the label mapping to the `train_df` DataFrame.

## Requirement Analysis & Data Organization

### Subtask:
Correcting the label mapping application for the labeled dataset (`train_df`) and preparing both the labeled and unlabeled datasets (`real_df`) for subsequent steps. This includes fixing the `NameError` in the `label_map` application and ensuring data readiness as per the methodology's first step.


**Reasoning**:
The previous cell failed due to a `NameError` because `df_train` was used instead of `train_df`. This step corrects the DataFrame name and applies the `label_map` to the `label` column of `train_df`, storing the result in a new `label_id` column as specified in the instructions.



In [4]:
label_map = {
    "not": 0,
    "offensive": 1
}

train_df["label"] = train_df["label"].astype(str).str.strip().str.lower()
train_df["label_id"] = train_df["label"].map(label_map)

print(train_df[["label", "label_id"]].head())
print("NaN in label_id:", train_df["label_id"].isna().sum())
print("Label counts:\n", train_df["label"].value_counts())


       label  label_id
0  offensive         1
1  offensive         1
2  offensive         1
3  offensive         1
4  offensive         1
NaN in label_id: 0
Label counts:
 label
not          7364
offensive    3867
Name: count, dtype: int64


# Train–Validation–Test Split to Reduce Overfitting


In [5]:
from sklearn.model_selection import train_test_split

X = train_df["tweet"].values
y = train_df["label_id"].values

# Train (70%) + Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

# Validation (15%) + Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    random_state=42,
    stratify=y_temp
)

print("Train size:", len(X_train))
print("Val size:", len(X_val))
print("Test size:", len(X_test))

print("\nClass distribution:")
import numpy as np
print("Train:", np.bincount(y_train))
print("Val:", np.bincount(y_val))
print("Test:", np.bincount(y_test))


Train size: 7861
Val size: 1685
Test size: 1685

Class distribution:
Train: [5154 2707]
Val: [1105  580]
Test: [1105  580]


# Setup LLM Environment and Load Pre-trained Model



In [6]:
!pip install -q transformers torch scikit-learn


In [7]:
import warnings
warnings.filterwarnings("ignore")

print("Environment ready: Transformers, Torch, and Scikit-learn")


Environment ready: Transformers, Torch, and Scikit-learn


**Reasoning**:
Now that the necessary libraries are installed, the next step is to import `AutoTokenizer` and `AutoModelForSequenceClassification` from the `transformers` library and then load a pre-trained Arabic transformer model and its tokenizer, as per the subtask instructions.



# Load Pre-trained Arabic BERT Model and Tokenizer


In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Choose a pre-trained Arabic BERT model
model_name = "aubmindlab/bert-base-arabertv2"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Tokenizer for {model_name} loaded successfully.")

# Load model for binary sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)
print(f"Model for {model_name} loaded successfully.")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model moved to device: {device}")


tokenizer_config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Tokenizer for aubmindlab/bert-base-arabertv2 loaded successfully.


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model for aubmindlab/bert-base-arabertv2 loaded successfully.
Model moved to device: cuda


# Handle Class Imbalance (Train Only)


In [9]:
import torch
from torch.utils.data import WeightedRandomSampler
import numpy as np

# Compute sample weights for the training set only
class_counts = np.bincount(y_train)              # e.g., [not_count, offensive_count]
class_weights = 1.0 / class_counts               # higher weight for minority class
sample_weights = class_weights[y_train]          # weight per training sample

train_sampler = WeightedRandomSampler(
    weights=torch.tensor(sample_weights, dtype=torch.double),
    num_samples=len(sample_weights),
    replacement=True
)

print("Train class counts:", class_counts)
print("Train class weights:", class_weights)
print("Balanced sampler ready ")


Train class counts: [5154 2707]
Train class weights: [0.00019402 0.00036941]
Balanced sampler ready 


# Tokenize Data and Build Datasets


In [10]:
from torch.utils.data import TensorDataset

max_length = 128

# Tokenize Train
train_encodings = tokenizer(
    list(X_train),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# Tokenize Validation
val_encodings = tokenizer(
    list(X_val),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# Tokenize Test
test_encodings = tokenizer(
    list(X_test),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# Convert labels to tensors
y_train_t = torch.tensor(y_train, dtype=torch.long)
y_val_t   = torch.tensor(y_val,   dtype=torch.long)
y_test_t  = torch.tensor(y_test,  dtype=torch.long)

# Build datasets
train_dataset = TensorDataset(
    train_encodings["input_ids"],
    train_encodings["attention_mask"],
    y_train_t
)

val_dataset = TensorDataset(
    val_encodings["input_ids"],
    val_encodings["attention_mask"],
    y_val_t
)

test_dataset = TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
    y_test_t
)

print("Datasets ready:",
      len(train_dataset),
      len(val_dataset),
      len(test_dataset))


Datasets ready: 7861 1685 1685


# Create DataLoaders


In [11]:
from torch.utils.data import DataLoader

batch_size = 16

# Train loader with balanced sampler
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    sampler=train_sampler
)

# Validation loader
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False
)

# Test loader
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)

print("DataLoaders ready:",
      len(train_loader),
      len(val_loader),
      len(test_loader))


DataLoaders ready: 492 106 106


# Training Loop with Validation


In [12]:
from torch.optim import AdamW
from tqdm import tqdm

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 3

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")

    # ===== Training =====
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc="Training"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Average training loss: {avg_train_loss:.4f}")

    # ===== Validation =====
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct / total

    print(f"Validation loss: {avg_val_loss:.4f}")
    print(f"Validation accuracy: {val_accuracy:.4f}")



Epoch 1/3


Training: 100%|██████████| 492/492 [02:10<00:00,  3.76it/s]


Average training loss: 0.2918


Validation: 100%|██████████| 106/106 [00:08<00:00, 13.09it/s]


Validation loss: 0.1705
Validation accuracy: 0.9329

Epoch 2/3


Training: 100%|██████████| 492/492 [02:11<00:00,  3.75it/s]


Average training loss: 0.1384


Validation: 100%|██████████| 106/106 [00:08<00:00, 13.13it/s]


Validation loss: 0.1708
Validation accuracy: 0.9407

Epoch 3/3


Training: 100%|██████████| 492/492 [02:11<00:00,  3.73it/s]


Average training loss: 0.0913


Validation: 100%|██████████| 106/106 [00:08<00:00, 13.07it/s]

Validation loss: 0.1632
Validation accuracy: 0.9424





# Final Evaluation on Test Set


In [13]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Accuracy
test_accuracy = np.mean(np.array(all_preds) == np.array(all_labels))
print(f"Test Accuracy: {test_accuracy:.4f}")

# Detailed metrics
print("\nClassification Report:")
print(classification_report(
    all_labels,
    all_preds,
    target_names=["not", "offensive"]
))

print("\nConfusion Matrix:")
print(confusion_matrix(all_labels, all_preds))


Test Accuracy: 0.9407

Classification Report:
              precision    recall  f1-score   support

         not       0.98      0.93      0.95      1105
   offensive       0.88      0.96      0.92       580

    accuracy                           0.94      1685
   macro avg       0.93      0.94      0.94      1685
weighted avg       0.94      0.94      0.94      1685


Confusion Matrix:
[[1029   76]
 [  24  556]]


# Optional: Inference on Unlabeled Data (real_df)


## Tokenize Unlabeled Data


In [14]:
import torch

max_length = 128  # keep consistent with training

real_texts = real_df["text"].astype(str).tolist()

real_encodings = tokenizer(
    real_texts,
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

print("Real data tokenized ")
print("input_ids:", real_encodings["input_ids"].shape)
print("attention_mask:", real_encodings["attention_mask"].shape)


Real data tokenized 
input_ids: torch.Size([218, 128])
attention_mask: torch.Size([218, 128])


## Predict Labels for real_df


In [15]:
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

model.eval()

real_dataset = TensorDataset(
    real_encodings["input_ids"],
    real_encodings["attention_mask"]
)

real_loader = DataLoader(real_dataset, batch_size=32, shuffle=False)

all_real_preds = []

with torch.no_grad():
    for batch in real_loader:
        input_ids, attention_mask = [b.to(device) for b in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_real_preds.extend(preds.cpu().numpy())

# Map numeric predictions back to label names
id2label = {0: "not", 1: "offensive"}
real_df["pred_id"] = all_real_preds
real_df["pred_label"] = real_df["pred_id"].map(id2label)

print("Predictions added ")
print(real_df[["text", "pred_label"]].head())
print("\nPred label counts:")
print(real_df["pred_label"].value_counts())


Predictions added 
                                                text pred_label
0  كلب سئم من نباح كلب صغير وقام رمية في حمام الس...        not
1  يا ابن الحمير انا ما غلطت عليك ولا سبيتك لكن ا...        not
2  بعد فضيحة سرقة لوحاتها .. الفنانة الدنماركية ل...  offensive
3                   طيب والله فضيحة لو ماحسبها بلنتي        not
4                                           كلب ومات        not

Pred label counts:
pred_label
not          183
offensive     35
Name: count, dtype: int64


# Human-Labeled Dataset (Manual Annotation)

## Load and Inspect Human-Labeled Twitter Data

In [16]:
human_df = pd.read_csv(
    "/content/merged_twitterdata with human classification.csv",
    sep=";",
    encoding="utf-8-sig",
    engine="python"
)

print(human_df.columns)
print(human_df.shape)
human_df.head()


Index(['url', 'twitterUrl', 'id', 'text', 'retweetCount', 'replyCount',
       'likeCount', 'quoteCount', 'createdAt', 'bookmarkCount', 'isRetweet',
       'isQuote', 'classification'],
      dtype='object')
(218, 13)


Unnamed: 0,url,twitterUrl,id,text,retweetCount,replyCount,likeCount,quoteCount,createdAt,bookmarkCount,isRetweet,isQuote,classification
0,https://x.com/Gxxzi/status/1987582511455572351,https://twitter.com/Gxxzi/status/1987582511455...,1.98758e+18,كلب سئم من نباح كلب صغير وقام رمية في حمام الس...,19,2,92,0,Sun Nov 09 18:06:12 +0000 2025,29,False,True,not
1,https://x.com/Asem_a/status/1987570973743128746,https://twitter.com/Asem_a/status/198757097374...,1.98757e+18,يا ابن الحمير انا ما غلطت عليك ولا سبيتك لكن ا...,0,0,0,0,Sun Nov 09 17:20:22 +0000 2025,0,False,True,offensive
2,https://x.com/FaisalIdri61604/status/198756789...,https://twitter.com/FaisalIdri61604/status/198...,1.98757e+18,بعد فضيحة سرقة لوحاتها .. الفنانة الدنماركية ل...,0,0,0,0,Sun Nov 09 17:08:07 +0000 2025,0,False,False,not
3,https://x.com/Fallzhrani/status/19875611389265...,https://twitter.com/Fallzhrani/status/19875611...,1.98756e+18,طيب والله فضيحة لو ماحسبها بلنتي,0,0,0,0,Sun Nov 09 16:41:17 +0000 2025,0,False,False,not
4,https://x.com/ksa702aaa/status/198753138641334...,https://twitter.com/ksa702aaa/status/198753138...,1.98753e+18,كلب ومات,0,0,0,0,Sun Nov 09 14:43:03 +0000 2025,0,False,True,offensive


## Inspect Human Label Distribution

In [17]:
human_df["classification"] = human_df["classification"].astype(str).str.strip().str.lower()
print(human_df["classification"].value_counts())


classification
not          162
offensive     56
Name: count, dtype: int64


# Model vs Human Comparison

## Generate Model Predictions for Human-Labeled Tweets

In [18]:
import torch
from torch.utils.data import DataLoader, TensorDataset

max_length = 128

texts = human_df["text"].astype(str).tolist()

enc = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

dataset = TensorDataset(enc["input_ids"], enc["attention_mask"])
loader = DataLoader(dataset, batch_size=32, shuffle=False)

model.eval()
preds = []

with torch.no_grad():
    for batch in loader:
        input_ids, attention_mask = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        batch_preds = torch.argmax(outputs.logits, dim=1)
        preds.extend(batch_preds.cpu().numpy())

id2label = {0: "not", 1: "offensive"}
human_df["model_pred"] = [id2label[p] for p in preds]

print(human_df[["text", "classification", "model_pred"]].head(10))
print("\nModel prediction counts:")
print(human_df["model_pred"].value_counts())


                                                text classification model_pred
0  كلب سئم من نباح كلب صغير وقام رمية في حمام الس...            not        not
1  يا ابن الحمير انا ما غلطت عليك ولا سبيتك لكن ا...      offensive        not
2  بعد فضيحة سرقة لوحاتها .. الفنانة الدنماركية ل...            not  offensive
3                   طيب والله فضيحة لو ماحسبها بلنتي            not        not
4                                           كلب ومات      offensive        not
5  @magdi_khalil الخنزير مجدي خليل صليبي متعصب حا...      offensive        not
6  @abdullh_132 كذاب اغلب البنات تحررو من النقاب ...      offensive  offensive
7                     كيمي يا ورع انقلع خل ماكس يعدي      offensive        not
8                     @samerabdullah06 كل خرا يا كلب      offensive        not
9               @hamed4343 قسم انك حمار بيشة????????      offensive        not

Model prediction counts:
model_pred
not          191
offensive     27
Name: count, dtype: int64


# Model Evaluation on Full Human-Labeled Dataset (Unbalanced Test)

In [19]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import torch

# Encode human-labeled texts (unbalanced)
human_enc = tokenizer(
    human_df["text"].astype(str).tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Convert labels
label_map = {"not": 0, "offensive": 1}
y_human = torch.tensor(
    human_df["classification"].map(label_map).values
)

# Create dataset and loader
human_dataset = TensorDataset(
    human_enc["input_ids"],
    human_enc["attention_mask"],
    y_human
)

human_loader = DataLoader(
    human_dataset,
    batch_size=16,
    shuffle=False
)

# Evaluation
model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for batch in human_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_true.extend(labels.cpu().numpy())

print("Unbalanced Human Test Accuracy:",
      accuracy_score(all_true, all_preds))

print("\nClassification Report (Human Unbalanced):")
print(classification_report(all_true, all_preds,
                            target_names=["not", "offensive"]))

print("\nConfusion Matrix:")
print(confusion_matrix(all_true, all_preds))


Unbalanced Human Test Accuracy: 0.7293577981651376

Classification Report (Human Unbalanced):
              precision    recall  f1-score   support

         not       0.77      0.91      0.83       162
   offensive       0.44      0.21      0.29        56

    accuracy                           0.73       218
   macro avg       0.61      0.56      0.56       218
weighted avg       0.69      0.73      0.69       218


Confusion Matrix:
[[147  15]
 [ 44  12]]


# Model vs Human Annotation Evaluation

## Confusion Matrix and Classification Report (Human vs Model)

In [20]:
from sklearn.metrics import classification_report, confusion_matrix

y_true = human_df["classification"]
y_pred = human_df["model_pred"]

print("Classification Report (Human vs Model):")
print(classification_report(
    y_true,
    y_pred,
    target_names=["not", "offensive"]
))

print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))


Classification Report (Human vs Model):
              precision    recall  f1-score   support

         not       0.77      0.91      0.83       162
   offensive       0.44      0.21      0.29        56

    accuracy                           0.73       218
   macro avg       0.61      0.56      0.56       218
weighted avg       0.69      0.73      0.69       218


Confusion Matrix:
[[147  15]
 [ 44  12]]


In [21]:
save_dir = "/content/arabert_abuse_model"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Saved to:", save_dir)


Saved to: /content/arabert_abuse_model


In [22]:
!zip -r arabert_abuse_model.zip /content/arabert_abuse_model


  adding: content/arabert_abuse_model/ (stored 0%)
  adding: content/arabert_abuse_model/tokenizer.json (deflated 73%)
  adding: content/arabert_abuse_model/tokenizer_config.json (deflated 90%)
  adding: content/arabert_abuse_model/vocab.txt (deflated 62%)
  adding: content/arabert_abuse_model/special_tokens_map.json (deflated 80%)
  adding: content/arabert_abuse_model/model.safetensors (deflated 7%)
  adding: content/arabert_abuse_model/config.json (deflated 49%)


# Human-Labeled Dataset as Test Set Only

### Create a balanced TEST set from human_df

In [23]:
import pandas as pd

# Work on a clean copy
test_df = human_df.copy()

# Basic cleaning (ensure correct labels)
test_df["classification"] = test_df["classification"].astype(str).str.strip().str.lower()

# Balance the test set: take the same number from each class
min_n = test_df["classification"].value_counts().min()

balanced_test_df = (
    test_df.groupby("classification", group_keys=False)
           .apply(lambda x: x.sample(n=min_n, random_state=42))
           .sample(frac=1, random_state=42)   # shuffle
           .reset_index(drop=True)
)

print("Original test distribution:")
print(test_df["classification"].value_counts())

print("\nBalanced test distribution:")
print(balanced_test_df["classification"].value_counts())

print("\nBalanced test size:", balanced_test_df.shape)
balanced_test_df.head()


Original test distribution:
classification
not          162
offensive     56
Name: count, dtype: int64

Balanced test distribution:
classification
not          56
offensive    56
Name: count, dtype: int64

Balanced test size: (112, 14)


Unnamed: 0,url,twitterUrl,id,text,retweetCount,replyCount,likeCount,quoteCount,createdAt,bookmarkCount,isRetweet,isQuote,classification,model_pred
0,https://x.com/rahaf_et97/status/17411799632172...,https://twitter.com/rahaf_et97/status/17411799...,1.74118e+18,نتكلم عن هالشي؟ ترا الام و اخواته يكونو رافضين...,1,0,3,0,Sat Dec 30 19:30:27 +0000 2023,0,False,False,not,not
1,https://x.com/nfc_mr1/status/1079486039444598790,https://twitter.com/nfc_mr1/status/10794860394...,1.07949e+18,#خيسوس_يعبث_بالهلال\n\nنادي فضيحه وفشلنا وسمعت...,0,0,0,0,Sun Dec 30 21:15:02 +0000 2018,0,False,False,offensive,not
2,https://x.com/HyAlhzm/status/1605602075039059968,https://twitter.com/HyAlhzm/status/16056020750...,1.6056e+18,?? مـ?ـــ الخاصــ ?? \nالسلام عليكم ابي مدرس ...,0,87,20,0,Wed Dec 21 16:32:19 +0000 2022,0,False,False,not,not
3,https://x.com/aabdulsalam9999/status/174123718...,https://twitter.com/aabdulsalam9999/status/174...,1.74124e+18,@s2daf مدري والله ?? لو قلت ايه بكون نصاب ولو ...,0,0,1,0,Sat Dec 30 23:17:49 +0000 2023,0,False,False,not,not
4,https://x.com/fat12alshareef/status/1595976777...,https://twitter.com/fat12alshareef/status/1595...,1.59598e+18,المرأة تصل قوتها إلى أضعاف قوة الرجل في مرحلت...,0,0,2,0,Fri Nov 25 03:04:50 +0000 2022,0,False,False,not,not


# Model Evaluation on Balanced Human-Labeled Test Set

In [24]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Encode balanced test texts
test_enc = tokenizer(
    balanced_test_df["text"].astype(str).tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
)

# Convert labels to numeric
label_map = {"not": 0, "offensive": 1}
y_test = torch.tensor(
    balanced_test_df["classification"].map(label_map).values
)

# Create test dataset & loader
test_dataset = TensorDataset(
    test_enc["input_ids"],
    test_enc["attention_mask"],
    y_test
)

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Evaluation
model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_true.extend(labels.cpu().numpy())

# Metrics
print("Balanced Test Accuracy:", accuracy_score(all_true, all_preds))
print("\nClassification Report:")
print(classification_report(all_true, all_preds, target_names=["not", "offensive"]))

print("\nConfusion Matrix:")
print(confusion_matrix(all_true, all_preds))


Balanced Test Accuracy: 0.5714285714285714

Classification Report:
              precision    recall  f1-score   support

         not       0.54      0.93      0.68        56
   offensive       0.75      0.21      0.33        56

    accuracy                           0.57       112
   macro avg       0.65      0.57      0.51       112
weighted avg       0.65      0.57      0.51       112


Confusion Matrix:
[[52  4]
 [44 12]]


In [25]:
# ===============================
# Save evaluation results (TXT + CSV)
# ===============================

import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Convert lists to arrays (assumes all_true & all_preds already exist)
y_true = all_true
y_pred = all_preds

# -------- TXT report --------
txt_path = "balanced_test_results.txt"

with open(txt_path, "w", encoding="utf-8") as f:
    f.write("Balanced Human-Labeled Test Results\n")
    f.write("=" * 40 + "\n\n")

    f.write(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}\n\n")

    f.write("Classification Report:\n")
    f.write(
        classification_report(
            y_true,
            y_pred,
            target_names=["not", "offensive"]
        )
    )
    f.write("\n\nConfusion Matrix:\n")
    f.write(str(confusion_matrix(y_true, y_pred)))

print(f"TXT results saved to: {txt_path}")

# -------- CSV predictions --------
csv_path = "balanced_test_predictions.csv"

results_df = balanced_test_df.copy()
results_df["true_label"] = results_df["classification"]
results_df["pred_label"] = ["not" if p == 0 else "offensive" for p in y_pred]

results_df.to_csv(csv_path, index=False, encoding="utf-8-sig")

print(f"CSV predictions saved to: {csv_path}")

results_df.head()


TXT results saved to: balanced_test_results.txt
CSV predictions saved to: balanced_test_predictions.csv


Unnamed: 0,url,twitterUrl,id,text,retweetCount,replyCount,likeCount,quoteCount,createdAt,bookmarkCount,isRetweet,isQuote,classification,model_pred,true_label,pred_label
0,https://x.com/rahaf_et97/status/17411799632172...,https://twitter.com/rahaf_et97/status/17411799...,1.74118e+18,نتكلم عن هالشي؟ ترا الام و اخواته يكونو رافضين...,1,0,3,0,Sat Dec 30 19:30:27 +0000 2023,0,False,False,not,not,not,not
1,https://x.com/nfc_mr1/status/1079486039444598790,https://twitter.com/nfc_mr1/status/10794860394...,1.07949e+18,#خيسوس_يعبث_بالهلال\n\nنادي فضيحه وفشلنا وسمعت...,0,0,0,0,Sun Dec 30 21:15:02 +0000 2018,0,False,False,offensive,not,offensive,not
2,https://x.com/HyAlhzm/status/1605602075039059968,https://twitter.com/HyAlhzm/status/16056020750...,1.6056e+18,?? مـ?ـــ الخاصــ ?? \nالسلام عليكم ابي مدرس ...,0,87,20,0,Wed Dec 21 16:32:19 +0000 2022,0,False,False,not,not,not,not
3,https://x.com/aabdulsalam9999/status/174123718...,https://twitter.com/aabdulsalam9999/status/174...,1.74124e+18,@s2daf مدري والله ?? لو قلت ايه بكون نصاب ولو ...,0,0,1,0,Sat Dec 30 23:17:49 +0000 2023,0,False,False,not,not,not,not
4,https://x.com/fat12alshareef/status/1595976777...,https://twitter.com/fat12alshareef/status/1595...,1.59598e+18,المرأة تصل قوتها إلى أضعاف قوة الرجل في مرحلت...,0,0,2,0,Fri Nov 25 03:04:50 +0000 2022,0,False,False,not,not,not,not
