# Arabic Dialect Identification and Dialectness Scoring (Sentence-Level)
*Done by:* Ameera Attiah

## Config & Imports

In [1]:
import torch
torch.cuda.set_device(1)                # use the second GPU
torch.set_default_device("cuda:1")      # make cuda:1 the default
print("Using:", torch.cuda.get_device_name(1))


Using: NVIDIA RTX A4500


In [2]:
# üß† Step 1: Config & Imports

import torch
import torch.nn as nn
from torch.utils.data import Dataset as TorchDataset, DataLoader
from datasets import Dataset as HFDataset          # HuggingFace (aliased)
from transformers import AutoTokenizer, AutoModel
import numpy as np
from sklearn.metrics import (
    f1_score, mean_squared_error, mean_absolute_error,
    classification_report, hamming_loss, confusion_matrix
)
from scipy.stats import pearsonr, spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data.dataloader import default_collate


config = {
    "base_model": "UBC-NLP/MARBERT",
    "num_labels": 5,
    "batch_size": 8,
    "num_epochs": 3,
    "learning_rate": 2e-5,
    "max_length": 128,
    "classification_threshold": 0.5,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

print("‚úÖ Configuration loaded")
print(f"üì¶ Using device: {config['device']}")



  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Configuration loaded
üì¶ Using device: cuda


In [3]:
import torch

# Check if GPU is available
print("CUDA Available:", torch.cuda.is_available())

# Check current device
if torch.cuda.is_available():
    print("Current GPU:", torch.cuda.get_device_name(0))
    print("GPU Memory (Allocated):", round(torch.cuda.memory_allocated(0) / 1024**3, 2), "GB")
    print("GPU Memory (Reserved):", round(torch.cuda.memory_reserved(0) / 1024**3, 2), "GB")
else:
    print("Using CPU")


CUDA Available: True
Current GPU: NVIDIA RTX A4500
GPU Memory (Allocated): 0.0 GB
GPU Memory (Reserved): 0.0 GB


In [4]:
import re

# Arabic text cleaner
def clean_arabic_text(text):
    # Remove tatweel (ŸÄ)
    text = text.replace("ŸÄ", "")
    # Reduce elongation: more than 2 repeated letters ‚Üí keep only 2
    text = re.sub(r'([\u0621-\u064A])\1{2,}', r'\1\1', text)
    # Normalize Arabic letters
    text = re.sub(r'[ÿ•ÿ£ÿ¢ÿß]', 'ÿß', text)
    text = re.sub(r'Ÿâ', 'Ÿä', text)
    text = re.sub(r'ÿ§', 'Ÿà', text)
    text = re.sub(r'ÿ¶', 'Ÿä', text)
    text = re.sub(r'ÿ©', 'Ÿá', text)
    return text


## Load & Encode AOC-ALDi Dataset

In [5]:
# üß† Step 2: Load & Encode Dataset

print("üì• Loading AOC-ALDi dataset...")
DIALECT2IDX = {
    "egyptian": 0,
    "levantine": 1,
    "gulf": 2,
    "maghrebi": 3,
    "msa": 4
}
IDX2DIALECT = {i: d for d, i in DIALECT2IDX.items()}

dataset = load_dataset("arbml/AOC_ALDi", split="train")

import ast

def encode_labels(row):
    multi_label = [0] * len(DIALECT2IDX)
    regression = [0.0] * len(DIALECT2IDX)

    dialects = row["dialect"]
    scores = row["dialectness_level"]

    # If they‚Äôre strings, convert them
    if isinstance(dialects, str):
        import ast
        try:
            dialects = ast.literal_eval(dialects)
        except:
            dialects = []
    if isinstance(scores, str):
        import ast
        try:
            scores = ast.literal_eval(scores)
        except:
            scores = []

    # Now iterate safely
    for d, s in zip(dialects, scores):
        if d in DIALECT2IDX:
            idx = DIALECT2IDX[d]
            multi_label[idx] = 1
            regression[idx] = float(s)

    return {
        "multi_label": multi_label,
        "regression": regression
    }
# Step 1: Add a dominant dialect label (used for balancing)
def extract_main_dialect(row):
    # Choose the first dialect listed (most confident)
    if isinstance(row["dialect"], list):
        return row["dialect"][0]
    elif isinstance(row["dialect"], str) and row["dialect"].startswith("["):
        import ast
        return ast.literal_eval(row["dialect"])[0]
    else:
        return row["dialect"]
        
processed_dataset = dataset.map(encode_labels)
processed_dataset = processed_dataset.map(lambda row: {"main_dialect": extract_main_dialect(row)})

üì• Loading AOC-ALDi dataset...


NameError: name 'load_dataset' is not defined

In [None]:
# CHANGE: make sure we always have a single main label index for CE training
def extract_main_dialect_and_idx(row):
    d = row["dialect"]
    if isinstance(d, list) and len(d) > 0:
        main = d[0]
    elif isinstance(d, str) and d.startswith("["):
        main = ast.literal_eval(d)[0]
    else:
        main = d
    main_idx = DIALECT2IDX.get(main, 4)  # default to MSA if weird
    return {"main_dialect": main, "main_idx": main_idx}

processed_dataset = dataset.map(encode_labels)
processed_dataset = processed_dataset.map(extract_main_dialect_and_idx)

# === Balancing (simple, keep your idea): upsample Maghrebi ===

from random import choices

maghrebi_rows = [row for row in processed_dataset if row["main_dialect"] == "maghrebi"]
non_maghrebi_rows = [row for row in processed_dataset if row["main_dialect"] != "maghrebi"]

avg_samples_per_class = len(non_maghrebi_rows) // 4 if len(non_maghrebi_rows) > 0 else len(maghrebi_rows)
upsampled_maghrebi = choices(maghrebi_rows, k=avg_samples_per_class)

balanced_rows = non_maghrebi_rows + upsampled_maghrebi
balanced_dataset = HFDataset.from_list(balanced_rows)
print(f"‚úÖ Balanced dataset size: {len(balanced_dataset)} (Maghrebi upsampled to {len(upsampled_maghrebi)})")


In [None]:
# CHANGE: simple train/val split (stratified-ish by shuffling per class)
from collections import defaultdict
by_cls = defaultdict(list)
for row in balanced_dataset:
    by_cls[row["main_idx"]].append(row)

train_rows, val_rows = [], []
for _, rows in by_cls.items():
    n = len(rows)
    split = max(1, int(0.85 * n))
    train_rows += rows[:split]
    val_rows   += rows[split:]

train_ds_raw = HFDataset.from_list(train_rows)
val_ds_raw   = HFDataset.from_list(val_rows)
print(f"‚úÖ Train/Val sizes: {len(train_ds_raw)} / {len(val_ds_raw)}")


## Tokenizer + Custom Dataset Class

In [None]:
# üß† Step 3: Tokenizer + Custom Dataset

print("üî§ Loading MARBERT tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(config["base_model"])

# CHANGE: add 'label_ce' as a single int (main dialect) in __getitem__
class AOC_Dataset(TorchDataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset          # <-- this is an HF dataset (HFDataset)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset[idx]

        # pull the text; your processed HF rows must have "sentence", "multi_label", "regression"
        text = row["sentence"]
        if isinstance(text, list):
            text = " ".join(text)

        text = clean_arabic_text(text)
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=config["max_length"],
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels_cls": torch.tensor(row["multi_label"], dtype=torch.float),
            "labels_reg": torch.tensor(row["regression"], dtype=torch.float),
        }


    def __len__(self):
        return len(self.dataset)

train_ds = AOC_Dataset(train_ds_raw, tokenizer)
val_ds   = AOC_Dataset(val_ds_raw, tokenizer)

print("‚úÖ Tokenizer loaded and dataset class ready")


## MARBERT Multi-Task Model

In [None]:
# üß† Step 4: Multi-Task MARBERT Model

print("üß† Initializing MARBERT-based model...")

class BertForMultiTask(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained(config["base_model"])
        hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Linear(hidden_size, config["num_labels"])
        self.regressor = nn.Linear(hidden_size, config["num_labels"])

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits_cls = self.classifier(pooled_output)  # No sigmoid: BCEWithLogits handles that
        logits_reg = torch.sigmoid(self.regressor(pooled_output))  # Output between 0‚Äì1
        return logits_cls, logits_reg

print("‚úÖ Model class defined")


## DataLoader, Model, Optimizer, and Losses

In [None]:
# üß† Step 5: DataLoader, Model, Optimizer, Losses

print("üì¶ Creating DataLoader...")
# CHANGE: make train/val loaders
train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=config["batch_size"], shuffle=False)

print("‚öôÔ∏è Initializing model, optimizer, and loss functions...")
model = BertForMultiTask().to(config["device"])
optimizer = torch.optim.AdamW(model.parameters(), lr=config["learning_rate"])

# CHANGE: loss functions
loss_fn_cls = nn.BCEWithLogitsLoss()
loss_fn_reg = nn.MSELoss()
lambda_reg = 0.2  # start small so classifier fixes itself

b = next(iter(train_loader))
print({k: v.shape if hasattr(v, "shape") else type(v) for k, v in b.items()})

print("‚úÖ Ready to train")


## Training Loop

In [None]:
# üîÅ Step 6: Training Loop (multi-label BCE + small reg weight)

lambda_reg = 0.2  # <- keep regression, but don't let it dominate

print("üöÄ Starting training...")
for epoch in range(config["num_epochs"]):
    model.train()
    total_loss = 0.0

    for step, batch in enumerate(train_loader):   # <-- use train_loader
        input_ids = batch["input_ids"].to(config["device"])
        attention_mask = batch["attention_mask"].to(config["device"])

        # MULTI-LABEL targets (multi-hot)
        labels_cls = batch["labels_cls"].to(config["device"])   # <‚Äî CHANGE: use labels_cls (multi-label)
        labels_reg = batch["labels_reg"].to(config["device"])

        # Forward
        logits_cls, logits_reg = model(input_ids, attention_mask)

        # Losses (BCE for multilabel; MSE for regression)
        loss_cls = loss_fn_cls(logits_cls, labels_cls)          # <‚Äî CHANGE: BCEWithLogitsLoss on multi-hot
        loss_reg = loss_fn_reg(logits_reg, labels_reg)
        loss = loss_cls + lambda_reg * loss_reg

        # Backprop
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        if step % 50 == 0:
            print(f"üü° Epoch {epoch+1} Step {step}/{len(train_loader)} | Loss: {loss.item():.4f}")

    avg_loss = total_loss / max(1, len(train_loader))
    print(f"‚úÖ Epoch {epoch+1} | Avg Loss: {avg_loss:.4f}")

    # ---- quick MULTI-LABEL validation on val_loader ----
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in val_loader:  # <-- use val_loader
            ids  = batch["input_ids"].to(config["device"])
            mask = batch["attention_mask"].to(config["device"])

            logits_cls, _ = model(ids, mask)
            probs = torch.sigmoid(logits_cls).cpu().numpy()  # (B, 5)
            preds = (probs >= config["classification_threshold"]).astype(int)

            y_true.append(batch["labels_cls"].cpu().numpy())
            y_pred.append(preds)

    y_true = np.vstack(y_true)
    y_pred = np.vstack(y_pred)
    print(classification_report(y_true, y_pred, target_names=list(DIALECT2IDX.keys()), digits=3))

# Save
torch.save(model.state_dict(), "marbert_sentence_model2.pt")
print("üíæ Model saved to marbert_sentence_model2.pt")


## Evaluation Loop

In [None]:
# üß™ Step 7: Evaluation Loop (multi-label, on VAL set)

print("üîç Running evaluation...")
model.eval()

all_preds_cls, all_labels_cls = [], []
all_preds_reg, all_labels_reg = [], []

with torch.no_grad():
    for batch in val_loader:  # <-- use val_loader, not 'dataloader'
        input_ids = batch["input_ids"].to(config["device"])
        attention_mask = batch["attention_mask"].to(config["device"])

        # Ground-truth
        labels_cls = batch["labels_cls"].cpu().numpy()
        labels_reg = batch["labels_reg"].cpu().numpy()

        # Forward pass
        logits_cls, logits_reg = model(input_ids, attention_mask)

        # MULTI-LABEL predictions (sigmoid + threshold)
        probs_cls = torch.sigmoid(logits_cls).cpu().numpy()
        preds_cls = (probs_cls >= config["classification_threshold"]).astype(int)

        # Regression outputs (your reg head already uses sigmoid in the model)
        preds_reg = logits_reg.cpu().numpy()

        # Collect
        all_preds_cls.append(preds_cls)
        all_labels_cls.append(labels_cls)
        all_preds_reg.append(preds_reg)
        all_labels_reg.append(labels_reg)

print("‚úÖ Evaluation complete")

# Stack to arrays for metrics in block 3
all_preds_cls  = np.vstack(all_preds_cls)
all_labels_cls = np.vstack(all_labels_cls)
all_preds_reg  = np.vstack(all_preds_reg)
all_labels_reg = np.vstack(all_labels_reg)


## Full Metrics & Visualizations

In [None]:
# üìä Step 8: Full Metrics + Visualizations (multi-label)

print("üìä Generating metrics and plots...")

# ---- Classification (multi-label) ----
print("\nüìÑ Classification Report:")
print(classification_report(all_labels_cls, all_preds_cls, target_names=list(DIALECT2IDX.keys()), digits=3))
print("‚úÖ Micro F1:", f1_score(all_labels_cls, all_preds_cls, average='micro'))
print("‚úÖ Macro F1:", f1_score(all_labels_cls, all_preds_cls, average='macro'))
print("üìâ Hamming Loss:", hamming_loss(all_labels_cls, all_preds_cls))

# ---- Regression ----
print("\nüìà Regression Metrics:")
mse = mean_squared_error(all_labels_reg, all_preds_reg)
mae = mean_absolute_error(all_labels_reg, all_preds_reg)
print(f"‚úÖ MSE: {mse:.4f}")
print(f"‚úÖ MAE: {mae:.4f}")

# Pearson and Spearman per dialect
for i, dialect in enumerate(DIALECT2IDX.keys()):
    pearson = pearsonr(all_labels_reg[:, i], all_preds_reg[:, i])[0]
    spearman = spearmanr(all_labels_reg[:, i], all_preds_reg[:, i])[0]
    print(f"{dialect:10} | Pearson: {pearson:.4f} | Spearman: {spearman:.4f}")

# ---- Per-dialect confusion matrices (multi-label: 2x2 per class) ----
print("\nüìä Confusion Matrices (per dialect, multi-label):")
for i, dialect in enumerate(DIALECT2IDX.keys()):
    y_true_bin = all_labels_cls[:, i].astype(int)
    y_pred_bin = all_preds_cls[:, i].astype(int)
    cm = confusion_matrix(y_true_bin, y_pred_bin, labels=[0,1])
    plt.figure(figsize=(3, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues",
                xticklabels=["Pred 0", "Pred 1"], yticklabels=["True 0", "True 1"])
    plt.title(f"Confusion Matrix: {dialect}")
    plt.tight_layout()
    plt.show()

# ---- Regression scatter ----
print("üìà Plotting regression scatter plot...")
plt.figure(figsize=(7, 5))
for i, dialect in enumerate(DIALECT2IDX.keys()):
    plt.scatter(all_labels_reg[:, i], all_preds_reg[:, i], label=dialect, alpha=0.5)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("True Dialectness Score")
plt.ylabel("Predicted Score")
plt.title("Dialectness Regression: True vs Predicted")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()




---
# Inference on the Fineweb2 dataset


##  Load Your Trained Model

In [None]:
from transformers import AutoTokenizer
import torch

# 1Ô∏è‚É£ Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERT")

# 2Ô∏è‚É£ Initialize model architecture
model = BertForMultiTask()

# 3Ô∏è‚É£ Load weights
model.load_state_dict(torch.load("marbert_sentence_model2.pt", map_location="cuda" if torch.cuda.is_available() else "cpu"))

# 4Ô∏è‚É£ Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


In [None]:
# gulf dialect
# test_sentence = """
# Ÿàÿ¥ ÿßŸÑÿ≥ÿßŸÑŸÅÿ© Ÿäÿß ŸàŸÑÿØÿü ÿ£ŸÜÿß ÿßŸÖÿ≥ ŸÉŸÜÿ™ ÿ±ÿßŸäÿ≠ ŸÑŸÑÿ®ÿ± ŸÖÿπ ÿßŸÑÿ±ÿ®ÿπÿå Ÿàÿ¥ÿ®ŸëŸäŸÜÿß ÿßŸÑŸÜÿßÿ± Ÿàÿ≥ŸàŸäŸÜÿß ŸÇŸáŸàÿ© Ÿàÿ¥ÿßŸäÿå Ÿàÿ®ÿØŸäŸÜÿß ŸÜÿ≥ŸàŸÑŸÅ ÿπŸÜ ÿ£ŸäÿßŸÖ ÿßŸÑÿ∑Ÿäÿ®ŸäŸÜ.
# ŸÖÿ± ÿßŸÑŸàŸÇÿ™ ÿ®ÿ≥ÿ±ÿπÿ©ÿå ŸàŸÉŸÑ Ÿàÿßÿ≠ÿØ ŸÇÿßŸÖ Ÿäÿ≠ŸÉŸä ÿπŸÜ ŸÖŸàÿßŸÇŸÅŸá ŸäŸàŸÖ ŸÉÿßŸÜ ÿ®ÿßŸÑÿ´ÿßŸÜŸàŸäÿ©. ÿ®ÿπÿØŸäŸÜ ÿ∑ŸÇŸäŸÜÿß ÿßŸÑÿπÿ¥ÿßÿ° ÿ±ÿ≤ ŸàŸÑÿ≠ŸÖÿå ŸàŸÑÿß ÿ£ÿ≠ŸÑŸâ.
# ÿ±ÿ¨ÿπŸÜÿß ŸÑŸÑÿ®Ÿäÿ™ ŸÇÿ®ŸÑ ÿßŸÑŸÅÿ¨ÿ±ÿå ÿ™ÿπÿ®ÿßŸÜŸäŸÜ ŸÑŸÉŸÜ ŸÖÿ®ÿ≥Ÿàÿ∑ŸäŸÜÿå ŸÖÿß ŸÅŸä ŸÖÿ´ŸÑ ÿ¨ŸÖÿπÿ© ÿßŸÑÿ±ÿ®ÿπ ÿ®ÿßŸÑÿ®ÿ±.
# """


# egyptian dialect
# test_sentence = """
# ÿ£ŸÜÿß ÿ®ÿµÿ±ÿßÿ≠ÿ© ÿ≤ŸáŸÇÿ™ ŸÖŸÜ ÿßŸÑÿ≤ÿ≠ŸÖÿ© ŸàÿßŸÑÿπŸäÿßÿ∑ ÿ®ÿ™ÿßÿπ ŸÉŸÑ ŸäŸàŸÖ ŸÅŸä ÿßŸÑÿ¥ÿ∫ŸÑÿå ŸäÿπŸÜŸä ŸÖÿßŸÅŸäÿ¥ ŸäŸàŸÖ ÿ®ŸäÿπÿØŸëŸä ŸÖŸÜ ÿ∫Ÿäÿ± ŸÖÿß ÿßŸÑŸÖŸàÿßÿµŸÑÿßÿ™ ÿ™ÿπŸÖŸÑ ŸÅŸäÿß ŸÖŸÇŸÑÿ®!
# ÿßŸÑŸÜŸáÿßÿ±ÿØÿ© ŸÖÿ´ŸÑÿßŸãÿå ÿµÿ≠Ÿäÿ™ ŸÖÿ™ÿ£ÿÆÿ± ÿπŸÑÿ¥ÿßŸÜ ÿßŸÑŸÉŸáÿ±ÿ®ÿß ŸÇÿ∑ÿπÿ™ÿå ŸàŸÜÿ≤ŸÑÿ™ ÿ£ÿ¨ÿ±Ÿä ÿπŸÑŸâ ÿßŸÑŸÖŸäŸÉÿ±Ÿàÿ®ÿßÿµÿå ÿ®ÿ≥ ÿßŸÑÿ≥ŸàÿßŸÇ ŸÇÿ±ÿ± ŸäÿßÿÆÿØ ÿßŸÑŸÑŸÅÿ© ŸÉŸÑŸáÿß ŸÇÿ®ŸÑ ŸÖÿß ŸäŸàÿµŸÑŸÜŸä.
# ÿπÿØŸëŸäŸÜÿß ŸÖŸÜ ÿπŸÜÿØ ÿßŸÑŸÉŸàÿ®ÿ±Ÿä ÿßŸÑŸÑŸä ÿØÿßŸäŸÖŸãÿß ÿ≤ÿ≠ŸÖÿ©ÿå ŸàÿßŸÑŸÜÿßÿ≥ ŸÇÿßÿπÿØÿ© ÿ®ÿ™ÿ≤ÿπŸÇ ŸÑÿ®ÿπÿ∂ ŸÖŸÜ ÿ∫Ÿäÿ± ÿ≥ÿ®ÿ®ÿå ŸàŸÉÿ£ŸÜ ÿßŸÑÿπÿµÿ®Ÿäÿ© ÿ®ŸÇÿ™ ÿ∑ÿ®ŸäÿπŸäÿ© ÿπŸÜÿØŸÜÿß.
# ÿ®ÿπÿØ ŸÉÿØŸá ŸàÿµŸÑÿ™ ÿßŸÑÿ¥ÿ∫ŸÑÿå ŸàÿßŸÑŸÖÿØŸäÿ± ÿ£ŸàŸÑ ŸÖÿß ÿ¥ÿßŸÅŸÜŸä ŸÇÿßŸÑŸä: "ÿßÿ™ÿ£ÿÆÿ±ÿ™ ÿ™ÿßŸÜŸäÿü" Ÿàÿ£ŸÜÿß ÿ®ÿµŸäÿ™ ŸÑŸá ŸàŸÇŸÑÿ™: "ŸàÿßŸÑŸÑŸá ÿ∫ÿµÿ® ÿπŸÜŸä".
# ŸäÿπŸÜŸä ŸáŸà ŸÅÿßŸÉÿ± ÿ•ŸÜŸÜÿß ÿ®ŸÜÿ≠ÿ® ÿßŸÑÿ™ÿ£ÿÆŸäÿ±ÿü ŸáŸà ŸÖÿ¥ ÿ≠ÿßÿ≥ÿ≥ ÿ®ÿßŸÑŸÑŸä ÿ®ŸÜÿ¥ŸàŸÅŸá ŸÉŸÑ ŸäŸàŸÖ ŸÅŸä ÿßŸÑÿ¥ÿßÿ±ÿπÿü
# ÿßŸÑŸÖŸáŸÖÿå ŸäŸàŸÖ ÿπÿØŸâ ŸàŸäÿß ÿπÿßŸÑŸÖ ÿ®ŸÉÿ±ÿß ŸáŸäÿ®ŸÇŸâ ŸÅŸäŸá ÿ•ŸäŸá!
# """

# levantine dialect
# test_sentence = """
# ŸÖÿ®ÿßÿ±ÿ≠ ŸÜÿ≤ŸÑÿ™ ÿπÿßŸÑÿ≥ŸàŸÇ ŸÖÿπ ÿ±ŸÅŸäŸÇÿ™Ÿäÿå ŸàŸÉÿßŸÜ ŸÅŸä ŸÉÿ™Ÿäÿ± ÿπÿ¨ŸÇÿ© ÿ®ÿ≥ ÿßŸÑÿ¨Ÿà ŸÉÿßŸÜ ÿ≠ŸÑŸà. ÿßÿ¥ÿ™ÿ±ŸäŸÜÿß ÿ¥ŸàŸäÿ© ÿÆÿ∂ÿ±ÿ© ŸàŸÅŸàÿßŸÉŸáÿå Ÿàÿ®ÿπÿØŸäŸÜ ÿ±ÿ≠ŸÜÿß ŸÜÿ¥ÿ±ÿ® ŸÇŸáŸàÿ© ÿ®ÿ¥ÿßÿ±ÿπ ÿßŸÑÿ≠ŸÖÿ±ÿß.
# ŸÇÿπÿØŸÜÿß ÿ¥Ÿä ÿ≥ÿßÿπÿ©ÿå ÿ∂ÿ≠ŸÉŸÜÿß Ÿàÿ≠ŸÉŸäŸÜÿß ÿπŸÜ ÿßŸÑÿ¥ÿ∫ŸÑ ŸàÿßŸÑÿ≠Ÿäÿßÿ©. ÿ®ÿπÿØŸäŸÜ ÿ•ÿ¨ÿß ÿ£ÿÆŸàŸáÿß ÿ®ÿ≥Ÿäÿßÿ±ÿ™Ÿá Ÿàÿ£ÿÆÿØŸÜÿß ÿπÿßŸÑÿ®Ÿäÿ™. ÿπŸÜÿ¨ÿØ ŸÉÿßŸÜ ŸÜŸáÿßÿ± ŸÉÿ™Ÿäÿ± ŸÖŸáÿ∂ŸàŸÖ.
# """

# maghrabi dialect
test_sentence = """
ÿßŸÑŸäŸàŸÖ ÿ®ŸÉÿ±Ÿä ŸÖÿ¥Ÿäÿ™ ŸÑŸÑŸÖÿßÿ±ÿ¥Ÿä ŸÜÿ¥ÿ±Ÿä ÿ¥ŸàŸäÿ© ÿÆÿ∂ÿ±ÿ©ÿå ŸÑŸÇŸäÿ™ ÿßŸÑÿØŸÜŸäÿß ÿπÿßŸÖÿ±ÿ© ŸàÿßŸÑŸÜÿßÿ≥ ŸÉŸäÿ™ÿ≥ÿßÿ®ŸÇŸà ÿ®ÿßÿ¥ Ÿäÿ¥ÿ±Ÿà ŸÇÿ®ŸÑ ŸÖÿß ÿ™ÿ≥ÿßŸÑŸä ÿßŸÑÿµÿ®ÿßÿ≠.
ÿ¥ÿ±Ÿäÿ™ ŸÖÿ∑Ÿäÿ¥ÿ©ÿå ÿ®ÿµŸÑÿ©ÿå Ÿàÿ¥Ÿä ÿ¥ŸàŸäÿ© ÿØŸäÿßŸÑ ÿßŸÑŸÜÿπŸÜÿßÿπ. ŸàŸÖŸÜ ÿ®ÿπÿØ ŸÖÿ¥Ÿäÿ™ ÿπŸÜÿØ ÿßŸÑÿ¨ÿßÿ±ÿ© ŸÜÿ¥ÿ±ÿ®Ÿà ÿ£ÿ™ÿßŸä ŸàŸÜÿØŸàŸäŸà ÿπŸÑŸâ ŸàŸÑÿßÿØŸÜÿß.
ÿßŸÑÿØŸÜŸäÿß ÿ≤ŸàŸäŸÜÿ© ŸàŸÑŸÉŸÜ ÿÆÿßÿµŸÜÿß ŸÜÿ™ŸáŸÑÿßŸà ŸÅÿ®ÿπÿ∂Ÿäÿßÿ™ŸÜÿß.
"""


cleaned_sentence = clean_arabic_text(test_sentence)

encoded = tokenizer(cleaned_sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)

# ‚úÖ Fix for unexpected keyword
if "token_type_ids" in encoded:
    del encoded["token_type_ids"]

encoded = {k: v.to(device) for k, v in encoded.items()}

with torch.no_grad():
    logits_cls, logits_reg = model(**encoded)
    probs_cls = torch.sigmoid(logits_cls).cpu().numpy()[0]
    preds_cls = (probs_cls > 0.5).astype(int)
    preds_reg = logits_reg.cpu().numpy()[0]

DIALECTS = ["egyptian", "levantine", "gulf", "maghrebi", "msa"]
predicted_dialects = [DIALECTS[i] for i, v in enumerate(preds_cls) if v == 1]

print("üìù Input:", test_sentence)
print("üßº Cleaned:", cleaned_sentence)
print("üß† Predicted Dialects:", predicted_dialects)
for i, d in enumerate(DIALECTS):
    print(f" - {d}: {preds_reg[i]:.3f}")


## Load & Preprocess FineWeb2 Dataset

In [None]:
from datasets import load_dataset

# ‚úÖ All Arabic dialect subsets in FineWeb2
dialects = [
    "acm_Arab",  # Iraqi
    # "aeb_Arab",  # Tunisian
    "apc_Arab",  # Levantine
    "arb_Arab",  # MSA
    "arq_Arab",  # Algerian
    "ars_Arab",  # Najdi (Saudi)
    "ary_Arab",  # Moroccan
    "arz_Arab",  # Egyptian
    "ayp_Arab",  # North Mesopotamian
    "shu_Arab",  # Chadian/Sudanese
]

# üì¶ Stream & sample texts from each dialect
samples = []
samples_per_dialect = 100  # adjust as needed

for dialect in dialects:
    print(f"üì• Sampling from: {dialect}")
    try:
        dataset = load_dataset("HuggingFaceFW/fineweb-2", dialect, split="train", streaming=True)
        for i, sample in enumerate(dataset):
            samples.append(sample["text"])
            if i + 1 >= samples_per_dialect:
                break
    except Exception as e:
        print(f"‚ùå Error loading {dialect}: {e}")


## Tokenize the Sentences

In [None]:
encoded_batch = tokenizer(samples, padding=True, truncation=True, max_length=128, return_tensors="pt")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoded_batch = {k: v.to(device) for k, v in encoded_batch.items()}
model.to(device)


## Run the Model for Inference

In [None]:
with torch.no_grad():
    logits_cls, logits_reg = model(encoded_batch["input_ids"], encoded_batch["attention_mask"])

# Convert logits to predictions
probs_cls = torch.sigmoid(logits_cls).cpu().numpy()
preds_reg = logits_reg.cpu().numpy()

# Threshold classification (multi-label)
import numpy as np
preds_cls = (probs_cls > 0.5).astype(int)


## Display Results

In [None]:
DIALECTS = ["egyptian", "levantine", "gulf", "maghrebi", "msa"]

for sentence, cls, reg in zip(samples, preds_cls, preds_reg):
    sentence = clean_arabic_text(sentence)
    predicted_dialects = [DIALECTS[j] for j, val in enumerate(cls) if val == 1]
    regression_scores = dict(zip(DIALECTS, reg))

    print("üìù Text:", sentence[:60])
    print("üß† Predicted Dialects:", predicted_dialects)
    for d in DIALECTS:
        print(f" - {d}: {regression_scores[d]:.3f}")
    print("=" * 50)


In [None]:
# Check if GPU is available
print("CUDA Available:", torch.cuda.is_available())

# Check current device
if torch.cuda.is_available():
    print("Current GPU:", torch.cuda.get_device_name(0))
    print("GPU Memory (Allocated):", round(torch.cuda.memory_allocated(0) / 1024**3, 2), "GB")
    print("GPU Memory (Reserved):", round(torch.cuda.memory_reserved(0) / 1024**3, 2), "GB")
else:
    print("Using CPU")


## Inference on Synthetic Data 
made using ChatGPT, 100 document samples of each dialect (egyptian, levantine, gulf, maghrabi, msa)

In [None]:
import json
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import torch
import re
from random import sample as random_sample

# ‚úÖ Define dialect mappings
DIALECTS = ["egyptian", "levantine", "gulf", "maghrebi", "msa"]
DIALECT2IDX = {d: i for i, d in enumerate(DIALECTS)}
IDX2DIALECT = {i: d for i, d in enumerate(DIALECTS)}

# üß† Re-initialize the model architecture
model = BertForMultiTask().to(config["device"])

# üîÅ Load trained weights
checkpoint_path = "marbert_sentence_model2.pt"
model.load_state_dict(torch.load(checkpoint_path, map_location=config["device"]))

print("‚úÖ Model loaded from checkpoint.")

# üì• Load JSON
with open("dialect_documents.json", "r", encoding="utf-8") as f:
    dialect_docs = json.load(f)

# üß™ Flatten all samples
test_samples = []
for label, docs in dialect_docs.items():
    for doc in docs:
        test_samples.append({"text": doc["text"], "label": label})

def split_sentences(text):
    """
    Splits the input Arabic text into sentences using simple punctuation heuristics.
    """
    # Normalize spacing and remove excessive newlines
    text = re.sub(r'\s+', ' ', text.strip())
    
    # Split on Arabic and standard sentence-ending punctuation
    sentence_enders = re.compile(r'(?<=[.!ÿü\n])\s+')
    sentences = sentence_enders.split(text)
    
    # Remove empty or very short entries
    return [s.strip() for s in sentences if len(s.strip()) > 5]

# ‚úÖ Predict sentence-wise
def predict_doc_label_and_score(text):
    model.eval()
    sentences = split_sentences(text)
    sentence_preds = []
    sentence_scores = []

    for sentence in sentences:
        sentence = clean_arabic_text(sentence)
        encoded = tokenizer(
            sentence,
            truncation=True,
            padding="max_length",
            max_length=config["max_length"],
            return_tensors="pt"
        ).to(config["device"])

        with torch.no_grad():
            logits_cls, logits_reg = model(encoded["input_ids"], encoded["attention_mask"])
            
            # Keep probabilities for ALL classes (multi-label)
            probs_cls = torch.sigmoid(logits_cls).cpu().numpy()[0]  # shape (5,)
            
            # Instead of one-hot argmax, keep full probabilities
            sentence_preds.append(probs_cls)
            
            # Regression as before
            pred_reg = torch.sigmoid(logits_reg).cpu().numpy()[0]
            sentence_scores.append(pred_reg)


    # ‚úÖ Aggregate both classification (votes) and regression (avg score)
    votes = np.sum(sentence_preds, axis=0)
    avg_reg = np.mean(sentence_scores, axis=0)

    boost = np.ones(len(DIALECTS))
    boost[DIALECT2IDX["maghrebi"]] = 5.0  # strong boost
    boost[DIALECT2IDX["gulf"]] = 1.6      # optional
    
    combined = (0.75 * votes + 0.25 * avg_reg) * boost
    

    pred_idx = np.argmax(combined)
    pred_label = IDX2DIALECT[pred_idx]

    return pred_label, avg_reg.tolist()

# üöÄ Predict all
predictions = []
for sample in tqdm(test_samples, desc="üîç Predicting dialects"):
    text = sample["text"]
    true = sample["label"]
    pred, scores = predict_doc_label_and_score(text)
    predictions.append({
        "text": text,
        "true": true,
        "pred": pred,
        "scores": scores
    })

# üìå Show 5 samples per dialect
grouped = defaultdict(list)
for p in predictions:
    grouped[p["true"]].append(p)

print("\nüìå Sample Predictions (5 per dialect):")
for d in DIALECTS:
    print(f"\n=== {d.upper()} ===")
    for s in random_sample(grouped[d], 1):
        snippet = s["text"][:150].replace('\n', ' ')
        print(f"True: {s['true']:<10} | Pred: {s['pred']:<10} | Scores: {np.round(s['scores'], 2)}")
        print(f"Text: {snippet}...\n")

# üìä Accuracy
summary = defaultdict(lambda: {"correct": 0, "wrong": 0})
for p in predictions:
    if p["true"] == p["pred"]:
        summary[p["true"]]["correct"] += 1
    else:
        summary[p["true"]]["wrong"] += 1

print("\nüìä Dialect Classification Accuracy:\n")
print(f"{'Dialect':<12} {'Correct':>7} {'Wrong':>7} {'Total':>7} {'Accuracy':>9}")
print("-" * 45)
for d in DIALECTS:
    correct = summary[d]["correct"]
    wrong = summary[d]["wrong"]
    total = correct + wrong
    acc = 100 * correct / total if total > 0 else 0
    print(f"{d:<12} {correct:>7} {wrong:>7} {total:>7} {acc:>8.2f}%")


In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Define the dialect labels (must match your data)
dialects = ["egyptian", "levantine", "gulf", "maghrebi", "msa"]

# Extract true and predicted labels from your prediction results
y_true = [p["true"] for p in predictions]
y_pred = [p["pred"] for p in predictions]

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=dialects)

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=dialects, yticklabels=dialects)
plt.xlabel("Predicted Dialect")
plt.ylabel("Actual Dialect")
plt.title("Dialect Classification Confusion Matrix")
plt.tight_layout()
plt.show()


In [None]:
print("\n‚ùå Misclassified Samples:\n")

for item in predictions:
    if item["true"] != item["pred"]:
        text_snippet = item["text"][:200].replace("\n", " ")
        print(f"True: {item['true']:<10} | Predicted: {item['pred']:<10} | Text: {text_snippet}...\n")
