<a href="https://colab.research.google.com/github/alhanaya/alhanaya/blob/main/Arabic_Text_Classification_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# üá∏üá¶ Arabic Text Classification (Colab-Ready)
**Goal:** Train a small multilingual model to classify Arabic text into 3 sentiments (negative=0, neutral=1, positive=2).  
Works out‚Äëof‚Äëthe‚Äëbox on **Google Colab** (CPU/GPU).

**What you'll get:**
- Data setup (either Google Drive or inline sample)
- Training with ü§ó Transformers + Datasets
- Evaluation (accuracy/F1)
- Inference cell to test your own sentences


In [12]:

# ‚úÖ If you're on Google Colab, you can set the runtime to GPU for faster training:
# Runtime > Change runtime type > Hardware accelerator: GPU

!pip -q install "transformers>=4.44.0" "datasets>=2.20.0" "accelerate>=0.30.0" "evaluate>=0.4.2"


In [13]:
import transformers, datasets, accelerate, evaluate
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)


transformers: 4.55.4
datasets: 4.0.0



## Option A) Use your data from Google Drive (JSONL)
Your files should be in your Drive, e.g. `/content/drive/MyDrive/ai-train/data/train.jsonl` and `dev.jsonl`, each line like:
```json
{"text":"ÿßŸÑÿÆÿØŸÖÿ© ÿ≥Ÿäÿ¶ÿ©","label":0}
```


In [14]:

USE_DRIVE = False  # ‚¨ÖÔ∏è set to True if your data is on Google Drive

if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    TRAIN_PATH = "/content/drive/MyDrive/ai-train/data/train.jsonl"
    DEV_PATH   = "/content/drive/MyDrive/ai-train/data/dev.jsonl"
else:
    TRAIN_PATH = "train.jsonl"
    DEV_PATH   = "dev.jsonl"



## Option B) Use small inline sample (quick start)
Runs even without Drive. You can replace with your own later.


In [15]:

sample_train = [
    {"text":"ÿßŸÑÿÆÿØŸÖÿ© ÿ≥Ÿäÿ¶ÿ© ŸàÿßŸÑÿ™ÿ£ÿÆŸäÿ± ÿ∫Ÿäÿ± ŸÖŸÇÿ®ŸàŸÑ.","label":0},
    {"text":"ÿßŸÑÿ∑ŸÑÿ® ŸàÿµŸÑ ŸÅŸä ÿßŸÑŸàŸÇÿ™ ÿßŸÑŸÖÿ≠ÿØÿØ ŸàÿßŸÑÿ¨ŸàÿØÿ© ŸÖŸÖÿ™ÿßÿ≤ÿ©.","label":2},
    {"text":"ÿßŸÑÿ™ÿ∑ÿ®ŸäŸÇ ÿπÿßÿØŸä ŸàŸÖÿß ŸÑÿßÿ≠ÿ∏ÿ™ ŸÅÿ±ŸÇ ŸÉÿ®Ÿäÿ±.","label":1},
    {"text":"ÿ™ÿ¨ÿ±ÿ®ÿ© ŸÖÿ≤ÿπÿ¨ÿ© ŸàÿßŸÑÿØÿπŸÖ ÿßŸÑŸÅŸÜŸä ŸÑÿß Ÿäÿ±ÿØ.","label":0},
    {"text":"ÿßŸÑÿ™ÿ∫ŸÑŸäŸÅ ÿ±ÿßÿ¶ÿπ ŸàÿßŸÑŸÖŸÜÿ™ÿ¨ ŸÖÿ´ŸÑ ÿßŸÑŸàÿµŸÅ ÿ™ŸÖÿßŸÖŸãÿß.","label":2},
    {"text":"ŸÑÿß ÿ¨ÿØŸäÿØ Ÿäÿ∞ŸÉÿ±ÿå ÿßŸÑÿ™ÿ¨ÿ±ÿ®ÿ© ŸÖÿ™Ÿàÿ≥ÿ∑ÿ©.","label":1},
    {"text":"ÿßŸÑÿ™ÿ≠ÿØŸäÿ´ ÿßŸÑÿ£ÿÆŸäÿ± ÿ≥ÿ®ÿ® ÿ£ÿπÿ∑ÿßŸÑ ŸÉÿ´Ÿäÿ±ÿ©.","label":0},
    {"text":"ÿßŸÑÿ≥ÿπÿ± ŸÖŸÜÿßÿ≥ÿ® ŸÖŸÇÿßÿ®ŸÑ ÿßŸÑŸÖÿ≤ÿßŸäÿß ÿßŸÑŸÖŸÇÿØŸÖÿ©.","label":2},
    {"text":"ÿßŸÑÿÆÿØŸÖÿ© ŸÖŸÇÿ®ŸàŸÑÿ© ŸÑŸÉŸÜ ÿ™ÿ≠ÿ™ÿßÿ¨ ÿ™ÿ≠ÿ≥ŸäŸÜ.","label":1},
    {"text":"ÿßŸÑÿ™ÿ≥ŸÑŸäŸÖ ÿ™ÿ£ÿÆÿ± ŸÉÿ´Ÿäÿ±ÿßŸã ŸàŸáÿ∞ÿß ŸÖÿ≠ÿ®ÿ∑.","label":0},
    {"text":"ÿßŸÑŸàÿßÿ¨Ÿáÿ© ŸÖÿ±ÿ™ÿ®ÿ© Ÿàÿ≥ŸáŸÑÿ© ÿßŸÑÿßÿ≥ÿ™ÿÆÿØÿßŸÖ.","label":2},
    {"text":"ŸÑÿß ÿ£ÿ≥ÿ™ÿ∑Ÿäÿπ ÿßŸÑÿ≠ŸÉŸÖ ÿ≠ÿßŸÑŸäÿßŸãÿå ŸÖÿß ÿ¨ÿ±ÿ®ÿ™ ŸÉŸÑ ÿ¥Ÿäÿ°.","label":1}
]

sample_dev = [
    {"text":"ÿ¨ŸàÿØÿ© ÿ±ÿØŸäÿ¶ÿ© ŸàÿÆÿØŸÖÿ© ÿ®ÿ∑Ÿäÿ¶ÿ©.","label":0},
    {"text":"ÿßŸÑÿ£ŸÖŸàÿ± ÿ∑ÿ®ŸäÿπŸäÿ© ÿ≠ÿ™Ÿâ ÿßŸÑÿ¢ŸÜ.","label":1},
    {"text":"ŸÜÿ™Ÿäÿ¨ÿ© ŸÖŸÖÿ™ÿßÿ≤ÿ© ŸàŸÅÿ±ŸäŸÇ ŸÖŸÖŸäÿ≤.","label":2},
    {"text":"Ÿàÿßÿ¨Ÿáÿ™ ŸÖÿ¥ÿßŸÉŸÑ ÿ®ÿπÿØ ÿßŸÑÿ™ÿ≠ÿØŸäÿ´.","label":0},
    {"text":"ÿßŸÑÿÆÿØŸÖÿ© ŸÑÿß ÿ®ÿ£ÿ≥ ÿ®Ÿáÿß ÿ®ÿ¥ŸÉŸÑ ÿπÿßŸÖ.","label":1},
    {"text":"ÿ£ŸÜÿµÿ≠ ÿ®Ÿá ÿ®ÿ¥ÿØÿ©!","label":2}
]

# Write files only if not using Drive
import json, os
if not os.path.exists(TRAIN_PATH):
    with open(TRAIN_PATH, "w", encoding="utf-8") as f:
        for row in sample_train:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")
if not os.path.exists(DEV_PATH):
    with open(DEV_PATH, "w", encoding="utf-8") as f:
        for row in sample_dev:
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Train file:", TRAIN_PATH)
print("Dev file  :", DEV_PATH)


Train file: train.jsonl
Dev file  : dev.jsonl



## Load dataset
We use ü§ó Datasets with JSON Lines format.


In [16]:

from datasets import load_dataset

data_files = {"train": TRAIN_PATH, "validation": DEV_PATH}
ds = load_dataset("json", data_files=data_files)
ds


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 13
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 6
    })
})


## Tokenization & Model
We'll use a lightweight multilingual model: `distilbert-base-multilingual-cased` (good enough for demo).


In [17]:
with open("train.jsonl", "a", encoding="utf-8") as f:
    f.write('{"text": "ÿßŸÑÿÆÿØŸÖÿ© ŸÖŸÖÿ™ÿßÿ≤ÿ© Ÿàÿ±ÿÆŸäÿµÿ©", "label": 2}\n')


In [18]:
# 1) ÿ™ÿ´ÿ®Ÿäÿ™ Gradio (ŸÖÿ±ÿ© Ÿàÿßÿ≠ÿØÿ©)
!pip -q install gradio

# 2) Ÿàÿßÿ¨Ÿáÿ© ÿßŸÑÿßÿ≥ÿ™ÿØŸÑÿßŸÑ
import torch, gradio as gr

# ÿßÿ≥ÿ™ÿÆÿØÿßŸÖ ÿ£ÿ≥ŸÖÿßÿ° ÿßŸÑÿ™ÿµŸÜŸäŸÅÿßÿ™ ÿ•ŸÜ ŸÉÿßŸÜÿ™ ŸÖÿπÿ±ŸëŸÅÿ© ŸÖÿ≥ÿ®ŸÇŸãÿßÿå Ÿàÿ•ŸÑÿß ŸÜŸÇÿ±ÿ£Ÿáÿß ŸÖŸÜ config
def _get_labels(num_classes):
    if 'label_names' in globals() and isinstance(label_names, (list, tuple)) and len(label_names)==num_classes:
        return list(label_names)
    id2label = getattr(getattr(model, 'config', None), 'id2label', None)
    if isinstance(id2label, dict) and len(id2label)==num_classes:
        # ÿßÿ∂ŸÖŸÜ ÿ™ÿ±ÿ™Ÿäÿ®Ÿáÿß 0..N-1
        return [id2label[i] for i in range(num_classes)]
    return [f"label_{i}" for i in range(num_classes)]

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device).eval()

# ÿØÿßŸÑÿ© Ÿàÿßÿ≠ÿØÿ© ÿ™Ÿèÿ±ÿ¨ÿπ ÿØÿ±ÿ¨ÿßÿ™ ŸÉŸÑ ÿßŸÑŸÅÿ¶ÿßÿ™ (ŸÖŸÜÿßÿ≥ÿ®ÿ© ŸÑŸÄ gr.Label)
def gr_predict(text):
    if not text or not text.strip():
        return {"(ÿ£ÿØÿÆŸÑ ŸÜÿµŸãÿß ÿπÿ±ÿ®ŸäÿßŸã)": 1.0}
    with torch.no_grad():
        enc = tokenizer(
            text, return_tensors="pt",
            truncation=True, padding=True, max_length=128
        )
        enc = {k: v.to(device) for k, v in enc.items()}
        logits = model(**enc).logits  # [1, num_labels]
        probs = torch.softmax(logits, dim=-1).squeeze(0).tolist()
    labels = _get_labels(len(probs))
    return {labels[i]: float(probs[i]) for i in range(len(labels))}

# ÿ£ŸÖÿ´ŸÑÿ© ÿ¨ÿßŸáÿ≤ÿ©
examples = [
    "ÿßŸÑÿßŸÇÿ™ÿµÿßÿØ ÿßŸÑÿπÿßŸÑŸÖŸä ŸäŸàÿßÿ¨Ÿá ÿ™ÿ≠ÿØŸäÿßÿ™ ÿπÿØŸäÿØÿ©.",
    "ÿ≠ŸÇŸÇ ÿßŸÑŸÜÿßÿØŸä ŸÅŸàÿ≤Ÿãÿß ŸÖŸáŸÖŸãÿß ŸÅŸä ÿ®ÿ∑ŸàŸÑÿ© ÿßŸÑÿØŸàÿ±Ÿä.",
    "ÿ£ÿ∑ŸÑŸÇÿ™ ÿßŸÑÿ¥ÿ±ŸÉÿ© Ÿáÿßÿ™ŸÅŸãÿß ÿ¨ÿØŸäÿØŸãÿß ÿ®ŸÖŸàÿßÿµŸÅÿßÿ™ ŸÇŸàŸäÿ©.",
    "ÿ£ÿπŸÑŸÜÿ™ ÿßŸÑŸàÿ≤ÿßÿ±ÿ© ÿπŸÜ ŸÇÿ±ÿßÿ±ÿßÿ™ ÿ™ŸÜÿ∏ŸäŸÖŸäÿ© ÿ¨ÿØŸäÿØÿ©.",
]

# 3) ÿ•ŸÜÿ¥ÿßÿ° Ÿàÿ™ÿ¥ÿ∫ŸäŸÑ ÿßŸÑŸàÿßÿ¨Ÿáÿ©
demo = gr.Interface(
    fn=gr_predict,
    inputs=gr.Textbox(lines=4, label="ÿ£ÿØÿÆŸÑ ŸÜÿµŸãÿß ÿπÿ±ÿ®ŸäŸãÿß ŸÑŸÑÿ™ÿµŸÜŸäŸÅ"),
    outputs=gr.Label(num_top_classes=3, label="ÿ£ÿπŸÑŸâ ÿßŸÑÿ™ÿµŸÜŸäŸÅÿßÿ™"),
    title="ÿ™ÿµŸÜŸäŸÅ ÿßŸÑŸÜÿµŸàÿµ ÿßŸÑÿπÿ±ÿ®Ÿäÿ©",
    description="ÿßŸÉÿ™ÿ® ÿ¨ŸÖŸÑÿ© ÿπÿ±ÿ®Ÿäÿ© Ÿàÿ≥ŸäŸèÿ∏Ÿáÿ± ÿßŸÑŸÜŸÖŸàÿ∞ÿ¨ ÿ£ÿπŸÑŸâ ÿßŸÑŸÅÿ¶ÿßÿ™ ŸÖÿπ ÿßŸÑÿßÿ≠ÿ™ŸÖÿßŸÑÿßÿ™.",
    examples=examples,
)

demo.launch(share=False)  # ÿßÿ¨ÿπŸÑŸá True ŸÑŸà ÿ™ÿ≠ÿ® ÿ±ÿßÿ®ÿ∑Ÿãÿß ÿπÿßŸÖŸãÿß (ÿßÿÆÿ™Ÿäÿßÿ±Ÿä)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [19]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-multilingual-cased"
tok = AutoTokenizer.from_pretrained(model_name)

def tok_fn(batch):
    return tok(batch["text"], truncation=True, padding="max_length", max_length=128)

ds_tok = ds.map(tok_fn, batched=True, remove_columns=[c for c in ds["train"].column_names if c not in ["text","label"]])

# Ensure 'labels' column exists
for split in ds_tok.keys():
    if "label" in ds_tok[split].column_names and "labels" not in ds_tok[split].column_names:
        ds_tok[split] = ds_tok[split].rename_column("label", "labels")

num_labels = 3  # 0=negative, 1=neutral, 2=positive
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



## Train
One short epoch for demo. Increase epochs and data for better results.


In [20]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [21]:
import os
os.environ["WANDB_DISABLED"] = "true"   # ÿ™ÿπÿ∑ŸäŸÑ wandb

from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

args = TrainingArguments(
    output_dir="runs/mini",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    logging_steps=20,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["validation"],
    tokenizer=tok,
    compute_metrics=compute_metrics
)

trainer.train()
metrics = trainer.evaluate()
metrics


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss


Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/gradio/queueing.py", line 626, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/route_utils.py", line 349, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 2274, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/gradio/blocks.py", line 1781, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
           ^^^^^

{'eval_loss': 1.0975292921066284,
 'eval_accuracy': 0.3333333333333333,
 'eval_f1_macro': 0.16666666666666666,
 'eval_runtime': 3.3531,
 'eval_samples_per_second': 1.789,
 'eval_steps_per_second': 0.298,
 'epoch': 1.0}


## Inference
Try your own sentences.


In [22]:

import torch

id2label = {0:"ÿ≥ŸÑÿ®Ÿä", 1:"ÿ≠ŸäÿßÿØŸä", 2:"ÿ•Ÿäÿ¨ÿßÿ®Ÿä"}

def predict(texts):
    enc = tok(texts, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        out = model(**enc)
        preds = out.logits.argmax(dim=-1).cpu().tolist()
    return [id2label[p] for p in preds]

examples = [
    "ÿßŸÑÿÆÿØŸÖÿ© ÿ≥Ÿäÿ¶ÿ© ÿ¨ÿØŸãÿß ŸàÿßŸÑÿ™ÿ£ÿÆŸäÿ± ŸÖÿ≤ÿπÿ¨.",
    "ÿßŸÑŸàÿ∂ÿπ ÿπÿßÿØŸä ŸàŸÖÿß ÿπŸÜÿØŸä ÿ™ÿπŸÑŸäŸÇ.",
    "ŸÖŸÖÿ™ÿßÿ≤! ÿ™ÿ¨ÿ±ÿ®ÿ© ÿ±ÿßÿ¶ÿπÿ© Ÿàÿ£ŸÜÿµÿ≠ ÿ®Ÿáÿß."
]

predict(examples)


['ÿ≠ŸäÿßÿØŸä', 'ÿ≠ŸäÿßÿØŸä', 'ÿ≠ŸäÿßÿØŸä']