<a href="https://colab.research.google.com/github/YoussefDiaa1/Project-ITI/blob/main/Deep_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [10]:
import os

# اختار مسار آمن
os.chdir("/content")

print("Now working inside:", os.getcwd())


Now working inside: /content


In [11]:
OUTPUT_DIR = "prepared_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Created:", os.path.abspath(OUTPUT_DIR))


Created: /content/prepared_data


In [8]:
!mkdir -p /content/prepared_data
OUTPUT_DIR = "/content/prepared_data"


shell-init: error retrieving current directory: getcwd: cannot access parent directories: No such file or directory


In [13]:
# file: prepare_multilang_reviews.py
import re
import os
import json
import unicodedata
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.utils import resample

# ---------- إعدادات ----------
AR_FILE = "/content/drive/MyDrive/ITI_Final_AI_Project/Review_Classification_Data_AR.csv"
EN_FILE = "/content/drive/MyDrive/ITI_Final_AI_Project/Review_Classification_Data_EN.csv"
OUTPUT_DIR = "prepared_data"
RANDOM_STATE = 42
TEST_SIZE = 0.1   # نسبة الاختبار النهائية
VAL_SIZE = 0.1    # نسبة الفاليدشن من الباقي بعد استقطاع الاختبار
OVERSAMPLE = True  # لو True سيقوم بزيادة عينات الفئات الصغيرة
OUTPUT_DIR = "./prepared_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- دوال مساعدة للتنظيف ----------
def remove_urls(text):
    return re.sub(r'http\S+|www\.\S+', ' ', text)

def remove_html(text):
    return re.sub(r'<[^>]+>', ' ', text)

def remove_control_chars(text):
    return ''.join(ch for ch in text if unicodedata.category(ch)[0] != "C")

def normalize_whitespace(text):
    return re.sub(r'\s+', ' ', text).strip()

# Arabic-specific normalization (light)
AR_DIACRITICS = re.compile(r'[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]')
def normalize_arabic(text):
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_control_chars(text)
    text = AR_DIACRITICS.sub('', text)            # remove tashkeel
    text = re.sub(r'[\u0640]', '', text)          # remove tatweel
    # normalize Alef variants
    text = re.sub(r'[إأآﺇﺃآ]', 'ا', text)
    # normalize Ya and Alef Maqsura
    text = re.sub(r'[ى]', 'ي', text)
    # hamza forms to bare hamza
    text = re.sub(r'[ؤئ]', 'ء', text)
    # optionally keep ة as is (tāʾ marbuta) — we keep it
    # remove extra punctuation except basic
    text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)
    return normalize_whitespace(text)

# English-specific normalization (light)
def normalize_english(text):
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_control_chars(text)
    text = text.lower()
    text = re.sub(r'[^0-9a-zA-Z\s\']', ' ', text)  # keep apostrophes
    return normalize_whitespace(text)

# generic cleaner that calls language-specific normalizers
def clean_text(text, lang):
    if not isinstance(text, str):
        text = str(text) if pd.notna(text) else ""
    text = text.strip()
    if lang == "ar":
        return normalize_arabic(text)
    else:
        return normalize_english(text)

# ---------- محاولة اكتشاف أعمدة النص والـ label تلقائيًا ----------
def pick_text_label_columns(df):
    text_candidates = ["text", "review", "content", "sentence", "comment", "body"]
    label_candidates = ["label", "labels", "sentiment", "rating", "target", "class"]
    text_col = None
    label_col = None
    cols = [c.lower() for c in df.columns]

    # find text col
    for cand in text_candidates:
        for c in df.columns:
            if c.lower() == cand:
                text_col = c
                break
        if text_col: break

    # fallback: any object/string column
    if text_col is None:
        for c in df.columns:
            if df[c].dtype == object:
                text_col = c
                break

    # find label col
    for cand in label_candidates:
        for c in df.columns:
            if c.lower() == cand:
                label_col = c
                break
        if label_col: break

    # fallback: any numeric or categorical column with few unique values
    if label_col is None:
        for c in df.columns:
            if df[c].dtype in [int, float] or df[c].dtype == object:
                if df[c].nunique() <= 20 and df[c].nunique() > 1:
                    label_col = c
                    break

    return text_col, label_col

# ---------- قراءة الملفات وتوحيد الأعمدة ----------
def read_and_prepare(path, lang_tag):
    df = pd.read_csv(path)
    text_col, label_col = pick_text_label_columns(df)
    if text_col is None or label_col is None:
        raise ValueError(f"Couldn't auto-detect text/label columns in {path}. Cols: {df.columns.tolist()}")
    df = df[[text_col, label_col]].rename(columns={text_col: "text", label_col: "label"})
    df["lang"] = lang_tag
    # clean
    df["text"] = df["text"].apply(lambda t: clean_text(t, lang_tag))
    # drop empty texts
    df = df[df["text"].str.strip() != ""]
    # reset index
    df = df.reset_index(drop=True)
    return df

# ---------- دمج الملفين ----------
df_ar = read_and_prepare(AR_FILE, "ar")
df_en = read_and_prepare(EN_FILE, "en")
df = pd.concat([df_ar, df_en], ignore_index=True)
print("Merged dataset size:", len(df))
print("Languages distribution:", df["lang"].value_counts().to_dict())

# ---------- تأكد من أن الـ labels هي أرقام 0..K-1 (map إذا كانت نصية) ----------
df["label"] = df["label"].astype(str).str.strip()  # نخليها كلها نصوص

label_values = sorted(df["label"].unique().tolist())
label_map = {lab: i for i, lab in enumerate(label_values)}

df["label"] = df["label"].map(label_map)  # حول النصوص لأرقام

print("Label mapping:", label_map)
print("Label distribution:", df["label"].value_counts().to_dict())

# حفظ الـ mapping
with open(os.path.join(OUTPUT_DIR, "label_map.json"), "w", encoding="utf-8") as f:
    json.dump(label_map, f, ensure_ascii=False, indent=2)

# ---------- حل مشكلة عدم التوازن (اختياري - oversample) ----------
if OVERSAMPLE:
    counts = df["label"].value_counts()
    max_count = counts.max()
    dfs = []
    for label_val, group in df.groupby("label"):
        if len(group) < max_count:
            upsampled = resample(group, replace=True, n_samples=max_count, random_state=RANDOM_STATE)
            dfs.append(upsampled)
        else:
            dfs.append(group)
    df = pd.concat(dfs).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
    print("After oversampling label distribution:", df["label"].value_counts().to_dict())

# ---------- تقسيم Stratified إلى train / val / test ----------
# أولاً فصل الاختبار النهائي
trainval_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=df["label"])
# ثم من الباقي نأخذ validation
val_relative_size = VAL_SIZE / (1.0 - TEST_SIZE)
train_df, val_df = train_test_split(trainval_df, test_size=val_relative_size, random_state=RANDOM_STATE, stratify=trainval_df["label"])

print("Final sizes -> train:", len(train_df), "val:", len(val_df), "test:", len(test_df))
print("Train label dist:", train_df["label"].value_counts().to_dict())

# ---------- حفظ CSVs جاهزة للتدريب (text,label,lang) ----------
for name, d in [("train.csv", train_df), ("validation.csv", val_df), ("test.csv", test_df), ("merged_all.csv", df)]:
    out_path = os.path.join(OUTPUT_DIR, name)
    d[["text", "label", "lang"]].to_csv(out_path, index=False)
    print("Saved:", out_path)

print("Done. Files in:", OUTPUT_DIR)


Merged dataset size: 81078
Languages distribution: {'en': 50000, 'ar': 31078}
Label mapping: {'0': 0, '1': 1, 'negative': 2, 'positive': 3}
Label distribution: {2: 25000, 3: 25000, 1: 18406, 0: 12672}
After oversampling label distribution: {3: 25000, 0: 25000, 2: 25000, 1: 25000}
Final sizes -> train: 80000 val: 10000 test: 10000
Train label dist: {0: 20000, 2: 20000, 3: 20000, 1: 20000}
Saved: ./prepared_data/train.csv
Saved: ./prepared_data/validation.csv
Saved: ./prepared_data/test.csv
Saved: ./prepared_data/merged_all.csv
Done. Files in: ./prepared_data


In [14]:
# file: train_multilang_reviews.py

import os
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# -----------------------------
# إعداد المسارات والمتغيرات
# -----------------------------
DATA_DIR = "./prepared_data"       # المكان اللي فيه CSVs
MODEL_NAME = "xlm-roberta-base"    # أو "xlm-roberta-large" لو GPU كبير
OUTPUT_DIR = "./multilang_model"
NUM_LABELS = 4                     # حسب الـ label mapping اللي عندك
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

os.makedirs(OUTPUT_DIR, exist_ok=True)

# -----------------------------
# تحميل البيانات من CSV
# -----------------------------
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "validation.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

# -----------------------------
# Tokenizer
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["text"].tolist(),
                     padding="max_length",
                     truncation=True,
                     max_length=MAX_LEN,
                     return_tensors="pt")

# -----------------------------
# Dataset class لـ Hugging Face
# -----------------------------
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx],
                             padding="max_length",
                             truncation=True,
                             max_length=MAX_LEN,
                             return_tensors="pt")
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = ReviewsDataset(train_df)
val_dataset   = ReviewsDataset(val_df)
test_dataset  = ReviewsDataset(test_df)

# -----------------------------
# Metrics function
# -----------------------------
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision_macro": precision_score(labels, preds, average="macro"),
        "recall_macro": recall_score(labels, preds, average="macro")
    }

# -----------------------------
# تحميل موديل للتصنيف
# -----------------------------
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

# -----------------------------
# TrainingArguments
# -----------------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    fp16=torch.cuda.is_available()
)

# -----------------------------
# Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# -----------------------------
# تدريب الموديل
# -----------------------------
trainer.train()

# -----------------------------
# تقييم على Test set
# -----------------------------
results = trainer.evaluate(test_dataset)
print("Test set results:", results)

# -----------------------------
# حفظ الموديل النهائي
# -----------------------------
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model and tokenizer saved to {OUTPUT_DIR}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'