<a href="https://colab.research.google.com/github/YoussefDiaa1/Project-ITI/blob/main/Deep_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [18]:
import os

# اختار مسار آمن
os.chdir("/content")

print("Now working inside:", os.getcwd())


Now working inside: /content


In [19]:
OUTPUT_DIR = "prepared_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Created:", os.path.abspath(OUTPUT_DIR))


Created: /content/prepared_data


In [20]:
!mkdir -p /content/prepared_data
OUTPUT_DIR = "/content/prepared_data"


In [31]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import os
import json
import unicodedata
import pandas as pd
from collections import Counter
from sklearn.utils import resample
from imblearn.over_sampling import RandomOverSampler
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


In [21]:
# ---------- إعدادات ----------
AR_FILE = "/content/drive/MyDrive/ITI_Final_AI_Project/Review_Classification_Data_AR.csv"
EN_FILE = "/content/drive/MyDrive/ITI_Final_AI_Project/Review_Classification_Data_EN.csv"
OUTPUT_DIR = "./prepared_data"
RANDOM_STATE = 42
TEST_SIZE = 0.1   # نسبة الاختبار النهائية
VAL_SIZE = 0.1    # نسبة الفاليدشن من الباقي بعد استقطاع الاختبار
OVERSAMPLE = True  # لو True سيقوم بزيادة عينات الفئات الصغيرة

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [22]:
def prepare_ar(path):
    df = pd.read_csv(path)
    # نفترض الأعمدة: cleaned_review_description, label
    df = df.rename(columns={"cleaned_review_description": "text", "label": "label"})
    df["lang"] = "ar"
    # حذف الصفوف الفارغة في النص
    df = df[df["text"].notna() & (df["text"].astype(str).str.strip() != "")]
    return df[["text", "label", "lang"]]

In [23]:
def prepare_ar(path):
    df = pd.read_csv(path)
    # نفترض الأعمدة: cleaned_review_description, label
    df = df.rename(columns={"cleaned_review_description": "text", "label": "label"})
    df["lang"] = "ar"
    # حذف الصفوف الفارغة في النص
    df = df[df["text"].notna() & (df["text"].astype(str).str.strip() != "")]
    return df[["text", "label", "lang"]]

In [24]:
def prepare_en(path):
    df = pd.read_csv(path)
    # نفترض الأعمدة: review, sentiment
    df = df.rename(columns={"review": "text", "sentiment": "label"})
    df["lang"] = "en"
    # لو القيم نصية 'positive'/'negative' نحولهم لأرقام
    if df["label"].dtype == object:
        df["label"] = df["label"].astype(str).str.strip().str.lower()
        df["label"] = df["label"].map({"negative": 0, "positive": 1}).where(
            df["label"].isin([0, 1]), other=df["label"]
        )
    # حذف الصفوف الفارغة في النص
    df = df[df["text"].notna() & (df["text"].astype(str).str.strip() != "")]
    return df[["text", "label", "lang"]]

In [27]:
df_ar = prepare_ar(AR_FILE)
df_en = prepare_en(EN_FILE)

df = pd.concat([df_ar, df_en], ignore_index=True)
print("Merged dataset size:", len(df))
print("Languages distribution:", df["lang"].value_counts().to_dict())

Merged dataset size: 81079
Languages distribution: {'en': 50000, 'ar': 31079}


In [28]:
if df["label"].dtype == object:
    df["label"] = df["label"].astype(str).str.strip()
    label_values = sorted(df["label"].unique().tolist())
    label_map = {lab: i for i, lab in enumerate(label_values)}
    df["label"] = df["label"].map(label_map)
    # حفظ الـ mapping
    with open(os.path.join(OUTPUT_DIR, "label_map.json"), "w", encoding="utf-8") as f:
        json.dump(label_map, f, ensure_ascii=False, indent=2)
    print("Label mapping (saved):", label_map)
else:
    # لو الأرقام 0/1 موجودة، نحفظ خريطة بسيطة
    uniques = sorted(df["label"].unique().tolist())
    label_map = {str(v): int(v) for v in uniques}
    with open(os.path.join(OUTPUT_DIR, "label_map.json"), "w", encoding="utf-8") as f:
        json.dump(label_map, f, ensure_ascii=False, indent=2)
    print("Numeric labels detected, label_map saved:", label_map)

print("Label distribution:", df["label"].value_counts().to_dict())

Label mapping (saved): {'0': 0, '1': 1, 'negative': 2, 'positive': 3}
Label distribution: {2: 25000, 3: 25000, 1: 18407, 0: 12672}


In [29]:
OVERSAMPLE = True
RANDOM_STATE = 42
if OVERSAMPLE:
    counts = df["label"].value_counts()
    max_count = counts.max()
    dfs = []
    for label_val, group in df.groupby("label"):
        if len(group) < max_count:
            upsampled = resample(
                group,
                replace=True,
                n_samples=max_count,
                random_state=RANDOM_STATE
            )
            dfs.append(upsampled)
        else:
            dfs.append(group)
    df = pd.concat(dfs).sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
    print("After oversampling label distribution:", df["label"].value_counts().to_dict())


After oversampling label distribution: {3: 25000, 0: 25000, 2: 25000, 1: 25000}


In [30]:
trainval_df, test_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=df["label"])
val_relative_size = VAL_SIZE / (1.0 - TEST_SIZE)
train_df, val_df = train_test_split(trainval_df, test_size=val_relative_size, random_state=RANDOM_STATE, stratify=trainval_df["label"])

print("Final sizes -> train:", len(train_df), "val:", len(val_df), "test:", len(test_df))

# ---------- حفظ CSVs جاهزة للتدريب (text,label,lang) ----------
for name, d in [("train.csv", train_df), ("validation.csv", val_df), ("test.csv", test_df), ("merged_all.csv", df)]:
    out_path = os.path.join(OUTPUT_DIR, name)
    d[["text", "label", "lang"]].to_csv(out_path, index=False)
    print("Saved:", out_path)

print("Done. Files in:", OUTPUT_DIR)

Final sizes -> train: 80000 val: 10000 test: 10000
Saved: ./prepared_data/train.csv
Saved: ./prepared_data/validation.csv
Saved: ./prepared_data/test.csv
Saved: ./prepared_data/merged_all.csv
Done. Files in: ./prepared_data


In [32]:
# -----------------------------
# إعداد المسارات والمتغيرات
# -----------------------------
DATA_DIR = "./prepared_data"       # المكان اللي فيه CSVs
MODEL_NAME = "xlm-roberta-base"    # أو "xlm-roberta-large" لو GPU كبير
OUTPUT_DIR = "./multilang_model"
NUM_LABELS = 4                     # حسب الـ label mapping اللي عندك
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [33]:
# -----------------------------
# تحميل البيانات من CSV
# -----------------------------
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "validation.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

In [34]:
# -----------------------------
# Tokenizer
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch["text"].tolist(),
                     padding="max_length",
                     truncation=True,
                     max_length=MAX_LEN,
                     return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [35]:
# -----------------------------
# Dataset class لـ Hugging Face
# -----------------------------
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx],
                             padding="max_length",
                             truncation=True,
                             max_length=MAX_LEN,
                             return_tensors="pt")
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = ReviewsDataset(train_df)
val_dataset   = ReviewsDataset(val_df)
test_dataset  = ReviewsDataset(test_df)

In [36]:
# -----------------------------
# Metrics function
# -----------------------------
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "precision_macro": precision_score(labels, preds, average="macro"),
        "recall_macro": recall_score(labels, preds, average="macro")
    }


In [42]:
# -----------------------------
# تحميل موديل للتصنيف
# -----------------------------
model_T = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
# -----------------------------
# TrainingArguments
# -----------------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    fp16=torch.cuda.is_available()
)

In [44]:
# -----------------------------
# Trainer
# -----------------------------
trainer = Trainer(
    model=model_T,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
# -----------------------------
# تدريب الموديل
# -----------------------------
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mamrhassanhd[0m ([33mamrhassanhd-ieee[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Precision Macro,Recall Macro
1,0.3161,0.303574,0.877,0.876898,0.878283,0.877
2,0.2664,0.286916,0.896,0.895989,0.896185,0.896
3,0.2082,0.312626,0.9014,0.901383,0.901656,0.9014


TrainOutput(global_step=15000, training_loss=0.2893467235565186, metrics={'train_runtime': 4253.1296, 'train_samples_per_second': 56.429, 'train_steps_per_second': 3.527, 'total_flos': 3.157389361152e+16, 'train_loss': 0.2893467235565186, 'epoch': 3.0})

In [None]:
# -----------------------------
# تقييم على Test set
# -----------------------------
results = trainer.evaluate(test_dataset)
print("Test set results:", results)

Test set results: {'eval_loss': 0.3137753903865814, 'eval_accuracy': 0.901, 'eval_f1_macro': 0.9009891693486665, 'eval_precision_macro': 0.9011521077161999, 'eval_recall_macro': 0.901, 'eval_runtime': 42.1546, 'eval_samples_per_second': 237.222, 'eval_steps_per_second': 14.826, 'epoch': 3.0}


In [None]:
# -----------------------------
# حفظ الموديل النهائي
# -----------------------------
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model and tokenizer saved to {OUTPUT_DIR}")

Model and tokenizer saved to ./multilang_model


In [16]:
# import shutil

# OUTPUT_DIR = "./multilang_model"
# ZIP_PATH = OUTPUT_DIR + ".zip"
# shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

# print("Created ZIP file:", ZIP_PATH)


In [14]:
# from huggingface_hub import HfApi, upload_folder, create_repo

# # تسجيل الدخول
# from huggingface_hub import login
# login()  # حط الـ token هنا

# # اسم الريبو
# REPO_ID = "amrhassank/Review_Classification"

# # إنشاء الريبو لو مش موجود
# create_repo(repo_id=REPO_ID, exist_ok=True)  # exist_ok=True مش هيعمل مشكلة لو الريبو موجود بالفعل

# # رفع الموديل
# OUTPUT_DIR = "./multilang_model"
# upload_folder(
#     repo_id=REPO_ID,
#     folder_path=OUTPUT_DIR,
#     path_in_repo="."
# )

# print("✅ Model uploaded successfully to:", f"https://huggingface.co/{REPO_ID}")


In [15]:
# !pip install -U "huggingface_hub[cli]" --quiet
# !hf auth login
# !hf upload amrhassank/Review_Classification ./multilang_model

In [12]:
# pip install gradio transformers torch

In [None]:
# import gradio as gr
# import torch
# from transformers import pipeline

# # -----------------------------
# # تحميل الموديل
# # -----------------------------
# MODEL_PATH = "./multilang_model"  # نفس مكان حفظ الموديل
# device = 0 if torch.cuda.is_available() else -1

# classifier = pipeline(
#     "text-classification",
#     model=MODEL_PATH,
#     tokenizer=MODEL_PATH,
#     device=device
# )
# label_map = {
#     "LABEL_0": "negative",
#     "LABEL_1": "positive",
#     "LABEL_2": "negative",
#     "LABEL_3": "positive"
# }
# # -----------------------------
# # دالة التوقع
# # -----------------------------
# def predict_review(text):
#     result = classifier(text)[0]
#     label = result["label"]
#     score = result["score"]
#     mapped_label = label_map.get(label, "unknown")
#     return f"Predicted sentiment: {mapped_label}, confidence: {score:.4f}"

# # -----------------------------
# # واجهة Gradio
# # -----------------------------
# interface = gr.Interface(
#     fn=predict_review,
#     inputs=gr.Textbox(lines=5, placeholder="اكتب مراجعة هنا بالعربي أو الإنجليزي..."),
#     outputs="text",
#     title="Multilingual Review Classifier",
#     description="ادخل مراجعة بالإنجليزي أو العربي، واحصل على توقع الإيجابي/السلبي للموديل."
# )

# # تشغيل الواجهة
# interface.launch(share=True)  # share=True لو عايز رابط مباشر للويب


In [None]:
from transformers import pipeline

classifier = pipeline("text-classification", model="amrhassank/Review_Classification")
result = classifier("This product is amazing!")
print(result)


In [11]:
import gradio as gr
from transformers import pipeline

classifier = pipeline("text-classification", model="amrhassank/Review_Classification")

id2label = {
    "LABEL_0": "negative",
    "LABEL_1": "positive",
    "LABEL_2": "negative",
    "LABEL_3": "positive"
}

def classify_text(text):
    result = classifier(text)[0]
    return id2label[result["label"]], float(result["score"])

demo = gr.Interface(
    fn=classify_text,
    inputs=gr.Textbox(lines=3, placeholder="Enter your review here..."),
    outputs=["label", "number"],
    title="Review Classification",
    description="Enter a review and the model will classify it."
)

demo.launch(share=True)


Device set to use cpu


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c1318968c457482981.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/923 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'LABEL_1', 'score': 0.8720827698707581}]
