In [None]:
# +code
#=====>0 تثبيت المكتبات الضرورية ---
!pip install transformers datasets torch scikit-learn nltk imbalanced-learn fastapi uvicorn -q


In [None]:
# +code
#====>1  استيراد المكتبات ---
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import nltk
from nltk.corpus import stopwords
from imblearn.over_sampling import RandomOverSampler

# تحميل stopwords العربية
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# +code
#====>1  استيراد المكتبات ---
import pandas as pd

#====>2  إنشاء DataFrame لكل مصدر كمثال ---
# GitHub dataset
data_github = {
    'Message': ["لقد ربحت جائزة اضغط على الرابط", "موعد اجتماعك غداً الساعة 10"],
    'Category': ["spam", "ham"]
}
df_github = pd.DataFrame(data_github)

# UCI dataset
data_uci = {
    'V1': ["ارسل معلوماتك البنكية", "محاضرة اليوم الساعة 2"],
    'V2': ["spam", "ham"]
}
df_uci = pd.DataFrame(data_uci)

# Mendeley dataset
data_mendeley = {

    'text_message': ["تحقق من رصيدك الآن", "الاجتماع تم تأجيله"],
    'class': ["spam", "ham"]

}
df_mendeley = pd.DataFrame(data_mendeley)

# ===> 3 إعادة تسمية الأعمدة لتكون موحدة ---
df_github.columns = ['text', 'label']
df_uci.columns = ['text', 'label']
df_mendeley.columns = ['text', 'label']

#====>4  دمج كل البيانات ---
df_all = pd.concat([df_github, df_uci, df_mendeley], ignore_index=True)

#====>5    تحويل التصنيفات إلى 0 و 1 ---
df_all['label'] = df_all['label'].map({'ham':0, 'spam':1})

#====>6  التحقق النهائي ---
print("Merged Dataset:")
print(df_all)
print("\nColumn Names:", df_all.columns)
print("Labels distribution:\n", df_all['label'].value_counts())


Merged Dataset:
                             text  label
0  لقد ربحت جائزة اضغط على الرابط      1
1     موعد اجتماعك غداً الساعة 10      0
2           ارسل معلوماتك البنكية      1
3           محاضرة اليوم الساعة 2      0
4              تحقق من رصيدك الآن      1
5              الاجتماع تم تأجيله      0

Column Names: Index(['text', 'label'], dtype='object')
Labels distribution:
 label
1    3
0    3
Name: count, dtype: int64


In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

arabic_stopwords = set(stopwords.words('arabic'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'[ًٌٍَُِّ]', '', text)
    text_tokens = [word for word in text.split() if word not in arabic_stopwords]
    return ' '.join(text_tokens)

df_all['clean_text'] = df_all['text'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
-# نتأكد أن العمود موجود ونتعامل مع أي قيم فارغة
df_all['clean_text'] = df_all['clean_text'].fillna("").astype(str)

# حساب عدد الكلمات والحروف
df_all['word_count'] = df_all['clean_text'].apply(lambda x: len(x.split()))
df_all['char_count'] = df_all['clean_text'].apply(lambda x: len(x))

# الكلمات المفتاحية
keywords = ['رابط','ربح','تحقق','مكافأة','ارسال']
for kw in keywords:
    df_all[f'keyword_{kw}'] = df_all['clean_text'].apply(lambda x: x.count(kw))

# الأعمدة المميزة
feature_cols = ['word_count', 'char_count'] + [f'keyword_{kw}' for kw in keywords]

print("Feature columns:", feature_cols)
print(df_all.head())


Feature columns: ['word_count', 'char_count', 'keyword_رابط', 'keyword_ربح', 'keyword_تحقق', 'keyword_مكافأة', 'keyword_ارسال']
                             text  label                  clean_text  \
0  لقد ربحت جائزة اضغط على الرابط      1  لقد ربحت جائزة اضغط الرابط   
1     موعد اجتماعك غداً الساعة 10      0         موعد اجتماعك الساعة   
2           ارسل معلوماتك البنكية      1       ارسل معلوماتك البنكية   
3           محاضرة اليوم الساعة 2      0         محاضرة اليوم الساعة   
4              تحقق من رصيدك الآن      1             تحقق رصيدك الان   

   word_count  char_count  keyword_رابط  keyword_ربح  keyword_تحقق  \
0           5          26             1            1             0   
1           3          19             0            0             0   
2           3          21             0            0             0   
3           3          19             0            0             0   
4           3          15             0            0             1   

   keyword_مكافأة 

In [None]:
# +code
!pip install transformers torch --quiet
from transformers import AutoTokenizer

model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

encodings = tokenizer(df_all['clean_text'].tolist(), truncation=True, padding=True)


In [None]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `khawlah` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `khawlah`


In [None]:
# +code
import torch

class SMSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

dataset = SMSDataset(encodings, df_all['label'].tolist())
print("Dataset example:", dataset[0])


NameError: name 'encodings' is not defined

In [None]:
# +code
!pip install scikit-learn --quiet
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
# +code
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# ملاحظة: num_labels=2 لأننا نصنف الرسائل إلى spam أو ham


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# +code
def compute_metrics(p):
    preds = p.predictions.argmax(-1)  # اختيار الفئة ذات أعلى احتمال
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
    acc = accuracy_score(p.label_ids, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
# +code
# تحديث مكتبة transformers
!pip install --upgrade transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# +code
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",             # حفظ النموذج والlogs هنا
    num_train_epochs=3,                 # عدد الـ epochs
    per_device_train_batch_size=4,      # حجم الباتش أثناء التدريب
    per_device_eval_batch_size=4,       # حجم الباتش أثناء التقييم
    logging_dir="./logs",               # مجلد حفظ السجلات
    logging_steps=10,                   # كل 10 خطوات تسجيل المعلومات
    learning_rate=5e-5,
    weight_decay=0.01
)


In [None]:
# +code
from transformers import AutoModelForSequenceClassification

# تحميل الموديل العربي BERT مع 2 فئة (spam / ham)
model_name = "aubmindlab/bert-base-arabertv02"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# +code
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# ==== تقسيم البيانات إلى train و test ====
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_all['clean_text'].tolist(),   # النصوص
    df_all['label'].tolist(),        # التصنيفات
    test_size=0.2,                   # 20% للاختبار
    random_state=42,
    stratify=df_all['label']         # يحافظ على نسبة spam/ham
)

# ==== تحويل النصوص للترميزات باستخدام tokenizer ====
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# ==== إنشاء Dataset لكل مجموعة ====
train_dataset = SpamDataset(train_encodings, train_labels)
test_dataset = SpamDataset(test_encodings, test_labels)

# ==== إنشاء DataLoader لكل مجموعة ====
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

print("Train/Test datasets and DataLoaders are ready!")


Train/Test datasets and DataLoaders are ready!


In [None]:
# +code
import torch
from torch import nn
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ==== تجهيز الجهاز (GPU إذا موجود) ====
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print("Using device:", device)

# ==== إعداد optimizer ====
optimizer = AdamW(model.parameters(), lr=5e-5)

# ==== عدد epochs ====
epochs = 3

# ==== Training Loop ====
for epoch in range(epochs):
    model.train()  # وضع الموديل في وضع التدريب
    total_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()

        # نقل البيانات للجهاز (GPU/CPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # تمرير البيانات للموديل
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # backward + update
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)

    # ==== Evaluation بعد كل epoch ====
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # ==== حساب Metrics ====
    acc = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Test Acc: {acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")


Using device: cpu


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/3 | Train Loss: 0.7728 | Test Acc: 0.5000 | Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2/3 | Train Loss: 0.4504 | Test Acc: 0.5000 | Precision: 0.0000 | Recall: 0.0000 | F1: 0.0000
Epoch 3/3 | Train Loss: 0.3485 | Test Acc: 0.5000 | Precision: 0.5000 | Recall: 1.0000 | F1: 0.6667


In [None]:
# Evaluation نهائي على Test Set
model.eval()
all_preds = []
all_probs = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=1)  # احتمالات لكل فئة
        preds = torch.argmax(probs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# حساب Metrics
acc = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')

print("=== Final Test Metrics ===")
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


NameError: name 'model' is not defined

In [None]:
# +code
from fastapi import FastAPI
from pydantic import BaseModel
import torch

# ==== تهيئة التطبيق ====
app = FastAPI()

# ==== موديل + tokenizer جاهز ====
model.eval()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# ==== نموذج البيانات ====
class Message(BaseModel):
    text: str

# ==== دالة التنبؤ ====
def predict_message(text):
    encoding = tokenizer([text], truncation=True, padding=True, return_tensors="pt")
    encoding = {key: val.to(device) for key, val in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)
        probs = torch.softmax(outputs.logits, dim=1)
        pred_class = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred_class].item()

    if confidence < 0.6:   # أي رسالة غير واضحة
        result = "This message is suspected to be a scam."
    else:
        result = "spam" if pred_class == 1 else "Not spam"

    return {"classification": result, "confidence": round(confidence*100, 2)}

# ==== إنشاء endpoint ====
@app.post("/predict/")
def predict(message: Message):
    return predict_message(message.text)


In [None]:
# تثبيت ngrok لتشغيل API على Colab
!pip install pyngrok -q
from pyngrok import ngrok


In [None]:
# تشغيل FastAPI
!pip install fastapi uvicorn -q

# استدعاء كود FastAPI
# تأكد أن الكود يحتوي:
# - تعريف app = FastAPI()
# - predict_message() + class Message(BaseModel)
# - endpoint /predict/


In [None]:
# +code
import nest_asyncio
import uvicorn
from pyngrok import ngrok

# السماح بتشغيل uvicorn داخل Colab
nest_asyncio.apply()

# ==== تفعيل authtoken الخاص بالجهاز  على ngrok ====
!ngrok authtoken "32F779cdExdXmngnDlzZuL3NB9Y_68Mntay3q8nL5D35VPCSX"  #   token الخاص بك

# فتح تونل ngrok على المنفذ 8000
public_url = ngrok.connect(addr="8000", proto="http")
print("Public URL:", public_url)

# تشغيل Uvicorn لخادم FastAPI
uvicorn.run(app, host="0.0.0.0", port=8000)


In [None]:
{
  "text": "تحقق من رصيدك الآن!",
  "classification": "spam",
  "confidence": 92.5
}



{'text': 'تحقق من رصيدك الآن!', 'classification': 'spam', 'confidence': 92.5}

In [3]:
import psycopg2
import json
from datetime import datetime

DB_CONFIG = {
    "dbname": "postgres",           # اسم قاعدة البيانات
    "user": "postgres",             # اسم المستخدم الافتراضي
    "password": "https://qsgrxnzljtoebmeqcpbp.supabase.co",
    "host": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InFzZ3J4bnpsanRvZWJtZXFjcGJwIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NTg2MzQ1MTMsImV4cCI6MjA3NDIxMDUxM30.2sHDLxRF_dZp0tbZ5_Pefed3rsOoEfw5zMVAjEjIqZs"
}

def log_sms_scan_result(sender_id: str, message_content: str, classification: str, score: float, details: dict):
    """
    تسجل نتيجة فحص الرسالة النصية في جدول sms_safe_scans.
    """
    conn = None
    try:
        # 1. الاتصال بقاعدة البيانات
        conn = psycopg2.connect(**DB_CONFIG)
        cursor = conn.cursor()

        # 2. تحويل التفاصيل إلى JSON String للتخزين في حقل TEXT/JSONB
        details_json = json.dumps(details)

        # 3. بناء استعلام الإدخال (INSERT)
        insert_query = """
        INSERT INTO public.sms_safe_scans (
            sender_id,
            message_content,
            type,                     -- سنفترض أنه 'ML_Classification' مؤقتاً
            classification_response,
            score,
            details
        ) VALUES (%s, %s, %s, %s, %s, %s);
        """

        # 4. تنفيذ الاستعلام
        cursor.execute(insert_query, (
            sender_id,
            message_content,
            'ML_Classification',
            classification,
            score,
            details_json
        ))

        # 5. تأكيد التغييرات
        conn.commit()
        print("Scan result logged successfully.")

    except (Exception, psycopg2.Error) as error:
        print(f"Error while connecting to PostgreSQL or logging data: {error}")
    finally:
        # 6. إغلاق الاتصال
        if conn:
            cursor.close()
            conn.close()