###  1-Use the datasets library from Hugging Face to download the arabic- generated-abstracts dataset directly into a Python environment (By Google Colab).

In [None]:
# !pip install datasets
# !pip install python-dotenv


In [None]:
from dotenv import load_dotenv
import os
from huggingface_hub import login

load_dotenv()

hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
dataset = load_dataset("KFUPM-JRCAI/arabic-generated-abstracts")
print(dataset)


In [None]:
import pandas as pd
# Combine all splits into one df_human
splits = ["by_polishing", "from_title", "from_title_and_content"]

df_human = pd.concat([dataset[s].to_pandas() for s in splits], ignore_index=True)

In [None]:
dfs = []

for split_name in ["by_polishing", "from_title", "from_title_and_content"]:
    split_df = dataset[split_name].to_pandas().copy()
    split_df["source_split"] = split_name   # <-- Create column manually
    dfs.append(split_df)

df_human = pd.concat(dfs, ignore_index=True)

In [None]:
ai_rows = []

for _, row in df_human.iterrows():
    ai_models = [
        ("allam", row["allam_generated_abstract"]),
        ("jais", row["jais_generated_abstract"]),
        ("llama", row["llama_generated_abstract"]),
        ("openai", row["openai_generated_abstract"]),
    ]

    for model_name, text in ai_models:
        ai_rows.append({
            "abstract_text": text,
            "source_split": row["source_split"],     # now this exists
            "generated_by": model_name,
            "label": 0  # AI
        })

# Convert to dataframe
df_ai = pd.DataFrame(ai_rows)

# Create human dataframe
df_h = pd.DataFrame({
    "abstract_text": df_human["original_abstract"],
    "source_split": df_human["source_split"],
    "generated_by": "human",
    "label": 1
})

# Final unified dataset
df = pd.concat([df_h, df_ai], ignore_index=True)

print("Final unified dataset shape:", df.shape)
df.head(10)

In [None]:
pip install xlsxwriter

In [None]:
output_file = 'original_data.xlsx'

df.to_excel(
        output_file,
        index=False,
        engine='xlsxwriter'
    )


In [None]:
print(df.columns)

###  Perform initial data exploration:

#### 1- Load and inspect the dataset structure (columns, data types).


In [None]:
# Inspect column names and data types for one split (e.g., 'by_polishing')
print("\nFeatures in 'by_polishing':")
print(dataset['by_polishing'].features)

# Check dataset info (shape, structure, statistics)
print("\nDataset info for 'by_polishing':")
print(dataset['by_polishing'])




#### 2- Check the distribution of the target variable (label: human vs. AI- generated)


In [None]:
num_human = df[df["label"] == 1].shape[0]
num_ai = df[df["label"] == 0].shape[0]

total = num_human + num_ai

print("\n===== Target Variable Distribution =====")
print("Human-written abstracts:", num_human)
print("AI-generated abstracts:", num_ai)
print("Human %:", round(num_human / total * 100, 2))
print("AI %:", round(num_ai / total * 100, 2))

#### 3- Assess data quality: check for missing values, duplicates, and inconsistencies:


Missing values → any None/NaN in columns

Duplicates → same abstract appearing multiple times

Inconsistencies → like empty strings " " or unusual data

In [None]:
print("\n===== Missing Values =====")
print(df.isnull().sum())

print("\n===== Duplicate Rows =====")
print("Total duplicate rows:", df.duplicated().sum())

print("\n===== Duplicate values per column =====")
for col in df.columns:
    print(f"{col}: {df[col].duplicated().sum()}")

print("\n===== Empty / Blank Values =====")
for col in df.columns:
    empty_count = df[col].apply(lambda x: str(x).strip() == "").sum()
    print(f"{col}: {empty_count}")

# task 2.1: Arabic Text Preprocessing


In [None]:

import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
from datasets import load_dataset


In [None]:
# Download required NLTK resources
nltk.download('stopwords')

In [None]:
# Check columns
print(df.head())

In [None]:
#Define Arabic text cleaning functions
# Remove tashkeel (diacritics)
def remove_diacritics(text):
    arabic_diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    return re.sub(arabic_diacritics, '', text)

In [None]:
# Normalize Arabic text
def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("[^؀-ۿ ]+", " ", text)  # remove non-Arabic chars
    return text

In [None]:
# Initialize stopwords and stemmer
arabic_stopwords = set(stopwords.words("arabic"))
stemmer = ISRIStemmer()

In [None]:
# Full preprocessing pipeline
def preprocess_text(text):
    text = str(text)
    text = remove_diacritics(text)
    text = normalize_arabic(text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in arabic_stopwords]
    tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(tokens)

In [None]:
# Apply preprocessing
text_columns = [
    'abstract_text',
    'source_split',
    'generated_by',
    'label',
]

# Apply preprocessing on the unified abstract text column
df["abstract_text_clean"] = df["abstract_text"].apply(preprocess_text)

print("Preprocessing complete! Here are the new columns:")
print(df.columns)

df.head(2)



## Features Engineering


In [None]:
#important library
import re
import math
import numpy as np
import pandas as pd
import unicodedata
from collections import Counter
from datasets import load_dataset
import regex as re2  # للاستخدام المتقدم (Arabic support)

In [None]:
#Helper functions


def simple_word_tokenize(text):
    """
    Tokenize text into words / symbols with Arabic support.
    """
    return re2.findall(r"\p{Arabic}+|\w+|[^\s\w]", text, flags=re2.VERSION1)

def sentence_tokenize(text):
    """
    Split text into sentences using Arabic/English punctuation.
    """
    if not isinstance(text, str):
        return []
    parts = re.split(r'(?<=[\.\?\!\u061F\u061B])\s+', text)
    return [p.strip() for p in parts if p.strip()]

def paragraph_tokenize(text):
    """
    Split text into paragraphs based on double newlines.
    """
    if not isinstance(text, str):
        return []
    paragraphs = re.split(r'\s*\n\s*\n\s*|\s*\r\n\s*\r\n\s*', text.strip())
    return [p.strip() for p in paragraphs if p.strip()]


In [None]:
# Column names to use
original_text_columns = "abstract_text"
clean_text_columns = "abstract_text_clean"


# 1. Tokens (use clean text)

df["tokens"] = df[clean_text_columns].apply(
    lambda t: [tok for tok in simple_word_tokenize(t) if tok.strip()] if isinstance(t, str) else []
)


# 2. Words (use clean tokens only)

df["words"] = df["tokens"].apply(
    lambda toks: [tok for tok in toks if re.search(r'\w', tok)]
)


# 3. Sentences (use original_text_columns for accurate sentence boundary detection)

df["sentences"] = df[original_text_columns].apply(
    lambda t: sentence_tokenize(t)
)


# 4. Paragraphs (use original_text_columns to preserve original structural breaks)

df["paragraphs"] = df[original_text_columns].apply(
    lambda t: paragraph_tokenize(t)
)

print("Feature engineering completed! Columns now:")
print(df.columns)
df.head(2)


In [None]:
# Column names to use
original_text_columns = "abstract_text"
clean_text_columns = "abstract_text_clean"


In [None]:
# Feature 2: Number of letters / C
import regex as _re

feature = f'{original_text_columns}_f002_letters_over_C'

def _ratio_letters(t):
    s = str(t) if pd.notna(t) else ""
    C = len(s)
    if C == 0:
        return 0.0
    letters = len(_re.findall(r'\p{L}', s, flags=_re.VERSION1))
    return letters / C

df[feature] = df[original_text_columns].apply(_ratio_letters)

In [None]:
# Feature 21: Brunet's W measure (approx)
#Brunet's W هو مقياس لتنوع المفردات في النص.
#قيمة W أقل → النص أكثر تنوعًا.
def _brunet_W(words, alpha=0.172):
    """Calculates Brunet's W measure of lexical diversity."""
    N = len(words) # Total tokens
    freq = Counter([w.lower() for w in words])
    V = len(freq) # Total types

    if N > 0 and V > 0:
        try:
            # W = N ^ (V ^ (-alpha))
            return N ** (V ** (-alpha))
        except OverflowError: # Handle potential large number errors gracefully
            return 0.0
    return 0.0

feature_name = f'{clean_text_columns}_f021_brunet_W'
df[feature_name] = df["words"].apply(_brunet_W)

In [None]:
# (40) Sentences length frequency distribution
df['f040_Sentence_length_frequency_distribution'] = df["sentences"].apply(
    # Assuming simple_word_tokenize is available
    lambda s: dict(Counter([len(simple_word_tokenize(sent)) for sent in s]))
    if s else {}
)

In [None]:
pip install pyarabic

In [None]:
import regex as _re          # better than "re" for Arabic letters
import pyarabic.araby as ar  # optional: Arabic normalization & letter handling

In [None]:
# HuggingFace tokenizer for embeddings
from transformers import AutoTokenizer
from collections import Counter

In [None]:
#59.Number of words found in the 500 positions within word  embedding (use corresponding word embedding aligned with the used LLM model)
#حساب أعلى 500 كلمة لكل مودل مسستخدم بالداتا


#  Load tokenizer (aligned to LLM)

model_name = "bert-base-uncased"   # change to your embedding model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Extract top 500 vocabulary tokens
top_500_vocab = set(list(tokenizer.get_vocab().keys())[:500])


#  Feature 3: Words found in top-500 embedding positions

feature = f"{clean_text_columns}_f059_words_in_top500"

def _count_words_in_top500(text):
    if text is None:
        return 0

    # Convert to string + normalize Arabic
    s = str(text)
    s = ar.normalize_hamza(s)             # normalize different hamza forms
    s = ar.normalize_ligature(s)          # normalize Arabic ligatures
    s = ar.strip_tashkeel(s)              # remove diacritics

    # split words on Arabic/English boundaries
    words = _re.findall(r"\p{L}+", s, flags=_re.VERSION1)

    # count matches with top 500 tokenizer words
    return sum(1 for w in words if w.lower() in top_500_vocab)

# Apply feature
df[feature] = df[clean_text_columns].apply(_count_words_in_top500)

In [None]:
df.head(5)

In [None]:
#  لتطبيق الميزة 78
!pip install transformers torch


In [None]:
# Feature 78: Perplexity score (placeholder). Requires LM scoring. Set up transformers and compute if desired.
#ميزة تعتمد على LM scoring، أي استخدام نموذج لغة (مثل GPT أو أي نموذج من مكتبة Hugging Face transformers) لحساب perplexity للنصوص. الـ perplexity هو مقياس لمدى قدرة النموذج على التنبؤ بالنص: كلما كان الرقم أصغر، كان النص أكثر “تناسقًا” أو متوافقًا مع اللغة التي تعلم عليها النموذج.
#مكتبات لازمة
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name_ppx = "aubmindlab/aragpt2-base"   # Arabic GPT2 model
tokenizer_ppx = AutoTokenizer.from_pretrained(model_name_ppx)
model_ppx = AutoModelForCausalLM.from_pretrained(model_name_ppx)
model_ppx.eval()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_ppx.to(device)

# Feature 78 name
feature_ppx = f"{clean_text_columns}_f078_perplexity"

def _calculate_perplexity_ar(text):
    text = str(text).strip()
    if not text:
        return None

    try:
        inputs = tokenizer_ppx(
            text,
            return_tensors='pt',
            truncation=True,
            max_length=512
        )

        input_ids = inputs.input_ids.to(device)

        with torch.no_grad():
            outputs = model_ppx(input_ids, labels=input_ids)
            loss = outputs.loss

        return float(torch.exp(loss).cpu().item())

    except Exception:
        return None

  #Apply Feature 78 to the clean text column
df[feature_ppx] = df[clean_text_columns].apply(
    lambda t: _calculate_perplexity_ar(t) if pd.notna(t) else None
)





In [None]:
output_file = 'clean_data.xlsx'

df.to_excel(
        output_file,
        index=False,
        engine='xlsxwriter'
    )


#Split the data



In [None]:
from sklearn.model_selection import train_test_split

# First split: Train 70%, Temp 30%
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42, shuffle=True)

# Second split: Temp 30% → 15% Validation, 15% Test
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42, shuffle=True)

# Show sizes
print("TOTAL:", len(df))
print("TRAIN:", len(train_df))
print("VAL:", len(val_df))
print("TEST:", len(test_df))

#TF-IDF Features from Cleaned Text


In [None]:
#apply with abstract_text_clean only
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer for Arabic text
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,   # limit vocabulary
    ngram_range=(1,2),   # unigrams + bigrams
    analyzer='word'
)

# Fit only on training set
tfidf_vectorizer.fit(train_df["abstract_text_clean"])

# Transform train/validation/test sets
X_train_tfidf = tfidf_vectorizer.transform(train_df["abstract_text_clean"])
X_val_tfidf   = tfidf_vectorizer.transform(val_df["abstract_text_clean"])
X_test_tfidf  = tfidf_vectorizer.transform(test_df["abstract_text_clean"])

print("TF-IDF shapes:")
print("Train:", X_train_tfidf.shape)
print("Validation:", X_val_tfidf.shape)
print("Test:", X_test_tfidf.shape)

##Define X and y

In [None]:
from scipy.sparse import hstack


In [None]:
#Select numeric features (The generated feature engineering exclude label and text)
EXCLUDED_COLS = ['label', 'abstract_text', 'abstract_text_clean',
                 'tokens', 'words', 'sentences', 'paragraphs', 'abstract_text_pos_tags']
# Select columns that are numeric AND not in the exclusion list>>feature engineering columns
numeric_cols = [
    col for col in train_df.select_dtypes(include=np.number).columns.tolist()
    if col not in EXCLUDED_COLS
]
# Convert the numeric features DataFrames to NumPy arrays (dense matrices)
# We must use the values/to_numpy() method to extract the array for sparse matrix stacking.
X_train_num_array = train_df[numeric_cols].values
X_val_num_array   = val_df[numeric_cols].values
X_test_num_array  = test_df[numeric_cols].values


# Target variable
y_train = train_df["label"]
y_val   = val_df["label"]
y_test  = test_df["label"]

# Features: TF-IDF and the creating feature engineering
X_train = hstack([X_train_tfidf, X_train_num_array]).tocsr()
X_val= hstack([X_val_tfidf, X_val_num_array]).tocsr()
X_test= hstack([X_test_tfidf, X_test_num_array]).tocsr()

print("X and y are ready for ML models.")
print("Train:", X_train.shape, y_train.shape)
print("Validation:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

#Build Machine learning Models

##1-Baseline Model (Naïve Bayes & Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the model
lr_model = LogisticRegression(max_iter=1000, random_state=42)

# Train on training set
lr_model.fit(X_train, y_train)

# Predict on validation set
y_val_pred = lr_model.predict(X_val)

# Evaluate on validation set
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

In [None]:
# Evaluation
# Predict on test set
y_test_pred = lr_model.predict(X_test)

# Evaluate on test set
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred))

# Optional: confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_test_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

##2-Traditional Machine Learning Models ( Support Vector Machine (SVM), Random Forest, XGBoost)using the validation set

In [None]:
from sklearn.svm import LinearSVC
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

# إذا كانت X_train sparse (مثل TF-IDF)، لا نستخدم StandardScaler مع with_mean=True
n_components = 300  # يمكن تعديلها
svd = TruncatedSVD(n_components=n_components, random_state=42)

svm_pipeline = Pipeline([
    ('svd', svd),
    ('svm', LinearSVC(C=1.0, max_iter=10000, random_state=42))
])

svm_pipeline.fit(X_train, y_train)
y_val_pred_svm = svm_pipeline.predict(X_val)

from sklearn.metrics import accuracy_score, classification_report
print("Linear SVM Validation Accuracy:", accuracy_score(y_val, y_val_pred_svm))
print(classification_report(y_val, y_val_pred_svm))


In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# XGBoost محسّن للبيانات الكبيرة / sparse
xgb_model = xgb.XGBClassifier(
    n_estimators=200,      # عدد الأشجار
    max_depth=6,           # أقصى عمق لكل شجرة
    learning_rate=0.1,     # سرعة التعلم
    subsample=0.8,         # أخذ عينات من البيانات لتسريع التدريب وتجنب overfitting
    colsample_bytree=0.8,  # أخذ عينات من المميزات لكل شجرة
    use_label_encoder=False,
    eval_metric='mlogloss',
    n_jobs=-1,             # استخدام كل الأنوية المتاحة
    random_state=42
)

# تدريب النموذج
xgb_model.fit(X_train, y_train)

# التنبؤ والتحقق من الأداء
y_val_pred_xgb = xgb_model.predict(X_val)
print("XGBoost Validation Accuracy:", accuracy_score(y_val, y_val_pred_xgb))
print(classification_report(y_val, y_val_pred_xgb))

In [None]:
#Evaluation
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# حفظ النماذج في قاموس لتسهيل التقييم
models = {
    'LinearSVM': svm_pipeline,
    'XGBoost': xgb_model
}

# تقييم كل نموذج على مجموعة الاختبار
for name, model in models.items():
    # التنبؤ على مجموعة الاختبار
    y_test_pred = model.predict(X_test)

    # طباعة النتائج
    print(f"\n===== {name} Test Evaluation =====")
    print("Accuracy:", accuracy_score(y_test, y_test_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_test_pred))

    # مصفوفة الالتباس
    cm = confusion_matrix(y_test, y_test_pred)
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
                xticklabels=True, yticklabels=True)
    plt.title(f'Confusion Matrix - {name}', fontsize=14)
    plt.xlabel('Predicted', fontsize=12)
    plt.ylabel('Actual', fontsize=12)
    plt.show()

#Build Deep learning Models

##Build neural network classifier ( simple Feedforward Network) on top of the extracted BERT embeddings or fine-tune the pre-trained BERT model for the classification task.

#Step1: Extract BERT Embeddings (Sentence-level)

In [None]:
!pip install sentence-transformers


In [None]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
import numpy as np


import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running embeddings on:", device)

# تحميل نموذج صغير نسبيًا لتسريع الـ embeddings على CPU
bert_model = SentenceTransformer(
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    device=device
)

# الأعمدة المراد استبعادها
exclude_cols = ["label", "abstract_text"]

# دالة لدمج كل الأعمدة النصية المتبقية في عمود واحد
def combine_text_columns(df, exclude_cols):
    text_cols = [c for c in df.columns if c not in exclude_cols]
    combined_texts = df[text_cols].astype(str).agg(" ".join, axis=1).tolist()
    return combined_texts

# إعداد tokenizer لاختصار النصوص إذا كانت طويلة جدًا
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
max_tokens = 128  # عدد التوكن لكل نص لتسريع الحساب على CPU

def truncate_texts(texts, max_length=max_tokens):
    return [" ".join(tokenizer.tokenize(t)[:max_length]) for t in texts]

# --- تحضير البيانات ---
train_texts = truncate_texts(combine_text_columns(train_df, exclude_cols))
val_texts   = truncate_texts(combine_text_columns(val_df, exclude_cols))
test_texts  = truncate_texts(combine_text_columns(test_df, exclude_cols))

# --- دالة لتطبيق embeddings على دفعات لتسريع CPU ---
def encode_in_batches(texts, model, batch_size=128):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
        embeddings.append(emb)
    return np.vstack(embeddings)

# --- إنشاء embeddings ---
X_train_emb = encode_in_batches(train_texts, bert_model, batch_size=128)
X_val_emb   = encode_in_batches(val_texts, bert_model, batch_size=128)
X_test_emb  = encode_in_batches(test_texts, bert_model, batch_size=128)

# --- المتغيرات المستهدفة ---
y_train = train_df["label"].values
y_val   = val_df["label"].values
y_test  = test_df["label"].values

# --- طباعة الأبعاد للتأكد ---
print("Train embedding shape:", X_train_emb.shape)
print("Validation embedding shape:", X_val_emb.shape)
print("Test embedding shape:", X_test_emb.shape)


##Step 2: Build a Feedforward Neural Network

In [None]:
#import tensorflow as tf
from tensorflow.keras import layers, models

# Basic feedforward classifier on embeddings
ffnn_model = models.Sequential([
    layers.Input(shape=(X_train_emb.shape[1],)),
    layers.Dense(256, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.3),
    layers.Dense(1, activation="sigmoid")   # binary classification
])

ffnn_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

ffnn_model.summary()

##Step3: Train the Model

In [None]:
history = ffnn_model.fit(
    X_train_emb, y_train,
    validation_data=(X_val_emb, y_val),
    epochs=10,
    batch_size=32
)

##Step 4: Evaluate on Test Set

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict
y_test_pred = (ffnn_model.predict(X_test_emb) > 0.5).astype(int)

print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

##Save Models

In [None]:
import os
import joblib
from tensorflow.keras.models import Model as KerasModel

def save_all_models(models_dict, save_dir="models"):


    # Create save folder
    os.makedirs(save_dir, exist_ok=True)

    for model_name, model_obj in models_dict.items():


            file_path = os.path.join(save_dir, f"{model_name}.pkl")
            joblib.dump(model_obj, file_path)
            print(f"[Saved] Pickle model → {file_path}")

    print("\nAll models saved successfully!")

In [None]:
import os
os.makedirs("models", exist_ok=True)
models_dict = {
    "lr_model": lr_model,
    "svm": svm_pipeline,
    "xgboost": xgb_model,
    "ffnn": ffnn_model
}

save_all_models(models_dict)