In [None]:
!pip install -q transformers torch scikit-learn bitsandbytes accelerate peft --upgrade bitsandbytes

In [None]:
import pandas as pd
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
train_df = pd.read_csv("/content/Arabic_Reports(3000).csv")
train_df = train_df.rename(columns={'البلاغ': 'tweet', 'التصنيف': 'label'})
train_df = train_df.dropna(subset=["tweet", "label"])


In [None]:
label_map = {

   "Profile Hacking Identity Theft": 0,
   "EWallet Related Fraud": 1,
   "Fraud CallVishing": 2
}

train_df["label"] = train_df["label"].astype(str).str.strip().str.lower()
train_df["label_id"] = train_df["label"].map(label_map)

print(train_df[["label", "label_id"]].head())
print("NaN in label_id:", train_df["label_id"].isna().sum())
print("Label counts:\n", train_df["label"].value_counts())


# Train–Validation–Test Split to Reduce Overfitting


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Correcting the label_map to match the lowercase 'label' column
label_map = {
   "profile hacking identity theft": 0,
   "ewallet related fraud": 1,
   "fraud callvishing": 2
}

train_df["label_id"] = train_df["label"].map(label_map)

# Filter out rows where label_id is NaN, if any non-mappable labels exist
train_df_cleaned = train_df.dropna(subset=["label_id"])

X = train_df_cleaned["tweet"].values
y = train_df_cleaned["label_id"].values.astype(int) # Convert to int after ensuring no NaNs

# Train (70%) + Temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

# Validation (15%) + Test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,
    random_state=42,
    stratify=y_temp
)

print("Train size:", len(X_train))
print("Val size:", len(X_val))
print("Test size:", len(X_test))

print("\nClass distribution:")
print("Train:", np.bincount(y_train))
print("Val:", np.bincount(y_val))
print("Test:", np.bincount(y_test))

# Setup LLM Environment and Load Pre-trained Model



In [None]:
!pip install -q transformers torch scikit-learn bitsandbytes accelerate peft --upgrade bitsandbytes

In [None]:
import warnings
warnings.filterwarnings("ignore")

print("Environment ready: Transformers, Torch, and Scikit-learn")


# Load Pre-trained Arabic BERT Model and Tokenizer


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Choose a pre-trained Arabic BERT model
model_name = "aubmindlab/bert-base-arabertv2"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(f"Tokenizer for {model_name} loaded successfully.")

# Load model for binary sequence classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)
print(f"Model for {model_name} loaded successfully.")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Model moved to device: {device}")


# Tokenize Data and Build Datasets


In [None]:
from torch.utils.data import TensorDataset

max_length = 128

# Tokenize Train
train_encodings = tokenizer(
    list(X_train),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# Tokenize Validation
val_encodings = tokenizer(
    list(X_val),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# Tokenize Test
test_encodings = tokenizer(
    list(X_test),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# Convert labels to tensors
y_train_t = torch.tensor(y_train, dtype=torch.long)
y_val_t   = torch.tensor(y_val,   dtype=torch.long)
y_test_t  = torch.tensor(y_test,  dtype=torch.long)

# Build datasets
train_dataset = TensorDataset(
    train_encodings["input_ids"],
    train_encodings["attention_mask"],
    y_train_t
)

val_dataset = TensorDataset(
    val_encodings["input_ids"],
    val_encodings["attention_mask"],
    y_val_t
)

test_dataset = TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
    y_test_t
)

print("Datasets ready:",
      len(train_dataset),
      len(val_dataset),
      len(test_dataset))


# Create DataLoaders


In [None]:
from torch.utils.data import DataLoader

batch_size = 16

# Train loader
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True  # يفضل shuffle للتدريب
)

# Validation loader
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False
)

# Test loader
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False
)

print("DataLoaders ready:",
      len(train_loader),
      len(val_loader),
      len(test_loader))


# Training Loop with Validation


In [None]:
from torch.optim import AdamW
from tqdm import tqdm

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 20

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")

    # ===== Training =====
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc="Training"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"Average training loss: {avg_train_loss:.4f}")

    # ===== Validation =====
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct / total

    print(f"Validation loss: {avg_val_loss:.4f}")
    print(f"Validation accuracy: {val_accuracy:.4f}")


اضافة _   # ===== Early Stopping =====

In [None]:
from torch.optim import AdamW
from tqdm import tqdm
import torch

# ===== Optimizer =====
optimizer = AdamW(model.parameters(), lr=2e-5)

# ===== Training parameters =====
epochs = 20
patience = 3  # عدد epochs بدون تحسن قبل الإيقاف المبكر
best_val_loss = float('inf')
counter = 0

# مكان حفظ أفضل نموذج
best_model_path = "best_model.pt"

# لتخزين القيم للرسم لاحقًا
train_losses_list = []
val_losses_list = []
val_acc_list = []

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")

    # ===== Training =====
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_loader, desc="Training"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    train_losses_list.append(avg_train_loss)
    print(f"Average training loss: {avg_train_loss:.4f}")

    # ===== Validation =====
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct / total
    val_losses_list.append(avg_val_loss)
    val_acc_list.append(val_accuracy)

    print(f"Validation loss: {avg_val_loss:.4f}")
    print(f"Validation accuracy: {val_accuracy:.4f}")

    # ===== Early Stopping =====
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        torch.save(model.state_dict(), best_model_path)  # حفظ أفضل نموذج
        print("Best model saved!")
    else:
        counter += 1
        print(f"EarlyStopping counter: {counter}/{patience}")
        if counter >= patience:
            print("Early stopping triggered!")
            break


# Final Evaluation on Test Set


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import numpy as np
import torch

model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Accuracy
test_accuracy = accuracy_score(all_labels, all_preds)
macro_f1 = f1_score(all_labels, all_preds, average='macro')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Macro F1-score: {macro_f1:.4f}")

# ترتيب الأسماء حسب label_map
label_map = {
    "Profile Hacking Identity Theft": 0,
    "EWallet Related Fraud": 1,
    "Fraud CallVishing": 2
}
id2label = {v: k for k, v in label_map.items()}
target_names = [id2label[i] for i in range(len(id2label))]

print("\nClassification Report:")
print(classification_report(
    all_labels,
    all_preds,
    target_names=target_names,
    digits=4
))

print("\nConfusion Matrix:")
print(confusion_matrix(all_labels, all_preds))


AllData(3000) as test

In [None]:
import pandas as pd
import numpy as np

# Load the human-labeled dataset
# Using the path specified in the instructions and previous successful loading attempts
human_df = pd.read_csv('/content/Arabic_Reports(3000).csv')

# Rename columns from Arabic to English as per the instructions
human_df = human_df.rename(columns={'البلاغ': 'tweet', 'التصنيف': 'label'})

# Drop rows with missing values in 'tweet' or 'label'
human_df = human_df.dropna(subset=["tweet", "label"])

# Convert 'label' column to string, strip whitespace, and convert to lowercase
human_df["label"] = human_df["label"].astype(str).str.strip().str.lower()

# Map labels to numerical IDs using the existing label_map
label_map = {
   "profile hacking identity theft": 0,
   "ewallet related fraud": 1,
   "fraud callvishing": 2
}
human_df["label_id"] = human_df["label"].map(label_map)

# Filter out rows where label_id is NaN, if any non-mappable labels exist
human_df = human_df.dropna(subset=["label_id"])

print("Human-labeled dataset loaded and processed:")
print(human_df.head())
print("\nNaN in label_id after mapping and dropping:", human_df["label_id"].isna().sum())
print("Label counts after initial processing:\n", human_df["label"].value_counts())

# Balance the dataset
min_samples = human_df["label_id"].value_counts().min()
human_df_balanced = human_df.groupby("label_id", group_keys=False).apply(lambda x: x.sample(min_samples, random_state=42)).reset_index(drop=True)

print("\nBalanced dataset label counts:\n", human_df_balanced["label_id"].value_counts())

# Extract X_human_test and y_human_test
X_human_test = human_df_balanced["tweet"].values
y_human_test = human_df_balanced["label_id"].values.astype(int)

print("\nShape of X_human_test:", X_human_test.shape)
print("Shape of y_human_test:", y_human_test.shape)


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import numpy as np

# 1. Load the best saved model weights
model.load_state_dict(torch.load(best_model_path))
print("Best model weights loaded successfully.")

# 2. Tokenize the X_human_test data
human_test_encodings = tokenizer(
    list(X_human_test),
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)
print("X_human_test tokenized.")

# 3. Convert y_human_test to a torch.long tensor
y_human_test_t = torch.tensor(y_human_test, dtype=torch.long)
print("y_human_test converted to tensor.")

# 4. Create a TensorDataset named human_test_dataset
human_test_dataset = TensorDataset(
    human_test_encodings["input_ids"],
    human_test_encodings["attention_mask"],
    y_human_test_t
)
print("human_test_dataset created.")

# 5. Create a DataLoader named human_test_loader
human_test_loader = DataLoader(
    human_test_dataset,
    batch_size=batch_size,
    shuffle=False
)
print("human_test_loader created.")

# 6. Set the model to evaluation mode
model.eval()

# 7. Initialize empty lists to store all predictions and true labels
all_human_preds = []
all_human_labels = []

# 8. Iterate through the human_test_loader in a torch.no_grad() block
with torch.no_grad():
    for batch in tqdm(human_test_loader, desc="Evaluating Human Test Set"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_human_preds.extend(preds.cpu().numpy())
        all_human_labels.extend(labels.cpu().numpy())

# 9. Calculate and print accuracy score and macro F1-score
human_test_accuracy = accuracy_score(all_human_labels, all_human_preds)
human_macro_f1 = f1_score(all_human_labels, all_human_preds, average='macro')

print(f"\nHuman Test Set Accuracy: {human_test_accuracy:.4f}")
print(f"Human Test Set Macro F1-score: {human_macro_f1:.4f}")

# Prepare target names for classification report
id2label = {v: k for k, v in label_map.items()}
target_names = [id2label[i] for i in range(len(id2label))]

# 10. Print the classification report
print("\nHuman Test Set Classification Report:")
print(classification_report(
    all_human_labels,
    all_human_preds,
    target_names=target_names,
    digits=4
))

# 11. Print the confusion matrix
print("\nHuman Test Set Confusion Matrix:")
print(confusion_matrix(all_human_labels, all_human_preds))

## Summary:

### Q&A
The model's performance on the balanced human-labeled test set is very strong, achieving a high accuracy and macro F1-score, indicating robust performance across all classes.

### Data Analysis Key Findings
*   The model achieved an accuracy of 0.9760 and a macro F1-score of 0.9760 on the balanced human-labeled test set.
*   Per-class performance was strong, with 'ewallet related fraud' showing exceptional metrics (Precision: 0.9980, Recall: 0.9950, F1-score: 0.9965).
*   The class 'profile hacking identity theft' had a precision of 0.9576, recall of 0.9720, and F1-score of 0.9648.
*   The class 'fraud callvishing' had a precision of 0.9727, recall of 0.9610, and F1-score of 0.9668.
*   The confusion matrix revealed that the primary misclassifications occurred between 'fraud callvishing' and 'profile hacking identity theft', with 39 instances of 'fraud callvishing' incorrectly classified as 'profile hacking identity theft', and 26 instances of 'profile hacking identity theft' misclassified as 'fraud callvishing'.
*   Evaluation results, including accuracy, classification report, and confusion matrix, were saved to `human_test_evaluation_results.txt`.
*   True labels and predicted labels were saved to `human_test_predictions.csv` for further detailed analysis.

### Insights or Next Steps
*   Investigate the specific characteristics of the 65 misclassified instances between 'fraud callvishing' and 'profile hacking identity theft' to identify patterns or ambiguous features that confuse the model.
*   Consider error analysis on the `human_test_predictions.csv` file to understand the types of errors and potentially refine the model or data preprocessing for these specific cases.


#Qwen

In [None]:
# تثبيت أو تحديث bitsandbytes
!pip install -U bitsandbytes
# تحديث transformers أيضاً
!pip install -U transformers
# إعادة تشغيل الكيرنل بعد التثبيت


In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score


In [None]:
# =========================
# إعداد البيئة
# =========================
!pip install -q transformers accelerate bitsandbytes --upgrade

import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
from tqdm import tqdm
import gc

# =========================
# تحميل البيانات
# =========================
df = pd.read_csv("/content/Arabic_Reports(3000).csv")

df = df.rename(columns={'البلاغ': 'tweet', 'التصنيف': 'label'})
df = df.dropna(subset=["tweet", "label"])
df["label"] = df["label"].astype(str).str.strip().str.lower()

label_map = {
    "profile hacking identity theft": 0,
    "ewallet related fraud": 1,
    "fraud callvishing": 2
}
df["label_id"] = df["label"].map(label_map)
df = df.dropna(subset=["label_id"])

X = df["tweet"].values
y = df["label_id"].values.astype(int)

# تقسيم البيانات 70-15-15
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=42
)

print("Train:", len(X_train), "Val:", len(X_val), "Test:", len(X_test))



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm

# -------------------------------
# 1️⃣ تحميل الموديل Qwen
# -------------------------------
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")  # ضبط padding_left
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)


In [None]:
# ================================
# 5️⃣ تجهيز Tokenization
# ================================
max_length = 128
batch_size = 16

def tokenize_data(texts):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings = tokenize_data(X_train)
val_encodings   = tokenize_data(X_val)
test_encodings  = tokenize_data(X_test)

# تحويل التصنيفات إلى Tensors
y_train_t = torch.tensor(y_train, dtype=torch.long)
y_val_t   = torch.tensor(y_val, dtype=torch.long)
y_test_t  = torch.tensor(y_test, dtype=torch.long)


In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)
best_val_loss = float('inf')
patience = 3
counter = 0

for epoch in range(10):
    model.train()
    train_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # validation
    model.eval()
    val_loss = 0
    correct, total = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            preds = outputs.logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_acc = correct/total
    print(f"Epoch {epoch+1} Val Acc: {val_acc:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        counter = 0
        torch.save(model.state_dict(), "best_qwen2_cls.pt")
    else:
        counter += 1
        if counter >= patience:
            print("Early Stopping Triggered")
            break


In [None]:
model.load_state_dict(torch.load("best_qwen2_cls.pt"))
model.eval()

all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = outputs.logits.argmax(dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("Test Accuracy", accuracy_score(all_labels, all_preds))
print(classification_report(all_labels, all_preds))
print(confusion_matrix(all_labels, all_preds))
