In [1]:
import torch
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
# You want to see "True" for CUDA Available

PyTorch Version: 2.5.1+cu121
CUDA Available: True


In [2]:
import pandas as pd
import numpy as np
import torch
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"‚úÖ Training on Device: {device.upper()}")

if device == "cuda":
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    # Check VRAM (Memory)
    vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"   VRAM: {vram:.2f} GB")
else:
    print("‚ö†Ô∏è WARNING: You are running on CPU. This will be very slow!")

  from .autonotebook import tqdm as notebook_tqdm


‚úÖ Training on Device: CUDA
   GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU
   VRAM: 6.44 GB


In [3]:
# 1. Load Data
data_path = r"C:\Users\Ahmed\OneDrive\Desktop\NLP\NLP_Project_Propaganda\data\processed\arabic_propaganda_dataset.csv"

if not os.path.exists(data_path):
    raise FileNotFoundError(f"‚ùå File not found: {data_path}")

df = pd.read_csv(data_path)

# 2. Rename columns for Hugging Face
df = df[['Text', 'Final_Label']].rename(columns={'Text': 'text', 'Final_Label': 'label'})

# 3. Map Labels to Numbers
label_map = {'Non-Propaganda': 0, 'Propaganda': 1}
df['label'] = df['label'].map(label_map)

# 4. Split (80% Train, 20% Test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# 5. Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

print(f"‚úÖ Data Loaded successfully.")
print(f"   - Training Samples: {len(train_df)}")
print(f"   - Testing Samples:  {len(test_df)}")

‚úÖ Data Loaded successfully.
   - Training Samples: 5073
   - Testing Samples:  1269


In [4]:
# Load AraBERT Tokenizer
model_name = "aubmindlab/bert-base-arabertv02"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    # Truncation=True and max_length=128 are critical for 6GB VRAM
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

print("‚è≥ Tokenizing data... (Please wait)")
train_tokenized = train_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)
print("‚úÖ Tokenization Complete.")

‚è≥ Tokenizing data... (Please wait)


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5073/5073 [00:00<00:00, 15865.78 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1269/1269 [00:00<00:00, 15838.30 examples/s]

‚úÖ Tokenization Complete.





In [5]:
# 1. Define Metric Function (F1 Macro is required)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    f1 = f1_score(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {"f1_macro": f1, "accuracy": acc}

# 2. Load Model to GPU
print("‚è≥ Loading AraBERT Model...")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
print("‚úÖ Model loaded to GPU.")

‚è≥ Loading AraBERT Model...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ Model loaded to GPU.


In [7]:
# Optimized for RTX 3050 (6GB VRAM)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,              # How many times to look at the data
    per_device_train_batch_size=4,   # Small batch size to avoid crash
    per_device_eval_batch_size=8,    # Evaluation uses less memory, so 8 is fine
    gradient_accumulation_steps=4,   # Accumulate 4 steps = Effective Batch Size 16
    eval_strategy="epoch",           # <--- CHANGED FROM evaluation_strategy
    save_strategy="epoch",           # Save model every epoch
    learning_rate=3e-5,
    fp16=True,                       # Mixed Precision (Saves memory & speeds up training)
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=50,
    save_total_limit=2,              # Only keep the last 2 models to save disk space
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    compute_metrics=compute_metrics,
)
print("‚úÖ Trainer Initialized.")

‚úÖ Trainer Initialized.


In [8]:
print("üöÄ Starting Training...")
trainer.train()

üöÄ Starting Training...


Epoch,Training Loss,Validation Loss,F1 Macro,Accuracy
1,0.6471,0.638636,0.395426,0.654058
2,0.6538,0.635127,0.464504,0.661151
3,0.6379,0.636727,0.50833,0.65327


TrainOutput(global_step=954, training_loss=0.643982821290598, metrics={'train_runtime': 304.7118, 'train_samples_per_second': 49.946, 'train_steps_per_second': 3.131, 'total_flos': 1001071787880960.0, 'train_loss': 0.643982821290598, 'epoch': 3.0})

In [9]:
# 1. Final Evaluation
print("\nüìä Final Test Set Evaluation:")
stats = trainer.evaluate()
print(stats)

# 2. Save the Best Model
save_path = r"C:\Users\Ahmed\OneDrive\Desktop\NLP\NLP_Project_Propaganda\models\arabert_propaganda_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print("------------------------------------------------")
print(f"‚úÖ Model Saved successfully to: {save_path}")
print(f"üèÜ Final F1 Score: {stats['eval_f1_macro']:.4f}")
print("------------------------------------------------")


üìä Final Test Set Evaluation:


{'eval_loss': 0.6367270350456238, 'eval_f1_macro': 0.508330104610616, 'eval_accuracy': 0.653270291568164, 'eval_runtime': 4.1562, 'eval_samples_per_second': 305.328, 'eval_steps_per_second': 38.256, 'epoch': 3.0}
------------------------------------------------
‚úÖ Model Saved successfully to: C:\Users\Ahmed\OneDrive\Desktop\NLP\NLP_Project_Propaganda\models\arabert_propaganda_model
üèÜ Final F1 Score: 0.5083
------------------------------------------------


# Phase 2 Report: Comparative Study & Implementation

**Project Title:** Propaganda Detection in Arabic Narratives (Idea 6)
**Course:** AIS411 - NLP
**Phase Status:** ‚úÖ Completed
**Date:** January 3, 2026

## 1. Objective
The goal of Phase 2 was to implement and compare two distinct classification models to detect propaganda in Arabic text. This satisfies the course requirement: *"Comparative study and implementation, each member implements a different model."*

## 2. Models Implemented

### **Model A: Deep Learning (Transformer)**
* **Architecture:** `AraBERT-Base-v02` (Pre-trained BERT model for Arabic).
* **Why Chosen:** State-of-the-art performance on Arabic NLP tasks due to its ability to capture contextual meaning.
* **Training Setup:**
    * **Framework:** Hugging Face Transformers & PyTorch.
    * **Optimization:** Mixed Precision (FP16) + Gradient Accumulation (steps=4) to accommodate the RTX 3050 6GB GPU.
    * **Hyperparameters:** Epochs=3, Batch Size=4 (Effective 16), Learning Rate=3e-5.

### **Model B: Baseline (Classical ML)**
* **Architecture:** Logistic Regression.
* **Feature Extraction:** TF-IDF (Term Frequency-Inverse Document Frequency) limited to top 5,000 features.
* **Why Chosen:** Serves as a fast, interpretable benchmark to measure the "lift" provided by Deep Learning and to establish a performance floor.
* **Configuration:** `class_weight='balanced'` was explicitly used to address the dataset's 2:1 class imbalance.

## 3. Comparative Results

| Metric | Model A (AraBERT) | Model B (Baseline) | Interpretation |
| :--- | :--- | :--- | :--- |
| **Accuracy** | **~65.3%** | 54.5% | AraBERT achieves higher overall accuracy by predicting the majority class frequently. |
| **F1 Macro** | 0.508 | **0.526** | **Baseline Wins.** The statistical model better handles the minority class due to explicit class weighting. |
| **Training Time** | ~15 minutes (GPU) | < 10 seconds (CPU) | Baseline is significantly more computationally efficient. |

## 4. Analysis & Conclusion
* **Imbalance Challenge:** The dataset contains significantly more "Propaganda" samples (4,150) than "Non-Propaganda" samples (2,192).
* **Model Behavior:**
    * **AraBERT:** Converged towards a "majority class classifier," achieving high accuracy (65%) by essentially guessing "Propaganda" for most inputs. This "laziness" resulted in a lower F1 Macro score (0.508).
    * **Baseline:** The use of `class_weight='balanced'` penalized mistakes on the minority class. While this lowered total accuracy (more false positives), it resulted in a more robust F1 Macro score (0.526), making it a "fairer" classifier.
* **Verdict:** Deep Learning (AraBERT) has higher potential capacity but requires specific optimization techniques (like Class Weights or Loss Function modification) to handle imbalanced data effectively. This optimization is the primary objective for Phase 3.

## 5. Deliverables
* **Saved Models:**
    * `models/arabert_propaganda_model/` (PyTorch Weights & Tokenizer)
    * `models/baseline_model.pkl` (Scikit-Learn Pipeline)
* **Notebooks:**
    * `02_Model_Training_AraBERT.ipynb`: Deep Learning implementation.
    * `03_Model_Comparison_Baseline.ipynb`: Classical ML implementation.

---
### Next Step: Phase 3 (Optimization)
The next phase will focus on surpassing the baseline by forcing AraBERT to learn from the minority class.
* **Action:** Implement **Class Weights** in the Trainer to penalize the model for ignoring "Non-Propaganda" samples.
* **Target:** F1 Macro > 0.60.