In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score, classification_report
import joblib

# 1. Load Data
data_path = r"C:\Users\Ahmed\OneDrive\Desktop\NLP\NLP_Project_Propaganda\data\processed\arabic_propaganda_dataset.csv"
df = pd.read_csv(data_path)

# 2. Split (Same split as AraBERT for fair comparison)
X_train, X_test, y_train, y_test = train_test_split(
    df['Text'], 
    df['Final_Label'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['Final_Label']
)

# 3. Build Pipeline (TF-IDF + Logistic Regression)
# TF-IDF converts text to numbers based on word frequency
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)), # Keep top 5000 words
    ('clf', LogisticRegression(class_weight='balanced', solver='liblinear')) # 'balanced' handles the imbalance!
])

# 4. Train
print("‚è≥ Training Baseline Model...")
pipeline.fit(X_train, y_train)

# 5. Evaluate
print("‚úÖ Evaluation on Test Set:")
predictions = pipeline.predict(X_test)

f1 = f1_score(y_test, predictions, average='macro')
acc = accuracy_score(y_test, predictions)

print(f"üèÜ Baseline F1 Macro: {f1:.4f}")
print(f"üìä Baseline Accuracy: {acc:.4f}")
print("\nDetailed Report:\n")
print(classification_report(y_test, predictions))

# 6. Save Model
joblib.dump(pipeline, r"C:\Users\Ahmed\OneDrive\Desktop\NLP\NLP_Project_Propaganda\models\baseline_model.pkl")
print("‚úÖ Baseline Model Saved.")

‚è≥ Training Baseline Model...
‚úÖ Evaluation on Test Set:
üèÜ Baseline F1 Macro: 0.5257
üìä Baseline Accuracy: 0.5453

Detailed Report:

                precision    recall  f1-score   support

Non-Propaganda       0.38      0.49      0.43       439
    Propaganda       0.68      0.57      0.62       830

      accuracy                           0.55      1269
     macro avg       0.53      0.53      0.53      1269
  weighted avg       0.58      0.55      0.56      1269

‚úÖ Baseline Model Saved.
