# Fake News Detector

### Installing Necessary Libraries

In [None]:
!pip install farasa
import re
import nltk
from nltk.corpus import stopwords

import pandas as pd
from farasa.segmenter import FarasaSegmenter
from farasa.stemmer import FarasaStemmer
from farasa.pos import FarasaPOSTagger




### Loading the data

In [None]:
# Load dataset
path = './data_set/train_set.csv'
fake_data = pd.read_csv(path, header=None)

if fake_data.empty:
    raise ValueError("The DataFrame is empty. Please check the CSV file ❌")
else :
    print("DataFrame loaded successfully ✅")




### Data Preview

In [4]:
fake_data.head()



In [5]:
fake_data.shape



In [6]:
fake_data.columns



### Preprocessing Text


In [None]:
def clean_text(text):
    """Basic text cleaning for Arabic"""
    if not text or not isinstance(text, str):
            return ""
    text = re.sub(r'https?://\S+|www\.\S+', '', text)# Remove URLs
    text = re.sub(r'<.*?>', '', text)# Remove HTML tags
    arabic_diacritics = re.compile(r'[\u064B-\u065F]')
    text = arabic_diacritics.sub('', text)# Remove diacritics (tashkeel)
    text = re.sub(r'[إأآا]', 'ا', text)  # Normalize Alif
    text = re.sub(r'ة', 'ه', text)  # Normalize Teh Marbuta
    text = re.sub(r'ى', 'ي', text)  # Normalize Ya
    text = re.sub(r'[ؤئ]', 'ء', text) # Normalize Hamzas
    # Remove non-Arabic characters except spaces and numbers
    text = re.sub(r'[^\u0600-\u06FF\s0-9]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^ء-ي\s]', '', text)
    
    return text

### Text Cleaning and Normalization (Using Farasa)

###  Linguistic and Stylistic Feature Extraction

In [18]:
import pandas as pd
from transformers import pipeline, BertTokenizer, BertForSequenceClassification
import torch

# Load preprocessed dataset
fake_data = pd.read_csv("preprocessed_100_rows.csv", header=0)

# Initialize sentiment analysis pipeline (for extracting alarmist tone or exaggeration)
sentiment_analyzer = pipeline("sentiment-analysis", model="aubmindlab/bert-base-arabic-sentiment")

# Initialize AraBERT tokenizer and model for classification
model_name = "aubmindlab/bert-base-arabic"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 classes: reliable, doubtful, fake

# Define a function for linguistic feature extraction
def extract_linguistic_features(text):
    # Sentiment analysis: alarmist or exaggerated language might have a certain sentiment
    sentiment = sentiment_analyzer(text)
    
    # Add more feature extraction here (e.g., keyword-based detection, sentiment score)
    return {
        "sentiment": sentiment[0]['label'],  # 'LABEL_0', 'LABEL_1', or 'LABEL_2' based on sentiment
    }

# Define a function for AraBERT classification (reliable, doubtful, fake)
def classify_text_with_arabert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()
    return prediction  # Class index: 0 (reliable), 1 (doubtful), 2 (fake)

# Apply feature extraction and classification to the dataset
fake_data["linguistic_features"] = fake_data["cleaned_text"].apply(extract_linguistic_features)
fake_data["classification"] = fake_data["cleaned_text"].apply(classify_text_with_arabert)

# Map the class indices to readable labels
class_mapping = {0: "reliable", 1: "doubtful", 2: "fake"}
fake_data["classification"] = fake_data["classification"].map(class_mapping)

# Save the results to a new CSV
fake_data.to_csv("preprocessed_with_classification.csv", index=False)

# Print the first few rows of the processed data
print(fake_data.head())






### Transformer Models for Text Classification

### Enriching Models with LLMs Specialized in Arabic