### Fake News Detector

#### Import Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import nltk
import re
from farasa.segmenter import FarasaSegmenter
from farasa.stemmer import FarasaStemmer
from farasa.pos import FarasaPOSTagger
from farasa.ner import FarasaNamedEntityRecognizer
from tqdm import tqdm
from joblib import Parallel, delayed
tqdm.pandas()
# Initialize Farasa tools
segmenter = FarasaSegmenter()
pos_tagger = FarasaPOSTagger()
stemmer = FarasaStemmer()
ner = FarasaNamedEntityRecognizer()

#### Load Data Set

In [2]:
# put the path for the data set here
path = '../data_set/train_set.csv'
path_2 = '../data_set/algerian_dialect_news.csv'
df = pd.read_csv(path, encoding='utf-8', skiprows=range(1, 3001))
df = df.dropna(subset=['text'])  # Drop rows where 'text' is NaN

In [3]:
df.shape

(5069, 3)

In [4]:
df.head()

Unnamed: 0,label,source,text
0,1,youtube,توفى ولا لبارح بصح غير الخبر مزال ماتنشر
1,1,paraphrased,متحديًا تحذيرات السكان لين يرفضون مغادرة المبا...
2,0,translated,وقالت الصين بلي الدبلوماسية لازما باش شبه الج...
3,1,manual,لاقتصاد تع الصين بدا يكبر فلقرن لعشرين
4,0,translated,الصين عطات مليار مساعدات عسكرية مجانية لأفريقيا


In [5]:
df.columns

Index(['label', 'source', 'text'], dtype='object')

In [6]:
# Count null rows
null_rows_count = df.isnull().sum().sum()
print(f"Number of null rows: {null_rows_count}")

# Count duplicated rows
duplicated_rows_count = df.duplicated().sum()
print(f"Number of duplicated rows: {duplicated_rows_count}")

Number of null rows: 0
Number of duplicated rows: 0


## NLP Processing

#### Text Cleaning

In [7]:
# -------------------------
# Arabic Text Normalization
def normalize_arabic(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)                   # Remove HTML tags
    # Remove emojis
    text = re.sub(r"[^\w\s,]", "", text, flags=re.UNICODE)  # Remove emojis
    # Normalize Arabic text
    text = re.sub(r"[إأآٱ]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("[ًٌٍَُِّْ]", "", text)  # Remove diacritics
    text = re.sub(r"[^\w\s]", "", text)   # Remove punctuation
    return text.strip()

# Load custom Arabic stopwords
with open("../data_set/algerian_arabic_stopwords.txt", "r", encoding="utf-8") as f:
    custom_stopwords = set(word.strip() for word in f.readlines())
# -------------------------
# Text Segmentation (Sentence Splitting)
def segment_text(text):
    # Farasa returns segmented text with morphological boundaries marked
    segmented = segmenter.segment(text)
    # Return as is or with your custom separator
    return segmented

# -------------------------
# Lemmatization (Root Extraction)
def lemmatize_text(text):
    return " ".join(stemmer.stem(text))

# -------------------------
# POS Tagging
def pos_tag_text(text):
    return " ".join(pos_tagger.tag(text))

# -------------------------
# -------------------------
# Stopword Removal
def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word not in custom_stopwords])

# -------------------------
# Text Segmentation using Farasa
def segment_text(text):
    segmented = segmenter.segment(text)
    return segmented

# -------------------------
# Lemmatization using Farasa
def lemmatize_text(text):
    stemmed = stemmer.stem(text)
    return stemmed

# -------------------------
# POS Tagging using Farasa
def pos_tag_text(text):
    tagged = pos_tagger.tag(text)
    return tagged
# -----------------------

# Apply stopword removal
tqdm.pandas(desc="🛑 Removing Stopwords")
df["cleaned_text"] = df["text"].progress_apply(remove_stopwords)

# Apply normalization
tqdm.pandas(desc="🔤 Normalizing")
df["normalized_text"] = df["cleaned_text"].progress_apply(normalize_arabic)

# Apply segmentation
tqdm.pandas(desc="📌 Segmenting")
df["segmented_text"] = df["normalized_text"].progress_apply(segment_text)

# Apply stemming/lemmatization and convert to token list
tqdm.pandas(desc="🌱 Stemming")
df["stemmed_tokens"] = df["normalized_text"].progress_apply(lambda text: stemmer.stem(text))
df["stemmed_tokens"] = df["stemmed_tokens"].apply(lambda text: text.split())

# Apply POS tagging and convert to token list
tqdm.pandas(desc="📌 POS Tagging")
df["pos_tags"] = df["normalized_text"].progress_apply(lambda text: pos_tagger.tag(text))
df["pos_tags"] = df["pos_tags"].apply(lambda text: text.split())

# Save the processed dataset
df.to_csv("cleaned_dataset.csv", index=False, encoding="utf-8-sig")
print("✅ NLP Preprocessing Complete! Ready for AraBERT tokenizer.")

🛑 Removing Stopwords: 100%|██████████| 5069/5069 [00:00<00:00, 234282.77it/s]
🔤 Normalizing: 100%|██████████| 5069/5069 [00:00<00:00, 77290.82it/s]
📌 Segmenting: 100%|██████████| 5069/5069 [4:00:17<00:00,  2.84s/it]  
🌱 Stemming: 100%|██████████| 5069/5069 [3:59:39<00:00,  2.84s/it]  
📌 POS Tagging: 100%|██████████| 5069/5069 [13:08:05<00:00,  9.33s/it]  

✅ NLP Preprocessing Complete! Ready for AraBERT tokenizer.





In [None]:
from farasa.segmenter import FarasaSegmenter
from farasa.stemmer import FarasaStemmer
from farasa.pos import FarasaPOSTagger
from farasa.ner import FarasaNamedEntityRecognizer
import re
# Initialize Farasa tools
segmenter = FarasaSegmenter()
pos_tagger = FarasaPOSTagger()
stemmer = FarasaStemmer()
ner = FarasaNamedEntityRecognizer()


In [27]:
def normalize_arabic(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)                   # Remove HTML tags
    # Remove emojis
    text = re.sub(r"[^\w\s,]", "", text, flags=re.UNICODE)  # Remove emojis
    # Normalize Arabic text
    text = re.sub(r"[إأآٱ]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ه", "ة", text)
    text = re.sub("[ًٌٍَُِّْ]", "", text)  # Remove diacritics
    text = re.sub(r"[^\w\s]", "", text)   # Remove punctuation
    return text.strip()

def segment_text(text):
    segmented = segmenter.segment(text)
    return segmented

def lemmatize_text(text):
    stemmed = stemmer.stem(text)
    return stemmed

with open("../data_set/algerian_arabic_stopwords.txt", "r", encoding="utf-8") as f:
    custom_stopwords = set(word.strip() for word in f.readlines())

def remove_stopwords(text):
    words = text.split()
    return " ".join([word for word in words if word not in custom_stopwords])

In [28]:
text ="محمد صلاح يقود نادي ليفربول للفوز الكبير 🏆 على مانشستر يونايتد"

#ليفربول #محمد_صلاح #الدوري_الإنجليزي"

print("Original Text:", text)

# Remove stopwords from the text
text_without_stopwords = remove_stopwords(text)

# Normalize the text after removing stopwords
normalized_text = normalize_arabic(text_without_stopwords)
print("Normalized Text:", normalized_text)

stemmed_text = stemmer.stem(normalized_text)
print("Stemmed Text:", stemmed_text)

# Now POS tagging on stemmed text
pos_tags_split = pos_tagger.tag(stemmed_text).split()
print("POS Tags (Split):")
for tag in pos_tags_split:
    print(tag)

Original Text: محمد صلاح يقود نادي ليفربول للفوز الكبير 🏆 على مانشستر يونايتد
Normalized Text: محمد صلاح يقود نادي ليفربول للفوز الكبير  مانشستر يونايتد
Stemmed Text: محمد صلاح قاد نادي ليفربول فوز كبير مانشستر يونايتد
POS Tags (Split):
S/S
محمد/NOUN-MS
صلاح/NOUN-MS
قاد/V
نادي/NOUN-MS
ليفربول/NOUN-MS
فوز/NOUN-MS
كبير/ADJ-MS
مانشستر/NOUN-MS
يونايتد/NOUN-MS
E/E


In [25]:
# Your input text
text = "محمد صلاح يقود نادي ليفربول للفوز الكبير 🏆 على مانشستر يونايتد"

# Print the original text
print("Original Text:", text)

# Normalize the text
normalized_text = normalize_arabic(text)
print("Normalized Text:", normalized_text)

# Stem the normalized text
stemmed_text = stemmer.stem(normalized_text)
print("Stemmed Text:", stemmed_text)

# Function to split text into words
def split_text_to_words(text):
    # Split the text by spaces or punctuation
    return [word for word in text.split() if word not in ['.', ',', '!', '?', '…', '(', ')']]

# Split normalized and stemmed text into words
normalized_words = split_text_to_words(normalized_text)
stemmed_words = split_text_to_words(stemmed_text)

# Output the word lists
print("\nNormalized Text as Words:", normalized_words)
print("\nAfter processing:", stemmed_words)


Original Text: محمد صلاح يقود نادي ليفربول للفوز الكبير 🏆 على مانشستر يونايتد
Normalized Text: محمد صلاح يقود نادي ليفربول للفوز الكبير  علي مانشستر يونايتد
Stemmed Text: محمد صلاح قاد نادي ليفربول فوز كبير علي مانشستر يونايتد

Normalized Text as Words: ['محمد', 'صلاح', 'يقود', 'نادي', 'ليفربول', 'للفوز', 'الكبير', 'علي', 'مانشستر', 'يونايتد']

After processing: ['محمد', 'صلاح', 'قاد', 'نادي', 'ليفربول', 'فوز', 'كبير', 'علي', 'مانشستر', 'يونايتد']
