### Load necessary libraries

In [17]:
!pip install sentence_transformers

^C


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the model from the local path
model_path = "./arabic-news-classifier"  # Adjust if your path is different

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

In [18]:
import re
from farasa.segmenter import FarasaSegmenter

# Initialize Farasa segmenter
segmenter = FarasaSegmenter()

# Normalize Arabic text
def normalize_arabic(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"<.*?>", "", text)                    # Remove HTML tags
    text = re.sub(r"[^\w\s,]", "", text, flags=re.UNICODE)  # Remove emojis
    text = re.sub(r"[إأآٱ]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("[ًٌٍَُِّْ]", "", text)  # Remove diacritics
    text = re.sub(r"[^\w\s]", "", text)   # Remove punctuation
    return text.strip()

# Segment text using Farasa
def segment_text(text):
    return segmenter.segment(text)

# Preprocess your text
input_text = "الجزائر تتأهل إلى كأس العالم 2022 بعد فوزها على الكاميرون"
normalized_text = normalize_arabic(input_text)
segmented_text = segment_text(normalized_text)

print("Segmented Text:", segmented_text)


Segmented Text: ال+جزائر تتاهل الي كاس ال+عالم 2022 بعد فوز+ها علي ال+كاميرون


In [19]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    return predicted_class_id

In [20]:
prediction = predict(normalized_text)

label_map = {0: "real", 1: "fake"}  # or whatever your labels were
print(f"Input text: {input_text}")
print(f"Normalized text: {normalized_text}")
print(f"Segmented text: {segmented_text}")
print(f"Predicted class ID: {prediction}")
print(f"Predicted label: {label_map[prediction]}")

Input text: الجزائر تتأهل إلى كأس العالم 2022 بعد فوزها على الكاميرون
Normalized text: الجزائر تتاهل الي كاس العالم 2022 بعد فوزها علي الكاميرون
Segmented text: ال+جزائر تتاهل الي كاس ال+عالم 2022 بعد فوز+ها علي ال+كاميرون
Predicted class ID: 1
Predicted label: fake
