In [54]:
import re
import json
from typing import Dict
from collections import Counter
from glob import glob
import pymupdf
from tqdm import tqdm
import pyarabic.araby as araby

In [7]:
pdf_directory = glob("../data/downloaded_pdfs/*.pdf")

In [39]:
def detect_language(text: str) -> str:
    """
    Detect whether text is primarily Arabic, English, or Mixed

    Args:
        text (str): Input text to analyze
        threshold (float): Minimum ratio (0.0-1.0) of minority language to consider text as mixed.
                         Default is 0.2 (20%)

    Returns:
        str: 'ar' for Arabic, 'en' for English, 'mixed' for bilingual text
    """
    if not isinstance(text, str):
        raise ValueError(f"Invalid text: {text}")

    text = text.lower()
    text = re.sub(r"[^a-zA-Z\u0600-\u06FF]", "", text)
    letter_counts = Counter(text)

    arabic_alphabet = set(araby.LETTERS)
    english_alphabet = set("abcdefghijklmnopqrstuvwxyz")

    arabic_count = sum(letter_counts[char] for char in arabic_alphabet)
    english_count = sum(letter_counts[char] for char in english_alphabet)

    total_count = arabic_count + english_count

    if total_count == 0:
        return {"ar": 0, "en": 0}

    arabic_ratio = arabic_count / total_count
    english_ratio = english_count / total_count

    return {"ar": arabic_ratio, "en": english_ratio}

In [40]:
def get_text_from_pdf(pdf_path: str) -> str:
    content = []
    doc = pymupdf.open(pdf_path)
    for page in doc:
        text = page.get_text()
        content.append(text)
    return " ".join(content)


In [41]:
def create_lang_report(pdf_path: str) -> Dict:
    text = get_text_from_pdf(pdf_path)
    lang = detect_language(text)
    return {"pdf_path": pdf_path, **lang}

In [None]:
lang_reports = [create_lang_report(path) for path in tqdm(pdf_directory)]

In [48]:
filtered = [lang_report for lang_report in lang_reports if lang_report["ar"] > 0.75]

In [50]:
len(filtered)

597

In [51]:
filtered[0]

{'pdf_path': '../data/downloaded_pdfs/27_03_2024_59d14eda.pdf',
 'ar': 0.9619351209450591,
 'en': 0.03806487905494094}

In [52]:
filtered_paths = [lang_report["pdf_path"] for lang_report in lang_reports]

In [53]:
filtered_paths

['../data/downloaded_pdfs/27_03_2024_59d14eda.pdf',
 '../data/downloaded_pdfs/W14-0903_9bd93c02.pdf',
 '../data/downloaded_pdfs/content_760c3e7d.pdf',
 '../data/downloaded_pdfs/Lughat_ul_Arabia1_d11c33ac.pdf',
 '../data/downloaded_pdfs/2205_26cdaa2b.pdf',
 '../data/downloaded_pdfs/2025.wacl-1.5_3e53d96e.pdf',
 '../data/downloaded_pdfs/_f286e5ec.pdf',
 '../data/downloaded_pdfs/viewcontent_ea19d5dc.pdf',
 '../data/downloaded_pdfs/news-premier-commits-to-strong-diverse-communities-20230414-langAR_7ff4b266.pdf',
 '../data/downloaded_pdfs/77732218289fa2d7791d13217fb50e416704e597f2df7_9ceb783a.pdf',
 '../data/downloaded_pdfs/document_19fdbd93.pdf',
 '../data/downloaded_pdfs/ar-conclusion-report_d552385b.pdf',
 '../data/downloaded_pdfs/Rendok_a_youth_secret_language_in_Sudan_46d8b6d7.pdf',
 '../data/downloaded_pdfs/Langar-History-2-4-April-2020_8e2ba696.pdf',
 '../data/downloaded_pdfs/Pamphlet-Langar_b60d2462.pdf',
 '../data/downloaded_pdfs/d63d64b3cd69e1fd62259d66611596fe23cd_700f7921.pdf',


In [57]:
json.dump(filtered_paths, open("../data/filtered_pdfs.json", "w"), indent=4, ensure_ascii=False)