            # 01 - Hadith Dataset EDA

            This notebook explores the downloaded hadith data, focusing on `bukhari.json` from `hadith-json`.

            It covers:
            - Basic corpus overview and random samples.
            - Arabic text statistics.
            - Arabic normalization examples with `ArabicNormalizer`.
            - Simple entity-signal pattern counts for later NER bootstrapping.
            - Export of a unified CSV for downstream modeling.
            


In [1]:
from __future__ import annotations

import json
import re
import sys
from collections import Counter
from pathlib import Path

import pandas as pd
from IPython.display import display

ROOT = Path.cwd().resolve()
if not (ROOT / "src").exists():
    ROOT = ROOT.parent.resolve()

if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.preprocessing.normalize import ArabicNormalizer

DATA_DIR = ROOT / "data" / "raw"
BUKHARI_PATH = DATA_DIR / "hadith_json" / "bukhari.json"
UNIFIED_PATH = DATA_DIR / "hadith_unified.csv"

if not BUKHARI_PATH.exists():
    raise FileNotFoundError(
        f"{BUKHARI_PATH} not found. Run `python scripts/download_data.py` first."
    )



            ## A) Load `bukhari.json` and inspect dataset shape

            This section loads the JSON, creates a flat pandas DataFrame, then prints:
            - total hadith count,
            - 5 random Arabic/English samples,
            - hadith distribution per book and chapter.
            


In [2]:
def as_text(value: object) -> str:
    return value if isinstance(value, str) else ""


def load_hadith_records(path: Path) -> list[dict]:
    with path.open("r", encoding="utf-8") as file:
        payload = json.load(file)

    if isinstance(payload, list):
        return [item for item in payload if isinstance(item, dict)]

    if isinstance(payload, dict):
        for key in ("hadiths", "data", "results"):
            value = payload.get(key)
            if isinstance(value, list):
                return [item for item in value if isinstance(item, dict)]

    raise ValueError("Unsupported hadith JSON format.")


def record_to_row(record: dict, default_book: str = "bukhari") -> dict:
    english = record.get("english")
    english_narrator = ""
    english_text = ""

    if isinstance(english, dict):
        english_narrator = as_text(english.get("narrator"))
        english_text = as_text(english.get("text"))
    elif isinstance(english, str):
        english_text = english

    return {
        "hadith_id": record.get("id")
        or record.get("hadith_id")
        or record.get("hadithNumber")
        or record.get("number"),
        "book": record.get("book")
        or record.get("bookName")
        or record.get("bookId")
        or default_book,
        "chapter": record.get("chapter")
        or record.get("chapterTitle")
        or record.get("chapterId")
        or "unknown",
        "arabic_text": as_text(
            record.get("arabic")
            or record.get("arab")
            or record.get("text")
            or record.get("arabic_text")
        ),
        "english_narrator": english_narrator
        or as_text(record.get("english_narrator")),
        "english_text": english_text
        or as_text(record.get("english_text"))
        or as_text(record.get("englishTranslation")),
    }


hadith_records = load_hadith_records(BUKHARI_PATH)
df = pd.DataFrame(record_to_row(record) for record in hadith_records)

df["arabic_text"] = df["arabic_text"].fillna("").astype(str)
df["english_narrator"] = df["english_narrator"].fillna("").astype(str)
df["english_text"] = df["english_text"].fillna("").astype(str)

print(f"Total number of hadiths: {len(df):,}")

print()
print("Sample of 5 random hadiths (Arabic + English):")
sample = df.sample(n=min(5, len(df)), random_state=42)[
    ["hadith_id", "arabic_text", "english_text"]
]
display(sample)

print()
print("Distribution of hadiths per book:")
book_distribution = (
    df["book"]
    .fillna("unknown")
    .astype(str)
    .value_counts()
    .rename_axis("book")
    .reset_index(name="hadith_count")
)
display(book_distribution)

print()
print("Distribution of hadiths per chapter (top 30):")
chapter_distribution = (
    df["chapter"]
    .fillna("unknown")
    .astype(str)
    .value_counts()
    .head(30)
    .rename_axis("chapter")
    .reset_index(name="hadith_count")
)
display(chapter_distribution)


Total number of hadiths: 7,277

Sample of 5 random hadiths (Arabic + English):


Unnamed: 0,hadith_id,arabic_text,english_text
6059,6060,حَدَّثَنَا يَحْيَى بْنُ قُزَعَةَ، حَدَّثَنَا إ...,"The Prophet (ﷺ) said ""Five things are in accor..."
6010,6011,حَدَّثَنَا أَبُو نُعَيْمٍ، حَدَّثَنَا عُمَرُ ب...,I entered (the house) along with Allah's Messe...
4475,4476,حَدَّثَنِي إِبْرَاهِيمُ بْنُ مُوسَى، أَخْبَرَن...,"Ibn `Abbas recited. ""No doubt! They fold up th..."
3689,3690,حَدَّثَنَا سُلَيْمَانُ بْنُ حَرْبٍ، حَدَّثَنَا...,The Prophet (ﷺ) recited Surat An-Najam and pro...
4515,4516,حَدَّثَنَا يَعْقُوبُ بْنُ إِبْرَاهِيمَ، حَدَّث...,"(regarding): 'Neither say your, prayer aloud, ..."



Distribution of hadiths per book:


Unnamed: 0,book,hadith_count
0,1,7277



Distribution of hadiths per chapter (top 30):


Unnamed: 0,chapter,hadith_count
0,65,499
1,64,488
2,56,294
3,10,266
4,78,250
5,25,247
6,97,188
7,77,185
8,34,184
9,67,183


            ## B) Arabic text statistics

            Computes:
            - average text length (characters and words),
            - percentage of hadiths containing tashkeel,
            - top 30 most frequent Arabic words.
            


In [3]:
TASHKEEL_RE = re.compile(r"[ؐ-ًؚ-ٰٟ]")
NON_ARABIC_RE = re.compile(r"[^؀-ۿ]+")

arabic_series = df["arabic_text"].fillna("").astype(str).str.strip()

avg_chars = arabic_series.str.len().mean()
avg_words = arabic_series.str.split().str.len().mean()
tashkeel_pct = arabic_series.apply(lambda text: bool(TASHKEEL_RE.search(text))).mean() * 100

print(f"Average length (characters): {avg_chars:.2f}")
print(f"Average length (words): {avg_words:.2f}")
print(f"Percentage containing tashkeel: {tashkeel_pct:.2f}%")

tokens = []
for text in arabic_series:
    cleaned = NON_ARABIC_RE.sub(" ", text)
    tokens.extend(token for token in cleaned.split() if token)

top_words = pd.DataFrame(Counter(tokens).most_common(30), columns=["word", "count"])
display(top_words)



Average length (characters): 591.67
Average length (words): 75.58
Percentage containing tashkeel: 100.00%


Unnamed: 0,word,count
0,قَالَ,16573
1,الله,15139
2,عَنْ,14448
3,حَدَّثَنَا,13653
4,عليه,11003
5,صلى,11001
6,وسلم,10825
7,ـ,10540
8,اللَّهِ,9737
9,بْنُ,9406


            ## C) ArabicNormalizer before/after examples

            Applies `ArabicNormalizer` to a random sample to show how text is normalized.
            


In [4]:
normalizer = ArabicNormalizer()

sample_before = arabic_series.sample(n=min(5, len(arabic_series)), random_state=7).reset_index(drop=True)
sample_after = sample_before.apply(normalizer.normalize)

normalization_preview = pd.DataFrame({
    "before": sample_before,
    "after": sample_after,
})
display(normalization_preview)



Unnamed: 0,before,after
0,حَدَّثَنَا أَبُو نُعَيْمٍ، حَدَّثَنَا إِسْرَائ...,حدثنا ابو نعيم، حدثنا اسرائيل، عن مخارق، عن طا...
1,حَدَّثَنَا مُحَمَّدُ بْنُ أَبِي بَكْرٍ، حَدَّث...,حدثنا محمد بن ابي بكر، حدثنا فضيل بن سليمان، ع...
2,حَدَّثَنِي عُبَيْدُ بْنُ إِسْمَاعِيلَ، عَنْ أَ...,حدثني عبيد بن اسماعيل، عن ابي اسامه، عن عبيد ا...
3,حَدَّثَنَا أَبُو الْيَمَانِ، أَخْبَرَنَا شُعَي...,حدثنا ابو اليمان، اخبرنا شعيب، حدثنا ابو الزنا...
4,حَدَّثَنَا عَلِيُّ بْنُ عَيَّاشٍ، حَدَّثَنَا أ...,حدثنا علي بن عياش، حدثنا ابو غسان، محمد بن مطر...


    ## D) Potential entity signal counts

    Counts simple lexical patterns that can seed future NER bootstrapping rules:
    - `حدثنا` (narrated to us)
    - `عن` (from/about)
    - `رواه` (reported by)
    - `قال` (said)
    


In [5]:
signal_terms = {
    "حدثنا": "narrated to us",
    "عن": "from/about",
    "رواه": "reported by",
    "قال": "said",
}

signal_rows = []
for term, meaning in signal_terms.items():
    count = int(arabic_series.str.count(re.escape(term)).sum())
    signal_rows.append({"pattern": term, "meaning": meaning, "count": count})

signal_df = pd.DataFrame(signal_rows).sort_values("count", ascending=False).reset_index(drop=True)
display(signal_df)



Unnamed: 0,pattern,meaning,count
0,عن,from/about,4136
1,قال,said,1
2,حدثنا,narrated to us,0
3,رواه,reported by,0


            ## E) Build and save unified CSV

            Creates a unified DataFrame with columns:
            `[hadith_id, book, chapter, arabic_text, arabic_normalized, english_narrator, english_text]`

            Then saves it to `data/raw/hadith_unified.csv`.
            


In [6]:
df["arabic_normalized"] = arabic_series.apply(normalizer.normalize)

unified_df = df[
    [
        "hadith_id",
        "book",
        "chapter",
        "arabic_text",
        "arabic_normalized",
        "english_narrator",
        "english_text",
    ]
].copy()

UNIFIED_PATH.parent.mkdir(parents=True, exist_ok=True)
unified_df.to_csv(UNIFIED_PATH, index=False, encoding="utf-8")

print(f"Saved unified CSV to: {UNIFIED_PATH}")
display(unified_df.head())



Saved unified CSV to: C:\Users\diaab\islamic-ner\data\raw\hadith_unified.csv


Unnamed: 0,hadith_id,book,chapter,arabic_text,arabic_normalized,english_narrator,english_text
0,1,1,1,حَدَّثَنَا الْحُمَيْدِيُّ عَبْدُ اللَّهِ بْنُ ...,حدثنا الحميدي عبد الله بن الزبير ، قال : حدثنا...,Narrated 'Umar bin Al-Khattab:,"I heard Allah's Messenger (ﷺ) saying, ""The rew..."
1,2,1,1,حَدَّثَنَا عَبْدُ اللَّهِ بْنُ يُوسُفَ، قَالَ ...,حدثنا عبد الله بن يوسف، قال اخبرنا مالك، عن هش...,Narrated 'Aisha:,(the mother of the faithful believers) Al-Hari...
2,3,1,1,حَدَّثَنَا يَحْيَى بْنُ بُكَيْرٍ، قَالَ حَدَّث...,حدثنا يحيي بن بكير، قال حدثنا الليث، عن عقيل، ...,Narrated 'Aisha (the mother of the faithful be...,The commencement of the Divine Inspiration to ...
3,4,1,1,قَالَ ابْنُ شِهَابٍ وَأَخْبَرَنِي أَبُو سَلَمَ...,قال ابن شهاب واخبرني ابو سلمه بن عبد الرحمن، ا...,Narrated Jabir bin 'Abdullah Al-Ansari (while ...,"""While I was walking, all of a sudden I heard ..."
4,5,1,1,حَدَّثَنَا مُوسَى بْنُ إِسْمَاعِيلَ، قَالَ حَد...,حدثنا موسي بن اسماعيل، قال حدثنا ابو عوانه، قا...,Narrated Said bin Jubair:,Ibn 'Abbas in the explanation of the statement...
