In [2]:
import re
import pandas as pd
from typing import Dict


# Pipeline

In [3]:
LANG_CONFIG = {
    "ar": {
        "text_col": "Question",
        "regex": {
            "medical_keywords": re.compile(
                r"(الم|نزيف|افرازات|حمل|عملية|طفل|اختبار|دواء|جرعة|حمية|ارتفاع ضغط|سكري|حرارة|كسر|التهاب)",
                re.IGNORECASE
            ),
            "rare_procedures": re.compile(
                r"(ربط عنق الرحم|ICSI|CRP|قسطرة|استئصال|منظار|تصوير بالموجات فوق الصوتية|عملية قيصرية|تحليل TSH|علاج دوائي)",
                re.IGNORECASE
            ),
            "numbers_and_labs": re.compile(
                r"(\d{1,3}(\.\d+)?\s*(ملغ|مل|سم|مم|نبض|bpm|CRP|Hb|هرمون)?)"
            ),
            "negation_or_reassurance": re.compile(
                r"(لا تقلقي|طبيعي|لا يوجد مشكلة|آمن|لا خطورة)", re.IGNORECASE
            ),
            "sensitive_topics": re.compile(
                r"(الجماع|القذف|الدورة الشهرية|الحمل|الرضاعة|الخصوبة|المني|الفرج|الثدي)", re.IGNORECASE
            ),
            "polite_phrases": re.compile(
                r"(السلام عليكم|مرحبا|شكرا|من فضلك|جزاك الله خيرا)", re.IGNORECASE
            ),
            "narrative_style": re.compile(
                r"(أشعر|أعاني|لاحظت|منذ|خلال|اليوم|فجأة|بعد أن|في الوقت الحالي|كنت|صرت|بدأت)", re.IGNORECASE
            ),
        }
    },

    "en": {
        "text_col": "Question_eng",
        "regex": {
            "medical_keywords": re.compile(
                r"\b(pain|swelling|cyst|discharge|infection|CRP|testosterone|surgery|fever|treatment|ulcer|cough|shortness of breath)\b",
                re.IGNORECASE
            ),
            "rare_procedures": re.compile(
                r"\b(ICSI|cervical cerclage|CBK test|echocardiogram|catheterization|angioplasty|laparoscopy|endoscopy)\b",
                re.IGNORECASE
            ),
            "numbers_and_labs": re.compile(
                r"\b(\d{1,3}(\.\d+)?\s*(mg|ml|g|cm|mmHg|bpm|CRP|Hb|testosterone)?)\b",
                re.IGNORECASE
            ),
            "negation_or_reassurance": re.compile(
                r"\b(don't worry|it's normal|no problem|minor|safe|harmless|not serious)\b",
                re.IGNORECASE
            ),
            "sensitive_topics": re.compile(
                r"\b(sexual|ejaculation|menstrual|pregnancy|reproductive|contraceptive|intercourse|masturbation|breast|fertility)\b",
                re.IGNORECASE
            ),
            "polite_phrases": re.compile(
                r"\b(peace be upon you|hello|hi|thank you|please)\b",
                re.IGNORECASE
            ),
            "narrative_style": re.compile(
                r"\b(I (feel|have|suffer|noticed)|Recently|For the last|Since|Today|Yesterday)\b",
                re.IGNORECASE
            ),
        }
    }
}


In [45]:
def analyze_df(df: pd.DataFrame, lang: str) -> pd.DataFrame:
    cfg = LANG_CONFIG[lang]
    text_col = cfg["text_col"]
    regex = cfg["regex"]

    rows = []
    for text in df[text_col].fillna("").astype(str):
        row = {}
        for k, p in regex.items():
            matches = p.findall(text)
            row[f"{k}_present"] = int(bool(matches))
            row[f"{k}_count"] = len(matches)
        rows.append(row)

    return pd.DataFrame(rows)


In [48]:
from typing import Literal

def analysis_pipeline(
    df: pd.DataFrame,
    lang: str,
    mode: Literal["present", "count"] = "count"
):
    """
    Unified analysis pipeline for medical text datasets.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe. Must contain a 'Trigger' column (0/1).
        May optionally contain a 'prediction' column (0/1).
    lang : str
        Language code ('ar' or 'en'), used to select text column
        and language-specific regex patterns.
    mode : {'present', 'count'}, default='present'
        - 'present': use binary keyword presence per row (recommended for statistics)
        - 'count'  : use total keyword occurrences (useful for qualitative analysis)

    Returns
    -------
    dict
        Dictionary containing aggregated keyword statistics for:
        - TRIGGER / NON_TRIGGER
        - FP / FN (if prediction column exists)
        - Length diagnostics (if prediction column exists)
    """

    assert "Trigger" in df.columns, "Trigger column is required"
    assert mode in {"present", "count"}, "mode must be 'present' or 'count'"

    out = {}
    suffix = "_present" if mode == "present" else "_count"

    # ---------- Trigger vs Non-trigger ----------
    trigger_df = df[df["Trigger"] == 1]
    non_trigger_df = df[df["Trigger"] == 0]

    out["TRIGGER"] = analyze_df(trigger_df, lang).filter(like=suffix).sum()
    out["NON_TRIGGER"] = analyze_df(non_trigger_df, lang).filter(like=suffix).sum()

    # ---------- FP / FN (only if prediction exists) ----------
    if "prediction" in df.columns:
        fp_df = df[(df["prediction"] == 1) & (df["Trigger"] == 0)]
        fn_df = df[(df["prediction"] == 0) & (df["Trigger"] == 1)]

        out["FP"] = analyze_df(fp_df, lang).filter(like=suffix).sum()
        out["FN"] = analyze_df(fn_df, lang).filter(like=suffix).sum()

        # ---------- Length diagnostics ----------
        text_col = LANG_CONFIG[lang]["text_col"]

        out["FP_avg_len_chars"] = fp_df[text_col].str.len().mean()
        out["FN_avg_len_chars"] = fn_df[text_col].str.len().mean()

        out["FP_avg_len_words"] = fp_df[text_col].str.split().str.len().mean()
        out["FN_avg_len_words"] = fn_df[text_col].str.split().str.len().mean()

    return out


In [66]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, fisher_exact

def significance_from_pipeline(df: pd.DataFrame, results_dict: dict = None, lang: str = None, alpha: float = 0.05):
    """
    Compute statistical significance of features between two groups:
    - Full dataset: Trigger vs Non-Trigger
    - Result dataset: FP vs FN

    Automatically detects totals, selects test, computes odds ratio, and shows which group is enriched.

    Parameters
    ----------
    df : pd.DataFrame
        Original dataset containing 'Trigger' and optionally 'prediction' columns.
    results_dict : dict, optional
        Output from analysis_pipeline. If None, will auto-generate TRIGGER/NON_TRIGGER or FP/FN.
    lang : str, optional
        Language key for analysis_pipeline (needed if results_dict is None)
    alpha : float
        Significance threshold.

    Returns
    -------
    pd.DataFrame
        Keyword-level table with:
        - counts per group
        - p-value
        - test used
        - significant
        - odds ratio
        - enriched_in: which group is enriched
    """

    # Determine group type and compute results_dict if not provided
    if results_dict is None:
        assert lang is not None, "Must provide 'lang' if results_dict is not given."
        from copy import deepcopy
        from __main__ import analysis_pipeline  # assumes analysis_pipeline is imported

        # Full dataset (no prediction column)
        if "prediction" not in df.columns:
            results_dict = analysis_pipeline(df, lang=lang, mode="present")
            # Should contain "TRIGGER" and "NON_TRIGGER"
            keys = ["TRIGGER", "NON_TRIGGER"]
        # Result dataset (with prediction)
        else:
            results_dict = analysis_pipeline(df, lang=lang, mode="present")
            keys = ["FP", "FN"]
    else:
        keys = list(results_dict.keys())
        if len(keys) != 2:
            raise ValueError("results_dict must have exactly two keys.")

    group1, group2 = keys

    # Compute totals from DataFrame if possible
    if "prediction" in df.columns and set(keys) == {"FP", "FN"}:
        total1 = len(df[(df["prediction"] == 1) & (df["Trigger"] == 0)])  # FP
        total2 = len(df[(df["prediction"] == 0) & (df["Trigger"] == 1)])  # FN
    else:
        total1 = (df["Trigger"] == 1).sum()  # TRIGGER
        total2 = (df["Trigger"] == 0).sum()  # NON_TRIGGER

    counts1 = results_dict[group1]
    counts2 = results_dict[group2]

    results = []

    for kw in counts1.index:
        a = counts1[kw]
        c = counts2[kw]
        b = total1 - a
        d = total2 - c

        table = np.array([[a, b], [c, d]])

        # Select test
        if (table < 5).any():
            odds_ratio, p_value = fisher_exact(table)
            test = "Fisher Exact"
        else:
            chi2, p_value, dof, expected = chi2_contingency(table)
            odds_ratio = (a * d) / (b * c) if b > 0 and c > 0 else np.nan
            test = "Chi-square"

        # Determine enrichment
        if odds_ratio > 1:
            enriched_in = group1
        elif odds_ratio < 1:
            enriched_in = group2
        else:
            enriched_in = "None"
            
        significant =  p_value < alpha
        if not significant:
            enriched_in = "Z"

        results.append({
            "Keyword": kw.replace("_present", ""),
            f"{group1}_with": a,
            f"{group2}_with": c,
            "p_value": p_value,
            "Significant": significant,
            "Test": test,
            "Odds_Ratio": odds_ratio,
            "Enriched_in": enriched_in
        })

    return pd.DataFrame(results).sort_values("p_value")


# Test here

## Show for Full Dataset

In [67]:
df_memo = pd.read_csv("../Eval/Memo_Dataset.csv")
test_lang = "ar"

# Run your pipeline
results_ara_memo_full = analysis_pipeline(df_memo, lang=test_lang, mode="present")

# Compute significance
sig_df = significance_from_pipeline(df=df_memo, results_dict=results_ara_memo_full, lang=test_lang)

sig_df


Unnamed: 0,Keyword,TRIGGER_with,NON_TRIGGER_with,p_value,Significant,Test,Odds_Ratio,Enriched_in
3,negation_or_reassurance,415,1042,6.768711e-134,True,Chi-square,0.241691,NON_TRIGGER
2,numbers_and_labs,2490,2365,1.4156469999999999e-37,True,Chi-square,0.625999,NON_TRIGGER
4,sensitive_topics,472,595,8.629937000000001e-23,True,Chi-square,0.534349,NON_TRIGGER
0,medical_keywords,4148,2651,1.490572e-09,True,Chi-square,1.239728,TRIGGER
6,narrative_style,1766,1184,0.08973565,False,Chi-square,1.075904,Z
5,polite_phrases,1221,911,0.2199133,False,Chi-square,0.941889,Z
1,rare_procedures,90,59,0.6947113,False,Chi-square,1.083479,Z


## Test on ALL models

In [93]:
import os
import pandas as pd

# Assuming analysis_pipeline and significance_from_pipeline are already defined

def run_directory_analysis(dir_path, mode="present", save_txt=True, txt_path="model_analysis_summary.txt"):
    """
    Reads all CSVs in a directory, runs analysis and significance pipeline for both languages,
    prints clear tables with model names and language, and summarizes significant features.
    Optionally saves the full output to a text file.
    """
    all_sig_features = {}  # to summarize later
    lines_to_save = []

    languages = ["ar", "en"]

    for file in os.listdir(dir_path):
        if not file.endswith(".csv"):
            continue

        # Determine model name (strip timestamp)
        base_name = file.split("_202")[0]

        # Read CSV
        df = pd.read_csv(os.path.join(dir_path, file))

        for lang in languages:
            sep_line = "="*80
            header = f"MODEL / CSV: {base_name} | on col LANGUAGE: {lang}"
            lines_to_save.append(sep_line)
            lines_to_save.append(header)
            lines_to_save.append("-"*80)

            print(sep_line)
            print(header)
            print("-"*80)

            # Run analysis pipeline
            results = analysis_pipeline(df, lang=lang, mode=mode)

            # Keep only FP / FN if predictions exist
            if "FP" in results and "FN" in results:
                results_filtered = {k: v for k, v in results.items() if k in ['FP', 'FN']}
                print("FP / FN Counts:")
                print(pd.DataFrame(results_filtered))
                lines_to_save.append("FP / FN Counts:")
                lines_to_save.append(pd.DataFrame(results_filtered).to_string())
            else:
                # Full dataset (Trigger vs Non-Trigger)
                results_filtered = {k: v for k, v in results.items() if k in ['TRIGGER', 'NON_TRIGGER']}
                print("TRIGGER / NON_TRIGGER Counts:")
                print(pd.DataFrame(results_filtered))
                lines_to_save.append("TRIGGER / NON_TRIGGER Counts:")
                lines_to_save.append(pd.DataFrame(results_filtered).to_string())

            # Compute significance
            sig_df = significance_from_pipeline(df=df, results_dict=results_filtered, lang=lang)
            print("-"*80)
            print("Significance Table:")
            print(sig_df)
            lines_to_save.append("-"*80)
            lines_to_save.append("Significance Table:")
            lines_to_save.append(sig_df.to_string(index=False))
            lines_to_save.append(sep_line + "\n")
            print(sep_line + "\n\n")

            # Record which features are significant for this model+lang
            sig_features = sig_df[sig_df["Significant"]]["Keyword"].tolist()
            all_sig_features[f"{base_name}_{lang}"] = sig_features

    # Summary table at the end
    print("SUMMARY OF SIGNIFICANT FEATURES PER MODEL / LANGUAGE:")
    summary_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in all_sig_features.items()]))
    print(summary_df)
    lines_to_save.append("SUMMARY OF SIGNIFICANT FEATURES PER MODEL / LANGUAGE:")
    lines_to_save.append(summary_df.to_string())

    # Save to txt if requested
    if save_txt:
        with open(txt_path, "w", encoding="utf-8") as f:
            for line in lines_to_save:
                f.write(str(line) + "\n")

    return all_sig_features


In [94]:
dir_path = "results/"
all_sig = run_directory_analysis(dir_path, mode="present")

MODEL / CSV: Claude-Haiku-4.5_Arabic | on col LANGUAGE: ar
--------------------------------------------------------------------------------
FP / FN Counts:
                                  FP   FN
medical_keywords_present         282  208
rare_procedures_present           11    3
numbers_and_labs_present         214  126
negation_or_reassurance_present   87   24
sensitive_topics_present          49   40
polite_phrases_present            83   59
narrative_style_present          136   80
--------------------------------------------------------------------------------
Significance Table:
                   Keyword  FP_with  FN_with   p_value  Significant  \
3  negation_or_reassurance       87       24  0.000002         True   
2         numbers_and_labs      214      126  0.005989         True   
6          narrative_style      136       80  0.049033         True   
1          rare_procedures       11        3  0.111305        False   
0         medical_keywords      282      208  0.7332

In [98]:
import pandas as pd
from tabulate import tabulate  # pip install tabulate

# all_sig = dictionary of model/language -> list of significant features

# 1️⃣ Find the maximum number of features across all models
max_len = max(len(v) for v in all_sig.values())

# 2️⃣ Pad each list with empty strings to make them equal length
padded_features = {k: v + [""]*(max_len - len(v)) for k, v in all_sig.items()}

# 3️⃣ Create a DataFrame
summary_df = pd.DataFrame(padded_features)

# 4️⃣ Transpose: models/languages as rows
summary_df = summary_df.T.reset_index()
summary_df.rename(columns={'index': 'Model / Language'}, inplace=True)

# 5️⃣ Rename feature columns
summary_df.columns = ['Model / Language'] + [f'Feature_{i+1}' for i in range(max_len)]

# 6️⃣ Convert to pretty table string using tabulate
table_str = tabulate(summary_df, headers='keys', tablefmt='grid', showindex=False)

# 7️⃣ Print table
print("="*120)
print("SUMMARY OF SIGNIFICANT FEATURES PER MODEL / LANGUAGE (TRANSPOSED)")
print("="*120)
print(table_str)

# 8️⃣ Save to a new text file
txt_path = "summary_significant_features.txt"
with open(txt_path, "w", encoding="utf-8") as f:
    f.write("="*120 + "\n")
    f.write("SUMMARY OF SIGNIFICANT FEATURES PER MODEL / LANGUAGE (TRANSPOSED)\n")
    f.write("="*120 + "\n")
    f.write(table_str)

print(f"\n✅ Transposed summary saved to: {txt_path}")


SUMMARY OF SIGNIFICANT FEATURES PER MODEL / LANGUAGE (TRANSPOSED)
+----------------------------------------------+-------------------------+-------------------------+-------------------------+------------------+------------------+
| Model / Language                             | Feature_1               | Feature_2               | Feature_3               | Feature_4        | Feature_5        |
| Claude-Haiku-4.5_Arabic_ar                   | negation_or_reassurance | numbers_and_labs        | narrative_style         |                  |                  |
+----------------------------------------------+-------------------------+-------------------------+-------------------------+------------------+------------------+
| Claude-Haiku-4.5_Arabic_en                   | numbers_and_labs        | narrative_style         |                         |                  |                  |
+----------------------------------------------+-------------------------+-------------------------+---------