In [None]:
import pandas as pd
import google.generativeai as genai
import os
import time

# === Configure Gemini API ===
genai.configure(api_key="...............")  # Replace with your actual key

# === Load significant directional asymmetry results ===
input_path = "directional_asymmetry_significance_1_vs_2.csv"
df = pd.read_csv(input_path)

# === Filter significant pairs ===
df_sig = df[df["Significant"] == "Yes"].copy()

# === Initialize Gemini model ===
model = genai.GenerativeModel("gemini-1.5-flash")

# === Helper: Refined prompt for biomedical insight ===
def query_gemini(cell1, cell2, pathology="colorectal cancer"):
    prompt = (
        f"As a biomedical expert specializing in tumor microenvironment, interpret the clinical relevance of directional spatial proximity between '{cell1}' and '{cell2}' in {pathology}. "
        f"Discuss how the imbalance in proximity (e.g., '{cell1}'→'{cell2}' vs '{cell2}'→'{cell1}') may reflect tumor-stroma dynamics, immune modulation, or therapeutic resistance. "
        f"Frame the insight in terms of potential biomarker value, prognostic significance, or implications for targeted therapy. "
        f"Summarize the relevance clearly for integration into an Excel report used by clinical researchers."
    )
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        print(f" Error querying Gemini for {cell1} ↔ {cell2}: {e}")
        return "LLM query failed"

# === Query Gemini for each significant pair ===
insights = []
for _, row in df_sig.iterrows():
    cell1 = row["CellType1"]
    cell2 = row["CellType2"]
    print(f" Querying Gemini for {cell1} ↔ {cell2}...")
    insight = query_gemini(cell1, cell2)
    insights.append(insight)
    time.sleep(1)  # polite delay

# === Append insights to dataframe ===
df_sig["LLM_Clinical_Insight"] = insights

# === Save enriched results ===
output_path = input_path.replace(".csv", "_with_clinical_insights.csv")
df_sig.to_csv(output_path, index=False)
print(f"\n Enriched results saved to: {output_path}")

In [6]:
from pathlib import Path
import pandas as pd
import numpy as np

# Paths
DEFAULT_SIG_PATH = Path("directional_asymmetry_significance_1_vs_2_with_clinical_insights-2.csv")
DEFAULT_COUNTS_PATH = Path("asymmetry_vs_cellcount_analysis.csv")
OUTPUT_PATH = Path("confounding_mediation_tags.csv")

# Helpers
def _safe_float(x):
    try:
        return float(x)
    except Exception:
        return np.nan

def sign_match(delta_metric: float, delta_count: float, tol: float = 1e-6) -> bool:
    if abs(delta_metric) < tol or abs(delta_count) < tol:
        return False
    return np.sign(delta_metric) == np.sign(delta_count)

def load_data(sig_path: Path, counts_path: Path) -> tuple[pd.DataFrame, pd.DataFrame]:
    sig = pd.read_csv(sig_path)
    counts = pd.read_csv(counts_path)
    sig.columns = [c.strip() for c in sig.columns]
    counts.columns = [c.strip() for c in counts.columns]
    return sig, counts

def flag_confounding(row, count_fc, min_diff):
    a1 = _safe_float(row.get("1_A_count"))
    b1 = _safe_float(row.get("1_B_count"))
    a2 = _safe_float(row.get("2_A_count"))
    b2 = _safe_float(row.get("2_B_count"))
    fc_a = (a2 + 1e-9) / (a1 + 1e-9)
    fc_b = (b2 + 1e-9) / (b1 + 1e-9)
    a_shift = abs(np.log2(fc_a)) >= np.log2(count_fc)
    b_shift = abs(np.log2(fc_b)) >= np.log2(count_fc)
    d_a_to_b = _safe_float(row.get("2_A→B")) - _safe_float(row.get("1_A→B"))
    d_b_to_a = _safe_float(row.get("2_B→A")) - _safe_float(row.get("1_B→A"))
    d_a_cnt = a2 - a1
    d_b_cnt = b2 - b1
    match_a = sign_match(d_a_to_b, d_a_cnt)
    match_b = sign_match(d_b_to_a, d_b_cnt)
    observed_diff = _safe_float(row.get("Observed_Diff_cnt"))
    conf = bool(a_shift and b_shift and match_a and match_b and abs(observed_diff) >= min_diff)
    reason = f"fc_A={fc_a:.2f}, fc_B={fc_b:.2f}; Δ(A→B)={d_a_to_b:.2f} vs ΔA_cnt={d_a_cnt:.0f}; Δ(B→A)={d_b_to_a:.2f} vs ΔB_cnt={d_b_cnt:.0f}; Observed_Diff={observed_diff:.2f}"
    return conf, reason

MECHANISM_KEYWORDS = {
    "tls": ["tls"],
    "m2": ["m2", "cd163"],
    "ecm": ["ecm", "stroma", "smooth muscle", "desmoplas"],
    "checkpoint": ["pd-l1", "pd1", "ctla", "checkpoint"],
    "tgfb": ["tgf"],
    "chemoattract": ["chemoattract", "ccl", "cxcl"],
    "angiogenesis": ["angio"],
    "exclusion": ["exclusion"],
}

def infer_mediator_labels(insight_text):
    text = (insight_text or "").lower()
    labels = [label for label, keys in MECHANISM_KEYWORDS.items() if any(k in text for k in keys)]
    return labels

def flag_mediating(sig_row, confounding_flag, alpha, min_diff):
    fdr = _safe_float(sig_row.get("FDR_adjusted_p_value"))
    significant_str = str(sig_row.get("Significant", "")).strip().lower()
    is_significant = (not np.isnan(fdr) and fdr <= alpha) or significant_str in {"yes", "true", "1"}
    observed_diff = _safe_float(sig_row.get("Observed_Diff_sig"))
    has_effect = abs(observed_diff) >= min_diff
    labels = infer_mediator_labels(sig_row.get("LLM_Clinical_Insight", ""))
    has_mechanism = len(labels) > 0
    mediating = bool(is_significant and has_effect and has_mechanism and not confounding_flag)
    reason = f"FDR={fdr:.4g}, significant={is_significant}, diff={observed_diff:.2f}, labels={labels}"
    return mediating, reason, labels

def run_confounding_mediating_analysis(alpha=0.05, count_fc=2.0, min_diff=5.0,
                                       sig_path=DEFAULT_SIG_PATH, counts_path=DEFAULT_COUNTS_PATH):
    sig, counts = load_data(sig_path, counts_path)
    merged = sig.merge(counts, left_on=["CellType1", "CellType2"], right_on=["CellType_A", "CellType_B"], how="left", suffixes=("_sig", "_cnt"))
    conf_flags, conf_reasons, med_flags, med_reasons, med_labels_all = [], [], [], [], []

    for _, row in merged.iterrows():
        conf, conf_reason = flag_confounding(row, count_fc, min_diff)
        med, med_reason, labels = flag_mediating(row, conf, alpha, min_diff)
        conf_flags.append(conf)
        conf_reasons.append(conf_reason)
        med_flags.append(med)
        med_reasons.append(med_reason)
        med_labels_all.append(",".join(labels) if labels else "")

    merged["confounding_candidate"] = conf_flags
    merged["confounding_reason"] = conf_reasons
    merged["mediating_candidate"] = med_flags
    merged["mediating_reason"] = med_reasons
    merged["mediator_labels"] = med_labels_all

    out_cols = [
        "CellType1", "CellType2", "Asymmetry_P1", "Asymmetry_P2", "Observed_Diff_sig",
        "FDR_adjusted_p_value", "confounding_candidate", "confounding_reason",
        "mediating_candidate", "mediating_reason", "mediator_labels"
    ]
    out_df = merged[out_cols].copy()
    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    out_df.to_csv(OUTPUT_PATH, index=False)

    print(f"[done] Wrote: {OUTPUT_PATH}")
    print(f"[summary] total={len(out_df)}, confounding={int(out_df['confounding_candidate'].sum())}, mediating={int(out_df['mediating_candidate'].sum())}")
    print("Top mediating examples:")
    top_med = out_df[out_df["mediating_candidate"]].sort_values("Observed_Diff_sig", ascending=False).head(10)
    for _, r in top_med.iterrows():
        print(f"  {r['CellType1']} ↔ {r['CellType2']}: diff={r['Observed_Diff_sig']:.2f}, labels={r['mediator_labels']}")

# Run the analysis
run_confounding_mediating_analysis()

[done] Wrote: confounding_mediation_tags.csv
[summary] total=93, confounding=12, mediating=74
Top mediating examples:
  B cells ↔ Tregs: diff=56.45, labels=tgfb,chemoattract
  CD163+ macros ↔ B cells: diff=53.44, labels=m2,ecm,checkpoint
  smooth muscle ↔ B cells: diff=52.69, labels=ecm,checkpoint
  stroma ↔ B cells: diff=48.35, labels=m2,ecm,checkpoint,tgfb,chemoattract
  B cells ↔ CD68+ macros: diff=46.68, labels=m2,ecm,angiogenesis
  tumor cells ↔ B cells: diff=44.04, labels=checkpoint
  granulocytes ↔ adipocytes: diff=36.58, labels=m2,ecm,checkpoint,angiogenesis
  B cells ↔ adipocytes: diff=35.87, labels=ecm
  CD8+ T cells ↔ generic immune: diff=34.53, labels=m2,ecm,checkpoint
  smooth muscle ↔ generic immune: diff=33.76, labels=ecm,checkpoint,tgfb,chemoattract
