# 目的

pyscriptを動かす

# セットアップ

In [None]:
import sys
import os
from pathlib import Path

# `LICENSE`が見える階層まで移動する
target_dir = "LICENSE"
path_current = Path.cwd()
while path_current.stem:
    if any(True for p in path_current.iterdir() if p.stem == target_dir):
        break
    path_current = path_current.parent
    if not path_current.stem:
        raise FileNotFoundError(f"{target_dir} directory not found")

os.chdir(path_current)
print(Path.cwd())

In [None]:
sys.path.append(str(path_current))
sys.path.append(str(Path.joinpath(path_current, "src")))

In [None]:
from pathlib import Path
from pprint import pprint
from collections import Counter

P = print
PP = pprint
C = Counter

from collections.abc import Iterator

# 実験

- 実験内容

In [None]:
from __future__ import annotations

import csv
import io
from pathlib import Path

from directory_manager import make_directories
from io_handler import download_file, load_csv_as_dicts, save_csv

IMPC_RELEASE = 23.0

In [None]:

###########################################################
# Preparation
###########################################################

ROOT_DIR = Path("TSUMUGI")
sub_dirs: list[str] = [".temp"]

make_directories(ROOT_DIR, sub_dirs)

TEMPDIR = ROOT_DIR / Path(".temp")

In [None]:
if not Path(TEMPDIR, "impc_phenodigm.csv").exists():
    url_phenodigm = "https://github.com/whri-phenogenomics/disease_models/raw/main/disease_models_app/data/phenodigm_matches_dr20.1.txt"

    error_message = "Please manually download impc phenodigm data (impc_phenodigm.csv) from https://diseasemodels.research.its.qmul.ac.uk/."

    phenodigm_tsv = download_file(url_phenodigm, error_message)
    reader = csv.reader(io.StringIO(phenodigm_tsv), delimiter="\t")
    phenodigm_csv = (row for row in reader)
    save_csv(phenodigm_csv, Path(TEMPDIR, "impc_phenodigm.csv"))

In [None]:
if not Path(TEMPDIR, f"statistical_all_{IMPC_RELEASE}.csv").exists():
    url_impc = f"https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/release-{IMPC_RELEASE}/results/statistical-results-ALL.csv.gz"

    error_message = "Please manually download impc statistical data (statistical_results_ALL.csv) from https://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/release-23.0/results/."

    statistical_all = download_file(url_impc, error_message)
    reader = csv.reader(io.StringIO(statistical_all), delimiter=",")
    statistical_all_rows = (row for row in reader)
    save_csv(statistical_all_rows, Path(TEMPDIR, f"statistical_all_{IMPC_RELEASE}.csv"))

In [None]:
# =========================================
# Filter significant genes
# significant genesはmp_term_nameが存在する列であることを利用
# =========================================

columns = [
    "marker_symbol",
    "marker_accession_id",
    "mp_term_name",
    "mp_term_id",
    "p_value",
    "effect_size",
    "female_ko_effect_p_value",
    "male_ko_effect_p_value",
    "female_ko_parameter_estimate",
    "sex_effect_p_value",
    "male_ko_parameter_estimate",  # sex differences
    "genotype_effect_p_value",
    "genotype_effect_parameter_estimate",
    "zygosity",  # zygosity
    "pipeline_name",
    "procedure_name",  # life-stage
    "allele_symbol",  # map to Phendigm
]

records: Iterator[dict[str, str]] = load_csv_as_dicts(Path(TEMPDIR, f"statistical_all_{IMPC_RELEASE}.csv"))

In [None]:
def subset_columns(records: Iterator[dict[str, str]], columns: list[str]) -> Iterator[dict[str, str]]:
    for record in records:
        yield {col: record.get(col, "") for col in columns}



In [None]:
records_subset = subset_columns(records, columns)


In [None]:
records_subset = list(records_subset)
P(len(records_subset))

# for record in records_subset:
#     p_value = record.get("p_value")
#     if not p_value:
#         print(record)
#         print(p_value is None)
#         print(p_value == "")
#         print(f"p_value is missing or empty in record: {p_value}")
#         break

In [None]:
# Embryo 表現型に該当する procedure_name の一覧
embryo_phenotyping = [
    "Gross Morphology Embryo E9.5",
    "Viability E9.5 Secondary Screen",
    "OPT E9.5",
    "MicroCT E9.5",
    "Gross Morphology Placenta E9.5",
    "Gross Morphology Embryo E12.5",
    "Embryo LacZ",
    "Gross Morphology Placenta E12.5",
    "Viability E12.5 Secondary Screen",
    "Viability E14.5-E15.5 Secondary Screen",
    "Gross Morphology Placenta E14.5-E15.5",
    "MicroCT E14.5-E15.5",
    "Gross Morphology Embryo E14.5-E15.5",
    "Viability E18.5 Secondary Screen",
    "MicroCT E18.5",
    "Gross Morphology Embryo E18.5",
    "Gross Morphology Placenta E18.5",
]

{record["pipeline_name"] for record in records_subset if record["procedure_name"] in embryo_phenotyping}

In [None]:

def extract_significant_phenotypes(
    records: Iterator[dict[str, str]], threshold: float = 10**(-4)
) -> list[dict[str, str | float]]:
    significants = []
    for record in records:
        # mp_term_nameが空文字なものはSkip
        if not record.get("mp_term_name"):
            continue

        p_value = record.get("p_value")
        female_ko_effect_p_value = record.get("female_ko_effect_p_value")
        male_ko_effect_p_value = record.get("male_ko_effect_p_value")
        effect_size = record.get("effect_size")

        # すべてfloatに変換. 空文字はInfに変換
        p_value = float(p_value) if p_value else float("inf")
        female_ko_effect_p_value = float(female_ko_effect_p_value) if female_ko_effect_p_value else float("inf")
        male_ko_effect_p_value = float(male_ko_effect_p_value) if male_ko_effect_p_value else float("inf")
        effect_size = float(effect_size) if effect_size else float("inf")

        if p_value == float("inf") and effect_size != float("inf"):
            significants.append(record)
        elif p_value < threshold or female_ko_effect_p_value < threshold or male_ko_effect_p_value < threshold:
            significants.append(record)

    # --- 重複削除（順序を気にしないので一気にsetで処理） ---
    unique_significants = [
        dict(t) for t in {frozenset(r.items()) for r in significants}
    ]

    return unique_significants


In [None]:
significants = extract_significant_phenotypes(records_subset, threshold=10**(-4))
P(len(significants))

In [None]:
from __future__ import annotations

from collections.abc import Iterator

INF = float("inf")


def subset_columns(records: Iterator[dict[str, str]], columns: list[str]) -> Iterator[dict[str, str]]:
    """Yield dicts keeping only the requested columns; missing keys become empty strings."""
    for record in records:
        yield {col: record.get(col, "") for col in columns}


def _to_float_or_inf(x) -> float:
    """Convert a string to float; empty/None becomes +Inf."""
    return float(x) if x not in (None, "") else INF


def _normalized_record(record: dict[str, str]) -> dict[str, float | str]:
    """Return a shallow-copied record with numeric fields coerced to float/Inf."""
    out = dict(record)  # avoid mutating the input iterator's backing data
    out["p_value"] = _to_float_or_inf(record.get("p_value"))
    out["female_ko_effect_p_value"] = _to_float_or_inf(record.get("female_ko_effect_p_value"))
    out["male_ko_effect_p_value"] = _to_float_or_inf(record.get("male_ko_effect_p_value"))
    out["effect_size"] = _to_float_or_inf(record.get("effect_size"))
    return out


def _is_significant(rec: dict[str, float | str], threshold: float) -> bool:
    """Significance rule:
    - If p_value is Inf and effect_size is finite -> keep.
    - OR any of the three p-values is below threshold -> keep."""
    if rec["p_value"] == INF and rec["effect_size"] != INF:
        return True
    return (
        rec["p_value"] < threshold
        or rec["female_ko_effect_p_value"] < threshold
        or rec["male_ko_effect_p_value"] < threshold
    )


def extract_significant_phenotypes(
    records: Iterator[dict[str, str]], threshold: float = 1e-4
) -> list[dict[str, float | str]]:
    """Filter significant phenotype records and drop exact duplicates (key+value match)."""
    significants: list[dict[str, float | str]] = []

    for record in records:
        # Skip when 'mp_term_name' is empty
        if not record.get("mp_term_name"):
            continue

        # Normalize numeric fields and evaluate significance
        rec = _normalized_record(record)
        if _is_significant(rec, threshold):
            significants.append(rec)

    # Deduplicate by full key-value equality; ordering does not matter
    # Use a sorted tuple of items as a stable, hashable fingerprint.
    seen: set[tuple[tuple[str, float | str], ...]] = set()
    unique: list[dict[str, float | str]] = []
    for rec in significants:
        fingerprint = tuple(sorted(rec.items()))
        if fingerprint not in seen:
            seen.add(fingerprint)
            unique.append(rec)

    return unique


In [None]:
significants = extract_significant_phenotypes(records_subset, threshold=10**(-4))
P(len(significants))

In [None]:
symbols = {record["marker_symbol"] for record in significants}
P(len(symbols))

In [None]:
mp_term_names = {record["mp_term_name"] for record in significants}
P(len(mp_term_names))

In [None]:

exoc6 = [record for record in significants if record["marker_symbol"] == "Exoc6"]

In [None]:
{record["mp_term_name"] for record in exoc6}

In [None]:
records = [record for record in records_subset if record["marker_symbol"] == "Exoc6"]

In [None]:
threshold = 10**(-4)

significants = []
for record in records:
    # mp_term_nameが空文字なものはSkip
    if not record.get("mp_term_name"):
        continue

    p_value = record.get("p_value")
    female_ko_effect_p_value = record.get("female_ko_effect_p_value")
    male_ko_effect_p_value = record.get("male_ko_effect_p_value")
    effect_size = record.get("effect_size")

    # すべてfloatに変換. 空文字はInfに変換
    p_value = float(p_value) if p_value else float("inf")
    female_ko_effect_p_value = float(female_ko_effect_p_value) if female_ko_effect_p_value else float("inf")
    male_ko_effect_p_value = float(male_ko_effect_p_value) if male_ko_effect_p_value else float("inf")
    effect_size = float(effect_size) if effect_size else float("inf")

    if p_value == float("inf") and effect_size != float("inf"):
        significants.append(record)
    elif p_value < threshold or female_ko_effect_p_value < threshold or male_ko_effect_p_value < threshold:
        print(record["mp_term_name"])
        significants.append(record)

    if record["mp_term_name"] == "hyperactivity":
        print(record)
# --- 重複削除（順序を気にしないので一気にsetで処理） ---
unique_significants = [
    dict(t) for t in {frozenset(r.items()) for r in significants}
]

{record["mp_term_name"] for record in unique_significants}