# Missing pmi values

## Check if missing in MIT_ROSMAP

This script loads two AnnData .h5ad files (FUJITA and MIT) in read-only backed mode and extracts the set of unique individualID values from adata.obs. It then reads ROSMAP_clinical.csv, subsets the clinical table to rows whose individualID appears in either AnnData file (union of IDs), and checks whether the clinical pmi column contains missing values for those matched individuals. It reports counts of matched rows and missing pmi, and optionally writes out lists of individuals with missing pmi and individuals present in the AnnData files but absent from the clinical CSV.

In [2]:
import anndata as ad
import pandas as pd

FILES = {
    "FUJITA": "../../celltypist/fujita_celltypist_GPU_counts_only.h5ad",
    "MIT": "../../celltypist/mit_celltypist_GPU_counts_only.h5ad",
}

CLINICAL_CSV = "../../QCAndScrublet/ROSMAP_clinical.csv"  # adjust path if needed
ID_COL = "individualID"
PMI_COL = "pmi"

def get_unique_individual_ids(h5ad_path: str) -> pd.Index:
    # backed="r" avoids loading X into memory
    adata = ad.read_h5ad(h5ad_path, backed="r")
    if ID_COL not in adata.obs.columns:
        raise KeyError(f"{h5ad_path}: obs is missing column '{ID_COL}'")
    # Normalize to string-ish, drop missing/empty, keep unique
    s = adata.obs[ID_COL]
    s = s.astype("string")
    s = s.dropna()
    s = s[s.str.len() > 0]
    return pd.Index(s.unique())

# 1) Extract unique individualID values (per file + union)
ids_by_file = {}
all_ids = pd.Index([], dtype="object")

for label, path in FILES.items():
    ids = get_unique_individual_ids(path)
    ids_by_file[label] = ids
    all_ids = all_ids.union(ids)
    print(f"{label}: {len(ids):,} unique {ID_COL}")

print(f"\nUNION: {len(all_ids):,} unique {ID_COL} across all files\n")

# 2) Subset ROSMAP_clinical.csv to those IDs and check missing PMI
clin = pd.read_csv(CLINICAL_CSV)

if ID_COL not in clin.columns:
    raise KeyError(f"{CLINICAL_CSV} is missing column '{ID_COL}'")
if PMI_COL not in clin.columns:
    raise KeyError(f"{CLINICAL_CSV} is missing column '{PMI_COL}'")

clin[ID_COL] = clin[ID_COL].astype("string")

subset = clin[clin[ID_COL].isin(all_ids)].copy()
print(f"Clinical rows matching UNION IDs: {len(subset):,}")

missing_pmi_mask = subset[PMI_COL].isna()
missing_pmi_rows = subset.loc[missing_pmi_mask, [ID_COL, PMI_COL]]

print(f"Rows with missing {PMI_COL}: {missing_pmi_rows.shape[0]:,}")

if missing_pmi_rows.shape[0] > 0:
    missing_ids = missing_pmi_rows[ID_COL].dropna().unique()
    print(f"Unique {ID_COL} with missing {PMI_COL}: {len(missing_ids):,}")
    # Print a small sample (and you can save full list below)
    print("Sample IDs:", list(missing_ids[:25]))

    # Optional: save full list
    pd.Series(missing_ids, name=ID_COL).to_csv("clinical_missing_pmi_individualIDs.txt", index=False)
    print('Saved: clinical_missing_pmi_individualIDs.txt')

# Optional: also report clinical IDs that were NOT found (if you care)
# (This is "which of my AnnData IDs are missing from the clinical CSV")
found_ids = pd.Index(subset[ID_COL].dropna().unique())
not_found = all_ids.difference(found_ids)
print(f"\nIDs present in h5ad UNION but not found in clinical CSV: {len(not_found):,}")
if len(not_found) > 0:
    pd.Series(not_found, name=ID_COL).to_csv("clinical_missing_rows_for_individualIDs.txt", index=False)
    print("Saved: clinical_missing_rows_for_individualIDs.txt")


FUJITA: 367 unique individualID
MIT: 357 unique individualID

UNION: 517 unique individualID across all files

Clinical rows matching UNION IDs: 517
Rows with missing pmi: 3
Unique individualID with missing pmi: 3
Sample IDs: ['R8432213', 'R9053141', 'R4258320']
Saved: clinical_missing_pmi_individualIDs.txt

IDs present in h5ad UNION but not found in clinical CSV: 0


## Check if missing pmi in unique individualID

This script loads each AnnData .h5ad file (FUJITA and MIT) in backed read-only mode and uses adata.obs to evaluate pmi completeness per individualID. For each individual, it computes: total number of cells/rows, number of missing pmi values, whether any pmi is missing, number of non-missing pmi values, how many unique non-missing PMI values occur (to detect inconsistencies), and the min/max PMI observed for that individual. It prints a quick per-file summary and saves a detailed per-individual report to CSV, plus a text file listing the individuals that have at least one missing pmi.

In [3]:
import anndata as ad
import pandas as pd

FILES = {
    "FUJITA": "../../celltypist/fujita_celltypist_GPU_counts_only.h5ad",
    "MIT": "../../celltypist/mit_celltypist_GPU_counts_only.h5ad",
}

ID_COL = "individualID"
PMI_COL = "pmi"

def check_missing_pmi_by_individual(h5ad_path: str, label: str) -> pd.DataFrame:
    # backed="r" avoids loading X; obs is accessed lazily
    adata = ad.read_h5ad(h5ad_path, backed="r")

    for col in (ID_COL, PMI_COL):
        if col not in adata.obs.columns:
            raise KeyError(f"{label} ({h5ad_path}): obs is missing column '{col}'")

    obs = adata.obs[[ID_COL, PMI_COL]].copy()

    # Normalize ID
    obs[ID_COL] = obs[ID_COL].astype("string")
    obs = obs.dropna(subset=[ID_COL])
    obs = obs[obs[ID_COL].str.len() > 0]

    # Coerce PMI to numeric (non-numeric becomes NaN)
    obs[PMI_COL] = pd.to_numeric(obs[PMI_COL], errors="coerce")

    # Per-individual missingness
    g = obs.groupby(ID_COL, observed=True)[PMI_COL]
    out = pd.DataFrame({
        "file": label,
        "n_cells": g.size(),
        "n_pmi_missing": g.apply(lambda s: s.isna().sum()),
        "any_pmi_missing": g.apply(lambda s: s.isna().any()),
        "n_pmi_present": g.apply(lambda s: s.notna().sum()),
        "pmi_unique_non_na": g.apply(lambda s: s.dropna().nunique()),
        "pmi_min": g.min(),
        "pmi_max": g.max(),
    }).reset_index()

    return out.sort_values(["any_pmi_missing", "n_pmi_missing"], ascending=[False, False])

all_reports = []
for label, path in FILES.items():
    rep = check_missing_pmi_by_individual(path, label)
    all_reports.append(rep)

    n_ids = rep[ID_COL].nunique()
    n_bad = rep["any_pmi_missing"].sum()
    print(f"{label}: {n_ids:,} individuals total; {n_bad:,} have ≥1 missing {PMI_COL}")

    # Save per-file list of IDs with missing PMI
    bad_ids = rep.loc[rep["any_pmi_missing"], ID_COL]
    bad_ids.to_csv(f"{label}_individualIDs_with_missing_pmi.txt", index=False)

# Combined report across both files
combined = pd.concat(all_reports, ignore_index=True)
combined.to_csv("missing_pmi_by_individual_across_files.csv", index=False)
print("\nSaved:")
print(" - missing_pmi_by_individual_across_files.csv")
print(" - FUJITA_individualIDs_with_missing_pmi.txt")
print(" - MIT_individualIDs_with_missing_pmi.txt")


FUJITA: 367 individuals total; 1 have ≥1 missing pmi
MIT: 357 individuals total; 2 have ≥1 missing pmi

Saved:
 - missing_pmi_by_individual_across_files.csv
 - FUJITA_individualIDs_with_missing_pmi.txt
 - MIT_individualIDs_with_missing_pmi.txt


## Check missing pmi cells

This script filters adata.obs down to cells where pmi is missing, then groups those missing rows by individualID to compute how many missing-PMI cells each individual contributes. It prints the resulting per-individual counts (so you can see exactly which individualID(s) are affected and by how much) and writes a CSV of individualID → n_missing_cells.

In [5]:
import anndata as ad
import pandas as pd

FILES = {
    "FUJITA": "../../celltypist/fujita_celltypist_GPU_counts_only.h5ad",
    "MIT": "../../celltypist/mit_celltypist_GPU_counts_only.h5ad",
}

ID_COL = "individualID"
PMI_COL = "pmi"

for label, path in FILES.items():
    adata = ad.read_h5ad(path, backed="r")

    if PMI_COL not in adata.obs.columns:
        raise KeyError(f"{label} ({path}): obs is missing column '{PMI_COL}'")
    if ID_COL not in adata.obs.columns:
        raise KeyError(f"{label} ({path}): obs is missing column '{ID_COL}'")

    # Work only with the columns we need
    obs = adata.obs[[ID_COL, PMI_COL]].copy()

    # Coerce PMI to numeric (non-numeric -> NaN), then check missing at the cell/row level
    obs[PMI_COL] = pd.to_numeric(obs[PMI_COL], errors="coerce")

    n_cells = len(obs)
    n_missing = int(obs[PMI_COL].isna().sum())
    pct_missing = (n_missing / n_cells * 100) if n_cells else 0.0

    print(f"{label}: {n_missing:,} / {n_cells:,} cells have missing {PMI_COL} ({pct_missing:.4f}%)")

    # Optional: also report how many individuals those missing cells belong to
    missing_individuals = obs.loc[obs[PMI_COL].isna(), ID_COL].astype("string").dropna().unique()
    print(f"{label}: missing-{PMI_COL} cells span {len(missing_individuals):,} unique {ID_COL}\n")

    # Optional: save the list of missing rows (could be large)
    obs.loc[obs[PMI_COL].isna(), [ID_COL, PMI_COL]].to_csv(f"{label}_cells_with_missing_pmi.csv", index=False)


FUJITA: 4,133 / 1,304,391 cells have missing pmi (0.3169%)
FUJITA: missing-pmi cells span 1 unique individualID

MIT: 4,887 / 1,807,427 cells have missing pmi (0.2704%)
MIT: missing-pmi cells span 2 unique individualID



## Check if pmi is missing across all cells within missing-pmi individualID

This script summarizes PMI completeness per individualID by counting, for each individual, how many cells have pmi missing vs present. It then keeps only individuals with at least one missing PMI cell and classifies each as:

ALL_MISSING: the individual has zero cells with PMI present (PMI entirely absent for that donor in that file)

PARTIAL_MISSING: the individual has a mix of present and missing PMI values (inconsistent metadata within the donor)
Finally, it prints and saves a per-individual table with n_cells, n_pmi_missing, n_pmi_present, and the missingness type.

In [6]:
import anndata as ad
import pandas as pd

FILES = {
    "FUJITA": "../../celltypist/fujita_celltypist_GPU_counts_only.h5ad",
    "MIT": "../../celltypist/mit_celltypist_GPU_counts_only.h5ad",
}

ID_COL = "individualID"
PMI_COL = "pmi"

for label, path in FILES.items():
    adata = ad.read_h5ad(path, backed="r")

    obs = adata.obs[[ID_COL, PMI_COL]].copy()
    obs[ID_COL] = obs[ID_COL].astype("string")
    obs = obs.dropna(subset=[ID_COL])
    obs = obs[obs[ID_COL].str.len() > 0]

    # Make PMI numeric so non-numeric strings become NaN
    obs[PMI_COL] = pd.to_numeric(obs[PMI_COL], errors="coerce")

    # Per-individual: how many cells have PMI missing vs present?
    g = obs.groupby(ID_COL, observed=True)[PMI_COL]
    per_ind = pd.DataFrame({
        "n_cells": g.size(),
        "n_pmi_missing": g.apply(lambda s: int(s.isna().sum())),
        "n_pmi_present": g.apply(lambda s: int(s.notna().sum())),
    }).reset_index()

    # Only individuals with any missing PMI
    bad = per_ind[per_ind["n_pmi_missing"] > 0].copy()

    # Classify: partial missing vs all missing
    bad["missing_type"] = bad.apply(
        lambda r: "ALL_MISSING" if r["n_pmi_present"] == 0 else "PARTIAL_MISSING",
        axis=1
    )

    print(f"\n{label}: individuals with missing PMI = {len(bad):,}")
    if len(bad) == 0:
        continue

    print(bad.sort_values(["missing_type", "n_pmi_missing"], ascending=[True, False]))

    # Save details
    bad.to_csv(f"{label}_pmi_missingness_by_individual.csv", index=False)
    print(f"Saved: {label}_pmi_missingness_by_individual.csv")



FUJITA: individuals with missing PMI = 1
    individualID  n_cells  n_pmi_missing  n_pmi_present missing_type
327     R9053141     4133           4133              0  ALL_MISSING
Saved: FUJITA_pmi_missingness_by_individual.csv

MIT: individuals with missing PMI = 2
    individualID  n_cells  n_pmi_missing  n_pmi_present missing_type
287     R8432213     4097           4097              0  ALL_MISSING
120     R4258320      790            790              0  ALL_MISSING
Saved: MIT_pmi_missingness_by_individual.csv
