## Methylation Data Preprocessing

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/a-nadeem9/brca-epigenetic-age-acceleration-immune-multiomics-classifier/blob/main/notebooks/01c_methylation_preprocessing.ipynb)


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import h5py

with h5py.File('../data/methylation_matched_final.h5', 'r') as f:
    print("Available datasets:")
    for key in f.keys():
        print(key)

Available datasets:
methylation


In [3]:
df = pd.read_hdf('../data/methylation_matched_final.h5', key='methylation')


In [4]:
# Display the first few rows of the dataframe
df.head()

Unnamed: 0_level_0,TCGA-C8-A27B-01A-11D-A16A-05,TCGA-E9-A3Q9-01A-11D-A21R-05,TCGA-C8-A3M8-01A-11D-A212-05,TCGA-E9-A5FL-01A-11D-A27Y-05,TCGA-A2-A25F-01A-11D-A16A-05,TCGA-A1-A0SE-01A-11D-A10P-05,TCGA-A7-A425-01A-11D-A244-05,TCGA-E9-A1N8-01A-11D-A145-05,TCGA-D8-A1X8-01A-11D-A14N-05,TCGA-E2-A3DX-01A-21D-A212-05,...,TCGA-BH-A0AZ-01A-21D-A12R-05,TCGA-AO-A0JC-01A-11D-A10P-05,TCGA-B6-A0X1-01A-11D-A10A-05,TCGA-E9-A22H-01A-11D-A161-05,TCGA-AO-A03L-01A-41D-A10P-05,TCGA-AC-A6NO-01A-12D-A33F-05,TCGA-EW-A2FS-01A-11D-A17F-05,TCGA-BH-A1FG-01A-11D-A13K-05,TCGA-LL-A7T0-01A-31D-A357-05,TCGA-V7-A7HQ-01A-11D-A33F-05
ProbeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cg13869341,,,,,,,,,,,...,,,,,,,,,,
cg14008030,,,,,,,,,,,...,,,,,,,,,,
cg12045430,,,,,,,,,,,...,,,,,,,,,,
cg20826792,,,,,,,,,,,...,,,,,,,,,,
cg00381604,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# 1. FILTER PROBES by missingness (per‐probe, across samples)
probe_missing = df.isna().mean(axis=1)                # fraction missing for each probe
keep_probes = probe_missing[probe_missing <= 0.15].index
df_filt = df.loc[keep_probes]
print(f"{df_filt.shape[0]} probes remain after filtering >15% missing")

398255 probes remain after filtering >15% missing


In [6]:
# 2. DROP sex-chromosome probes via raw GitHub manifest  
manifest_url = (
    "https://github.com/zhou-lab/InfiniumAnnotationV1/"
    "raw/main/Anno/HM450/HM450.hg38.manifest.tsv.gz"
)
# Read only the probe ID and chromosome columns from the gzipped TSV
manifest = pd.read_csv(
    manifest_url,
    sep="\t",
    compression="gzip",
    usecols=["Probe_ID", "CpG_chrm"]
)
# Standardize chromosome field
manifest["CHR"] = (
    manifest["CpG_chrm"]
    .astype(str)
    .str.replace("^chr", "", regex=True)
    .str.upper()
)
# Keep only autosomes 1–22
autos = {str(i) for i in range(1, 23)}
auto_probes = set(manifest.loc[manifest["CHR"].isin(autos), "Probe_ID"])

# Filter your already‐missingness‐filtered df_filt down to autosomal probes
df_filt = df_filt.loc[df_filt.index.intersection(auto_probes)]
print(f"{df_filt.shape[0]} autosomal probes remain")


388945 autosomal probes remain


In [None]:
#df_filt.to_hdf('../data/methylation_matched_final_filtered.h5', key='methylation')

In [1]:
import pandas as pd

In [2]:
df = pd.read_hdf('../data/methylation_matched_final_filtered.h5', key='methylation')

In [3]:

from sklearn.impute import KNNImputer

# 3. KNN-impute the autosomal, ≤15%-missing probes
imputer = KNNImputer(n_neighbors=5)

# Transpose so samples are rows
df_T = df.T                    # shape: (n_samples, 388945 probes)
imp_array = imputer.fit_transform(df_T)

# Back to probes×samples with the right labels
df_imp = pd.DataFrame(
    imp_array,
    index=df_T.index,               # sample IDs
    columns=df_T.columns            # probe IDs
).T

print("Imputed matrix shape:", df_imp.shape,
      "— total NAs:", df_imp.isna().sum().sum())


MemoryError: Unable to allocate 2.01 GiB for an array with shape (388945, 693) and data type float64