In [1]:
import pandas as pd

# Load imprinting regions (comma-separated)
imprnt = pd.read_csv("ICR_known_regions_hg38.csv", sep=",", header=0)

# Load combined methylation matrix
cpg_matrix = pd.read_csv("/data/cephfs-1/work/groups/kühnen/users/cama15_c/meth_matrix_files/meth_matrix_10_samples.csv", sep=",")

cpg_matrix.rename(columns={'Chromosome': 'Chr'}, inplace=True)

In [2]:
print(cpg_matrix.columns)

Index(['Chr', 'Start', 'End', 'Naive_1_frac', 'Naive_1_cov',
       'Formative_1_frac', 'Formative_1_cov', 'Primed_1_frac', 'Primed_1_cov',
       'Formative_Neuron_1_frac', 'Formative_Neuron_1_cov',
       'Primed_Neuron_1_frac', 'Primed_Neuron_1_cov', 'Naive_2_frac',
       'Naive_2_cov', 'Formative_2_frac', 'Formative_2_cov', 'Primed_2_frac',
       'Primed_2_cov', 'Formative_Neuron_2_frac', 'Formative_Neuron_2_cov',
       'Primed_Neuron_2_frac', 'Primed_Neuron_2_cov'],
      dtype='object')


In [3]:
# Optional: Speed up by converting to categorical types
cpg_matrix["Chr"] = cpg_matrix["Chr"].astype("category")
imprnt["Chr"] = imprnt["Chr"].astype("category")

# Identify coverage columns (ending in '_cov')
cov_cols = [col for col in cpg_matrix.columns if col.endswith('_cov')]

# Keep rows where at least 3 samples have coverage ≥ 10
cpg_filtered = (cpg_matrix[cov_cols] >= 10).sum(axis=1) >= 3
cpg_matrix_filtered = cpg_matrix[cpg_filtered].copy()

# Match CpGs within all imprinitng regions
matches = []
for _, row in imprnt.iterrows():
    chr_, start, end = row["Chr"], row["Start"], row["End"]
    subset = cpg_matrix[
        (cpg_matrix["Chr"] == chr_) &
        (cpg_matrix["Start"] >= start) &
        (cpg_matrix["End"] <= end)
    ]
    if not subset.empty:
        matches.append(subset)

# Concatenate all matches into one DataFrame
matched_cpgs = pd.concat(matches, ignore_index=True)

# Save to file
matched_cpgs.to_csv("cpgs_within_imprint_regions.csv", sep=",", index=False)
