In [1]:
import pandas as pd

# Load metastable epialleles (comma-separated)
epialleles = pd.read_csv("/data/cephfs-1/work/groups/kühnen/users/cama15_c/metastable_epiallele_analysis/10_samples_work/metastable_epiallele_list.csv", sep=",", header=0)
epialleles = epialleles.drop(columns=["Hg19 data"])

In [2]:
# Add the two additional regions (make sure start < end)
additional = pd.DataFrame([
    {"Chr": "chr2", "Start": 25161685, "End": 25161767},
    {"Chr": "chr2", "Start": 113235385, "End": 113235510}
])

# Append them to the epiallele list
epialleles = pd.concat([epialleles, additional], ignore_index=True)

# Load combined methylation matrix
cpg_matrix = pd.read_csv("/data/cephfs-1/work/groups/kühnen/users/cama15_c/meth_matrix_files/meth_matrix_10_samples.csv", sep=",", header=0)

In [3]:
# Drop the 'Chr' column if it exists
if 'Chr' in cpg_matrix.columns:
    cpg_matrix = cpg_matrix.drop(columns=['Chr'])

# Rename 'Chromosome' to 'Chr'
cpg_matrix = cpg_matrix.rename(columns={'Chromosome': 'Chr'})

In [4]:
print(cpg_matrix.columns)

Index(['Chr', 'Start', 'End', 'Naive_1_frac', 'Naive_1_cov',
       'Formative_1_frac', 'Formative_1_cov', 'Primed_1_frac', 'Primed_1_cov',
       'Formative_Neuron_1_frac', 'Formative_Neuron_1_cov',
       'Primed_Neuron_1_frac', 'Primed_Neuron_1_cov', 'Naive_2_frac',
       'Naive_2_cov', 'Formative_2_frac', 'Formative_2_cov', 'Primed_2_frac',
       'Primed_2_cov', 'Formative_Neuron_2_frac', 'Formative_Neuron_2_cov',
       'Primed_Neuron_2_frac', 'Primed_Neuron_2_cov'],
      dtype='object')


In [5]:
# Optional: Speed up by converting to categorical types
cpg_matrix["Chr"] = cpg_matrix["Chr"].astype("category")
epialleles["Chr"] = epialleles["Chr"].astype("category")

# Identify coverage columns (ending in '_cov')
cov_cols = [col for col in cpg_matrix.columns if col.endswith('_cov')]

# Keep rows where at least 3 samples have coverage ≥ 10
cpg_filtered = (cpg_matrix[cov_cols] >= 10).sum(axis=1) >= 3
cpg_matrix_filtered = cpg_matrix[cpg_filtered].copy()

matches = []
for _, row in epialleles.iterrows():
    chr_, start, end = row["Chr"], row["Start"], row["End"]
    subset = cpg_matrix_filtered[
        (cpg_matrix_filtered["Chr"] == chr_) &
        (cpg_matrix_filtered["Start"] >= start) &
        (cpg_matrix_filtered["End"] <= end)
    ]
    if not subset.empty:
        matches.append(subset)

# Concatenate all matches into one DataFrame
matched_cpgs = pd.concat(matches, ignore_index=True)

# Save to file
matched_cpgs.to_csv("cpgs_within_metastable_epialleles.tsv", sep="\t", index=False)