
### SV–Disease Association Analysis
Aim: For SVs in high LD with SNPs from the GWAS Catalog, matched EHR data from 1,027 long-read samples were used to explore potential SV–disease associations.

In [None]:
# EHR information extraction
import os
import pandas as pd
from google.cloud import bigquery

client = bigquery.Client()

CDR_version = os.getenv("WORKSPACE_CDR")

sample_ids_file = "/home/jupyter/process/sample_IDs"  # 1027 LR samples

with open(sample_ids_file, "r") as f:
    sample_ids = [line.strip() for line in f]  

sample_ids_str = ",".join(sample_ids)

query = f"""
WITH combined_data AS (
    SELECT person_id, condition_concept_id AS concept_id, 'condition' AS source
    FROM `{CDR_version}.condition_occurrence`
    WHERE person_id IN ({sample_ids_str})

    UNION ALL

    SELECT person_id, observation_concept_id AS concept_id, 'observation' AS source
    FROM `{CDR_version}.observation`
    WHERE person_id IN ({sample_ids_str})

    UNION ALL

    SELECT person_id, measurement_concept_id AS concept_id, 'measurement' AS source
    FROM `{CDR_version}.measurement`
    WHERE person_id IN ({sample_ids_str})
)
SELECT person_id, concept_id, source FROM combined_data
"""

df_samples = client.query(query).to_dataframe()

output_file = "/home/jupyter/process/AoU_LR_samples_EHR.csv"
df_samples.to_csv(output_file, index=False)


In [2]:
import pandas as pd

samples_file = "/home/jupyter/process/AoU_LR_samples_EHR.csv"
df_samples = pd.read_csv(samples_file)
 
df_conditions = df_samples[df_samples["source"] == "condition"][["person_id", "concept_id"]]
 
vocabulary_file = "/home/jupyter/process/vocabulary_SNOMED/CONCEPT.csv"
df_vocab = pd.read_csv(vocabulary_file, sep="\t")   

df_vocab = df_vocab[["concept_id", "concept_name", "concept_class_id", "standard_concept"]]

df_merged = pd.merge(df_conditions, df_vocab, on="concept_id", how="left")

output_file = "/home/jupyter/process/LR_samples_condition_info.csv"
df_merged.to_csv(output_file, index=False)

  df_vocab = pd.read_csv(vocabulary_file, sep="\t")


In [4]:
!head -n 1 "/home/jupyter/process/LR_samples_condition_info.csv"  > "/home/jupyter/process/LR_samples_condition_info.unique.csv" && sort "/home/jupyter/process/LR_samples_condition_info.csv" |uniq |grep -v person_id >> "/home/jupyter/process/LR_samples_condition_info.unique.csv"

In [5]:
import pandas as pd

df = pd.read_csv("/home/jupyter/process/LR_samples_condition_info.unique.csv")

disease_counts = df.groupby('concept_name')['person_id'].nunique()

valid_diseases = disease_counts[(disease_counts >= 10) & (disease_counts <=838)].index

filtered_df = df[df['concept_name'].isin(valid_diseases)]

filtered_df.to_csv("/home/jupyter/process/filtered_LR_samples_condition_info.csv", index=False)

In [7]:
!grep -v person_id /home/jupyter/process/LR_samples_condition_info.csv |awk -F "," '{print $1}'  |sort |uniq > /home/jupyter/process/LR_samples_with_condition.txt

In [11]:
for general diseases
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from statsmodels.stats.contingency_tables import Table2x2
from multiprocessing import Pool
import math
import os

INPUT_FILE = "/home/jupyter/process/filtered_genomewide_SVs.phase1_LD.tsv.gz" 
OUTPUT_FILE = "/home/jupyter/process/SV_Disease_associations.genomewide.tsv"
NUM_JOBS = 4

# Load samples 
with open("/home/jupyter/process/LR_samples_with_condition.txt") as f:
    all_samples = {line.strip() for line in f if line.strip()}
all_samples = set(map(str, all_samples)) 

# Important diseases 
with open("/home/jupyter/process/phase1_important_diseases.general.txt") as f:
    important_diseases = {line.strip() for line in f if line.strip()}

ehr_df = pd.read_csv("/home/jupyter/process/LR_samples_condition_info.unique.csv", dtype=str)
ehr_df = ehr_df[
    (ehr_df["concept_class_id"] == "Disorder") &
    (ehr_df["standard_concept"] == "S") &
    (ehr_df["concept_name"].isin(important_diseases))
]

phenotype_to_samples = (
    ehr_df.groupby("concept_name")["person_id"]
    .apply(lambda s: set(map(str, s)))
    .to_dict()
)


def run_fisher(sv_id, disease, meta_info, phenotype_to_samples, all_samples):
    gene_names, sens07_id, sv_samples = meta_info[sv_id]
    disease_samples = phenotype_to_samples[disease] & all_samples

    sv_case = len(sv_samples & disease_samples)
    sv_non_case = len(sv_samples - disease_samples)

    non_sv_samples = all_samples - sv_samples
    non_sv_case = len(non_sv_samples & disease_samples)
    non_sv_non_case = len(non_sv_samples - disease_samples)

    if min(sv_case, sv_non_case, non_sv_case, non_sv_non_case) == 0:
        return None

    table = [[sv_case, sv_non_case],
             [non_sv_case, non_sv_non_case]]

    odds_ratio, pvalue = fisher_exact(table)

    try:
        table_obj = Table2x2(table)
        ci_lower, ci_upper = table_obj.oddsratio_confint(method="exact")
    except Exception:
        ci_lower, ci_upper = np.nan, np.nan

    return [
        sv_id,
        gene_names,
        sens07_id,
        len(sv_samples),
        disease,
        sv_case,
        sv_non_case,
        non_sv_case,
        non_sv_non_case,
        round(odds_ratio, 3),
        round(ci_lower, 3) if not np.isnan(ci_lower) else np.nan,
        round(ci_upper, 3) if not np.isnan(ci_upper) else np.nan,
        pvalue,
    ]


def chunk_tasks(tasks, n_chunks):
    if not tasks:
        return []
    chunk_size = math.ceil(len(tasks) / n_chunks)
    return [tasks[i:i + chunk_size] for i in range(0, len(tasks), chunk_size)]


def run_chunk(chunk, meta_info, phenotype_to_samples, all_samples):
    results = []
    for sv_id, disease in chunk:
        res = run_fisher(sv_id, disease, meta_info, phenotype_to_samples, all_samples)
        if res is not None:
            results.append(res)
    return results


def main():
    if not os.path.exists(INPUT_FILE):
        print(f"Input file {INPUT_FILE} not found, exiting...")
        return

    print(f"Processing genome-wide SV–disease associations with {NUM_JOBS} jobs...")

    sv_df = pd.read_csv(INPUT_FILE, sep="\t", dtype=str)
    # convert sample list string → set of sample IDs (as str)
    sv_df["Sample_IDs"] = sv_df["Sample_IDs"].apply(
        lambda x: set(map(str, x.split(",")))
    )

    task_list = []
    meta_info = {}

    for _, row in sv_df.iterrows():
        sv_id = row["1KG_ID"]
        gene_names = row["Gene_Names"]
        sens07_id = row["sens07_ID"]
        sv_samples = row["Sample_IDs"] & all_samples
        control_samples = all_samples - sv_samples

        if len(sv_samples) < 10 or len(control_samples) < 10:
            continue

        meta_info[sv_id] = (gene_names, sens07_id, sv_samples)

        for disease in phenotype_to_samples:
            task_list.append((sv_id, disease))

    print(f"{len(task_list)} SV–disease pairs to test")

    if not task_list:
        print("No SV–disease pairs passed filters.")
        return

    task_chunks = chunk_tasks(task_list, NUM_JOBS)

    with Pool(NUM_JOBS) as pool:
        chunked_results = pool.starmap(
            run_chunk,
            [
                (chunk, meta_info, phenotype_to_samples, all_samples)
                for chunk in task_chunks
            ],
        )

    results = [r for chunk in chunked_results for r in chunk]

    output_df = pd.DataFrame(
        results,
        columns=[
        "AoU_ID", "Gene_Names", "sens07_ID", "Sample_Count", "Disease",
        "SV_with_Disease", "SV_without_Disease", "NonSV_with_Disease", "NonSV_without_Disease",
        "Odds_ratio", "CI_lower", "CI_upper", "pvalue"
        ],
    )
    output_df.to_csv(OUTPUT_FILE, sep="\t", index=False)
    print(f"Saved {OUTPUT_FILE}")


if __name__ == "__main__":
    main()


Processing genome-wide SV–disease associations with 4 jobs...
2638740 SV–disease pairs to test
Saved /home/jupyter/process/SV_Disease_associations.genomewide.tsv


In [1]:
# Identification of SVs in females associated with female-specific diseases
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from statsmodels.stats.contingency_tables import Table2x2
from multiprocessing import Pool
import math
import os

INPUT_FILE = "/home/jupyter/process/filtered_genomewide_SVs.phase1_LD.tsv.gz" 
OUTPUT_FILE = "/home/jupyter/process/SV_Disease_associations.female.tsv"
NUM_JOBS = 4

# Load female samples 
with open("/home/jupyter/process/AoU_LR_condition_female_samples") as f:
    all_samples = {line.strip() for line in f if line.strip()}
all_samples = set(map(str, all_samples)) 

# Important diseases 
with open("/home/jupyter/process/phase1_important_diseases.female.txt") as f:
    important_diseases = {line.strip() for line in f if line.strip()}

ehr_df = pd.read_csv("/home/jupyter/process/LR_samples_condition_info.unique.csv", dtype=str)
ehr_df = ehr_df[
    (ehr_df["concept_class_id"] == "Disorder") &
    (ehr_df["standard_concept"] == "S") &
    (ehr_df["concept_name"].isin(important_diseases))
]

phenotype_to_samples = (
    ehr_df.groupby("concept_name")["person_id"]
    .apply(lambda s: set(map(str, s)))
    .to_dict()
)


def run_fisher(sv_id, disease, meta_info, phenotype_to_samples, all_samples):
    gene_names, sens07_id, sv_samples = meta_info[sv_id]
    disease_samples = phenotype_to_samples[disease] & all_samples

    sv_case = len(sv_samples & disease_samples)
    sv_non_case = len(sv_samples - disease_samples)

    non_sv_samples = all_samples - sv_samples
    non_sv_case = len(non_sv_samples & disease_samples)
    non_sv_non_case = len(non_sv_samples - disease_samples)

    if min(sv_case, sv_non_case, non_sv_case, non_sv_non_case) == 0:
        return None

    table = [[sv_case, sv_non_case],
             [non_sv_case, non_sv_non_case]]

    odds_ratio, pvalue = fisher_exact(table)

    try:
        table_obj = Table2x2(table)
        ci_lower, ci_upper = table_obj.oddsratio_confint(method="exact")
    except Exception:
        ci_lower, ci_upper = np.nan, np.nan

    return [
        sv_id,
        gene_names,
        sens07_id,
        len(sv_samples),
        disease,
        sv_case,
        sv_non_case,
        non_sv_case,
        non_sv_non_case,
        round(odds_ratio, 3),
        round(ci_lower, 3) if not np.isnan(ci_lower) else np.nan,
        round(ci_upper, 3) if not np.isnan(ci_upper) else np.nan,
        pvalue,
    ]


def chunk_tasks(tasks, n_chunks):
    if not tasks:
        return []
    chunk_size = math.ceil(len(tasks) / n_chunks)
    return [tasks[i:i + chunk_size] for i in range(0, len(tasks), chunk_size)]


def run_chunk(chunk, meta_info, phenotype_to_samples, all_samples):
    results = []
    for sv_id, disease in chunk:
        res = run_fisher(sv_id, disease, meta_info, phenotype_to_samples, all_samples)
        if res is not None:
            results.append(res)
    return results


def main():
    if not os.path.exists(INPUT_FILE):
        print(f"Input file {INPUT_FILE} not found, exiting...")
        return

    print(f"Processing genome-wide SV–disease associations with {NUM_JOBS} jobs...")

    sv_df = pd.read_csv(INPUT_FILE, sep="\t", dtype=str)
    # convert sample list string → set of sample IDs (as str)
    sv_df["Sample_IDs"] = sv_df["Sample_IDs"].apply(
        lambda x: set(map(str, x.split(",")))
    )

    task_list = []
    meta_info = {}

    for _, row in sv_df.iterrows():
        sv_id = row["1KG_ID"]
        gene_names = row["Gene_Names"]
        sens07_id = row["sens07_ID"]
        sv_samples = row["Sample_IDs"] & all_samples
        control_samples = all_samples - sv_samples

        if len(sv_samples) < 10 or len(control_samples) < 10:
            continue

        meta_info[sv_id] = (gene_names, sens07_id, sv_samples)

        for disease in phenotype_to_samples:
            task_list.append((sv_id, disease))

    print(f"{len(task_list)} SV–disease pairs to test")

    if not task_list:
        print("No SV–disease pairs passed filters.")
        return

    task_chunks = chunk_tasks(task_list, NUM_JOBS)

    with Pool(NUM_JOBS) as pool:
        chunked_results = pool.starmap(
            run_chunk,
            [
                (chunk, meta_info, phenotype_to_samples, all_samples)
                for chunk in task_chunks
            ],
        )

    results = [r for chunk in chunked_results for r in chunk]

    output_df = pd.DataFrame(
        results,
        columns=[
        "AoU_ID", "Gene_Names", "sens07_ID", "Sample_Count", "Disease",
        "SV_with_Disease", "SV_without_Disease", "NonSV_with_Disease", "NonSV_without_Disease",
        "Odds_ratio", "CI_lower", "CI_upper", "pvalue"
        ],
    )
    output_df.to_csv(OUTPUT_FILE, sep="\t", index=False)
    print(f"Saved {OUTPUT_FILE}")


if __name__ == "__main__":
    main()


Processing genome-wide SV–disease associations with 4 jobs...
107982 SV–disease pairs to test
Saved /home/jupyter/process/SV_Disease_associations.female.tsv


In [2]:
#Identification of SVs in males associated with male-specific diseases
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from statsmodels.stats.contingency_tables import Table2x2
from multiprocessing import Pool
import math
import os

INPUT_FILE = "/home/jupyter/process/filtered_genomewide_SVs.phase1_LD.tsv.gz" 
OUTPUT_FILE = "/home/jupyter/process/SV_Disease_associations.male.tsv"
NUM_JOBS = 4

# Load female samples 
with open("/home/jupyter/process/AoU_LR_condition_male_samples") as f:
    all_samples = {line.strip() for line in f if line.strip()}
all_samples = set(map(str, all_samples)) 

# Important diseases 
with open("/home/jupyter/process/phase1_important_diseases.male.txt") as f:
    important_diseases = {line.strip() for line in f if line.strip()}

ehr_df = pd.read_csv("/home/jupyter/process/LR_samples_condition_info.unique.csv", dtype=str)
ehr_df = ehr_df[
    (ehr_df["concept_class_id"] == "Disorder") &
    (ehr_df["standard_concept"] == "S") &
    (ehr_df["concept_name"].isin(important_diseases))
]

phenotype_to_samples = (
    ehr_df.groupby("concept_name")["person_id"]
    .apply(lambda s: set(map(str, s)))
    .to_dict()
)


def run_fisher(sv_id, disease, meta_info, phenotype_to_samples, all_samples):
    gene_names, sens07_id, sv_samples = meta_info[sv_id]
    disease_samples = phenotype_to_samples[disease] & all_samples

    sv_case = len(sv_samples & disease_samples)
    sv_non_case = len(sv_samples - disease_samples)

    non_sv_samples = all_samples - sv_samples
    non_sv_case = len(non_sv_samples & disease_samples)
    non_sv_non_case = len(non_sv_samples - disease_samples)

    if min(sv_case, sv_non_case, non_sv_case, non_sv_non_case) == 0:
        return None

    table = [[sv_case, sv_non_case],
             [non_sv_case, non_sv_non_case]]

    odds_ratio, pvalue = fisher_exact(table)

    try:
        table_obj = Table2x2(table)
        ci_lower, ci_upper = table_obj.oddsratio_confint(method="exact")
    except Exception:
        ci_lower, ci_upper = np.nan, np.nan

    return [
        sv_id,
        gene_names,
        sens07_id,
        len(sv_samples),
        disease,
        sv_case,
        sv_non_case,
        non_sv_case,
        non_sv_non_case,
        round(odds_ratio, 3),
        round(ci_lower, 3) if not np.isnan(ci_lower) else np.nan,
        round(ci_upper, 3) if not np.isnan(ci_upper) else np.nan,
        pvalue,
    ]


def chunk_tasks(tasks, n_chunks):
    if not tasks:
        return []
    chunk_size = math.ceil(len(tasks) / n_chunks)
    return [tasks[i:i + chunk_size] for i in range(0, len(tasks), chunk_size)]


def run_chunk(chunk, meta_info, phenotype_to_samples, all_samples):
    results = []
    for sv_id, disease in chunk:
        res = run_fisher(sv_id, disease, meta_info, phenotype_to_samples, all_samples)
        if res is not None:
            results.append(res)
    return results


def main():
    if not os.path.exists(INPUT_FILE):
        print(f"Input file {INPUT_FILE} not found, exiting...")
        return

    print(f"Processing genome-wide SV–disease associations with {NUM_JOBS} jobs...")

    sv_df = pd.read_csv(INPUT_FILE, sep="\t", dtype=str)
    # convert sample list string → set of sample IDs (as str)
    sv_df["Sample_IDs"] = sv_df["Sample_IDs"].apply(
        lambda x: set(map(str, x.split(",")))
    )

    task_list = []
    meta_info = {}

    for _, row in sv_df.iterrows():
        sv_id = row["1KG_ID"]
        gene_names = row["Gene_Names"]
        sens07_id = row["sens07_ID"]
        sv_samples = row["Sample_IDs"] & all_samples
        control_samples = all_samples - sv_samples

        if len(sv_samples) < 10 or len(control_samples) < 10:
            continue

        meta_info[sv_id] = (gene_names, sens07_id, sv_samples)

        for disease in phenotype_to_samples:
            task_list.append((sv_id, disease))

    print(f"{len(task_list)} SV–disease pairs to test")

    if not task_list:
        print("No SV–disease pairs passed filters.")
        return

    task_chunks = chunk_tasks(task_list, NUM_JOBS)

    with Pool(NUM_JOBS) as pool:
        chunked_results = pool.starmap(
            run_chunk,
            [
                (chunk, meta_info, phenotype_to_samples, all_samples)
                for chunk in task_chunks
            ],
        )

    results = [r for chunk in chunked_results for r in chunk]

    output_df = pd.DataFrame(
        results,
        columns=[
        "AoU_ID", "Gene_Names", "sens07_ID", "Sample_Count", "Disease",
        "SV_with_Disease", "SV_without_Disease", "NonSV_with_Disease", "NonSV_without_Disease",
        "Odds_ratio", "CI_lower", "CI_upper", "pvalue"
        ],
    )
    output_df.to_csv(OUTPUT_FILE, sep="\t", index=False)
    print(f"Saved {OUTPUT_FILE}")


if __name__ == "__main__":
    main()


Processing genome-wide SV–disease associations with 4 jobs...
29502 SV–disease pairs to test
Saved /home/jupyter/process/SV_Disease_associations.male.tsv
