# Create the Excel file for analysis with every table joined

In [1]:
import pandas as pd
import sqlalchemy as sa

from analysis.database import get_engine

## Conditions


In [2]:
with get_engine() as engine:
    conditions_df = pd.read_sql(
        """
    SELECT i.individual_id, c.condition, ic.has_condition
    FROM individual i
    JOIN individual_condition ic
    ON i.individual_id = ic.individual_id
    JOIN condition c
    ON ic.condition_id = c.condition_id
    """,
        engine,
    )

    conditions_df = conditions_df.pivot(index="individual_id",
                                        columns="condition",
                                        values="has_condition")

    # remove the condition column name
    conditions_df.columns.name = None

conditions_df.head()

Unnamed: 0_level_0,Adult-onset primary generalised epilepsy,Arrhythmia at rest,Arrhythmogenic right ventricular cardiomyopathy,Arteriovenous malformation,Ascending aortic aneurysm,Atrial fibrillation,Atrial flutter,Atrial standstill,Atrial tachycardia,Attention deficit hyperactivity disorder,...,Sudden cardiac death,Sudden infant death syndrome,Supraventricular tachycardia,Syncope,Syncope exercise/stress induced,Third-degree atrioventricular block,Unspecified premature ventricular contractions,Ventricular fibrillation,Ventricular tachycardia (unspecified),Weight loss
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,False,,,False,,,
3,,,,,,False,,,False,,...,,,,True,True,,False,,,
4,,False,,,,False,,,True,,...,,,,False,,,False,,,
5,,,,,,,,,,,...,,,,True,,,,,,


In [3]:
with get_engine() as engine:
    other_condition_info = pd.read_sql(
        """
    SELECT i.individual_id, c.condition, ic.age_of_onset, ic.description, ic.onset_symptoms, ic.age_of_presentation, ic.primary_diagnosis
    FROM individual i
    LEFT JOIN individual_condition ic
    ON i.individual_id = ic.individual_id
    LEFT JOIN condition c
    ON ic.condition_id = c.condition_id
    """,
        engine,
    )

other_condition_info.head()

Unnamed: 0,individual_id,condition,age_of_onset,description,onset_symptoms,age_of_presentation,primary_diagnosis
0,1,Heart Structure Abnormality,,,,,
1,1,Baseline/resting electrocardiogram abnormality,,,,,
2,1,Polymorphic ventricular tachycardia,,,,,
3,1,Exercise/stress induced polymorphic ventricula...,,,,,
4,2,Heart Structure Abnormality,,,,,


In [4]:
description_and_onset_symptoms = other_condition_info.groupby(
    "individual_id").agg(
    {
        "description": lambda x: "; ".join(x.dropna()),
        "onset_symptoms": lambda x: "; ".join(x.dropna())
    }
)
description_and_onset_symptoms.rename(
    columns={
        "description": "condition_descriptions",
        "onset_symptoms": "condition_onset_symptoms"
    },
    inplace=True
)
description_and_onset_symptoms.head()

Unnamed: 0_level_0,condition_descriptions,condition_onset_symptoms
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,,
2,,none
3,,
4,,
5,,


In [5]:
# primary diagnosis (whatever is true for the individual)
primary_diagnosis = other_condition_info.groupby("individual_id").apply(
    lambda x: ", ".join(x[x["primary_diagnosis"] == 1]["condition"].values) if
    x["primary_diagnosis"].sum() > 0 else None,
    include_groups=False
)
primary_diagnosis = primary_diagnosis.rename("primary_diagnosis")
primary_diagnosis.head()

individual_id
1    Catecholaminergic polymorphic ventricular tach...
2    Catecholaminergic polymorphic ventricular tach...
3    Catecholaminergic polymorphic ventricular tach...
4    Catecholaminergic polymorphic ventricular tach...
5    Catecholaminergic polymorphic ventricular tach...
Name: primary_diagnosis, dtype: object

In [6]:
age_of_onsets = other_condition_info.pivot(
    index="individual_id",
    columns="condition",
    values=["age_of_onset", "onset_symptoms", "age_of_presentation"],
)
age_of_onsets.columns.name = None
# drop any columns that are all NaN
age_of_onsets = age_of_onsets.dropna(axis=1, how="all")
age_of_onsets.columns = [
    "::".join(col).strip().strip(":") for col in age_of_onsets.columns.values
]

age_of_onsets.head()

Unnamed: 0_level_0,age_of_onset::Catecholaminergic polymorphic ventricular tachycardia 1,onset_symptoms::Catecholaminergic polymorphic ventricular tachycardia 1,age_of_presentation::Catecholaminergic polymorphic ventricular tachycardia 1
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,,,36.0
2,,none,41.0
3,,,16.0
4,2.0,,
5,10.0,,28.0


In [7]:
# join all the tables together
condition_info_df = conditions_df.join(primary_diagnosis).join(
    age_of_onsets).join(description_and_onset_symptoms)

condition_info_df = condition_info_df.reset_index()

condition_info_df.head()

Unnamed: 0,individual_id,Adult-onset primary generalised epilepsy,Arrhythmia at rest,Arrhythmogenic right ventricular cardiomyopathy,Arteriovenous malformation,Ascending aortic aneurysm,Atrial fibrillation,Atrial flutter,Atrial standstill,Atrial tachycardia,...,Unspecified premature ventricular contractions,Ventricular fibrillation,Ventricular tachycardia (unspecified),Weight loss,primary_diagnosis,age_of_onset::Catecholaminergic polymorphic ventricular tachycardia 1,onset_symptoms::Catecholaminergic polymorphic ventricular tachycardia 1,age_of_presentation::Catecholaminergic polymorphic ventricular tachycardia 1,condition_descriptions,condition_onset_symptoms
0,1,,,,,,,,,,...,,,,,Catecholaminergic polymorphic ventricular tach...,,,36.0,,
1,2,,,,,,,,,,...,False,,,,Catecholaminergic polymorphic ventricular tach...,,none,41.0,,none
2,3,,,,,,False,,,False,...,False,,,,Catecholaminergic polymorphic ventricular tach...,,,16.0,,
3,4,,False,,,,False,,,True,...,False,,,,Catecholaminergic polymorphic ventricular tach...,2.0,,,,
4,5,,,,,,,,,,...,,,,,Catecholaminergic polymorphic ventricular tach...,10.0,,28.0,,


In [8]:
# see which individuals are missing from the condition_info_df
with get_engine() as engine:
    individual_ids = pd.read_sql("SELECT individual_id FROM individual", engine)

set(individual_ids["individual_id"]) - set(condition_info_df["individual_id"])

set()

In [9]:
from pathlib import Path

# temp dave this for later
output = Path("../data/04_create_excel_file")

if not output.exists():
    print("Creating output directory")
    output.mkdir()

condition_info_df.to_excel(output / "01_condition_info.xlsx", index=False)

## Family History

In [10]:
with get_engine() as engine:
    family_member_history = pd.read_sql(
        """
    SELECT i.individual_id,
           c.condition,
           fmh.has_condition,
           kn.name AS relationship
    FROM individual i
             LEFT JOIN family_history_record fhr
                       ON i.individual_id = fhr.individual_id
             LEFT JOIN condition c
                       ON fhr.condition_id = c.condition_id
             JOIN family_member_history fmh
                       ON fhr.family_history_record_id =
                          fmh.family_history_record_id
             JOIN kinship_name kn
                       ON fmh.kinship_name_id = kn.kinship_name_id
    ORDER BY i.individual_id
    """,
        engine,
    )

family_member_history = family_member_history.pivot(
    index="individual_id", columns=["condition", "relationship"],
    values="has_condition"
)
family_member_history.columns.name = None
family_member_history.columns = [
    "::".join(col).strip().strip(":") for col in
    family_member_history.columns.values
]

family_member_history.head()

Unnamed: 0_level_0,Sudden cardiac death::Mother,Sudden cardiac death::Father
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,True,False
10,True,False
11,True,False
14,True,False
15,True,False


In [11]:
with get_engine() as engine:
    condition_num_family_members = pd.read_sql(
        """
    SELECT i.individual_id,
           c.condition,
           num_family_members
    FROM individual i
             JOIN family_history_record fhr
                       ON i.individual_id = fhr.individual_id
             JOIN condition c
                       ON fhr.condition_id = c.condition_id
    WHERE num_family_members IS NOT NULL
    ORDER BY num_family_members DESC
    """,
        engine,
    )
condition_num_family_members = condition_num_family_members.pivot(
    index="individual_id", columns="condition", values="num_family_members"
)
condition_num_family_members.columns = [
    f"{col}::num_family_members" if col != "individual_id" else col
    for col in condition_num_family_members.columns.values
]
condition_num_family_members.columns.name = None
condition_num_family_members.head()

Unnamed: 0_level_0,Sudden cardiac death::num_family_members
individual_id,Unnamed: 1_level_1
4,0
9,0
13,0
14,0
15,0


In [12]:
family_history = family_member_history.join(condition_num_family_members)
family_history.reset_index(inplace=True)
family_history.to_excel(output / "02_family_history.xlsx", index=False)
family_history.head()

Unnamed: 0,individual_id,Sudden cardiac death::Mother,Sudden cardiac death::Father,Sudden cardiac death::num_family_members
0,1,True,False,
1,10,True,False,
2,11,True,False,
3,14,True,False,0.0
4,15,True,False,0.0


## Treatments

In [13]:
with get_engine() as engine:
    treatments = pd.read_sql(
        """
    SELECT i.individual_id,
           tr.treatment_taken,
           tr.effective,
           t.treatment_name
    FROM individual i
             JOIN treatment_record tr
                       ON i.individual_id = tr.patient_id
             LEFT JOIN treatment t
                       ON tr.treatment_id = t.treatment_id
    ORDER BY i.individual_id
    """,
        engine,
    )

treatments = treatments.pivot(
    index="individual_id",
    columns="treatment_name",
    values=["treatment_taken", "effective"],
)
treatments.columns.name = None
treatments.columns = [
    "::".join(col).strip().strip(":") for col in treatments.columns.values
]
treatments.reset_index(inplace=True)
treatments.head()

Unnamed: 0,individual_id,treatment_taken::Beta blocker,treatment_taken::Catheter ablation,treatment_taken::Enalapril,treatment_taken::Flecainide,treatment_taken::Implantable cardioverter-defibrillator,treatment_taken::Left cardiac sympathetic denervation,treatment_taken::Verapamil,effective::Beta blocker,effective::Catheter ablation,effective::Enalapril,effective::Flecainide,effective::Implantable cardioverter-defibrillator,effective::Left cardiac sympathetic denervation,effective::Verapamil
0,2,True,False,False,True,False,False,False,False,,,True,,,
1,4,True,False,False,False,False,False,False,,,,,,,
2,6,False,,,,,,,,,,,,,
3,7,False,,,,,,,,,,,,,
4,8,False,False,False,False,True,False,False,,,,,True,,


In [14]:
treatments.to_excel(output / "03_treatments.xlsx", index=False)

## Variants

In [15]:
from sqlalchemy import text

with get_engine() as engine:
    with engine.connect() as conn:
        conn.execute(text("REFRESH MATERIALIZED VIEW individuals_mv"))
        conn.commit()

    variant_info = pd.read_sql("""
    SELECT i.individual_id,
    iv.variant_id,
    vvm.clinvar_variation_id,
    vvm.clinical_significance AS clinvar_clinical_significance,
    vvm.clinvar_conditions,
    v.hgvs_string,
    sv.p_hgvs_string,
    vvm.c_edit_type as cdna_change_type,
    vvm.p_edit_type as protein_change_type,
    z.zygosity,
    vi.variant_inheritance AS inheritance,
    vvm.exons,
    imv.exon_start,
    imv.exon_end,
    vvm.structure_domains
    FROM individual i
    LEFT JOIN individual_variant iv
    ON i.individual_id = iv.individual_id
    LEFT JOIN variant v 
    ON iv.variant_id = v.variant_id
    LEFT JOIN zygosity z
    ON iv.zygosity_id = z.zygosity_id
    LEFT JOIN variant_inheritance vi
    ON iv.variant_inheritance_id = vi.variant_inheritance_id
    LEFT JOIN sequence_variant sv
    ON v.sequence_variant_id = sv.sequence_variant_id
    LEFT JOIN variant_view_mv vvm
    ON v.variant_id = vvm.variant_id
    LEFT JOIN individuals_mv imv
    ON i.individual_id = imv.individual_id
    ORDER BY i.individual_id
    """, engine)

variant_info.head()

Unnamed: 0,individual_id,variant_id,clinvar_variation_id,clinvar_clinical_significance,clinvar_conditions,hgvs_string,p_hgvs_string,cdna_change_type,protein_change_type,zygosity,inheritance,exons,exon_start,exon_end,structure_domains
0,1,609,218487.0,Benign,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.13564-41A>G,,Substitution,,heterozygous,inherited,,,,
1,2,599,201405.0,Pathogenic/Likely pathogenic,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.14885A>G,NP_001026.2:p.(Tyr4962Cys),Substitution,Substitution,,,"[105, 106)",105.0,106.0,"[{'structure_id': 19, 'structure_domain': 'Act..."
2,3,681,235053.0,Likely pathogenic,,NM_001035.3:c.14173T>A,NP_001026.2:p.(Tyr4725Asn),Substitution,Substitution,,,"[99, 100)",99.0,100.0,"[{'structure_id': 23, 'structure_domain': 'Tra..."
3,4,3698,1067931.0,Pathogenic/Likely pathogenic,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.14174A>G,NP_001026.2:p.(Tyr4725Cys),Substitution,Substitution,heterozygous,,"[99, 100)",99.0,100.0,"[{'structure_id': 23, 'structure_domain': 'Tra..."
4,5,3698,1067931.0,Pathogenic/Likely pathogenic,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.14174A>G,NP_001026.2:p.(Tyr4725Cys),Substitution,Substitution,,spontaneous,"[99, 100)",99.0,100.0,"[{'structure_id': 23, 'structure_domain': 'Tra..."


In [16]:
variant_info["clinvar_variation_id"] = variant_info[
    "clinvar_variation_id"].fillna(-1).astype(int)
variant_info["clinvar_uri"] = variant_info.apply(
    lambda
        x: f"https://www.ncbi.nlm.nih.gov/clinvar/variation/{x['clinvar_variation_id']}"
    if x["clinvar_variation_id"] != -1 else None,
    axis=1,
)
# rearrange
variant_info_cols = []
for col in variant_info.columns:
    if col == "clinvar_uri":
        continue

    variant_info_cols.append(col)

    if col == "clinvar_variation_id":
        variant_info_cols.append("clinvar_uri")

variant_info = variant_info[variant_info_cols]

variant_info.head()

Unnamed: 0,individual_id,variant_id,clinvar_variation_id,clinvar_uri,clinvar_clinical_significance,clinvar_conditions,hgvs_string,p_hgvs_string,cdna_change_type,protein_change_type,zygosity,inheritance,exons,exon_start,exon_end,structure_domains
0,1,609,218487,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Benign,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.13564-41A>G,,Substitution,,heterozygous,inherited,,,,
1,2,599,201405,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Pathogenic/Likely pathogenic,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.14885A>G,NP_001026.2:p.(Tyr4962Cys),Substitution,Substitution,,,"[105, 106)",105.0,106.0,"[{'structure_id': 19, 'structure_domain': 'Act..."
2,3,681,235053,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Likely pathogenic,,NM_001035.3:c.14173T>A,NP_001026.2:p.(Tyr4725Asn),Substitution,Substitution,,,"[99, 100)",99.0,100.0,"[{'structure_id': 23, 'structure_domain': 'Tra..."
3,4,3698,1067931,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Pathogenic/Likely pathogenic,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.14174A>G,NP_001026.2:p.(Tyr4725Cys),Substitution,Substitution,heterozygous,,"[99, 100)",99.0,100.0,"[{'structure_id': 23, 'structure_domain': 'Tra..."
4,5,3698,1067931,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Pathogenic/Likely pathogenic,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.14174A>G,NP_001026.2:p.(Tyr4725Cys),Substitution,Substitution,,spontaneous,"[99, 100)",99.0,100.0,"[{'structure_id': 23, 'structure_domain': 'Tra..."


In [17]:
# get the number of individuals with more than one variant
variant_info["individual_id"].value_counts().value_counts()

count
1    1341
2       1
Name: count, dtype: int64

In [19]:
# structure inforemation
domains_df = pd.read_csv("./data_commit/ryr2_subdomains.csv")
domains_df

Unnamed: 0,domains,subdomains,subdomain_precedence
0,NTD,NTD-A,
1,SPRY,NTD-B,
2,JSol,NSol,
3,BSol,SPRY1,
4,SCLP,SPRY2,
5,CSol,SPRY3,
6,TaF,RY1&2,
7,TM,JSol,
8,CTD,BSol1,
9,,BSol2,


In [22]:
import numpy as np


def find_domain(structure_domain: list[dict], domains: set[str]):
    if not structure_domain or np.any(pd.isna(structure_domain)):
        return None

    in_domains = []

    for d in structure_domain:
        # example value: [{'structure_id': 27, 'structure_domain': 'C-terminal domain ', 'structure_domain_symbol': 'CTD'}, {'structure_id': 19, 'structure_domain': 'Activation core and channel', 'structure_domain_symbol': None}]
        domain_symbol = d["structure_domain_symbol"]

        if domain_symbol is None or pd.isna(domain_symbol):
            continue

        if domain_symbol in domains:
            in_domains.append(domain_symbol)

    if len(in_domains) == 0:
        print(
            f"No domains found: {structure_domain}. Setting to full name if structure_domain is length 1."
        )
        return return_first_full_domain_name(structure_domain)

    if len(in_domains) == 1:
        return in_domains[0]

    raise ValueError(f"Multiple domains found: {in_domains}")


def find_subdomain(
        structure_domain: list[dict], subdomain_with_precedence: dict[str, bool]
):
    if not structure_domain or np.any(pd.isna(structure_domain)):
        return None

    in_subdomains = []
    for d in structure_domain:

        domain_symbol = d["structure_domain_symbol"]

        if domain_symbol is None or pd.isna(domain_symbol):
            continue

        if domain_symbol in subdomain_with_precedence:
            in_subdomains.append(domain_symbol)

    if len(in_subdomains) == 0:
        print(
            f"No subdomains found: {structure_domain}. Setting to full name if structure_domain is length 1."
        )

        return return_first_full_domain_name(structure_domain)

    if len(in_subdomains) == 1:
        return in_subdomains[0]

    # make sure only 1 has precedence
    with_precedence = [d for d in in_subdomains if subdomain_with_precedence[d]]
    if len(with_precedence) == 1:
        return with_precedence[0]

    raise ValueError(f"Multiple subdomains found: {in_subdomains}")


def return_first_full_domain_name(structure_domain: list[dict]):
    if not structure_domain:
        return None

    if len(structure_domain) != 1:
        raise ValueError(
            f"Multiple domains found: {structure_domain}"
        )

    return structure_domain[0]["structure_domain"]


variant_info["domain"] = variant_info["structure_domains"].apply(
    lambda x: find_domain(
        x, set(d.strip() for d in domains_df["domains"].values if pd.notna(d))
    )
)

variant_info["subdomain"] = variant_info["structure_domains"].apply(
    lambda x: find_subdomain(
        x,
        {
            d.strip(): p if pd.notna(p) else False
            for d, p in zip(
            domains_df["subdomains"].values,
            domains_df["subdomain_precedence"].values,
        )
            if pd.notna(d)
        },
    )
)
variant_info.head()

No domains found: [{'structure_id': 19, 'structure_domain': 'Activation core and channel', 'structure_domain_symbol': None}]. Setting to full name if structure_domain is length 1.
No domains found: [{'structure_id': 19, 'structure_domain': 'Activation core and channel', 'structure_domain_symbol': None}]. Setting to full name if structure_domain is length 1.
No domains found: [{'structure_id': 19, 'structure_domain': 'Activation core and channel', 'structure_domain_symbol': None}]. Setting to full name if structure_domain is length 1.
No subdomains found: [{'structure_id': 19, 'structure_domain': 'Activation core and channel', 'structure_domain_symbol': None}]. Setting to full name if structure_domain is length 1.
No subdomains found: [{'structure_id': 19, 'structure_domain': 'Activation core and channel', 'structure_domain_symbol': None}]. Setting to full name if structure_domain is length 1.
No subdomains found: [{'structure_id': 19, 'structure_domain': 'Activation core and channel', 

Unnamed: 0,individual_id,variant_id,clinvar_variation_id,clinvar_uri,clinvar_clinical_significance,clinvar_conditions,hgvs_string,p_hgvs_string,cdna_change_type,protein_change_type,zygosity,inheritance,exons,exon_start,exon_end,structure_domains,domain,subdomain
0,1,609,218487,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Benign,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.13564-41A>G,,Substitution,,heterozygous,inherited,,,,,,
1,2,599,201405,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Pathogenic/Likely pathogenic,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.14885A>G,NP_001026.2:p.(Tyr4962Cys),Substitution,Substitution,,,"[105, 106)",105.0,106.0,"[{'structure_id': 19, 'structure_domain': 'Act...",CTD,CTD
2,3,681,235053,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Likely pathogenic,,NM_001035.3:c.14173T>A,NP_001026.2:p.(Tyr4725Asn),Substitution,Substitution,,,"[99, 100)",99.0,100.0,"[{'structure_id': 23, 'structure_domain': 'Tra...",TM,pVSD
3,4,3698,1067931,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Pathogenic/Likely pathogenic,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.14174A>G,NP_001026.2:p.(Tyr4725Cys),Substitution,Substitution,heterozygous,,"[99, 100)",99.0,100.0,"[{'structure_id': 23, 'structure_domain': 'Tra...",TM,pVSD
4,5,3698,1067931,https://www.ncbi.nlm.nih.gov/clinvar/variation...,Pathogenic/Likely pathogenic,[{'condition': 'Catecholaminergic polymorphic ...,NM_001035.3:c.14174A>G,NP_001026.2:p.(Tyr4725Cys),Substitution,Substitution,,spontaneous,"[99, 100)",99.0,100.0,"[{'structure_id': 23, 'structure_domain': 'Tra...",TM,pVSD


In [23]:
variant_info.to_excel(output / "04_variant_info.xlsx")

## Legacy in silico variant pathogenicity predictions

- SIFT
- PolyPhen
- Fathmm
- Provean

These were included in the original Excel file, but AlphaFold mutation predictions are not currently included

In [24]:
with get_engine() as engine:
    variant_path_pred = pd.read_sql("""
    SELECT pp.variant_id,
    p.model,
    pp.prediction,
    pp.score
    FROM pathogenicity_prediction pp
    JOIN pathogenicity_predictor p
    ON pp.predictor_id = p.predictor_id
    """, engine)

variant_path_pred = variant_path_pred.pivot(
    index="variant_id",
    columns="model",
    values=["prediction", "score"],
)
variant_path_pred.columns.name = None
variant_path_pred.columns = [
    "::".join(col).strip().strip(":") for col in
    variant_path_pred.columns.values
]
variant_path_pred.reset_index(inplace=True)
variant_path_pred.to_excel(output / "05_variant_path_predictions.xlsx",
                           index=False)
variant_path_pred.head()

Unnamed: 0,variant_id,prediction::fathmm,prediction::polyphen,prediction::provean,prediction::sift,score::fathmm,score::polyphen,score::provean,score::sift
0,1,,damaging,,,-3.55,1.0,-5.05,0.001
1,2,,damaging,,,-2.88,1.0,-5.22,0.001
2,4,,damaging,,,-5.5,1.0,-7.06,0.0
3,5,,damaging,,,-4.61,0.954,-6.46,0.035
4,6,,,,,-4.49,0.006,-4.0,0.001


## The rest

- Publications
- Demographics

In [25]:
with get_engine() as engine:
    individual_info = pd.read_sql("""
    SELECT i.individual_id,
    iis.value AS sex,
    p.publication_id,
    p.title,
    p.first_author,
    p.reference,
    p.doi,
    p.year,
    pd.name as publication_database,
    pd.resource_uri,
    ptd.resource_id,
    p.publication_type_id,
    pt.publication_type,
    p.rob_publication_type_id,
    prt.rob_publication_type
    FROM individual i
    LEFT JOIN individual_sex iis
        ON i.individual_sex_id = iis.individual_sex_id
    LEFT JOIN individual_to_publication itp
        ON i.individual_id = itp.individual_id
    LEFT JOIN publication p
        ON itp.publication_id = p.publication_id
    LEFT JOIN publication_to_database ptd
        ON p.publication_id = ptd.publication_id
    LEFT JOIN publication_database pd
        ON ptd.database_id = pd.database_id
    LEFT JOIN publication_type pt
        ON p.publication_type_id = pt.publication_type_id
    LEFT JOIN publication_rob_type prt
        ON p.rob_publication_type_id = prt.rob_publication_type_id
    ORDER BY i.individual_id
    """, engine)

individual_info["resource_uri"] = individual_info.apply(
    lambda x: f"{x['resource_uri']}{x['resource_id']}" if x[
        "resource_id"] else None,
    axis=1,
)
individual_info["doi_uri"] = individual_info.apply(
    lambda x: f"https://doi.org/{x['doi']}" if x["doi"] else None, axis=1
)
# rearrange so doi_uri is next to resource_id
individual_info_cols = []
for col in individual_info.columns:
    if col == "doi_uri":
        continue

    individual_info_cols.append(col)

    if col == "resource_id":
        individual_info_cols.append("doi_uri")

individual_info = individual_info[individual_info_cols]

individual_info.head()

Unnamed: 0,individual_id,sex,publication_id,title,first_author,reference,doi,year,publication_database,resource_uri,resource_id,doi_uri,publication_type_id,publication_type,rob_publication_type_id,rob_publication_type
0,1,,1,Familial Evaluation in Catecholaminergic Polym...,Van Der Werf C,,10.1161/CIRCEP.112.970517,2012,PubMed,https://pubmed.ncbi.nlm.nih.gov/22787013,22787013,https://doi.org/10.1161/CIRCEP.112.970517,5,Article,1,Cohort or Cross-Sectional Like Study
1,2,female,2,Flecainide therapy reduces exercise-induced ve...,"van der Werf, C",,10.1016/j.jacc.2011.01.026,2011,PubMed,https://pubmed.ncbi.nlm.nih.gov/21616285,21616285,https://doi.org/10.1016/j.jacc.2011.01.026,5,Article,1,Cohort or Cross-Sectional Like Study
2,3,male,3,Structural abnormalities on cardiac magnetic r...,"Gerber, D",,10.1016/j.jacep.2020.03.006,2020,PubMed,https://pubmed.ncbi.nlm.nih.gov/32553227,32553227,https://doi.org/10.1016/j.jacep.2020.03.006,3,Research Letter,1,Cohort or Cross-Sectional Like Study
3,4,female,4,Genetic Background of Catecholaminergic Polymo...,"Kawamura, M",,10.1253/circj.cj-12-1460,2013,PubMed,https://pubmed.ncbi.nlm.nih.gov/23595086,23595086,https://doi.org/10.1253/circj.cj-12-1460,5,Article,3,Case Series
4,5,male,5,Gender differences in the inheritance mode of ...,"Ohno, S.",,10.1371/journal.pone.0131517,2015,PubMed,https://pubmed.ncbi.nlm.nih.gov/26114861,26114861,https://doi.org/10.1371/journal.pone.0131517,5,Article,3,Case Series


In [26]:
individual_info["individual_id"].value_counts().value_counts()

count
1    1342
Name: count, dtype: int64

In [27]:
individual_info.to_excel(output / "06_individual_info.xlsx", index=False)

## Merge all the tables together

In [28]:
df_all = pd.merge(
    individual_info,
    variant_info,
    on="individual_id",
    how="outer",
)
df_all = pd.merge(
    df_all,
    variant_path_pred,
    on="variant_id",
    how="left"
)
df_all = pd.merge(
    df_all,
    condition_info_df,
    on="individual_id",
    how="left",
)
df_all = pd.merge(
    df_all,
    family_history,
    on="individual_id",
    how="left",
)
df_all = pd.merge(
    df_all,
    treatments,
    on="individual_id",
    how="left",
)
df_all.head()

Unnamed: 0,individual_id,sex,publication_id,title,first_author,reference,doi,year,publication_database,resource_uri,...,treatment_taken::Implantable cardioverter-defibrillator,treatment_taken::Left cardiac sympathetic denervation,treatment_taken::Verapamil,effective::Beta blocker,effective::Catheter ablation,effective::Enalapril,effective::Flecainide,effective::Implantable cardioverter-defibrillator,effective::Left cardiac sympathetic denervation,effective::Verapamil
0,1,,1,Familial Evaluation in Catecholaminergic Polym...,Van Der Werf C,,10.1161/CIRCEP.112.970517,2012,PubMed,https://pubmed.ncbi.nlm.nih.gov/22787013,...,,,,,,,,,,
1,2,female,2,Flecainide therapy reduces exercise-induced ve...,"van der Werf, C",,10.1016/j.jacc.2011.01.026,2011,PubMed,https://pubmed.ncbi.nlm.nih.gov/21616285,...,False,False,False,False,,,True,,,
2,3,male,3,Structural abnormalities on cardiac magnetic r...,"Gerber, D",,10.1016/j.jacep.2020.03.006,2020,PubMed,https://pubmed.ncbi.nlm.nih.gov/32553227,...,,,,,,,,,,
3,4,female,4,Genetic Background of Catecholaminergic Polymo...,"Kawamura, M",,10.1253/circj.cj-12-1460,2013,PubMed,https://pubmed.ncbi.nlm.nih.gov/23595086,...,False,False,False,,,,,,,
4,5,male,5,Gender differences in the inheritance mode of ...,"Ohno, S.",,10.1371/journal.pone.0131517,2015,PubMed,https://pubmed.ncbi.nlm.nih.gov/26114861,...,,,,,,,,,,


In [29]:
df_all["individual_id"].value_counts().value_counts()

count
1    1341
2       1
Name: count, dtype: int64

In [30]:
import datetime

date = datetime.datetime.now().strftime("%Y-%m-%d")

In [31]:
from openpyxl.styles import Font
from openpyxl.cell import Cell
from contextlib import contextmanager
import openpyxl
import openpyxl.utils


@contextmanager
def open_workbook(filename: str):
    wb = None

    try:
        wb = openpyxl.load_workbook(filename)
        yield wb
        wb.save(filename)
    except Exception as e:
        print("Something went wrong: ", e)
    finally:
        if wb:
            wb.close()


excel_filename = Path(f"../data/individual_all_data-{date}.xlsx")

df_all.to_excel(excel_filename, sheet_name="all_data")

with open_workbook(excel_filename) as wb:
    sheet = wb["all_data"]

    col_to_idx = {
        row[i].value: i
        for row in sheet.iter_rows(min_row=1, max_row=1)
        for i in range(len(row))
    }
    for i, row in enumerate(sheet.iter_rows(min_row=2)):
        for col in ["resource_uri", "doi_uri", "clinvar_uri"]:
            cell: Cell = row[col_to_idx[col]]

            if isinstance(cell.value, str) and cell.value.startswith("http"):
                cell.hyperlink = cell.value
                cell.font = Font(color="0000FF", underline="single")