# Create the Excel file for analysis with every table joined

In [1]:
import pandas as pd
import sqlalchemy as sa

from analysis.database import get_engine

## Conditions


In [3]:
with get_engine() as engine:
    conditions_df = pd.read_sql(
        """
    SELECT i.individual_id, c.condition, ic.has_condition
    FROM individual i
    JOIN individual_condition ic
    ON i.individual_id = ic.individual_id
    JOIN condition c
    ON ic.condition_id = c.condition_id
    """,
        engine,
    )

    conditions_df = conditions_df.pivot(index="individual_id",
                                        columns="condition",
                                        values="has_condition")

    # remove the condition column name
    conditions_df.columns.name = None

conditions_df.head()

Unnamed: 0_level_0,Adult-onset primary generalised epilepsy,Arrhythmia at rest,Arrhythmogenic right ventricular cardiomyopathy,Arteriovenous malformation,Ascending aortic aneurysm,Atrial fibrillation,Atrial flutter,Atrial standstill,Atrial tachycardia,Attention deficit hyperactivity disorder,...,Sudden cardiac death,Sudden infant death syndrome,Supraventricular tachycardia,Syncope,Syncope exercise/stress induced,Third-degree atrioventricular block,Unspecified premature ventricular contractions,Ventricular fibrillation,Ventricular tachycardia (unspecified),Weight loss
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,False,,,False,,,
3,,,,,,False,,,False,,...,,,,True,True,,False,,,
4,,False,,,,False,,,True,,...,,,,False,,,False,,,
5,,,,,,,,,,,...,,,,True,,,,,,


In [22]:
with get_engine() as engine:
    other_condition_info = pd.read_sql(
        """
    SELECT i.individual_id, c.condition, ic.age_of_onset, ic.description, ic.onset_symptoms, ic.age_of_presentation, ic.primary_diagnosis
    FROM individual i
    LEFT JOIN individual_condition ic
    ON i.individual_id = ic.individual_id
    LEFT JOIN condition c
    ON ic.condition_id = c.condition_id
    """,
        engine,
    )

other_condition_info.head()

Unnamed: 0,individual_id,condition,age_of_onset,description,onset_symptoms,age_of_presentation,primary_diagnosis
0,1,Heart Structure Abnormality,,,,,
1,1,Baseline/resting electrocardiogram abnormality,,,,,
2,1,Polymorphic ventricular tachycardia,,,,,
3,1,Exercise/stress induced polymorphic ventricula...,,,,,
4,2,Heart Structure Abnormality,,,,,


In [23]:
description_and_onset_symptoms = other_condition_info.groupby(
    "individual_id").agg(
    {
        "description": lambda x: "; ".join(x.dropna()),
        "onset_symptoms": lambda x: "; ".join(x.dropna())
    }
)
description_and_onset_symptoms.rename(
    columns={
        "description": "condition_descriptions",
        "onset_symptoms": "condition_onset_symptoms"
    },
    inplace=True
)
description_and_onset_symptoms.head()

Unnamed: 0_level_0,condition_descriptions,condition_onset_symptoms
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,,
2,,none
3,,
4,,
5,,


In [24]:
# primary diagnosis (whatever is true for the individual)
primary_diagnosis = other_condition_info.groupby("individual_id").apply(
    lambda x: ", ".join(x[x["primary_diagnosis"] == 1]["condition"].values) if
    x["primary_diagnosis"].sum() > 0 else None,
    include_groups=False
)
primary_diagnosis = primary_diagnosis.rename("primary_diagnosis")
primary_diagnosis.head()

individual_id
1    Catecholaminergic polymorphic ventricular tach...
2    Catecholaminergic polymorphic ventricular tach...
3    Catecholaminergic polymorphic ventricular tach...
4    Catecholaminergic polymorphic ventricular tach...
5    Catecholaminergic polymorphic ventricular tach...
Name: primary_diagnosis, dtype: object

In [25]:
age_of_onsets = other_condition_info.pivot(
    index="individual_id",
    columns="condition",
    values=["age_of_onset", "onset_symptoms", "age_of_presentation"],
)
age_of_onsets.columns.name = None
# drop any columns that are all NaN
age_of_onsets = age_of_onsets.dropna(axis=1, how="all")
age_of_onsets.columns = [
    "::".join(col).strip().strip(":") for col in age_of_onsets.columns.values
]

age_of_onsets.head()

Unnamed: 0_level_0,age_of_onset::Catecholaminergic polymorphic ventricular tachycardia 1,onset_symptoms::Catecholaminergic polymorphic ventricular tachycardia 1,age_of_presentation::Catecholaminergic polymorphic ventricular tachycardia 1
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,,,36.0
2,,none,41.0
3,,,16.0
4,2.0,,
5,10.0,,28.0


In [26]:
# join all the tables together
condition_info_df = conditions_df.join(primary_diagnosis).join(
    age_of_onsets).join(description_and_onset_symptoms)

condition_info_df.head()

Unnamed: 0_level_0,Adult-onset primary generalised epilepsy,Arrhythmia at rest,Arrhythmogenic right ventricular cardiomyopathy,Arteriovenous malformation,Ascending aortic aneurysm,Atrial fibrillation,Atrial flutter,Atrial standstill,Atrial tachycardia,Attention deficit hyperactivity disorder,...,Unspecified premature ventricular contractions,Ventricular fibrillation,Ventricular tachycardia (unspecified),Weight loss,primary_diagnosis,age_of_onset::Catecholaminergic polymorphic ventricular tachycardia 1,onset_symptoms::Catecholaminergic polymorphic ventricular tachycardia 1,age_of_presentation::Catecholaminergic polymorphic ventricular tachycardia 1,condition_descriptions,condition_onset_symptoms
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,Catecholaminergic polymorphic ventricular tach...,,,36.0,,
2,,,,,,,,,,,...,False,,,,Catecholaminergic polymorphic ventricular tach...,,none,41.0,,none
3,,,,,,False,,,False,,...,False,,,,Catecholaminergic polymorphic ventricular tach...,,,16.0,,
4,,False,,,,False,,,True,,...,False,,,,Catecholaminergic polymorphic ventricular tach...,2.0,,,,
5,,,,,,,,,,,...,,,,,Catecholaminergic polymorphic ventricular tach...,10.0,,28.0,,


In [27]:
# make sure all individual_ids are in there
with get_engine() as engine:
    individual_ids = pd.read_sql("SELECT individual_id FROM individual", engine)

set(individual_ids["individual_id"]) - set(condition_info_df.index)

set()

In [29]:
from pathlib import Path

# temp dave this for later
output = Path("../data/04_create_excel_file")

if not output.exists():
    print("Creating output directory")
    output.mkdir()

condition_info_df.to_excel(output / "01_condition_info.xlsx")

Creating output directory


# Family History