# Create Pivot Tables

Create a pivot table of patients and conditions. Then combine all individuals and their variants into a single table for analysis.

In [1]:
import pandas as pd
import sqlalchemy as sa

In [2]:
# read db connection from .env file
from helpers.settings import get_settings

settings = get_settings()

Loading settings ...


In [3]:
engine = sa.create_engine(settings.postgresql_dsn.__str__())

In [4]:
df = pd.read_sql(
    """
SELECT i.individual_id, c.condition, ic.has_condition
FROM individual i
JOIN individual_condition ic
ON i.individual_id = ic.individual_id
JOIN condition c
ON ic.condition_id = c.condition_id
""",
    engine,
)
df.head()

Unnamed: 0,individual_id,condition,has_condition
0,1,Catecholaminergic polymorphic ventricular tach...,True
1,1,Heart Structure Abnormality,False
2,1,Baseline/resting electrocardiogram abnormality,False
3,1,Polymorphic ventricular tachycardia,True
4,1,Exercise/stress induced polymorphic ventricula...,True


In [5]:
df2 = df.copy()
df2 = df2.pivot(index="individual_id", columns="condition", values="has_condition")
df2.reset_index(inplace=True)

df2.head()

condition,individual_id,Adult-onset primary generalised epilepsy,Arrhythmia at rest,Arrhythmogenic right ventricular cardiomyopathy,Arteriovenous malformation,Ascending aortic aneurysm,Atrial fibrillation,Atrial flutter,Atrial standstill,Atrial tachycardia,...,Sudden cardiac death,Sudden infant death syndrome,Supraventricular tachycardia,Syncope,Syncope exercise/stress induced,Third-degree atrioventricular block,Unspecified premature ventricular contractions,Ventricular fibrillation,Ventricular tachycardia (unspecified),Weight loss
0,1,,,,,,,,,,...,,,,,,,,,,
1,2,,,,,,,,,,...,,,,False,,,False,,,
2,3,,,,,,False,,,False,...,,,,True,True,,False,,,
3,4,,False,,,,False,,,True,...,,,,False,,,False,,,
4,5,,,,,,,,,,...,,,,True,,,,,,


In [6]:
# find the current working directory
import os
import IPython

notebook_path = IPython.get_ipython().getoutput('pwd')[0]

data_dir = os.path.join(notebook_path, '..', 'data')

if not os.path.exists(data_dir):
    print(f'Creating directory: {data_dir}')
    os.makedirs(data_dir)

In [7]:
# save to csv
df_file = os.path.join(data_dir, "individual_conditions.csv")
df2.to_csv(df_file, index=False)

# Combine purified data with rest of condition data

In [8]:
df_all_condition = pd.read_sql(
    """
SELECT i.individual_id, c.condition, ic.age_of_onset, ic.description, ic.onset_symptoms
FROM individual i
LEFT JOIN individual_condition ic
ON i.individual_id = ic.individual_id
LEFT JOIN condition c
ON ic.condition_id = c.condition_id
""",
    engine,
)
df_all_condition.head()

Unnamed: 0,individual_id,condition,age_of_onset,description,onset_symptoms
0,1,Catecholaminergic polymorphic ventricular tach...,,,
1,1,Heart Structure Abnormality,,,
2,1,Baseline/resting electrocardiogram abnormality,,,
3,1,Polymorphic ventricular tachycardia,,,
4,1,Exercise/stress induced polymorphic ventricula...,,,


In [9]:
df_all_condition.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13774 entries, 0 to 13773
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   individual_id   13774 non-null  int64  
 1   condition       13774 non-null  object 
 2   age_of_onset    411 non-null    float64
 3   description     264 non-null    object 
 4   onset_symptoms  314 non-null    object 
dtypes: float64(1), int64(1), object(3)
memory usage: 538.2+ KB


In [10]:
df_all_condition.groupby("individual_id")["description"].apply(
    lambda x: ", ".join(x.dropna())
).reset_index()

Unnamed: 0,individual_id,description
0,1,
1,2,
2,3,
3,4,
4,5,
...,...,...
1350,1351,"Left ventricular hypertrabeculation at apex, B..."
1351,1352,
1352,1353,Abnormal RV relaxation
1353,1354,Prominent left ventricular trabeculation


In [11]:
df3_pivot = df_all_condition.pivot(
    index="individual_id",
    columns="condition",
    values=["age_of_onset", "onset_symptoms"],
)
df3_pivot.reset_index(inplace=True)
# drop any columns that are all NaN
df3_pivot = df3_pivot.dropna(axis=1, how="all")
df3_pivot.columns = [
    "::".join(col).strip().strip(":") for col in df3_pivot.columns.values
]
# reset individual_id to a column
df3_pivot.set_index("individual_id", inplace=True)
# join the description column
df3_pivot = df3_pivot.join(
    df_all_condition.groupby("individual_id")["description"]
    .apply(lambda x: ", ".join(x.dropna()))
    .reset_index()
    .set_index("individual_id")
)
# rename the description column to "other conditions"
df3_pivot.rename(columns={"description": "other conditions"}, inplace=True)
df3_pivot.head()

Unnamed: 0_level_0,age_of_onset::Catecholaminergic polymorphic ventricular tachycardia 1,onset_symptoms::Catecholaminergic polymorphic ventricular tachycardia 1,other conditions
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,,,
2,,none,
3,,,
4,2.0,,
5,10.0,,


In [12]:
# save to csv
df3_pivot.to_csv(
    os.path.join(data_dir, "individual_conditions_descriptions.csv"), index=False
)

## Family History Conditions

In [13]:
df4_pre = pd.read_sql(
    """
SELECT i.individual_id,
       c.condition,
       fmh.has_condition,
       kn.name AS relationship
FROM individual i
         LEFT JOIN family_history_record fhr
                   ON i.individual_id = fhr.individual_id
         LEFT JOIN condition c
                   ON fhr.condition_id = c.condition_id
         JOIN family_member_history fmh
                   ON fhr.family_history_record_id =
                      fmh.family_history_record_id
         JOIN kinship_name kn
                   ON fmh.kinship_name_id = kn.kinship_name_id
ORDER BY i.individual_id
""",
    engine,
)
df4_pre.head()

Unnamed: 0,individual_id,condition,has_condition,relationship
0,1,Sudden cardiac death,True,Mother
1,1,Sudden cardiac death,False,Father
2,10,Sudden cardiac death,True,Mother
3,10,Sudden cardiac death,False,Father
4,11,Sudden cardiac death,True,Mother


In [14]:
df4_1_pivot = df4_pre.pivot(
    index="individual_id", columns=["condition", "relationship"], values="has_condition"
)
df4_1_pivot.reset_index(inplace=True)
df4_1_pivot.columns = [
    "::".join(col).strip().strip(":") for col in df4_1_pivot.columns.values
]
df4_1_pivot.set_index("individual_id", inplace=True)
df4_1_pivot.head()

Unnamed: 0_level_0,Sudden cardiac death::Mother,Sudden cardiac death::Father
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,True,False
10,True,False
11,True,False
14,True,False
15,True,False


In [15]:
df4_2_pre = pd.read_sql(
    """
SELECT i.individual_id,
       c.condition,
       num_family_members
FROM individual i
         JOIN family_history_record fhr
                   ON i.individual_id = fhr.individual_id
         JOIN condition c
                   ON fhr.condition_id = c.condition_id
WHERE num_family_members IS NOT NULL
ORDER BY num_family_members DESC
""",
    engine,
)
df4_2_pre.head()

Unnamed: 0,individual_id,condition,num_family_members
0,991,Sudden cardiac death,36
1,952,Sudden cardiac death,36
2,954,Sudden cardiac death,36
3,956,Sudden cardiac death,36
4,958,Sudden cardiac death,36


In [16]:
df4_2_pivot = df4_2_pre.pivot(
    index="individual_id", columns="condition", values="num_family_members"
).reset_index()
df4_2_pivot.columns = [
    f"{col}::num_family_members" if col != "individual_id" else col
    for col in df4_2_pivot.columns.values
]
df4_2_pivot.set_index("individual_id", inplace=True)
df4_2_pivot.head()

Unnamed: 0_level_0,Sudden cardiac death::num_family_members
individual_id,Unnamed: 1_level_1
4,0
9,0
13,0
14,0
15,0


In [17]:
# combine the two dataframes - not all individuals will be in both
df4_pivot = df4_1_pivot.join(df4_2_pivot, how="outer")
df4_pivot.head()

Unnamed: 0_level_0,Sudden cardiac death::Mother,Sudden cardiac death::Father,Sudden cardiac death::num_family_members
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,True,False,
4,,,0.0
9,,,0.0
10,True,False,
11,True,False,


In [18]:
# save to csv
df4_pivot.to_csv(
    os.path.join(data_dir, "individual_family_conditions.csv"),
)

## Treatments

In [19]:
df5 = pd.read_sql(
    """
SELECT i.individual_id,
       tr.treatment_taken,
       tr.effective,
       t.treatment_name
FROM individual i
         JOIN treatment_record tr
                   ON i.individual_id = tr.patient_id
         LEFT JOIN treatment t
                   ON tr.treatment_id = t.treatment_id
ORDER BY i.individual_id
""",
    engine,
)
df5.head()

Unnamed: 0,individual_id,treatment_taken,effective,treatment_name
0,2,True,False,Beta blocker
1,2,False,,Verapamil
2,2,True,True,Flecainide
3,2,False,,Enalapril
4,2,False,,Left cardiac sympathetic denervation


In [20]:
df5_pivot = df5.pivot(
    index="individual_id",
    columns="treatment_name",
    values=["treatment_taken", "effective"],
)
df5_pivot.reset_index(inplace=True)
df5_pivot.columns = [
    "::".join(col).strip().strip(":") for col in df5_pivot.columns.values
]
df5_pivot.set_index("individual_id", inplace=True)
df5_pivot.head()

Unnamed: 0_level_0,treatment_taken::Beta blocker,treatment_taken::Catheter ablation,treatment_taken::Enalapril,treatment_taken::Flecainide,treatment_taken::Implantable cardioverter-defibrillator,treatment_taken::Left cardiac sympathetic denervation,treatment_taken::Verapamil,effective::Beta blocker,effective::Catheter ablation,effective::Enalapril,effective::Flecainide,effective::Implantable cardioverter-defibrillator,effective::Left cardiac sympathetic denervation,effective::Verapamil
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,True,False,False,True,False,False,False,False,,,True,,,
4,True,False,False,False,False,False,False,,,,,,,
6,False,,,,,,,,,,,,,
7,False,,,,,,,,,,,,,
8,False,False,False,False,True,False,False,,,,,True,,


In [21]:
# save to csv
df5_pivot.to_csv(
    os.path.join(data_dir, "individual_treatments.csv"),
)

# Combine all data

In [22]:
df_rest = pd.read_sql(
    """
SELECT i.individual_id,
       iis.value              AS sex,
       iv.variant_id,
       z.zygosity,
       vi.variant_inheritance AS inheritance,
       v.hgvs_string,
       sv.p_hgvs_string,
       itp.publication_id,
       p.title,
       p.first_author,
       p.pmid,
       p.reference,
       p.doi,
       p.year,
       i.extra_information
FROM individual i
         LEFT JOIN individual_sex iis
                   ON i.individual_sex_id = iis.individual_sex_id
         LEFT JOIN individual_variant iv
                   ON i.individual_id = iv.individual_id
         LEFT JOIN zygosity z
                   ON iv.zygosity_id = z.zygosity_id
         LEFT JOIN variant_inheritance vi
                   ON iv.variant_inheritance_id = vi.variant_inheritance_id
         LEFT JOIN variant v
                   ON iv.variant_id = v.variant_id
         LEFT JOIN individual_to_publication itp
                   ON i.individual_id = itp.individual_id
         LEFT JOIN publication p
                   ON itp.publication_id = p.publication_id
         LEFT JOIN sequence_variant sv
                   ON v.sequence_variant_id = sv.sequence_variant_id
ORDER BY i.individual_id;
""",
    engine,
)
df_rest.head()

Unnamed: 0,individual_id,sex,variant_id,zygosity,inheritance,hgvs_string,publication_id,title,first_author,pmid,reference,doi,year,extra_information
0,1,,8027,heterozygous,inherited,NM_001035.3:c.13564-41A>G,1,Familial Evaluation in Catecholaminergic Polym...,Van Der Werf C,22787013.0,https://www.ncbi.nlm.nih.gov/pubmed?cmd=Retrie...,,2012.0,"{'basic_science': False, 'maternity_paternity_..."
1,2,female,7854,,,NM_001035.3:c.14885A>G,2,Flecainide therapy reduces exercise-induced ve...,"van der Werf, C",21616285.0,https://www.ncbi.nlm.nih.gov/pubmed/21616285,,2011.0,{'basic_science': False}
2,3,male,7490,,,NM_001035.3:c.14173T>A,3,Structural abnormalities on cardiac magnetic r...,"Gerber, D",32553227.0,https://www.sciencedirect.com/science/article/...,,2020.0,"{'basic_science': False, 'maternity_paternity_..."
3,4,female,7491,heterozygous,,NM_001035.3:c.14174A>G,4,Genetic Background of Catecholaminergic Polymo...,"Kawamura, M",23595086.0,https://www.ncbi.nlm.nih.gov/pubmed/23595086,,2013.0,{'basic_science': False}
4,5,male,7491,,spontaneous,NM_001035.3:c.14174A>G,5,Gender differences in the inheritance mode of ...,"Ohno, S.",26114861.0,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,,2015.0,"{'basic_science': False, 'maternity_paternity_..."


In [23]:
df_all = (
    df_rest.set_index("individual_id")
    .join(df2)
    .join(df3_pivot)
    .join(df4_pivot)
    .join(df5_pivot)
)
df_all.drop(columns="individual_id", inplace=True)
df_all.head()

Unnamed: 0_level_0,sex,variant_id,zygosity,inheritance,hgvs_string,publication_id,title,first_author,pmid,reference,...,treatment_taken::Implantable cardioverter-defibrillator,treatment_taken::Left cardiac sympathetic denervation,treatment_taken::Verapamil,effective::Beta blocker,effective::Catheter ablation,effective::Enalapril,effective::Flecainide,effective::Implantable cardioverter-defibrillator,effective::Left cardiac sympathetic denervation,effective::Verapamil
individual_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,8027,heterozygous,inherited,NM_001035.3:c.13564-41A>G,1,Familial Evaluation in Catecholaminergic Polym...,Van Der Werf C,22787013.0,https://www.ncbi.nlm.nih.gov/pubmed?cmd=Retrie...,...,,,,,,,,,,
2,female,7854,,,NM_001035.3:c.14885A>G,2,Flecainide therapy reduces exercise-induced ve...,"van der Werf, C",21616285.0,https://www.ncbi.nlm.nih.gov/pubmed/21616285,...,False,False,False,False,,,True,,,
3,male,7490,,,NM_001035.3:c.14173T>A,3,Structural abnormalities on cardiac magnetic r...,"Gerber, D",32553227.0,https://www.sciencedirect.com/science/article/...,...,,,,,,,,,,
4,female,7491,heterozygous,,NM_001035.3:c.14174A>G,4,Genetic Background of Catecholaminergic Polymo...,"Kawamura, M",23595086.0,https://www.ncbi.nlm.nih.gov/pubmed/23595086,...,False,False,False,,,,,,,
5,male,7491,,spontaneous,NM_001035.3:c.14174A>G,5,Gender differences in the inheritance mode of ...,"Ohno, S.",26114861.0,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,...,,,,,,,,,,


In [1]:
import datetime

# drop the individual_id column
date = datetime.datetime.now().strftime("%Y-%m-%d")

date

'2024-07-05'

In [None]:
# save to csv
df_all.to_excel(
    os.path.join(data_dir, f"individual_all_data-{date}.xlsx"),
)

In [None]:
# export all publications
df_publications = pd.read_sql(
    """
SELECT p.publication_id, p.title, p.first_author, p.pmid, p.reference, p.doi, p.year
FROM publication p
ORDER BY p.title
""",
    engine,
)
df_publications.head()

In [None]:
df_publications.to_csv(os.path.join(data_dir, f"publications-{date}.csv"), index=False)