# Summary Statistics


In [9]:
import re

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import sqlalchemy as sa
from sqlalchemy import text

In [10]:
# find the current working directory
import os
import IPython

notebook_path = IPython.get_ipython().getoutput('pwd')[0]

data_dir = os.path.join(notebook_path, '..', 'data')

if not os.path.exists(data_dir):
    print(f'Creating directory: {data_dir}')
    os.makedirs(data_dir)

In [11]:
from datetime import datetime

date = datetime.now().strftime("%Y-%m-%d")

# create dir for figures
figures_dir = os.path.join(notebook_path, "..", "figures", "analysis_5", date)

if not os.path.exists(figures_dir):
    print(f"Creating directory: {figures_dir}")
    os.makedirs(figures_dir)

In [12]:
sns.set_style("ticks")

DEFAULT_COLOR = sns.color_palette("husl", 9)[6]

In [13]:
def add_bar_labels(ax, fmt="{:.0f}", ignore_zero=True):
    for bar in ax.patches:

        if bar.get_height() == 0 and ignore_zero:
            continue

        # Use ax.text() to add text over the bars
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(),
                fmt.format(bar.get_height()),
                ha='center', va='bottom')


def save_current_plot(name: str):
    for fmt in ["png", "pdf", "svg"]:
        plt.savefig(
            os.path.join(figures_dir, name + '.' + fmt), dpi=300)

In [14]:
# read db connection from .env file
from helpers.settings import get_settings

settings = get_settings()

In [15]:
engine = sa.create_engine(settings.postgresql_dsn.__str__())

In [55]:
def qry(sql: str):
    with engine.connect() as conn:
        res = conn.execute(text(sql)).scalar()

    return res


tot_variants = qry(
    """
    SELECT COUNT(DISTINCT variant_id)
    FROM individual_variant
    """
)

print(f"Number of variants: {tot_variants} (TOTAL)")

num_cpvt_variants = qry(
    """
    SELECT COUNT(DISTINCT variant_id) AS num_patients
    FROM individual_variant
    JOIN cpvt_patients_v cp
    ON cp.individual_id = individual_variant.individual_id
    """
)

print(f"Number of variants in CPVT patients: {num_cpvt_variants}")

num_tot_patients = qry(
    """
    SELECT COUNT(individual_id) AS num_patients
    FROM individual
"""
)
print(f"Number of patients: {num_tot_patients} (TOTAL)")

num_cpvt_patients = qry(
    """
    SELECT COUNT(DISTINCT individual_id) AS num_cpvt_patients
    FROM cpvt_patients_v
    """
)

print(f"Number of CPVT patients: {num_cpvt_patients}")

num_pubs = qry(
    """
    SELECT COUNT(DISTINCT p.publication_id)
    FROM publication p
    JOIN individual_to_publication itp 
    ON p.publication_id = itp.publication_id
    """
)

print(f"Number of publications: {num_pubs}")

hgvs_p_parsable = qry(
    """
    SELECT COUNT(DISTINCT sv.sequence_variant_id) AS num_protein_variants_hgvs_parsed
    FROM individual_variant iv
    JOIN variant v
    ON iv.variant_id = v.variant_id
    JOIN sequence_variant sv
    ON v.sequence_variant_id = sv.sequence_variant_id
    WHERE sv.p_hgvs_string IS NOT NULL
    """
)

print(f"Number of PROTEIN variant_ids with parsed HGVS: {hgvs_p_parsable} (TOTAL)")

hgvs_cpvt_p_parsable = qry(
    """
    SELECT COUNT(DISTINCT sv.sequence_variant_id) AS num_protein_variants_hgvs_parsed
    FROM individual_variant iv
    JOIN variant v
    ON iv.variant_id = v.variant_id
    JOIN sequence_variant sv
    ON v.sequence_variant_id = sv.sequence_variant_id
    JOIN cpvt_patients_v cp
    ON cp.individual_id = iv.individual_id
    WHERE sv.p_hgvs_string IS NOT NULL
    """
)

print(f"Number of PROTEIN variant_ids with parsed HGVS in CPVT patients: {hgvs_cpvt_p_parsable}")

hgvs_variants_total_patients = qry(
    """
    SELECT COUNT(DISTINCT sv.sequence_variant_id) AS num_hgvs_parsable
    FROM individual_variant iv
    JOIN variant v
    ON iv.variant_id = v.variant_id
    JOIN sequence_variant sv
    ON v.sequence_variant_id = sv.sequence_variant_id
    WHERE sv.sequence_variant_id IS NOT NULL
    """
)

print(f"Number of unique variant_ids (protein/dna) parsable: {hgvs_variants_total_patients} in (TOTAL)")

hgvs_any_parsable_cpvt = qry(
    """
    SELECT COUNT(DISTINCT sv.sequence_variant_id) AS num_hgvs_parsable
    FROM individual_variant iv
    JOIN variant v
    ON iv.variant_id = v.variant_id
    JOIN sequence_variant sv
    ON v.sequence_variant_id = sv.sequence_variant_id
    JOIN cpvt_patients_v cp
    ON cp.individual_id = iv.individual_id
    WHERE sv.sequence_variant_id IS NOT NULL
    """
)

print(f"Number of unique variant_ids (protein/dna) parsable: {hgvs_any_parsable_cpvt} in CPVT patients")

hgvs_only_hgvs_str = qry(
    """
    SELECT COUNT(DISTINCT sv.p_hgvs_string) AS num_unique_proteins
    FROM individual_variant iv
    JOIN variant v
    ON iv.variant_id = v.variant_id
    JOIN sequence_variant sv
    ON v.sequence_variant_id = sv.sequence_variant_id
    WHERE sv.sequence_variant_id IS NOT NULL
    AND sv.p_hgvs_string IS NOT NULL
    """
)

print(f"Number of unique protein HGVS STRINGS: {hgvs_only_hgvs_str} (TOTAL)")

hgvs_only_hgvs_str_cpvt = qry(
    """
    SELECT COUNT(DISTINCT sv.p_hgvs_string) AS num_unique_proteins
    FROM individual_variant iv
    JOIN variant v
    ON iv.variant_id = v.variant_id
    JOIN sequence_variant sv
    ON v.sequence_variant_id = sv.sequence_variant_id
    JOIN cpvt_patients_v cp
    ON cp.individual_id = iv.individual_id
    WHERE sv.sequence_variant_id IS NOT NULL
    AND sv.p_hgvs_string IS NOT NULL
    """
)

print(f"Number of unique protein HGVS STRINGS: {hgvs_only_hgvs_str_cpvt} in CPVT patients")

Number of variants: 393 (TOTAL)
Number of variants in CPVT patients: 284
Number of patients: 1347 (TOTAL)
Number of CPVT patients: 968
Number of publications: 225
Number of PROTEIN variant_ids with parsed HGVS: 382 (TOTAL)
Number of PROTEIN variant_ids with parsed HGVS in CPVT patients: 275
Number of unique variant_ids (protein/dna) parsable: 392 in (TOTAL)
Number of unique variant_ids (protein/dna) parsable: 283 in CPVT patients
Number of unique protein HGVS STRINGS: 381 (TOTAL)
Number of unique protein HGVS STRINGS: 274 in CPVT patients


In [32]:
# get the list of variants not parsable
variants_not_parsable = pd.read_sql("""
SELECT v.variant_id,
    v.hgvs_string
FROM variant v
JOIN individual_variant iv
ON v.variant_id = iv.variant_id
WHERE v.sequence_variant_id IS NULL
""", engine)
variants_not_parsable

Unnamed: 0,variant_id,hgvs_string
0,9225,exon 3 deletion
1,9225,exon 3 deletion
2,9225,exon 3 deletion
3,9225,exon 3 deletion
4,9225,exon 3 deletion
5,9225,exon 3 deletion
6,9225,exon 3 deletion
7,9225,exon 3 deletion
8,9225,exon 3 deletion
9,9225,exon 3 deletion


In [28]:
def add_pct(df: pd.DataFrame, cat_col: str, col: str = "num_patients"):
    df["percentage"] = df[col] / df[col].sum() * 100

    df["percentage_exclude_nan"] = df.dropna(subset=[cat_col])[col] / df.dropna(subset=[cat_col])[col].sum() * 100

    return df

# Sex of patients

In [30]:
# get number male, female, NAN
sex_agg = pd.read_sql("""
SELECT s.value AS sex,
    COUNT(DISTINCT individual_id) AS num_patients
FROM individual i 
LEFT JOIN individual_sex s
ON i.individual_sex_id = s.individual_sex_id
GROUP BY s.value 
""", engine)
add_pct(sex_agg, "sex")
sex_agg

Unnamed: 0,sex,num_patients,percentage,percentage_exclude_nan
0,female,645,47.884187,54.987212
1,male,528,39.198218,45.012788
2,,174,12.917595,


In [31]:
# only cpvt patients
only_cpvt_sex_agg = pd.read_sql("""
SELECT s.value AS sex,
    COUNT(DISTINCT i.individual_id) AS num_patients
FROM cpvt_patients_v cpv 
LEFT JOIN individual i
ON cpv.individual_id = i.individual_id
LEFT JOIN individual_sex s
ON i.individual_sex_id = s.individual_sex_id
GROUP BY s.value
""", engine)
add_pct(only_cpvt_sex_agg, "sex")
only_cpvt_sex_agg

Unnamed: 0,sex,num_patients,percentage,percentage_exclude_nan
0,female,464,47.933884,56.862745
1,male,352,36.363636,43.137255
2,,152,15.702479,


# Inheritance

In [34]:
# mutation inheritance
inheritance_agg = pd.read_sql("""
SELECT vi.variant_inheritance AS inheritance,
    COUNT (DISTINCT iv.individual_id) AS num_patients
FROM individual_variant iv
JOIN variant_inheritance vi
ON iv.variant_inheritance_id = vi.variant_inheritance_id
GROUP BY variant_inheritance
""", engine)
add_pct(inheritance_agg, "inheritance")
inheritance_agg

Unnamed: 0,inheritance,num_patients,percentage,percentage_exclude_nan
0,inherited,251,71.509972,71.509972
1,spontaneous,100,28.490028,28.490028


In [35]:
total_inheritance_records = inheritance_agg['num_patients'].sum()
total_inheritance_records

np.int64(351)

In [36]:
# get edit type of p variant
p_edit_type = pd.read_sql("""
SELECT et.name AS edit_type,
    COUNT(DISTINCT v.variant_id) AS num_variants
FROM individual_variant iv
JOIN variant v
ON iv.variant_id = v.variant_id
LEFT JOIN  sequence_variant sv
ON v.sequence_variant_id = sv.sequence_variant_id
LEFT JOIN edit_type et
ON sv.p_edit_type = et.edit_type_id
GROUP BY name
""", engine)
# calculate pct
add_pct(p_edit_type, "edit_type", "num_variants")
p_edit_type

Unnamed: 0,edit_type,num_variants,percentage,percentage_exclude_nan
0,Deletion,5,1.272265,1.308901
1,Frameshift,1,0.254453,0.26178
2,Insertion,3,0.763359,0.78534
3,Substitution,373,94.910941,97.643979
4,,11,2.798982,


In [37]:
p_edit_type_by_patient = pd.read_sql("""
SELECT et.name AS edit_type,
    COUNT(iv.individual_id) AS num_patients
FROM individual_variant iv
JOIN variant v
ON iv.variant_id = v.variant_id
LEFT JOIN  sequence_variant sv
ON v.sequence_variant_id = sv.sequence_variant_id
LEFT JOIN edit_type et
ON sv.p_edit_type = et.edit_type_id
WHERE name IS NOT NULL
GROUP BY name
""", engine)
# calculate pct
add_pct(p_edit_type_by_patient, "edit_type", "num_patients")
p_edit_type_by_patient

Unnamed: 0,edit_type,num_patients,percentage,percentage_exclude_nan
0,Deletion,26,1.977186,1.977186
1,Insertion,4,0.304183,0.304183
2,Substitution,1282,97.490494,97.490494
3,Frameshift,3,0.228137,0.228137


In [38]:
zygosity_agg = pd.read_sql("""
                           SELECT z.zygosity,
                            COUNT(iv.individual_id) AS num_patients
                            FROM individual_variant iv
                            JOIN zygosity z
                            ON iv.zygosity_id = z.zygosity_id
                            GROUP BY z.zygosity
                            """, engine)
add_pct(zygosity_agg, "zygosity")
zygosity_agg

Unnamed: 0,zygosity,num_patients,percentage,percentage_exclude_nan
0,heterozygous,632,99.059561,99.059561
1,homozygous,6,0.940439,0.940439


In [17]:
total_zygosity_records = zygosity_agg['num_patients'].sum()
total_zygosity_records

np.int64(638)

# Only CPVT Patients


In [49]:
# conditions
conditions_to_graph = [
    "Exercise/stress induced polymorphic ventricular tachycardia",
    "Syncope exercise/stress induced",
    "Exercise/stress induced sudden cardiac arrest",
    "Baseline/resting electrocardiogram abnormality",
    "Heart Structure Abnormality",
]


def make_condition_qry(
        condition: str
):
    stmt = text("""
    SELECT
    c.condition,
    ic.has_condition,
    COUNT(ic.individual_id) AS num_patients
    FROM individual_condition ic
    JOIN condition c
    ON c.condition_id = ic.condition_id
    WHERE c.condition ILIKE :condition
    AND ic.has_condition IS NOT NULL
    AND ic.individual_id IN (
        SELECT individual_id
        FROM individual_condition ic2
        JOIN condition c2
        ON c2.condition_id = ic2.condition_id
        WHERE c2.condition = 'Catecholaminergic polymorphic ventricular tachycardia 1'
        AND ic2.has_condition = true
    )   
    GROUP BY c.condition, ic.has_condition
    ORDER BY c.condition, ic.has_condition
    """)

    return stmt.bindparams(condition=condition)


all_dfs = []

for condition in conditions_to_graph:
    df_condition = pd.read_sql(make_condition_qry(condition), engine)
    add_pct(df_condition, "has_condition")

    # print total in group
    total_in_group = df_condition['num_patients'].sum()
    print(f"Total patients with {condition}: {total_in_group}")

    all_dfs.append(df_condition)

# merge all dataframes
all_conditions = pd.concat(all_dfs)
all_conditions

Total patients with Exercise/stress induced polymorphic ventricular tachycardia: 323
Total patients with Syncope exercise/stress induced: 271
Total patients with Exercise/stress induced sudden cardiac arrest: 113
Total patients with Baseline/resting electrocardiogram abnormality: 465
Total patients with Heart Structure Abnormality: 494


Unnamed: 0,condition,has_condition,num_patients,percentage,percentage_exclude_nan
0,Exercise/stress induced polymorphic ventricula...,False,7,2.167183,2.167183
1,Exercise/stress induced polymorphic ventricula...,True,316,97.832817,97.832817
0,Syncope exercise/stress induced,False,9,3.321033,3.321033
1,Syncope exercise/stress induced,True,262,96.678967,96.678967
0,Exercise/stress induced sudden cardiac arrest,False,16,14.159292,14.159292
1,Exercise/stress induced sudden cardiac arrest,True,97,85.840708,85.840708
0,Baseline/resting electrocardiogram abnormality,False,388,83.44086,83.44086
1,Baseline/resting electrocardiogram abnormality,True,77,16.55914,16.55914
0,Heart Structure Abnormality,False,449,90.890688,90.890688
1,Heart Structure Abnormality,True,45,9.109312,9.109312


# TREATMENTS

Must `JOIN` with `cpvt_patients_v` to get only CPVT patients

In [61]:
# get num patients with detailed treatment records
num_patients = pd.read_sql("""
SELECT COUNT(DISTINCT patient_id) AS num_patients
FROM treatment_record tr
JOIN treatment t
ON tr.treatment_id = t.treatment_id
WHERE tr.patient_id IN (
    SELECT individual_id
    FROM cpvt_patients_v
) AND
tr.treatment_taken IS NOT NULL
""", engine)
num_patients

Unnamed: 0,num_patients
0,605


In [62]:
# beta blocker + any other treatment
num_patients_bb = pd.read_sql("""
SELECT COUNT(DISTINCT patient_id) AS num_patients
FROM treatment_record tr
JOIN treatment t
ON tr.treatment_id = t.treatment_id
WHERE tr.patient_id IN (
    SELECT individual_id
    FROM cpvt_patients_v
) AND
tr.treatment_taken = TRUE AND
treatment_name = 'Beta blocker'
""", engine)
num_patients_bb

Unnamed: 0,num_patients
0,434


In [63]:
num_patients_bb['num_patients'] / num_patients['num_patients']

0    0.717355
Name: num_patients, dtype: float64

In [19]:
# get num patients with beta blocker + 
# Verapamil
# Flecainide
# Enalapril

num_patients_poly_pharm = pd.read_sql("""
SELECT COUNT(DISTINCT patient_id) AS num_patients
FROM treatment_record tr
JOIN treatment t
ON tr.treatment_id = t.treatment_id
WHERE tr.patient_id IN (
    SELECT individual_id
    FROM cpvt_patients_v
) AND
tr.treatment_taken = TRUE AND 
tr.patient_id IN (
    SELECT tr.patient_id
    FROM treatment_record tr
    JOIN treatment t
    ON tr.treatment_id = t.treatment_id
    WHERE t.treatment_name = 'Beta blocker'
    AND tr.treatment_taken = TRUE
)
AND tr.patient_id IN (
    SELECT tr.patient_id
    FROM treatment_record tr
    JOIN treatment t
    ON tr.treatment_id = t.treatment_id
    WHERE t.treatment_name IN ('Flecainide', 'Verapamil', 'Enalapril')
    AND tr.treatment_taken = TRUE
)
""", engine)

num_patients_poly_pharm

Unnamed: 0,num_patients
0,132


In [20]:
# calculate percentage
num_patients_poly_pharm['num_patients'] / num_patients['num_patients']

0    0.218182
Name: num_patients, dtype: float64

In [21]:
# invasive treatment
# Left cardiac sympathetic denervation
# Implantable cardioverter-defibrillator
# Catheter ablation
num_patients_invasive = pd.read_sql("""
SELECT COUNT(DISTINCT patient_id) AS num_patients
FROM treatment_record tr
JOIN treatment t
ON tr.treatment_id = t.treatment_id
WHERE tr.patient_id IN (
    SELECT individual_id
    FROM cpvt_patients_v
) AND
tr.treatment_taken = TRUE AND 
tr.patient_id IN (
    SELECT tr.patient_id
    FROM treatment_record tr
    JOIN treatment t
    ON tr.treatment_id = t.treatment_id
    WHERE t.treatment_name = 'Beta blocker'
    AND tr.treatment_taken = TRUE
)
AND tr.patient_id IN (
    SELECT tr.patient_id
    FROM treatment_record tr
    JOIN treatment t
    ON tr.treatment_id = t.treatment_id
    WHERE t.treatment_name IN ('Left cardiac sympathetic denervation', 'Implantable cardioverter-defibrillator', 'Catheter ablation')
    AND tr.treatment_taken = TRUE
)
""", engine)

num_patients_invasive

Unnamed: 0,num_patients
0,130


In [22]:
num_patients_invasive['num_patients'] / num_patients['num_patients']

0    0.214876
Name: num_patients, dtype: float64