# Summary Statistics


In [1]:
import re

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import sqlalchemy as sa
from sqlalchemy import text

In [2]:
# find the current working directory
import os
import IPython

notebook_path = IPython.get_ipython().getoutput('pwd')[0]

data_dir = os.path.join(notebook_path, '..', 'data')

if not os.path.exists(data_dir):
    print(f'Creating directory: {data_dir}')
    os.makedirs(data_dir)

In [3]:
from datetime import datetime

date = datetime.now().strftime("%Y-%m-%d")

# create dir for figures
figures_dir = os.path.join(notebook_path, "..", "figures", "analysis_5", date)

if not os.path.exists(figures_dir):
    print(f"Creating directory: {figures_dir}")
    os.makedirs(figures_dir)

In [4]:
sns.set_style("ticks")

DEFAULT_COLOR = sns.color_palette("husl", 9)[6]

In [5]:
def add_bar_labels(ax, fmt="{:.0f}", ignore_zero=True):
    for bar in ax.patches:

        if bar.get_height() == 0 and ignore_zero:
            continue

        # Use ax.text() to add text over the bars
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(),
                fmt.format(bar.get_height()),
                ha='center', va='bottom')


def save_current_plot(name: str):
    for fmt in ["png", "pdf", "svg"]:
        plt.savefig(
            os.path.join(figures_dir, name + '.' + fmt), dpi=300)

In [6]:
# read db connection from .env file
from helpers.settings import get_settings

settings = get_settings()

Loading settings ...


In [7]:
engine = sa.create_engine(settings.postgresql_dsn.__str__())

In [8]:
with engine.connect() as conn:
    res = conn.execute(
        text(
            """
    SELECT COUNT(individual_id) AS num_patients
    FROM individual
    """
        )
    ).scalars().all()
    print(f"Number of patients TOTAL: {res[0]}")

    res = conn.execute(
        text(
            """
            SELECT COUNT(DISTINCT variant_id)
            FROM individual_variant
            """
        )
    ).scalars().all()

    print(f"Number of variants: {res[0]}")

    res = conn.execute(
        text(
            """
            SELECT COUNT(DISTINCT individual_id) AS num_cpvt_patients
            FROM cpvt_patients_v
            """
        )
    )

    print(f"Number of CPVT patients: {res.scalar()}")

    res = conn.execute(
        text(
            """
            SELECT COUNT(publication_id)
            FROM publication
            """
        )).scalars().all()

    print(f"Number of publications: {res[0]}")

Number of patients TOTAL: 1347
Number of variants: 393
Number of CPVT patients: 968
Number of publications: 227


In [9]:
# get number male, female, NAN
sex_agg = pd.read_sql("""
SELECT s.value AS sex,
    COUNT(individual_id) AS num_patients
FROM individual i 
LEFT JOIN individual_sex s
ON i.individual_sex_id = s.individual_sex_id
GROUP BY s.value 
""", engine)
sex_agg

Unnamed: 0,sex,num_patients
0,,174
1,female,645
2,male,528


In [10]:
# mutation inheritance
inheritance_agg = pd.read_sql("""
SELECT vi.variant_inheritance AS inheritance,
    COUNT (DISTINCT iv.individual_id) AS num_patients
FROM individual_variant iv
JOIN variant_inheritance vi
ON iv.variant_inheritance_id = vi.variant_inheritance_id
GROUP BY variant_inheritance
""", engine)
inheritance_agg

Unnamed: 0,inheritance,num_patients
0,inherited,251
1,spontaneous,100


In [11]:
# get percentages
inheritance_agg['percentage'] = inheritance_agg['num_patients'] / inheritance_agg['num_patients'].sum() * 100
inheritance_agg

Unnamed: 0,inheritance,num_patients,percentage
0,inherited,251,71.509972
1,spontaneous,100,28.490028


In [12]:
total_inheritance_records = inheritance_agg['num_patients'].sum()
total_inheritance_records

np.int64(351)

In [13]:
# get edit type of p variant
p_edit_type = pd.read_sql("""
SELECT et.name AS edit_type,
    COUNT(DISTINCT v.variant_id) AS num_variants
FROM individual_variant iv
JOIN variant v
ON iv.variant_id = v.variant_id
LEFT JOIN  sequence_variant sv
ON v.sequence_variant_id = sv.sequence_variant_id
LEFT JOIN edit_type et
ON sv.p_edit_type = et.edit_type_id
GROUP BY name
""", engine)
# calculate pct
p_edit_type['percentage'] = p_edit_type['num_variants'] / p_edit_type['num_variants'].sum() * 100
p_edit_type

Unnamed: 0,edit_type,num_variants,percentage
0,Deletion,5,1.272265
1,Frameshift,1,0.254453
2,Insertion,3,0.763359
3,Substitution,373,94.910941
4,,11,2.798982


In [14]:
p_edit_type_by_patient = pd.read_sql("""
SELECT et.name AS edit_type,
    COUNT(DISTINCT iv.individual_id) AS num_patients
FROM individual_variant iv
JOIN variant v
ON iv.variant_id = v.variant_id
LEFT JOIN  sequence_variant sv
ON v.sequence_variant_id = sv.sequence_variant_id
LEFT JOIN edit_type et
ON sv.p_edit_type = et.edit_type_id
GROUP BY name
""", engine)
# calculate pct
p_edit_type_by_patient['percentage'] = p_edit_type_by_patient['num_patients'] / p_edit_type_by_patient[
    'num_patients'].sum() * 100
p_edit_type_by_patient

Unnamed: 0,edit_type,num_patients,percentage
0,Deletion,26,1.930215
1,Frameshift,3,0.222717
2,Insertion,4,0.296956
3,Substitution,1282,95.174462
4,,32,2.37565


In [15]:
zygosity_agg = pd.read_sql("""
                           SELECT z.zygosity,
                            COUNT(iv.individual_id) AS num_patients
                            FROM individual_variant iv
                            JOIN zygosity z
                            ON iv.zygosity_id = z.zygosity_id
                            GROUP BY z.zygosity
                            """, engine)
zygosity_agg['percentage'] = zygosity_agg['num_patients'] / zygosity_agg['num_patients'].sum() * 100
zygosity_agg

Unnamed: 0,zygosity,num_patients,percentage
0,homozygous,6,0.940439
1,heterozygous,632,99.059561


In [16]:
total_zygosity_records = zygosity_agg['num_patients'].sum()
total_zygosity_records

np.int64(638)

In [17]:
# get num patients with detailed treatment records
num_patients = pd.read_sql("""
SELECT COUNT(DISTINCT patient_id) AS num_patients
FROM treatment_record tr
JOIN treatment t
ON tr.treatment_id = t.treatment_id
WHERE tr.patient_id IN (
    SELECT individual_id
    FROM cpvt_patients_v
) AND
tr.treatment_taken IS NOT NULL
""", engine)
num_patients

Unnamed: 0,num_patients
0,605


In [30]:
# get num patients with beta blocker + 
# Verapamil
# Flecainide
# Enalapril

num_patients_poly_pharm = pd.read_sql("""
SELECT COUNT(DISTINCT patient_id) AS num_patients
FROM treatment_record tr
JOIN treatment t
ON tr.treatment_id = t.treatment_id
WHERE tr.patient_id IN (
    SELECT individual_id
    FROM cpvt_patients_v
) AND
tr.treatment_taken = TRUE AND 
tr.patient_id IN (
    SELECT tr.patient_id
    FROM treatment_record tr
    JOIN treatment t
    ON tr.treatment_id = t.treatment_id
    WHERE t.treatment_name = 'Beta blocker'
    AND tr.treatment_taken = TRUE
)
AND tr.patient_id IN (
    SELECT tr.patient_id
    FROM treatment_record tr
    JOIN treatment t
    ON tr.treatment_id = t.treatment_id
    WHERE t.treatment_name IN ('Flecainide', 'Verapamil', 'Enalapril')
    AND tr.treatment_taken = TRUE
)
""", engine)

num_patients_poly_pharm

Unnamed: 0,num_patients
0,132


In [31]:
# calculate percentage
num_patients_poly_pharm['num_patients'] / num_patients['num_patients']

0    0.218182
Name: num_patients, dtype: float64

In [28]:
# invasive treatment
# Left cardiac sympathetic denervation
# Implantable cardioverter-defibrillator
# Catheter ablation
num_patients_invasive = pd.read_sql("""
SELECT COUNT(DISTINCT patient_id) AS num_patients
FROM treatment_record tr
JOIN treatment t
ON tr.treatment_id = t.treatment_id
WHERE tr.patient_id IN (
    SELECT individual_id
    FROM cpvt_patients_v
) AND
tr.treatment_taken = TRUE AND 
tr.patient_id IN (
    SELECT tr.patient_id
    FROM treatment_record tr
    JOIN treatment t
    ON tr.treatment_id = t.treatment_id
    WHERE t.treatment_name = 'Beta blocker'
    AND tr.treatment_taken = TRUE
)
AND tr.patient_id IN (
    SELECT tr.patient_id
    FROM treatment_record tr
    JOIN treatment t
    ON tr.treatment_id = t.treatment_id
    WHERE t.treatment_name IN ('Left cardiac sympathetic denervation', 'Implantable cardioverter-defibrillator', 'Catheter ablation')
    AND tr.treatment_taken = TRUE
)
""", engine)

num_patients_invasive

Unnamed: 0,num_patients
0,130


In [29]:
num_patients_invasive['num_patients'] / num_patients['num_patients']

0    0.214876
Name: num_patients, dtype: float64