In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from analysis.database import get_config

config = get_config()
data_all = pd.read_excel(
    Path.cwd().parent / "data" / f"individual_all_data-{config.version}.xlsx",
    index_col=0)
data_all.head()

Unnamed: 0,individual_id,sex,publication_id,title,first_author,reference,doi,year,publication_database,resource_uri,...,treatment_taken::Implantable cardioverter-defibrillator,treatment_taken::Left cardiac sympathetic denervation,treatment_taken::Verapamil,effective::Beta blocker,effective::Catheter ablation,effective::Enalapril,effective::Flecainide,effective::Implantable cardioverter-defibrillator,effective::Left cardiac sympathetic denervation,effective::Verapamil
0,1,,1,Familial Evaluation in Catecholaminergic Polym...,Van Der Werf C,,10.1161/CIRCEP.112.970517,2012,PubMed,https://pubmed.ncbi.nlm.nih.gov/22787013,...,,,,,,,,,,
1,2,female,2,Flecainide therapy reduces exercise-induced ve...,"van der Werf, C",,10.1016/j.jacc.2011.01.026,2011,PubMed,https://pubmed.ncbi.nlm.nih.gov/21616285,...,0.0,0.0,0.0,0.0,,,1.0,,,
2,3,male,3,Structural abnormalities on cardiac magnetic r...,"Gerber, D",,10.1016/j.jacep.2020.03.006,2020,PubMed,https://pubmed.ncbi.nlm.nih.gov/32553227,...,,,,,,,,,,
3,4,female,4,Genetic Background of Catecholaminergic Polymo...,"Kawamura, M",,10.1253/circj.cj-12-1460,2013,PubMed,https://pubmed.ncbi.nlm.nih.gov/23595086,...,0.0,0.0,0.0,,,,,,,
4,5,male,5,Gender differences in the inheritance mode of ...,"Ohno, S.",,10.1371/journal.pone.0131517,2015,PubMed,https://pubmed.ncbi.nlm.nih.gov/26114861,...,,,,,,,,,,


In [25]:
import os

config = get_config()

# create dir for figures
output_dir = Path(
    os.path.join("..", "figures", config.version_for_dir,
                 "analysis_17_amino_acid_diseases"))

if not os.path.exists(output_dir):
    print(f"Creating directory: {output_dir}")
    os.makedirs(output_dir)

Creating directory: ../figures/0_3_4/analysis_17_amino_acid_diseases


In [13]:
cpvt_df = data_all[
    data_all["Catecholaminergic polymorphic ventricular tachycardia 1"] == 1
    ]
condition_cols = {
    "Polymorphic ventricular tachycardia": "PVT",
    "Syncope": "Syncope",
    "Sudden cardiac arrest": "SCA",
    "Sudden cardiac death": "SCD",
    "Atrial tachycardia": "AT",
}
selected_cols = [
                    "p_hgvs_aa1",
                    "protein_change_type",
                ] + list(condition_cols.keys())
cpvt_df = cpvt_df[selected_cols].copy()
cpvt_df = cpvt_df[
    # at least one of the columns is not null
    cpvt_df[list(condition_cols.keys())].notna().any(axis=1)
    & cpvt_df["p_hgvs_aa1"].notna()
    ]
print(cpvt_df.shape)
cpvt_df.head()

(830, 7)


Unnamed: 0,p_hgvs_aa1,protein_change_type,Polymorphic ventricular tachycardia,Syncope,Sudden cardiac arrest,Sudden cardiac death,Atrial tachycardia
1,Y4962C,Substitution,1.0,0.0,0.0,,
2,Y4725N,Substitution,1.0,1.0,0.0,,0.0
3,Y4725C,Substitution,1.0,0.0,0.0,,1.0
4,Y4725C,Substitution,,1.0,0.0,,
5,Y4725C,Substitution,1.0,,,,


In [22]:
import re


def has_condition(x: pd.Series):
    return np.sum(x == 1)


def not_has_condition(x: pd.Series):
    return np.sum(x == 0)


def total(x: pd.Series):
    return np.sum(x.notna())


result = cpvt_df.groupby(
    "p_hgvs_aa1"
).agg(
    {
        condition: [has_condition, not_has_condition, total]
        for condition in condition_cols
    }
)

result.columns = [
    f"{condition_cols[condition]}_{agg}"
    for condition, agg in result.columns
]
result.reset_index(inplace=True)
result["aa_loc"] = result["p_hgvs_aa1"].apply(
    lambda x: int(re.search(r"\d+", x).group())
)

# reorder columns
col_order = [
                "p_hgvs_aa1",
                "aa_loc",
            ] + [
                f"{condition_cols[condition]}_{agg}"
                for condition in condition_cols
                for agg in ["has_condition", "not_has_condition", "total"]
            ]

result = result[col_order]

result.head()

Unnamed: 0,p_hgvs_aa1,aa_loc,PVT_has_condition,PVT_not_has_condition,PVT_total,Syncope_has_condition,Syncope_not_has_condition,Syncope_total,SCA_has_condition,SCA_not_has_condition,SCA_total,SCD_has_condition,SCD_not_has_condition,SCD_total,AT_has_condition,AT_not_has_condition,AT_total
0,A1136V,1136,0,0,0,0,1,1,1,0,1,0,0,0,0,1,1
1,A165D,165,1,0,1,1,0,1,0,1,1,0,0,0,0,1,1
2,A2254V,2254,3,0,3,0,0,0,2,0,2,0,0,0,0,0,0
3,A2317E,2317,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,A2317T,2317,1,0,1,0,1,1,1,0,1,0,0,0,0,1,1


In [26]:
# find variants with amino acids at same location
same_aa_loc = result[result["aa_loc"].duplicated(keep=False)]

# same_aa_loc.to_excel(
#     output_dir / "same_aa_loc_variants.xlsx",
# )

same_aa_loc.head()

Unnamed: 0,p_hgvs_aa1,aa_loc,PVT_has_condition,PVT_not_has_condition,PVT_total,Syncope_has_condition,Syncope_not_has_condition,Syncope_total,SCA_has_condition,SCA_not_has_condition,SCA_total,SCD_has_condition,SCD_not_has_condition,SCD_total,AT_has_condition,AT_not_has_condition,AT_total
3,A2317E,2317,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,A2317T,2317,1,0,1,0,1,1,1,0,1,0,0,0,0,1,1
5,A2387P,2387,1,0,1,1,1,2,1,1,2,0,0,0,0,1,1
6,A2387T,2387,2,0,2,2,1,3,4,2,6,0,0,0,0,2,2
7,A2387V,2387,2,0,2,2,1,3,1,2,3,0,0,0,0,2,2


In [31]:
# find variants with at least in totals
min_total = 5

bool_res = np.zeros(same_aa_loc.shape[0], dtype=bool)

for condition in condition_cols.values():
    bool_res |= same_aa_loc[f"{condition}_total"] >= min_total

filtered_loc = same_aa_loc[bool_res]
filtered_loc = filtered_loc[
    filtered_loc["aa_loc"].duplicated(keep=False)
]
print(filtered_loc.shape)
filtered_loc.head()

(4, 17)


Unnamed: 0,p_hgvs_aa1,aa_loc,PVT_has_condition,PVT_not_has_condition,PVT_total,Syncope_has_condition,Syncope_not_has_condition,Syncope_total,SCA_has_condition,SCA_not_has_condition,SCA_total,SCD_has_condition,SCD_not_has_condition,SCD_total,AT_has_condition,AT_not_has_condition,AT_total
186,R176L,176,0,7,7,6,2,8,2,6,8,0,0,0,0,7,7
187,R176Q,176,0,1,1,5,0,5,0,3,3,0,0,0,0,1,1
202,R420Q,420,28,11,39,26,14,40,13,29,42,0,0,0,7,25,32
203,R420W,420,18,6,24,8,10,18,7,15,22,0,0,0,0,8,8


In [32]:
with pd.ExcelWriter(
        output_dir / "same_aa_loc_variants.xlsx", ) as writer:
    same_aa_loc.to_excel(writer, sheet_name="all")
    filtered_loc.to_excel(writer, sheet_name="filtered")

In [40]:
filtered_loc[
    filtered_loc["aa_loc"] > 1
    ].set_index("p_hgvs_aa1")[[
    "PVT_has_condition",
    "PVT_not_has_condition",
]]

Unnamed: 0_level_0,PVT_has_condition,PVT_not_has_condition
p_hgvs_aa1,Unnamed: 1_level_1,Unnamed: 2_level_1
R176L,0,7
R176Q,0,1
R420Q,28,11
R420W,18,6


In [42]:
# split into fisher tables
import scipy.stats as stats


def fisher_table(df: pd.DataFrame, condition: str, *, aa_loc: int):
    """
    Create a 2x2 table for Fisher's exact test
    """
    # create the table
    table = df[
        df["aa_loc"] == aa_loc
        ].set_index("p_hgvs_aa1")[[
        f"{condition}_has_condition",
        f"{condition}_not_has_condition",
    ]]

    return table



In [50]:
fisher_table(
    filtered_loc,
    "PVT",
    aa_loc=420
).T.sum()

p_hgvs_aa1
R420Q    39
R420W    24
dtype: int64

In [44]:
stats.fisher_exact(
    fisher_table(
        filtered_loc,
        "PVT",
        aa_loc=420
    )
)

SignificanceResult(statistic=0.8484848484848485, pvalue=1.0)

In [47]:
from typing import NamedTuple

# I know there could be more than 2 aa, but since thats not the case w our data I'll
# not handle it here
results = []


class ResultsInfo(NamedTuple):
    aa_loc: int
    variant1: str
    variant2: str
    total: int
    condition: str
    statistic: float
    p_value: float


for aa_loc in filtered_loc["aa_loc"].unique():
    for condition in condition_cols.values():
        table = fisher_table(
            filtered_loc,
            condition,
            aa_loc=aa_loc
        )

        # skip if any row is < 5
        ta

        odds_ratio, p_value = stats.fisher_exact(table)

        results.append(
            ResultsInfo(
                aa_loc=aa_loc,
                variant1=table.index[0],
                variant2=table.index[1],
                total=table.sum().sum(),
                condition=condition,
                statistic=odds_ratio,
                p_value=p_value
            )
        )

results = pd.DataFrame(results)

results

Unnamed: 0,aa_loc,variant1,variant2,total,condition,statistic,p_value
0,176,R176L,R176Q,8,PVT,,1.0
1,176,R176L,R176Q,13,Syncope,0.0,0.487179
2,176,R176L,R176Q,11,SCA,inf,1.0
3,176,R176L,R176Q,0,SCD,,1.0
4,176,R176L,R176Q,8,AT,,1.0
5,420,R420Q,R420W,63,PVT,0.848485,1.0
6,420,R420Q,R420W,58,Syncope,2.321429,0.161526
7,420,R420Q,R420W,64,SCA,0.960591,1.0
8,420,R420Q,R420W,0,SCD,,1.0
9,420,R420Q,R420W,40,AT,inf,0.308712
