In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from analysis.database import get_config
import re

config = get_config()
data_all = pd.read_excel(
    Path.cwd().parent / "data" / f"individual_all_data-{config.version}.xlsx",
    index_col=0)
data_all.head()

Unnamed: 0,individual_id,sex,publication_id,title,first_author,reference,doi,year,publication_database,resource_uri,...,treatment_taken::Implantable cardioverter-defibrillator,treatment_taken::Left cardiac sympathetic denervation,treatment_taken::Verapamil,effective::Beta blocker,effective::Catheter ablation,effective::Enalapril,effective::Flecainide,effective::Implantable cardioverter-defibrillator,effective::Left cardiac sympathetic denervation,effective::Verapamil
0,1,,1,Familial Evaluation in Catecholaminergic Polym...,Van Der Werf C,,10.1161/CIRCEP.112.970517,2012,PubMed,https://pubmed.ncbi.nlm.nih.gov/22787013,...,,,,,,,,,,
1,2,female,2,Flecainide therapy reduces exercise-induced ve...,"van der Werf, C",,10.1016/j.jacc.2011.01.026,2011,PubMed,https://pubmed.ncbi.nlm.nih.gov/21616285,...,0.0,0.0,0.0,0.0,,,1.0,,,
2,3,male,3,Structural abnormalities on cardiac magnetic r...,"Gerber, D",,10.1016/j.jacep.2020.03.006,2020,PubMed,https://pubmed.ncbi.nlm.nih.gov/32553227,...,,,,,,,,,,
3,4,female,4,Genetic Background of Catecholaminergic Polymo...,"Kawamura, M",,10.1253/circj.cj-12-1460,2013,PubMed,https://pubmed.ncbi.nlm.nih.gov/23595086,...,0.0,0.0,0.0,,,,,,,
4,5,male,5,Gender differences in the inheritance mode of ...,"Ohno, S.",,10.1371/journal.pone.0131517,2015,PubMed,https://pubmed.ncbi.nlm.nih.gov/26114861,...,,,,,,,,,,


In [2]:
import os

config = get_config()

# create dir for figures
output_dir = Path(
    os.path.join("..", "figures", config.version_for_dir,
                 "analysis_17_amino_acid_diseases"))

if not os.path.exists(output_dir):
    print(f"Creating directory: {output_dir}")
    os.makedirs(output_dir)

In [3]:
age_col = "age_of_onset::Catecholaminergic polymorphic ventricular tachycardia 1"

cpvt_df = data_all[
    data_all["Catecholaminergic polymorphic ventricular tachycardia 1"] == 1
    ]
condition_cols = {
    "Polymorphic ventricular tachycardia": "PVT",
    "Syncope": "Syncope",
    "Sudden cardiac arrest": "SCA",
    "Sudden cardiac death": "SCD",
}
selected_cols = [
                    "p_hgvs_aa1",
                    "protein_change_type",
                    age_col
                ] + list(condition_cols.keys())
cpvt_df = cpvt_df[selected_cols].copy()
cpvt_df = cpvt_df[
    # at least one of the columns is not null
    cpvt_df[list(condition_cols.keys())].notna().any(axis=1)
    & cpvt_df["p_hgvs_aa1"].notna()
    ]
print(cpvt_df.shape)
cpvt_df["aa_loc"] = cpvt_df["p_hgvs_aa1"].apply(
    lambda x: int(re.search(r"\d+", x).group())
)
cpvt_df.head()

(828, 7)


Unnamed: 0,p_hgvs_aa1,protein_change_type,age_of_onset::Catecholaminergic polymorphic ventricular tachycardia 1,Polymorphic ventricular tachycardia,Syncope,Sudden cardiac arrest,Sudden cardiac death,aa_loc
1,Y4962C,Substitution,,1.0,0.0,0.0,,4962
2,Y4725N,Substitution,,1.0,1.0,0.0,,4725
3,Y4725C,Substitution,2.0,1.0,0.0,0.0,,4725
4,Y4725C,Substitution,10.0,,1.0,0.0,,4725
5,Y4725C,Substitution,2.0,1.0,,,,4725


In [17]:
age_only = data_all[
    (data_all["Catecholaminergic polymorphic ventricular tachycardia 1"] == 1)
].groupby(
    "p_hgvs_aa1"
).agg(
    {
        age_col: [
            "mean", "median", "count"
        ]
    }
)

age_only.columns = [
    f"{agg}"
    for _, agg in age_only.columns
]
age_only.reset_index(inplace=True)
age_only["aa_loc"] = age_only["p_hgvs_aa1"].apply(
    lambda x: int(re.search(r"\d+", x).group())
)
age_only = age_only[
    age_only["aa_loc"].duplicated(keep=False)
]

print(age_only.shape)

age_only_filtered = age_only[
    age_only["count"] > 0
    ]
age_only_filtered = age_only_filtered[
    age_only_filtered["aa_loc"].duplicated(keep=False)
]
print(age_only_filtered.shape)

age_only_filtered.head()

(65, 5)
(40, 5)


Unnamed: 0,p_hgvs_aa1,mean,median,count,aa_loc
6,A2387T,10.0,10.0,1,2387
7,A2387V,10.0,10.0,3,2387
11,A4091T,4.333333,5.0,3,4091
12,A4091V,11.0,11.0,1,4091
79,G3946A,6.0,6.0,2,3946


In [5]:
import re


def has_condition(x: pd.Series):
    return np.sum(x == 1)


def not_has_condition(x: pd.Series):
    return np.sum(x == 0)


def total(x: pd.Series):
    return np.sum(x.notna())


def q1(x: pd.Series):
    return np.quantile(x, 0.25)


def q3(x: pd.Series):
    return np.quantile(x, 0.75)


result = cpvt_df.groupby(
    "p_hgvs_aa1"
).agg(
    {
        condition: [has_condition, not_has_condition, total]
        for condition in condition_cols
    } | {
        age_col: [
            "mean", "median", q1, q3, "count"
        ],
        "p_hgvs_aa1": "size",
        "aa_loc": "first"
    }
)

col_name_remapper = condition_cols | {
    age_col: "age_of_onset",
    "p_hgvs_aa1": "num_patients",
}

result.columns = [
    f"{col_name_remapper.get(condition, condition)}_{agg}"
    for condition, agg in result.columns
]
result.reset_index(inplace=True)

# rename to aa_loc
result["aa_loc"] = result["aa_loc_first"]
# # reorder columns
col_order = [
    col for col in result.columns if
    col not in {"aa_loc_first", "num_patients_size", "aa_loc"}
]
col_order.insert(1, "aa_loc")
col_order.insert(2, "num_patients_size")

result = result[col_order]

result.head()

Unnamed: 0,p_hgvs_aa1,aa_loc,num_patients_size,PVT_has_condition,PVT_not_has_condition,PVT_total,Syncope_has_condition,Syncope_not_has_condition,Syncope_total,SCA_has_condition,SCA_not_has_condition,SCA_total,SCD_has_condition,SCD_not_has_condition,SCD_total,age_of_onset_mean,age_of_onset_median,age_of_onset_q1,age_of_onset_q3,age_of_onset_count
0,A1136V,1136,1,0,0,0,0,1,1,1,0,1,0,0,0,,,,,0
1,A165D,165,1,1,0,1,1,0,1,0,1,1,0,0,0,8.0,8.0,8.0,8.0,1
2,A2254V,2254,3,3,0,3,0,0,0,2,0,2,0,0,0,8.0,8.0,,,1
3,A2317E,2317,1,1,0,1,0,0,0,0,0,0,0,0,0,,,,,0
4,A2317T,2317,1,1,0,1,0,1,1,1,0,1,0,0,0,4.0,4.0,4.0,4.0,1


In [6]:
# find variants with amino acids at same location
same_aa_loc = result[result["aa_loc"].duplicated(keep=False)]

print(same_aa_loc.shape)

# same_aa_loc.to_excel(
#     output_dir / "same_aa_loc_variants.xlsx",
# )

same_aa_loc.head()

(61, 20)


Unnamed: 0,p_hgvs_aa1,aa_loc,num_patients_size,PVT_has_condition,PVT_not_has_condition,PVT_total,Syncope_has_condition,Syncope_not_has_condition,Syncope_total,SCA_has_condition,SCA_not_has_condition,SCA_total,SCD_has_condition,SCD_not_has_condition,SCD_total,age_of_onset_mean,age_of_onset_median,age_of_onset_q1,age_of_onset_q3,age_of_onset_count
3,A2317E,2317,1,1,0,1,0,0,0,0,0,0,0,0,0,,,,,0
4,A2317T,2317,1,1,0,1,0,1,1,1,0,1,0,0,0,4.0,4.0,4.0,4.0,1
5,A2387P,2387,2,1,0,1,1,1,2,1,1,2,0,0,0,,,,,0
6,A2387T,2387,7,2,0,2,2,1,3,4,2,6,0,0,0,10.0,10.0,,,1
7,A2387V,2387,4,2,0,2,2,1,3,1,2,3,0,0,0,10.0,10.0,,,3


In [6]:
# find variants with at least in totals
min_total = 5

bool_res = np.zeros(same_aa_loc.shape[0], dtype=bool)

for condition in condition_cols.values():
    bool_res |= same_aa_loc[f"{condition}_total"] >= min_total

filtered_loc = same_aa_loc[bool_res]
filtered_loc = filtered_loc[
    filtered_loc["aa_loc"].duplicated(keep=False)
]
print(filtered_loc.shape)
filtered_loc.head()

(4, 20)


Unnamed: 0,p_hgvs_aa1,aa_loc,num_patients_size,PVT_has_condition,PVT_not_has_condition,PVT_total,Syncope_has_condition,Syncope_not_has_condition,Syncope_total,SCA_has_condition,SCA_not_has_condition,SCA_total,SCD_has_condition,SCD_not_has_condition,SCD_total,age_of_onset_mean,age_of_onset_median,age_of_onset_q1,age_of_onset_q3,age_of_onset_count
185,R176L,176,8,0,7,7,6,2,8,2,6,8,0,0,0,24.428571,25.0,,,7
186,R176Q,176,5,0,1,1,5,0,5,0,3,3,0,0,0,16.5,16.5,,,2
201,R420Q,420,51,28,11,39,26,14,40,13,29,42,0,0,0,14.230769,10.0,,,26
202,R420W,420,26,18,6,24,8,10,18,7,15,22,0,0,0,19.125,18.5,,,8


In [7]:
with pd.ExcelWriter(
        output_dir / "same_aa_loc_variants.xlsx", ) as writer:
    same_aa_loc.sort_values(by="aa_loc").to_excel(writer, sheet_name="all")
    filtered_loc.to_excel(writer, sheet_name="filtered")

In [8]:
filtered_loc[
    filtered_loc["aa_loc"] > 1
    ].set_index("p_hgvs_aa1")[[
    "PVT_has_condition",
    "PVT_not_has_condition",
]]

Unnamed: 0_level_0,PVT_has_condition,PVT_not_has_condition
p_hgvs_aa1,Unnamed: 1_level_1,Unnamed: 2_level_1
R176L,0,7
R176Q,0,1
R420Q,28,11
R420W,18,6


In [9]:
# split into fisher tables
import scipy.stats as stats


def fisher_table(df: pd.DataFrame, condition: str, *, aa_loc: int):
    """
    Create a 2x2 table for Fisher's exact test
    """
    # create the table
    table = df[
        df["aa_loc"] == aa_loc
        ].set_index("p_hgvs_aa1")[[
        f"{condition}_has_condition",
        f"{condition}_not_has_condition",
    ]]

    return table



In [10]:
np.any(fisher_table(
    filtered_loc,
    "PVT",
    aa_loc=420
).T.sum() < 5)

False

In [11]:
stats.fisher_exact(
    fisher_table(
        filtered_loc,
        "PVT",
        aa_loc=420
    )
)

SignificanceResult(statistic=0.8484848484848485, pvalue=1.0)

In [12]:
from typing import NamedTuple

# I know there could be more than 2 aa, but since thats not the case w our data I'll
# not handle it here
results = []


class ResultsInfo(NamedTuple):
    aa_loc: int
    variant1: str
    variant2: str
    total: int
    variant_1_has_condition: int
    variant_1_not_has_condition: int
    variant_2_has_condition: int
    variant_2_not_has_condition: int
    condition: str
    statistic: float
    p_value: float


for aa_loc in filtered_loc["aa_loc"].unique():
    for condition in condition_cols.values():
        table = fisher_table(
            filtered_loc,
            condition,
            aa_loc=aa_loc
        )

        # skip if any row is < 5
        if np.any(table.T.sum() < min_total):
            print(f"Skipping {aa_loc} {condition}")
            continue

        odds_ratio, p_value = stats.fisher_exact(table)

        results.append(
            ResultsInfo(
                aa_loc=aa_loc,
                variant1=table.index[0],
                variant2=table.index[1],
                variant_1_has_condition=table.iloc[0, 0],
                variant_1_not_has_condition=table.iloc[0, 1],
                variant_2_has_condition=table.iloc[1, 0],
                variant_2_not_has_condition=table.iloc[1, 1],
                total=table.sum().sum(),
                condition=condition,
                statistic=odds_ratio,
                p_value=p_value
            )
        )

results = pd.DataFrame(results)

results

Skipping 176 PVT
Skipping 176 SCA
Skipping 176 SCD
Skipping 420 SCD


Unnamed: 0,aa_loc,variant1,variant2,total,variant_1_has_condition,variant_1_not_has_condition,variant_2_has_condition,variant_2_not_has_condition,condition,statistic,p_value
0,176,R176L,R176Q,13,6,2,5,0,Syncope,0.0,0.487179
1,420,R420Q,R420W,63,28,11,18,6,PVT,0.848485,1.0
2,420,R420Q,R420W,58,26,14,8,10,Syncope,2.321429,0.161526
3,420,R420Q,R420W,64,13,29,7,15,SCA,0.960591,1.0


In [13]:
results.to_excel(output_dir / "results_variant_loc.xlsx")

In [14]:
stats.fisher_exact(fisher_table(
    filtered_loc,
    "Syncope",
    aa_loc=420
))

SignificanceResult(statistic=2.3214285714285716, pvalue=0.1615255507439563)

## Summary table

In [15]:
summary = same_aa_loc.copy()

for condition in condition_cols.values():
    summary[f"{condition}_percent"] = summary[
                                          f"{condition}_has_condition"
                                      ] / summary[
                                          f"{condition}_total"
                                      ] * 100

    # drop columns
    summary.drop(
        columns=[
            f"{condition}_has_condition",
            f"{condition}_not_has_condition",
            f"{condition}_total"
        ],
        inplace=True
    )

summary.sort_values(
    by="aa_loc",
    inplace=True
)
summary.head()

Unnamed: 0,p_hgvs_aa1,aa_loc,num_patients_size,age_of_onset_mean,age_of_onset_median,age_of_onset_q1,age_of_onset_q3,age_of_onset_count,PVT_percent,Syncope_percent,SCA_percent,SCD_percent
183,R169L,169,2,9.0,9.0,9.0,9.0,2,100.0,100.0,0.0,
184,R169Q,169,11,10.3,8.0,,,10,100.0,100.0,62.5,
185,R176L,176,8,24.428571,25.0,,,7,0.0,75.0,25.0,
186,R176Q,176,5,16.5,16.5,,,2,0.0,100.0,0.0,
196,R414C,414,4,13.666667,15.0,,,3,0.0,75.0,50.0,


In [16]:
# save summary
# summary.to_excel(
#     output_dir / "summary.xlsx",
#     index=False
# )
summary_filtered = summary[
    summary["num_patients_size"] > 1
    ]
summary_filtered = summary_filtered[
    summary_filtered["aa_loc"].duplicated(keep=False)
]

print(summary_filtered.shape)

with pd.ExcelWriter(
        output_dir / "summary.xlsx",
) as writer:
    summary_filtered.to_excel(writer, sheet_name="filtered")
    summary.to_excel(writer, sheet_name="all")

(26, 12)


## Age of onset

In [17]:
min_for_mwu = 3

with_age_data = summary[
    summary[
        "age_of_onset_count"
    ] >= min_for_mwu
    ]

with_age_data = with_age_data[
    with_age_data["aa_loc"].duplicated(keep=False)
]

with_age_data

Unnamed: 0,p_hgvs_aa1,aa_loc,num_patients_size,age_of_onset_mean,age_of_onset_median,age_of_onset_q1,age_of_onset_q3,age_of_onset_count,PVT_percent,Syncope_percent,SCA_percent,SCD_percent
201,R420Q,420,51,14.230769,10.0,,,26,71.794872,65.0,30.952381,
202,R420W,420,26,19.125,18.5,,,8,75.0,44.444444,31.818182,
189,R2474G,2474,5,7.2,10.0,1.0,10.0,5,100.0,75.0,50.0,
190,R2474S,2474,3,7.666667,8.0,7.5,8.0,3,100.0,100.0,50.0,
215,S4124R,4124,3,7.0,7.0,7.0,7.0,3,100.0,100.0,0.0,
213,S4124G,4124,10,26.333333,26.0,,,3,100.0,66.666667,25.0,


In [18]:
patient_age = cpvt_df[
    cpvt_df["p_hgvs_aa1"].isin(
        with_age_data["p_hgvs_aa1"]
    )
    &
    cpvt_df[
        age_col].notna()
    ]
patient_age = patient_age[[
    "p_hgvs_aa1",
    "aa_loc",
    age_col,
]]
patient_age.head()

Unnamed: 0,p_hgvs_aa1,aa_loc,age_of_onset::Catecholaminergic polymorphic ventricular tachycardia 1
140,S4124R,4124,7.0
141,S4124R,4124,7.0
142,S4124R,4124,7.0
152,S4124G,4124,45.0
153,S4124G,4124,26.0


In [19]:
class MannWhitneyResult(NamedTuple):
    aa_loc: int
    variant1: str
    variant2: str
    median1: float
    median2: float
    q1_1: float
    q1_2: float
    q3_1: float
    q3_2: float
    n_1: int
    n_2: int
    p_value: float
    statistic: float


mwu_results = []
for aa_loc in patient_age["aa_loc"].unique():
    print(aa_loc)
    df_aa_loc = patient_age[
        patient_age["aa_loc"] == aa_loc
        ]
    foo = df_aa_loc.groupby(
        "p_hgvs_aa1"
    )[age_col].apply(list).reset_index()

    # check if there are 2 variants
    if len(foo) != 2:
        print(f"Skipping {aa_loc} {foo.shape}")
        continue

    # mwu will always be insignificant if less than 8 total
    if len(foo[age_col].iloc[0]) + len(foo[age_col].iloc[1]) < 8:
        print(f"Skipping {aa_loc} {foo.shape}")
        continue

    statistic, p_value = stats.mannwhitneyu(
        foo[age_col].iloc[0],
        foo[age_col].iloc[1]
    )

    mwu_results.append(
        MannWhitneyResult(
            aa_loc=aa_loc,
            variant1=foo.iloc[0, 0],
            variant2=foo.iloc[1, 0],
            median1=np.median(foo[age_col].iloc[0]),
            median2=np.median(foo[age_col].iloc[1]),
            q1_1=np.quantile(foo[age_col].iloc[0], 0.25),
            q1_2=np.quantile(foo[age_col].iloc[1], 0.25),
            q3_1=np.quantile(foo[age_col].iloc[0], 0.75),
            q3_2=np.quantile(foo[age_col].iloc[1], 0.75),
            p_value=p_value,
            statistic=statistic,
            n_1=len(foo[age_col].iloc[0]),
            n_2=len(foo[age_col].iloc[1])
        )
    )

mwu_results = pd.DataFrame(mwu_results)
mwu_results.to_excel(
    output_dir / "mwu_results_age_onset.xlsx",
    index=False
)
mwu_results

4124
Skipping 4124 (2, 2)
420
2474


Unnamed: 0,aa_loc,variant1,variant2,median1,median2,q1_1,q1_2,q3_1,q3_2,n_1,n_2,p_value,statistic
0,420,R420Q,R420W,10.0,18.5,9.0,13.0,14.75,20.75,26,8,0.019124,46.0
1,2474,R2474G,R2474S,10.0,8.0,1.0,7.5,10.0,8.0,5,3,0.761422,9.0
