# Calculate demographic and clinical characteristics
##### Updated 08/21/2024
##### Selin Kubali

#### Goal:
Calculate median, mean, IQR, standard deviations, and/or counts for variables used in Cox regression model.

#### Required inputs

Lifelines files - Stored in */selected_genes/hcm/cox_model_data*. 


In [None]:
import pandas as pd
import numpy as np

##### Calculate demographic and clinical characteristics

In [None]:
# read in file: demographic and clinical characteristics are the same for all genes
def demographic_clinical_characteristics(gene, dir_path = 'selected_genes/hcm/cox_model_data'):
        lifelines_data = pd.read_csv(f'/mnt/project/{dir_path}/{gene}_with_generated_data.csv', dtype={
        'is_family_hist':'boolean',
        'is_hcm':'boolean'
        })

        lifelines_data['prs_score'] = (lifelines_data['prs_score'] - lifelines_data['prs_score'].min()) / (lifelines_data['prs_score'].max() - lifelines_data['prs_score'].min())  # normalize PRS  

        lifelines_data = lifelines_data[['is_HCM', 'duration', 'prs_score', 'sex', 'is_family_hist', 'is_AF', 'is_HTN']]
        lifelines_data_hcm = lifelines_data[lifelines_data['is_HCM'] == 1]
        lifelines_data_no_hcm = lifelines_data[lifelines_data['is_HCM'] == 0]
        return lifelines_data, lifelines_data_hcm, lifelines_data_no_hcm
lifelines_data, lifelines_data_hcm, lifelines_data_no_hcm = demographic_clinical_characteristics('ACTN2')

In [None]:
significant_figures = 2
medians = pd.DataFrame({})
dict_list = []
# find 25th, 50th, and 75th percentiles for duration and prs_score for full cohort, patients with HCM, and patients without HCM
for cols in ['duration', 'prs_score']:
    col_list = {}
    for df, df_name in zip([lifelines_data, lifelines_data_hcm, lifelines_data_no_hcm], ['Full Cohort' ,'With HCM', 'Without HCM']):
        col = df[cols]
        col = col[~np.isnan(col)]
        percentiles = np.percentile(col, [25, 50, 75]) # find median and IQR
        formatted_numbers = []

        # format information
        for num in percentiles:
            formatted_numbers.append("{:.{}g}".format(num, significant_figures))
        string = str(formatted_numbers[1])+' ('+str(formatted_numbers[0])+'-'+str(formatted_numbers[2])+')'
        col_list[df_name] = string
    dict_list.append(col_list)
    
df = pd.DataFrame(dict_list, index = ['Duration', 'Normalized PRS score'])
df

In [None]:
sums = pd.DataFrame({})
# find counts for sex, family history of CAD, presence of atrial fibrillation, and presence of hypertension for full cohort, patients with HCM, and patients without HCM
dict_list = []
for cols in ['sex', 'is_family_hist', 'is_AF', 'is_HTN']:
    col_list = {}
    for df, df_name in zip([lifelines_data, lifelines_data_hcm, lifelines_data_no_hcm], ['Full Cohort' ,'With HCM (Num = 751)', 'Without HCM (Num = 501608)']):
        col = df[cols]
        col = col[~np.isnan(col)]
        formatted_numbers = []

        # format information
        string = str("{:.{}g}".format(col.sum(), 7)) + ' (' + str("{:.{}g}".format(col.mean(), 2)) + '%)'
        col_list[df_name] = string
    dict_list.append(col_list)
    
df = pd.DataFrame(dict_list, index = ['Number Male', 'Family History of CAD', 'Atrial Fibrillation', 'Hypertension'])
df

##### Calculate variant characteristics

In [None]:
# combine information from all genes to calculate variant characteristics
def combine_all_genes(dir_path = 'selected_genes/hcm/cox_model_data'):
    full_df = pd.DataFrame({})
    gene_list = ['ACTN2', 'ALPK3','FLNC','MYBPC3','MYH6','MYH7','MYL2','PTPN11','TNNT2']    
    for gene in gene_list:
        gene_data = pd.read_csv(f'/mnt/project/{dir_path}/{gene}_with_generated_data.csv', dtype={
                })
        print(gene)

                
        if 'revel_max' not in gene_data.columns:
            gene_data = gene_data[['am_pathogenicity','spliceai_ds_max','cadd_raw_score', 'phylop', 'faf_max','Consequence', 'is_HCM']]
        else:
            gene_data = gene_data[['am_pathogenicity', 'spliceai_ds_max', 'revel_max', 'cadd_raw_score', 'phylop', 'faf_max', 'Consequence', 'is_HCM']]
            gene_data['revel_max'] = (gene_data['revel_max'] - gene_data['revel_max'].min()) / (gene_data['revel_max'].max() - gene_data['revel_max'].min())     
            
        gene_data = gene_data[gene_data['Consequence'] == 'missense_variant']

        gene_data = gene_data.drop(['Consequence'], axis = 1)

        # normalize
        gene_data['am_pathogenicity'] = (gene_data['am_pathogenicity'] - gene_data['am_pathogenicity'].min()) / (gene_data['am_pathogenicity'].max() - gene_data['am_pathogenicity'].min())     
        gene_data['cadd_raw_score'] = (gene_data['cadd_raw_score'] - gene_data['cadd_raw_score'].min()) / (gene_data['cadd_raw_score'].max() - gene_data['cadd_raw_score'].min())     
        gene_data['phylop'] = (gene_data['phylop'] - gene_data['phylop'].min()) / (gene_data['phylop'].max() - gene_data['phylop'].min())     


        full_df = pd.concat([full_df,gene_data])
        return full_df
full_df = combine_all_genes()

In [None]:
lifelines_data = full_df[['is_HCM',  'am_pathogenicity', 'spliceai_ds_max', 'revel_max', 'cadd_raw_score', 'phylop', 'faf_max',]]
lifelines_data_hcm = full_df[full_df['is_HCM'] == 1]
lifelines_data_no_hcm = full_df[full_df['is_HCM'] == 0]

significant_figures = 2
medians = pd.DataFrame({})
dict_list = []
# find 25th, 50th, and 75th percentiles for AlphaMissense score, CADD score, PhyloP score, REVEL score, SpliceAI max score, and maximum population allele frequency for full cohort, patients with HCM, and patients without HCM
for cols in ['am_pathogenicity', 'spliceai_ds_max', 'revel_max', 'cadd_raw_score', 'phylop', 'faf_max']:
    col_list = {}
    for df, df_name in zip([lifelines_data, lifelines_data_no_hcm, lifelines_data_hcm], ['Full Cohort' , 'Without HCM', 'With HCM']):
        col = df[cols]
        col = col[~np.isnan(col)]
        percentiles = np.percentile(col, [25, 50, 75]) # find medians and IQR
        formatted_numbers = []

        # format
        for num in percentiles:
            formatted_numbers.append("{:.{}g}".format(num, significant_figures))
        string = str(formatted_numbers[1])+' ('+str(formatted_numbers[0])+'-'+str(formatted_numbers[2])+')'
        col_list[df_name] = string
    dict_list.append(col_list)
    
df = pd.DataFrame(dict_list, index = ['AlphaMissense', 'SpliceAI', 'REVEL', 'CADD', 'PhyloP', 'Pop freq'])
df