In [None]:
import pandas as pd
import numpy as np

import os
import math
import time
import operator
from collections import defaultdict

import numpy as np
import pandas as pd
from scipy import stats
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.multivariate.manova import MANOVA
from statsmodels.tools.sm_exceptions import PerfectSeparationError
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score, roc_auc_score
from sklearn.metrics import brier_score_loss, precision_score, recall_score, f1_score, roc_auc_score

from ml4h_ccds.data_descriptions.util import download_s3_if_not_exists
from ml4h.explorations import latent_space_dataframe

# IPython imports
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sb 

In [None]:
def confounder_vector(label_header, df, indexes):
    clf = Ridge(normalize=True, max_iter=50000) #ElasticNet(max_iter=50000, normalize=True)#Ridge(normalize=True, max_iter=10000)#ElasticNet(max_iter=50000, normalize=True) #LinearRegression() #LinearRegression(normalize=True)   #ElasticNet(normalize=True)
    clf.fit(df[indexes], df[label_header])
    train_score = clf.score(df[indexes], df[label_header])
    return clf.coef_, train_score


def confounder_matrix(adjust_cols, df, indexes):
    vectors = []
    scores = {}
    for col in adjust_cols:
        cv, r2 = confounder_vector(col, df, indexes)
        scores[col] = r2
        vectors.append(cv)
    return np.array(vectors), scores


def iterative_subspace_removal(adjust_cols, latent_df, latent_cols, r2_thresh=0.01): 
    new_cols = latent_cols
    new_adjust_cols = adjust_cols
    space = latent_df[latent_cols].to_numpy()
    iteration = 0
    while len(new_adjust_cols) > 0 and space.shape[-1] > len(new_adjust_cols):
        cfm, scores = confounder_matrix(new_adjust_cols, latent_df, new_cols)
        u, s, vt = np.linalg.svd(cfm, full_matrices=True)
        nspace = np.matmul(space, vt[:, len(new_adjust_cols):])
        new_cols=[]
        for i in range(nspace.shape[-1]):
            col = f'new_latent_{iteration}_{i}'
            new_cols.append(col)
            latent_df[col] = nspace[:, i]
        
        iteration += 1
        space = nspace
        new_adjust_cols = [col for col, score in scores.items() if score > r2_thresh]
        print(f'Scores were {scores}, remaining columns are {new_adjust_cols}')
        print(f'After iteration {iteration} Space shape is: {space.shape}')
    return new_cols

def stratify_and_project_latent_space(stratify_column, stratify_thresh, stratify_std, 
                                      latent_cols, adjust_cols, latent_df, test_df, component_folder,
                                      manova=False, permute=0, save_components=True, histograms=False):
    
    if manova:
        formula = f"{'+'.join(latent_cols)} ~ {stratify_column}"
        maov = MANOVA.from_formula(formula, data=latent_df)
        test = maov.mv_test()
        s = test[stratify_column]['stat']
        return s['Pr > F'][0]   
    
    hit = latent_df.loc[latent_df[stratify_column] >= stratify_thresh+(1*stratify_std)]
    miss = latent_df.loc[latent_df[stratify_column] < stratify_thresh-(1*stratify_std)]
    hit_np = hit[latent_cols].to_numpy()
    miss_np = miss[latent_cols].to_numpy()
    miss_mean_vector = np.mean(miss_np, axis=0)
    hit_mean_vector = np.mean(hit_np, axis=0)
    angle = angle_between(miss_mean_vector, hit_mean_vector)
    space = test_df[latent_cols].to_numpy()
    space -= np.mean(space)
    space /= np.std(space)
    phenotype_vector = unit_vector(hit_mean_vector-miss_mean_vector)
    all_dots = np.array([np.dot(phenotype_vector, v) for v in space])
    all_phenotypes = test_df[stratify_column].to_numpy()
    if permute > 0:
        for i in range(len(all_phenotypes)):
            if np.random.random() < permute:
                all_phenotypes[i] = 1-all_phenotypes[i]
    if len(adjust_cols) > 0:
        all_adjustments = test_df[adjust_cols].to_numpy()
        all_data = np.column_stack([all_phenotypes, all_adjustments, np.ones(all_dots.shape[0])])
        formula = f'phecode ~ {" + ".join(adjust_cols)} + component'
        adjust_formula = f'phecode ~ {" + ".join(adjust_cols)}'
    else:
        all_data = np.column_stack([all_phenotypes, np.ones(all_dots.shape[0])])
        formula = f'phecode ~ component'

    data = {'component': all_dots, 'phecode': all_phenotypes}
    for i, col in enumerate(adjust_cols):
        data[col] = all_adjustments[:, i]
    df = pd.DataFrame.from_dict(data)
    try:
        results = smf.logit(formula, data=df).fit(maxiter=200, disp=False)
        
        preds = results.predict(df)
        results2 = smf.logit(adjust_formula, data=df).fit(maxiter=200, disp=False)
        preds2 = results2.predict(df)
        auc_w_component = roc_auc_score(all_phenotypes, preds)
        auc_no_component = roc_auc_score(all_phenotypes, preds2)
        smf_ols_p_value = float(results.summary2().tables[1]['P>|z|']['component'])
        smf_ols_t_stat = float(results.summary2().tables[1]['z']['component'])

        if histograms and -np.log10(results.summary2().tables[1]['P>|z|']['component']) > 10:
            fig, ax = plt.subplots(figsize=(8, 6), dpi=300)
            hit_dots = all_dots[all_phenotypes == 1]
            miss_dots = all_dots[all_phenotypes == 0]
            dists = [list(hit_dots), list(miss_dots)]
            labels = [f'{stratify_column} n={len(hit_dots)}', f'No {stratify_column} n={len(miss_dots)}']
            for i, data in enumerate(dists):
                #plt.hist(data, bins = 40, label=labels[i], alpha=0.5, density=True)
                sb.kdeplot(np.array(data), bw=0.5, label=labels[i], ax=ax)
                # Title and labels
                ax.set_title(f'{stratify_column}')
                ax.set_xlabel(f'Component in direction of {stratify_column} vector')
                ax.set_ylabel('Density')
            ax.legend()
            plt.show()
        
        return results.summary2().tables[1], phenotype_vector, auc_w_component, auc_no_component
    except (np.linalg.LinAlgError, PerfectSeparationError) as e:
        print(f'Phecode {stratify_column} Failed')
        return None, None, None, None
       
def phewas_feature(stratify_column, stratify_thresh, stratify_std, feature_col, 
                   adjust_cols, test_df, component_folder):   
    all_dots = test_df[feature_col].to_numpy()
    all_phecodes = test_df[stratify_column].to_numpy()

    if len(adjust_cols) > 0:
        all_adjustments = test_df[adjust_cols].to_numpy()
        formula = f'phecode ~ {" + ".join(adjust_cols)} + component' 
    else:
        formula = f'phecode ~ component'
    
    
    data = {'component': all_dots, 'phecode': all_phecodes}
    for i, col in enumerate(adjust_cols):
        data[col] = all_adjustments[:, i]
    df = pd.DataFrame.from_dict(data)
    try:
        results = smf.logit(formula, data=df).fit(disp=False)
        stat_key = 'z'
        smf_ols_p_value = float(results.summary2().tables[1][f'P>|{stat_key}|']['component'])
        smf_ols_t_stat = float(results.summary2().tables[1][f'{stat_key}']['component'])
        return results.summary2().tables[1], None, None, None
    except (np.linalg.LinAlgError, statsmodels.tools.sm_exceptions.PerfectSeparationError) as e:
        print(f'Phecode {stratify_column} Failed')
        return None, None, None, None
    
def merge_and_stratify_phecode_file(latent_df, test_df, latent_cols, adjust_cols, 
                                    phecode_file, test_phecode_file, min_cases, 
                                    component_folder, permute=False):
    
    if 'PheCode' not in phecode_file:
        return None, None, None, None, None, None
    df = pd.read_csv(phecode_file, sep='\t')
    phecode_name = f'phe_{df.iloc[0].phenotype}'.replace('.', '_').replace(' ', '')
    #print(f'phe file {phecode_file} has name: {phecode_name} cases: {df.has_disease.sum()}')
    ratio = df.has_disease.sum() / len(df.has_disease)
    if (df.has_disease.sum() > min_cases and min_cases > 1) or ratio > min_cases:
        df = process_phecode_df(df, phecode_name)
        latent_df["LINKER_ID"] = latent_df["LINKER_ID"].astype(int)
        latent_df = pd.merge(latent_df, df, left_on='LINKER_ID', right_on='linker_id', how='inner')
        
        test_phe_df = pd.read_csv(test_phecode_file, sep='\t')
        test_phe_df = process_phecode_df(test_phe_df, phecode_name)
        test_df["LINKER_ID"] = test_df["LINKER_ID"].astype(int)
        test_df = pd.merge(test_df, test_phe_df, left_on='LINKER_ID', right_on='linker_id', how='inner')
        if len(latent_df[phecode_name].value_counts()) > 1 and latent_df[phecode_name].value_counts()[1] > min_cases:
            if len(latent_cols) == 1:
                results, vector, auc1, auc2 = phewas_feature(phecode_name, 1, 0, latent_cols[0], adjust_cols, 
                                                 test_df, component_folder)
            else:
                results, vector, auc1, auc2 = stratify_and_project_latent_space(phecode_name, 1, 0, latent_cols, adjust_cols, 
                                              latent_df, test_df, component_folder, permute=permute,
                                              save_components=False, histograms=False)                
            return results, phecode_name, vector, test_phe_df[phecode_name].sum(), auc1, auc2
    return None, None, None, None, None, None
         
    
def process_phecode_df(df, phecode_name):
    df = df.rename(columns={"has_disease": phecode_name})
    df['age_sqr'] = df.partners_ecg_age * df.partners_ecg_age
    for c in ["linker_id", "partners_ecg_age", "white"]:
        df[c] = df[c].astype(np.float64)
    df['age_sqr'] = df.partners_ecg_age * df.partners_ecg_age
    df['sex_int'] = (df.sex == 'Male').astype(int)
    return df

def merge_and_stratify_by_code_folder(latent_df, test_df, latent_cols, adjust_cols, component_folder,
                                      phe_folder='./phecodes/', test_phe_folder='./phecodes/', 
                                      min_cases=20, permute=False):
    ses = {}
    betas = {}
    p_vals = {}
    counts = {}
    vectors = {}
    auc1 = {}
    auc2 = {}
    for phe_file in sorted(os.listdir(phe_folder)):
        results, name, vector, n, a1, a2 = merge_and_stratify_phecode_file(latent_df, test_df, latent_cols, 
                                                                        adjust_cols, phe_folder + phe_file, 
                                                                        test_phe_folder + phe_file, 
                                                                        min_cases, component_folder, permute)
        if results is not None:
            counts[name] = n
            p_vals[name] = results['P>|z|']['component']
            betas[name] = results['Coef.']['component']
            ses[name] = results['Std.Err.']['component']
            vectors[name] = vector
            auc1[name] = a1
            auc2[name] = a2
            print(f'Phe:{name}, N: {n}, P: {p_vals[name]:0.3E} betas {betas[name]:0.3f} std err: {ses[name]:0.3f}')
            print(f'\tPhe:{name}, AUC w ECG component:{auc1[name]:0.3f} AUC no ECG:{auc2[name]:0.3f}')
    return p_vals, betas, ses, counts, vectors, auc1, auc2



def merge_code_folder(latent_df, phe_folder='/home/sam/select_phecodes/', min_cases=4, max_codes=35):
    phe_codes = []
    for phe_file in sorted(os.listdir(phe_folder)):
        latent_df, phe_code = merge_phecode_file(latent_df, phe_folder + phe_file, min_cases)
        if phe_code in latent_df:
            print(f'{phe_code} has enough prevalence: {latent_df[phe_code].value_counts()[1]}')
            phe_codes.append(phe_code)
            if len(phe_codes) >= max_codes:
                break
    return latent_df, phe_codes

def ttest_feature(feature, snp):
    ref = latent_df[latent_df[snp] == 0][feature].dropna().to_numpy(dtype=np.float32)
    het = latent_df[latent_df[snp] == 1][feature].dropna().to_numpy(dtype=np.float32)
    hom = latent_df[latent_df[snp] == 2][feature].dropna().to_numpy(dtype=np.float32)
    var = latent_df[(latent_df[snp] == 1) | (latent_df[snp] == 2)][feature].dropna().to_numpy(dtype=np.float32)
    t2, p2 = stats.ttest_ind(var, ref, equal_var = False)
    print(f"Reference n={len(ref)}, Heterozygous n={len(het)}, Homozygous n={len(hom)}, Non-reference n={len(var)}")
    t_het_ref, p_het_ref = stats.ttest_ind(het, ref, equal_var = False)
    print(f"Ref v Het {feature}:\t\t T-Statistic = {t_het_ref:0.2f}, P-Value = {p_het_ref}")
    t_hom_ref, p_hom_ref = stats.ttest_ind(hom, ref, equal_var = False)
    print(f"Ref v Hom {feature}:\t\t T-Statistic = {t_hom_ref:0.2f}, P-Value = {p_hom_ref}")
    t_var_ref, p_var_ref = stats.ttest_ind(var, ref, equal_var = False)
    print(f"Ref v Var {feature}:\t\t T-Statistic = {t_var_ref:0.2f}, P-Value = {p_var_ref}\n")
    return {#'T-test REF vs HET '+snp: (t_het_ref, p_het_ref), 
            #'T-test REF vs HOM '+snp: (t_hom_ref, p_hom_ref),
            'T-test REF vs VAR '+snp: (t_var_ref, p_var_ref)}
    
def plot_nested_dictionary(all_scores):
    n = 4
    eps = 1e-300
    for model in all_scores:
        n = max(n, len(all_scores[model]))
    cols = max(2, int(math.ceil(math.sqrt(n))))
    rows = max(2, int(math.ceil(n / cols)))
    fig, axes = plt.subplots(rows, cols, figsize=(cols * 4, rows * 3), sharex=True)
    renest = defaultdict(dict)
    errors = defaultdict(dict)
    lens = {}
    max_tstat = 0
    max_pval = 0
    for model in all_scores:
        for metric in all_scores[model]:
            renest[metric][model] = all_scores[model][metric][0]
            errors[metric][model] = all_scores[model][metric][1]
            lens[metric] = all_scores[model][metric][2]
            max_tstat = max(abs(all_scores[model][metric][0]), max_tstat)
            max_pval = max(-np.log10(all_scores[model][metric][1]+eps), max_pval)
    for metric, ax in zip(renest, axes.ravel()):
         
        models = [k for k,v in sorted(renest[metric].items(), key=lambda x: x[0].lower())]
        tstats = [abs(v) for k,v in sorted(renest[metric].items(), key=lambda x: x[0].lower())]
        pvalues = [-np.log10(v) if v > 1e-4800 else 500 for k,v in sorted(errors[metric].items(), key=lambda x: x[0].lower())]
        y_pos = np.arange(len(models))
        x = np.linspace(0, 1, int(max_pval))
        plt.imshow(x[:, np.newaxis], cmap=cm.jet)
        cb = plt.colorbar(ax=ax, ticks=[0, 1.0])
        cb.set_label('Negative Log P-Value')
        cb.ax.set_yticklabels(['0', f'{max_pval:0.0f}'])
        ax.barh(y_pos, tstats, color=[cm.jet(p/max_pval) for p in pvalues], align='center')
        ax.set_yticks(y_pos)
        ax.set_yticklabels(models)
        ax.invert_yaxis()  # labels read top-to-bottom
        ax.set_xlabel('T–Statistic')
        ax.xaxis.set_tick_params(which='both', labelbottom=True)
        ax.set_title(f'{metric}\n n={lens[metric]}')
            
    plt.tight_layout()    
    

def unit_vector(vector):
    """ Returns the unit vector of the vector.  """
    return vector / np.linalg.norm(vector)

def angle_between(v1, v2):
    """ Returns the angle in radians between vectors 'v1' and 'v2'::
            angle_between((1, 0, 0), (0, 1, 0))
            90
            angle_between((1, 0, 0), (1, 0, 0))
            0.0
            angle_between((1, 0, 0), (-1, 0, 0))
            180
    """
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0)) * 180 / 3.141592

def get_phenotype_vector(stratify_column, stratify_thresh, stratify_std, latent_cols, latent_df):
    hit = latent_df.loc[latent_df[stratify_column] >= stratify_thresh+stratify_std][latent_cols].to_numpy()
    miss = latent_df.loc[latent_df[stratify_column] < stratify_thresh-stratify_std][latent_cols].to_numpy()
    miss_mean_vector = np.mean(miss, axis=0)
    hit_mean_vector = np.mean(hit, axis=0)
    angle = angle_between(miss_mean_vector, hit_mean_vector)
#     print(f'Angle between {stratify_column} and all others: {angle}, \n'
#           f'Hit shape {hit.shape}, miss:{miss.shape} threshold:{stratify_thresh}\n'
#           f'Distance: {np.linalg.norm(hit_mean_vector - miss_mean_vector):.3f}, '
#           f'Hit std {np.std(hit, axis=1).mean():.3f}, miss std:{np.std(miss, axis=1).mean():.3f}\n')
    return hit_mean_vector - miss_mean_vector   

In [None]:
SESSION_DIR = os.path.expanduser("~")  # downloaded data will be stored here
mgh_ecg_meta_path = download_s3_if_not_exists(
    bucket_name='2017P001650',
    bucket_path='csvs/06-25_explore_mgh.pq',
    local_dir=SESSION_DIR,
)
bwh_ecg_meta_path = download_s3_if_not_exists(
    bucket_name='2017P001650',
    bucket_path='csvs/08-16_explore_bwh.pq',
    local_dir=SESSION_DIR,
)
# load the wide file
mgh_ecg_meta = pd.read_parquet(mgh_ecg_meta_path)
bwh_ecg_meta = pd.read_parquet(bwh_ecg_meta_path)

In [None]:
# mgh_ecg_meta.info()
# bwh_ecg_meta.info()

In [None]:
mgh_ecg_meta_path = download_s3_if_not_exists(
    bucket_name='2017P001650',
    bucket_path='csvs/mgh_c3po_ecgs.pq',
    local_dir=SESSION_DIR,
)
bwh_ecg_meta_path = download_s3_if_not_exists(
    bucket_name='2017P001650',
    bucket_path='csvs/bwh_c3po_ecgs.pq',
    local_dir=SESSION_DIR,
)
# load the wide file
mgh_ecg_meta = pd.read_parquet(mgh_ecg_meta_path)
bwh_ecg_meta = pd.read_parquet(bwh_ecg_meta_path)

In [None]:
mgh_ecg_meta['ecg_date'] = mgh_ecg_meta['datetime'].dt.date
bwh_ecg_meta['ecg_date'] = bwh_ecg_meta['datetime'].dt.date

In [None]:
df = pd.read_csv('./phecodes/PheCode_1016.txt', sep='\t')
bdf = pd.read_csv('./phecodes_bwh/PheCode_1016.txt', sep='\t')
#df.info()

In [None]:
links = pd.read_csv('mrn_linker.txt', sep='\t')
# latent = pd.read_csv('mgh_drop_fuse_latent_space.csv')
# latent_df = pd.merge(latent, links, left_on='MGH_MRN_0', right_on='MRN', how='inner')

#lf='/home/samuel.friedman/trained_models/mgh_biosppy_median_autoencoder_256d_v2022_05_19/hidden_median_mgh_biosppy_median_autoencoder_256d_v2022_05_19.tsv'
lf='/home/samuel.friedman/trained_models/mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21/hidden_median_mgh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.tsv'
#lf='/home/samuel.friedman/trained_models/mgh_biosppy_median_60bpm_lead_I_autoencoder_256d_v2022_05_24/hidden_median_mgh_biosppy_median_60bpm_lead_I_autoencoder_256d_v2022_05_24.tsv'

#latent = pd.read_csv('./trained_models/ecg_2500_autoencoder_mgh_c3po_128d_v2021_12_17/mgh_latent_ecg_2500_autoencoder_mgh_c3po_128d_v2021_12_17.tsv', sep='\t')
#latent = pd.read_csv('./trained_models/mgh_ecg_rest_median_raw_10_autoencoder_256d_v2022_04_13/mgh_latent_mgh_ecg_rest_median_raw_10_autoencoder_256d_v2022_04_13.tsv', sep='\t')
#latent = pd.read_csv('./trained_models/mgh_ecg_rest_median_raw_10_lead_I_autoencoder_256d_v2022_04_09/mgh_latent_lead_I_mgh_ecg_rest_median_raw_10_lead_I_autoencoder_256d_v2022_04_09.tsv', sep='\t')

latent = pd.read_csv(lf, sep='\t')

latent_df = pd.merge(latent, links, left_on='sample_id', right_on='MRN', how='inner')
latent_dimension = 256
latent_cols = [f'latent_{i}' for i in range(latent_dimension)]
latent_df.info()

In [None]:
lf='/home/samuel.friedman/trained_models/bwh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21/hidden_median_bwh_biosppy_median_60bpm_autoencoder_256d_v2022_05_21.tsv'
#lf='/home/samuel.friedman/trained_models/bwh_biosppy_median_autoencoder_256d_v2022_05_19/hidden_median_bwh_biosppy_median_autoencoder_256d_v2022_05_19.tsv'
#lf='/home/samuel.friedman/trained_models/bwh_biosppy_median_60bpm_lead_I_autoencoder_256d_v2022_05_24/hidden_median_bwh_biosppy_median_60bpm_lead_I_autoencoder_256d_v2022_05_24.tsv'
#bwf_df = pd.read_csv('./trained_models/ecg_2500_autoencoder_mgh_c3po_128d_v2021_12_17/bwh_latent_ecg_2500_autoencoder_mgh_c3po_128d_v2021_12_17.tsv', sep='\t')
#bwf_df = pd.read_csv('./trained_models/mgh_ecg_lead_I_2500_std_autoencoder_v2022_03_28/bwh_latent_lead_I_mgh_ecg_lead_I_2500_std_autoencoder_v2022_03_28.tsv', sep='\t')
#bwf_df = pd.read_csv('./trained_models/mgh_ecg_rest_median_raw_10_autoencoder_256d_v2022_04_13/bwh_latent_mgh_ecg_rest_median_raw_10_autoencoder_256d_v2022_04_13.tsv', sep='\t')
#bwf_df = pd.read_csv('/home/samuel.friedman/trained_models/bwh_biosppy_median_ae_256d/hidden_median_bwh_biosppy_median_ae_256d.tsv', sep='\t')
bwf_df = pd.read_csv(lf, sep='\t')

bwh_links = pd.read_csv('hidden_inference_ecg_2500_hyperoptimized_autoencoder_mish_c3po_bwh_linked_dates.tsv', sep='\t')
bwf_df.info()
bwh_df = pd.merge(bwf_df, bwh_links[['id', 'BWH_MRN_0']], left_on='sample_id', right_on='BWH_MRN_0', how='inner')
bwh_df = bwh_df.rename(columns={"id": 'LINKER_ID'})
bwf_df.info()

In [None]:
df = process_phecode_df(df, 'phecode_name')
latent_df["LINKER_ID"] = latent_df["LINKER_ID"].astype(int)
latent_df = pd.merge(latent_df, df[['linker_id', 'partners_ecg_datetime']],  #, 'white', 'partners_ecg_age', 'sex'
                     left_on='LINKER_ID', right_on='linker_id', how='left')

In [None]:
[c for c in bwh_df if 'latent' not in c]

In [None]:
[c for c in latent_df if 'latent' not in c]

In [None]:
bdf = process_phecode_df(bdf, 'phecode_name')
bwh_df["LINKER_ID"] = bwh_df["LINKER_ID"].astype(int)
bwh_df = pd.merge(bwh_df, bdf[['linker_id', 'partners_ecg_datetime']], #, 'white', 'partners_ecg_age', 'sex'
                     left_on='LINKER_ID', right_on='linker_id', how='inner')

In [None]:
bwh_df.info()

In [None]:
new_latent_df = pd.merge(latent_df, mgh_ecg_meta, left_on=['sample_id'], 
                    right_on=['sample_id'], how='inner')
# new_latent_df = pd.merge(latent_df, mgh_ecg_meta, left_on=['MRN'], 
#                      right_on=['sample_id'], how='inner')



In [None]:
new_latent_df.ecg_date = new_latent_df.ecg_date.astype(str)
new_latent_df.partners_ecg_datetime = new_latent_df.partners_ecg_datetime.astype(str)
new_latent_df[['sample_id','ecg_date','partners_ecg_datetime', 'num_zeros']].head()

In [None]:
new_latent_df = new_latent_df[new_latent_df.ecg_date == new_latent_df.partners_ecg_datetime]

keep_cols = [c for c in new_latent_df if 'latent' not in c]
new_latent_df[keep_cols].info()

In [None]:
new_latent_df['time'] = new_latent_df['ecg_date'].apply(lambda x: pd.to_datetime(x).value/10**17)

In [None]:
new_bwh_df = pd.merge(bwh_df, bwh_ecg_meta, left_on=['sample_id'], 
                     right_on=['sample_id'], how='inner')
new_bwh_df.info()


In [None]:
new_bwh_df.ecg_date = new_bwh_df.ecg_date.astype(str)
new_bwh_df.partners_ecg_datetime = new_bwh_df.partners_ecg_datetime.astype(str)

#new_bwh_df = new_bwh_df[new_bwh_df.ecg_date == new_bwh_df.partners_ecg_datetime]
new_bwh_df['time'] = new_bwh_df['ecg_date'].apply(lambda x: pd.to_datetime(x).value/10**17)
new_bwh_df[['sample_id','ecg_date','partners_ecg_datetime', 'num_zeros']].head()

In [None]:
new_bwh_df['date_diff'] = pd.to_datetime(new_bwh_df.partners_ecg_datetime) - pd.to_datetime(new_bwh_df.ecg_date)
new_bwh_df[['sample_id','ecg_date','partners_ecg_datetime', 'date_diff', 'num_zeros']].head()

In [None]:
new_bwh_df['date_diff'] = new_bwh_df['date_diff'].dt.days

In [None]:
new_bwh_df.info()

In [None]:
new_bwh_df = new_bwh_df[new_bwh_df.date_diff > 0]

In [None]:
new_bwh_df.info()

In [None]:
new_bwh_df2 = new_bwh_df.sort_values('date_diff').groupby('sample_id').first()

In [None]:
new_bwh_df2.info()

In [None]:
new_latent_df = new_latent_df.drop_duplicates(subset = ["sample_id"])

In [None]:
new_latent_df.info()
new_bwh_df2.info()
print([c for c in new_bwh_df2.columns if 'latent' not in c])

In [None]:
len(new_latent_df)

In [None]:
train = new_latent_df.sample(frac=0.5, random_state=1234)
test = new_latent_df.drop(train.index)

train_bwh = new_bwh_df2.sample(frac=0.5, random_state=1234)
test_bwh = new_bwh_df2.drop(train_bwh.index)

In [None]:
print([c for c in test_bwh.columns if 'latent' not in c])

In [None]:
print(f'MGH Latent Space Zero Table total: {len(new_latent_df)}')
for low,high,label in [(0,250,'7.5-10s'), (250,500,'2.5-7.5s'), (500,17500,'0-2.5s')]:
    for c in new_latent_df:
        if 'zeros' in c:
            n = len(new_latent_df[(low < new_latent_df[c]) & (high > new_latent_df[c]) ])
            print(f'For lead: {c} n: {n}   time no zeros {label}  Ratio: {n/len(new_latent_df):0.2f}')

In [None]:
print(f'BWH Latent Space Zero Table total: {len(new_bwh_df2)}')
for low,high,label in [(0,250,'7.5-10s'), (250,500,'2.5-7.5s'), (500,17500,'0-2.5s')]:
    for c in new_bwh_df2:
        if 'zeros' in c:
            n = len(new_bwh_df2[(low < new_bwh_df2[c]) & (high > new_bwh_df2[c]) ])
            print(f'For lead: {c} n: {n}   time no zeros {label}  Ratio: {n/len(new_bwh_df2):0.2f}')


In [None]:
len(new_bwh_df2)

In [None]:
# keep_cols = [ 'BWH_MRN_0', 'linker_id', 'partners_ecg_datetime', 'white', 'partners_ecg_age', 'sex', 
#              'num_zeros', 'gender', 'patientage', 'ventricularrate_md', 'qrsduration_md', 'printerval_md',
#              'qtinterval_md', 'paxis_md', 'raxis_md', 'taxis_md', 'weightlbs', 'heightin', 'ecg_date', 'time']
# new_bwh_df2[keep_cols].to_csv('bwh_phewas_covariates.tsv', index=False, sep='\t')
# train_bwh[keep_cols].to_csv('bwh_derive_set_phewas_covariates.tsv', index=False, sep='\t')
# test_bwh[keep_cols].to_csv('bwh_validation_set_phewas_covariates.tsv', index=False, sep='\t')

In [None]:
print([c for c in test_bwh.columns if 'latent' not in c])

In [None]:
# keep_cols = ['MRN', 'linker_id', 'partners_ecg_datetime', 'white', 'partners_ecg_age', 'sex', 
#              'num_zeros', 'gender', 'patientage', 'ecg_date', 'time']
keep_cols = [c for c in new_latent_df if 'latent' not in c]
new_latent_df[keep_cols].to_csv('mgh_phewas_covariates_v2022_04_04.tsv', index=False, sep='\t')
# train[keep_cols].to_csv('mgh_derive_set_phewas_covariates.tsv', index=False, sep='\t')
# test[keep_cols].to_csv('mgh_validation_set_phewas_covariates.tsv', index=False, sep='\t')

In [None]:
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
latent_dimension = 256
latent_cols = [f'latent_{i}' for i in range(latent_dimension)]
mgh_pvals, mgh_betas, mgh_ses, mgh_counts, mgh_vectors, mgh_auc1, mgh_auc2 = merge_and_stratify_by_code_folder(
    train, 
    test, 
    latent_cols, 
    adjust_cols, './drop_fuse_phecode_projections2/',
    phe_folder='./phecodes/', 
    test_phe_folder='./phecodes/', 
    min_cases=1.1,
    permute=False)

In [None]:
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
bwh_pvals, bwh_betas, bwh_ses, bwh_counts, bwh_vectors, bwh_auc1, bwh_auc2 = merge_and_stratify_by_code_folder(
    train, 
    new_bwh_df2,
    latent_cols, 
    adjust_cols,
    './drop_fuse_phecode_projections2/',
    phe_folder='./phecodes/',
    test_phe_folder='./phecodes_bwh/',
    min_cases=1.1, 
    permute=False)

In [None]:
mgh_intervals = pd.read_csv('all_ecgs_xin_20211217.csv')
mgh_intervals.partners_ecg_datetime = pd.to_datetime(mgh_intervals.partners_ecg_datetime)
mgh_intervals.head()

In [None]:
[c for c in mgh_intervals]

In [None]:
test = pd.merge(test, mgh_intervals[['LINKER_ID', 'partners_ecg_datetime', 'partners_ecg_qrs_md',
                                     'partners_ecg_qt_md', 'partners_ecg_rate_md', 'partners_ecg_pr_md',
                                    ]], 
                right_on =['LINKER_ID', 'partners_ecg_datetime'],
                left_on =['LINKER_ID', 'datetime']
               )

In [None]:
[c for c in test if 'latent' not in c]

In [None]:
latent_cols = ['partners_ecg_qrs_md']
test[latent_cols] = test[latent_cols].astype(float)
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
mgh_qrs_pvals, mgh_qrs_betas, mgh_qrs_ses, mgh_qrs_counts, _ = merge_and_stratify_by_code_folder(train, test, 
                                                                                           latent_cols, adjust_cols, 
                                                          './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes/', 
                                                          min_cases=1.1, permute=False)

In [None]:
latent_cols = ['partners_ecg_qt_md']
test[latent_cols] = test[latent_cols].astype(float)
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
mgh_qt_pvals, mgh_qt_betas, mgh_qt_ses, mgh_qt_counts, _ = merge_and_stratify_by_code_folder(train, test, 
                                                                                           latent_cols, adjust_cols, 
                                                          './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes/', 
                                                          min_cases=1.1, permute=False)

In [None]:
latent_cols = ['partners_ecg_pr_md']
test[latent_cols] = test[latent_cols].astype(float)
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
mgh_pr_pvals, mgh_pr_betas, mgh_pr_ses, mgh_pr_counts, _ = merge_and_stratify_by_code_folder(train, test, 
                                                                                           latent_cols, adjust_cols, 
                                                          './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes/', 
                                                          min_cases=1.1, permute=False)

In [None]:
latent_cols = ['qrsduration_md']
new_bwh_df2[latent_cols] = new_bwh_df2[latent_cols].astype(float)
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
bwh_qrs_pvals, bwh_qrs_betas, bwh_qrs_ses, bwh_qrs_counts, _ = merge_and_stratify_by_code_folder(train, new_bwh_df2, 
                                                                                           latent_cols, adjust_cols, 
                                                          './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes_bwh/', 
                                                          min_cases=1.1, permute=False)

In [None]:
latent_cols = ['qtinterval_md']
new_bwh_df2[latent_cols] = new_bwh_df2[latent_cols].astype(float)
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
bwh_qt_pvals, bwh_qt_betas, bwh_qt_ses, bwh_qt_counts, _ = merge_and_stratify_by_code_folder(train, new_bwh_df2, 
                                                                                           latent_cols, adjust_cols, 
                                                          './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes_bwh/', 
                                                          min_cases=1.1, permute=False)

In [None]:
latent_cols = ['printerval_md']
new_bwh_df2[latent_cols] = new_bwh_df2[latent_cols].astype(float)
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
bwh_pr_pvals, bwh_pr_betas, bwh_pr_ses, bwh_pr_counts, _ = merge_and_stratify_by_code_folder(train, new_bwh_df2, 
                                                                                           latent_cols, adjust_cols, 
                                                          './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes_bwh/', 
                                                          min_cases=1.1, permute=False)

In [None]:
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
latent_dimension = 128
latent_cols = [f'latent_{i}' for i in range(latent_dimension)]
mgh_10s_pvals, mgh_10s_betas, mgh_10s_ses, mgh_10s_counts, mgh_10s_vectors = merge_and_stratify_by_code_folder(train, test, 
                                                                                           latent_cols, 
                                                 adjust_cols, './drop_fuse_phecode_projections2/',
                                                 phe_folder='./phecodes/', 
                                                 test_phe_folder='./phecodes/', 
                                                 min_cases=100,
                                                 permute=False)

In [None]:
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
latent_dimension = 256
mgh_lead_I_pvals, mgh_lead_I_betas, mgh_lead_I_ses, mgh_lead_I_counts, mgh_lead_I_vectors = merge_and_stratify_by_code_folder(train, test, 
                                                                                           latent_cols, 
                                                 adjust_cols, './drop_fuse_phecode_projections2/',
                                                 phe_folder='./phecodes/', 
                                                 test_phe_folder='./phecodes/', 
                                                 min_cases=1.1,
                                                 permute=False)

In [None]:
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
latent_dimension = 256
latent_cols = [f'latent_{i}' for i in range(latent_dimension)]
bwh_lead_I_pvals, bwh_lead_I_betas, bwh_lead_I_ses, bwh_lead_I_counts, bwh_lead_I_vectors = merge_and_stratify_by_code_folder(train, new_bwh_df2, 
                                                                                           latent_cols, adjust_cols, 
                                                          './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes_bwh/', 
                                                          min_cases=1.1, permute=False)

In [None]:
# adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
# bwh_wmgh_pvals, bwh_wmgh_betas, bwh_wmgh_ses, bwh_wmgh_counts, bwh_wmgh_vectors= merge_and_stratify_by_code_folder(train_bwh, test, latent_cols, adjust_cols, 
#                                                           './drop_fuse_phecode_projections2/',
#                                                           phe_folder='./phecodes_bwh/', 
#                                                           test_phe_folder='./phecodes/', 
#                                                           min_cases=100, permute=False)



In [None]:
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
bwh_pvals, bwh_betas, bwh_ses, bwh_counts, bwh_vectors = merge_and_stratify_by_code_folder(train_bwh, test_bwh, 
                                                                                           latent_cols, adjust_cols, 
                                                          './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes_bwh/', 
                                                          test_phe_folder='./phecodes_bwh/', 
                                                          min_cases=100, permute=False)

In [None]:
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
permuted_ten = merge_and_stratify_by_code_folder(train, test, latent_cols, adjust_cols,
                                             './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes/', 
                                                          min_cases=100, permute=0.1)

In [None]:
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
permuted_twenty = merge_and_stratify_by_code_folder(train, test, latent_cols, adjust_cols,
                                             './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes/', 
                                                          min_cases=100, permute=0.2)

In [None]:
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
permuted_none = merge_and_stratify_by_code_folder(train, test, latent_cols, adjust_cols,
                                             './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes/', 
                                                          min_cases=100, permute=0.0)

In [None]:
adjust_cols = ['partners_ecg_age', 'age_sqr', 'sex_int', 'white', 'num_zeros', 'time']
permuted_fifty = merge_and_stratify_by_code_folder(train, test, latent_cols, adjust_cols,
                                             './drop_fuse_phecode_projections2/',
                                                          phe_folder='./phecodes/', 
                                                          test_phe_folder='./phecodes/', 
                                                          min_cases=100, permute=0.5)

In [None]:
phecode_meta = pd.read_csv('phecode_definitions.csv') 
cat_colors= {}
for i,k in enumerate(phecode_meta.category.value_counts().keys()):
    cat_colors[k] = i
    #print(f'{i} {k}')

In [None]:
def phecode_dicts(pval_dict):
    categories = defaultdict(list)
    categories_text = defaultdict(list)

    for phe, pval in sorted(pval_dict.items(), key=operator.itemgetter(1)):
        print(f"phe is {phe.replace('phe_', '').replace('_', '.')}")
        row = phecode_meta[phecode_meta.phecode == float(phe.replace('phe_', '').replace('_', '.'))].iloc[0]
        if row.category in cat_colors:
            categories[row.category].append(pval)
            categories_text[row.category].append(row.phenotype)
            print(f'category: {row.category}\n phenotype: {row.phenotype}  phecode is {phe} and pvalue {pval:0.4E}\n' )
    return categories, categories_text

In [None]:
def write_phewas(pvals, betas, ses, counts, vectors, auc1, auc2, file_name, latent_dim=0):
    tuples = []
    for phe, pval in sorted(pvals.items(), key=operator.itemgetter(1)):

        row = phecode_meta[phecode_meta.phecode == float(phe.replace('phe_', '').replace('_', '.'))].iloc[0]
        if row.category in cat_colors:
            cols = [row.phecode, phe, pval, row.phenotype, row.category, 
                    counts[phe], betas[phe], ses[phe], auc1[phe], auc2[phe],
                   ] + [vectors[phe][i] for i in range(latent_dim)]
            tuples.append(tuple(cols))
    headers = ['phecode', 'phecode_text', 'p_value', 'phenotype', 'category', 
               'n', 'beta', 'se', 'AUC_with_ECG', 'AUC_no_ECG'] + [f'pv_{i}' for i in range(latent_dim)]
    df = pd.DataFrame(tuples, columns = headers)
    df.to_csv(file_name, index=False)

In [None]:
write_phewas(mgh_pvals, mgh_betas, mgh_ses, mgh_counts, mgh_vectors, 
             './phewas_lead_I_mgh_with_mgh_v2022_05_25.csv')
write_phewas(mgh_pvals, mgh_betas, mgh_ses, mgh_counts, mgh_vectors,
             './phewas_lead_I_mgh_with_mgh_plus_vectors_v2022_05_25.csv', 
             latent_dim=256)
write_phewas(bwh_pvals, bwh_betas, bwh_ses, bwh_counts, bwh_vectors, 
             './phewas_lead_I_bwh_with_mgh_v2022_05_25.csv')

In [None]:
write_phewas(mgh_pvals, mgh_betas, mgh_ses, mgh_counts, mgh_vectors, mgh_auc1, mgh_auc2,
             './phewas_mgh_with_mgh_v2022_11_30.csv')

In [None]:
write_phewas(bwh_pvals, bwh_betas, bwh_ses, bwh_counts, bwh_vectors, bwh_auc1, bwh_auc2, 
             './phewas_bwh_with_mgh_v2022_11_30.csv')

In [None]:
write_phewas(bwh_qrs_pvals, bwh_qrs_betas, bwh_qrs_ses, bwh_qrs_counts, _, 
             './phewas_bwh_qrs.csv')
write_phewas(bwh_qt_pvals, bwh_qt_betas, bwh_qt_ses, bwh_qt_counts, _, 
             './phewas_bwh_qt.csv')
write_phewas(bwh_pr_pvals, bwh_pr_betas, bwh_pr_ses, bwh_pr_counts, _, 
             './phewas_bwh_pq.csv')

In [None]:
write_phewas(mgh_qt_pvals, mgh_qt_betas, mgh_qt_ses, mgh_qt_counts, _, 
             './phewas_mgh_qt.csv')
write_phewas(mgh_pr_pvals, mgh_pr_betas, mgh_pr_ses, mgh_pr_counts, _, 
             './phewas_mgh_pq.csv')

In [None]:
write_phewas(mgh_lead_I_pvals, mgh_lead_I_betas, mgh_lead_I_ses, mgh_lead_I_counts, mgh_lead_I_vectors, 
             './phewas_mgh_with_mgh_lead_I_v2022_04_12.csv')

In [None]:
write_phewas(mgh_lead_I_pvals, mgh_lead_I_betas, mgh_lead_I_ses, mgh_lead_I_counts, mgh_lead_I_vectors,
             './phewas_mgh_with_mgh_lead_I_plus_vectors_v2022_04_12.csv', 
             latent_dim=256)

In [None]:
write_phewas(bwh_lead_I_pvals, bwh_lead_I_betas, bwh_lead_I_ses, bwh_lead_I_counts, bwh_lead_I_vectors, 
             './phewas_bwh_with_mgh_lead_I_v2022_04_12.csv')

In [None]:
permed = []
for pvals in [permuted_none, permuted_ten, permuted_twenty, permuted_fifty]:
    permed.append(phecode_dicts(pvals[0]))

In [None]:
bwh_cat, bwh_text = phecode_dicts(bwh_pvals)
mgh_cat, mgh_text = phecode_dicts(mgh_pvals)
#bwh_lead_I_cat, bwh_lead_I_text = phecode_dicts(bwh_lead_I_pvals)
#mgh_lead_I_cat, mgh_lead_I_text = phecode_dicts(mgh_lead_I_pvals)

#mgh_10s_cat, mgh_10s_text = phecode_dicts(mgh_10s_pvals)
#val_bwh_cat, val_bwh_text = phecode_dicts(adjusted_bwh_from_mgh)
#less_val_bwh_cat, less_val_bwh_text = phecode_dicts(less_adjusted_bwh_from_mgh)

In [None]:
def qq_plot_theoretical(categories, text, title='QQ Plot', p_thresh=4):
    #p_thresh=-np.log(1/len(text))
    fig, ax = plt.subplots(figsize=(18, 10), dpi=300)
    sort_cat = sorted(categories.items(), key=operator.itemgetter(0))
    
    annotations = []
    x_offsets = [0]
    x_labels = []
    for i, (k,v) in enumerate(sort_cat):
        if len(categories[k]) < 3:
            continue
        neglog10p = -np.log10(np.array(categories[k])+1e-300)
        expected = -np.log10(np.arange(1.0/len(categories[k]), 1+1e-8, 1.0/len(categories[k]))) + x_offsets[-1]
        #print(f'{x_offset} expected {len(expected)}  neglog10p {len(neglog10p)} \n EXpected {expected[0]}')
        ax.scatter(sorted(expected[:len(neglog10p)], reverse=True), sorted(neglog10p[:len(expected)], reverse=True), label=k)
        ax.plot([x_offsets[-1], expected[0]],[0, 1]) 
        x_offsets.append(expected[0] + 0.2)
        x_labels.append(k)
        for j, txt in enumerate(text[k]):
            if neglog10p[j] > p_thresh:
                annotations.append(ax.annotate(txt, xy=(expected[j], neglog10p[j])))
    print(f'Total hits: {len(annotations)} at thresh: {p_thresh}')            
    ax.set_xticks(x_offsets[:-1])
    ax.set_xticklabels(x_labels, rotation=30, ha='right')
    #plt.ylim(0, 15)
    ax.legend()
    ax.set_title(title)
    ax.set_xlabel('Expected -log10(P_value) per PheCode category')
    ax.set_ylabel('Observed -log10(P_value) per PheCode category')
    mask = np.zeros(fig.canvas.get_width_height(), bool)

    fig.canvas.draw()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    for a in annotations:
        bbox = a.get_window_extent(renderer=fig.canvas.get_renderer())
        if not np.isinf(bbox.x0): 
            x0 = int(bbox.x0)
            x1 = int(math.ceil(bbox.x1))
            y0 = int(bbox.y0)
            y1 = int(math.ceil(bbox.y1))

            s = np.s_[x0:x1+1, y0:y1+1]
            if np.any(mask[s]):
                a.set_visible(False)
            else:
                mask[s] = True

In [None]:
qq_plot_theoretical(mgh_cat, mgh_text,  'MGH Phenotype Vectors PheWAS in MGH, C3PO MGH Trained Biosppy Median ECG Autoencoder Latent Space')
qq_plot_theoretical(bwh_cat, bwh_text,  'MGH Phenotype Vectors PheWAS in BWH, C3PO MGH Trained Biosppy Median ECG Autoencoder Latent Space')


In [None]:
qq_plot_theoretical(mgh_cat, mgh_text,  'MGH Phenotype Vectors PheWAS in MGH, C3PO MGH Trained Biosppy Median ECG Autoencoder Latent Space')
qq_plot_theoretical(bwh_cat, bwh_text,  'MGH Phenotype Vectors PheWAS in BWH, C3PO MGH Trained Biosppy Median ECG Autoencoder Latent Space')


In [None]:
qq_plot_theoretical(mgh_cat, mgh_text,  'MGH Phenotype Vectors PheWAS in MGH, C3PO MGH Trained Median ECG Autoencoder Latent Space')
qq_plot_theoretical(bwh_cat, bwh_text,  'MGH Phenotype Vectors PheWAS in BWH, C3PO MGH Trained Median ECG Autoencoder Latent Space')



In [None]:
qq_plot_theoretical(mgh_lead_I_cat, mgh_lead_I_text, 
                    'Lead I MGH Phenotype Vectors PheWAS in MGH, MGH Lead I Median Trained ECG Autoencoder Latent Space')
qq_plot_theoretical(bwh_lead_I_cat, bwh_lead_I_text, 
                    'Lead I MGH Phenotype Vectors PheWAS in BWH, MGH Lead I Median Trained ECG Autoencoder Latent Space')

In [None]:
qq_plot_theoretical(bwh_cat, bwh_text, 
                    'BWH Phenotype Vectors PheWAS in BWH, MGH Biosppy Median Trained ECG Autoencoder Latent Space')

In [None]:
qq_plot_theoretical(bwh_cat, bwh_text, 
                    'BWH Phenotype Vectors PheWAS in BWH, MGH Biosppy Median Trained ECG Autoencoder Latent Space')

In [None]:
qq_plot_theoretical(mgh_cat, mgh_text,  'MGH Phenotype Vectors PheWAS in MGH, C3PO MGH Trained Median ECG Autoencoder Latent Space')


In [None]:
qq_plot_theoretical(mgh_10s_cat, mgh_10s_text, 'MGH -> MGH ECG Autoencoder Phenotype Vecter PheWAS Cases > 100 prevalent')

In [None]:
qq_plot_theoretical(mgh_cat, mgh_text,  'MGH Phenotype Vectors PheWAS in MGH, C3PO MGH Trained ECG Autoencoder Latent Space')

In [None]:
qq_plot_theoretical(val_bwh_cat, val_bwh_text, 'MGH -> BWH ECG Autoencoder Phenotype Vector PheWAS Cases > 1% prevalent')

In [None]:
qq_plot_theoretical(permuted_cat, permuted_text, 'Permuted Phecode Labels')


In [None]:
qq_plot_theoretical(permed[0][0], permed[0][1], 'Baseline Phewas')

In [None]:
qq_plot_theoretical(permed[1][0], permed[1][1], 'PheWAS with 10% Noisy Labels')

In [None]:
qq_plot_theoretical(permed[2][0], permed[2][1], 'PheWAS with 20% Noisy Labels')

In [None]:
qq_plot_theoretical(permed[3][0], permed[3][1], 'PheWAS with 100% Noisy Labels')

In [None]:
# links = pd.read_csv('mrn_linker.txt', sep='\t')
# latent = pd.read_csv('mgh_drop_fuse_latent_space_mv.csv')
# latent_df = pd.merge(latent, links, left_on='MGH_MRN_0', right_on='MRN', how='inner')
# latent_dimension = 256
# latent_cols = [f'latent_{i}' for i in range(latent_dimension)]
# merge_and_stratify_by_code_folder(latent_df, latent_cols, './drop_fuse_phecode_projections_mv/')

In [None]:
# links = pd.read_csv('mrn_linker.txt', sep='\t')
# latent = pd.read_csv('mgh_drop_fuse_latent_space_uni.csv')
# latent_df = pd.merge(latent, links, left_on='MGH_MRN_0', right_on='MRN', how='inner')
# latent_dimension = 256
# latent_cols = [f'latent_{i}' for i in range(latent_dimension)]
# merge_and_stratify_by_code_folder(latent_df, latent_cols, './drop_fuse_phecode_projections_uni/')

In [None]:
latent = pd.read_csv('mgh_drop_fuse_latent_space.csv')
latent.columns

In [None]:
df = pd.merge(latent, links, left_on='MGH_MRN_0', right_on='MRN', how='inner')
df.info()

In [None]:
#  ECG + MRI union GWAS