In [None]:
import os
import math
import time
import operator
from collections import defaultdict

import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.metrics import brier_score_loss, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.multivariate.manova import MANOVA
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score, roc_auc_score
from ml4h.explorations import latent_space_dataframe

# IPython imports
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sb 

In [None]:
def merge_phecode_file(latent_df, phecode_file, phecode_name, min_cases):
    df = pd.read_csv(phecode_file, sep='\t')
    if df.has_disease.sum() > min_cases:
        df = df.rename(columns={"has_disease": phecode_name})
        new_df = pd.merge(latent_df, df, left_on='fpath', right_on='sample_id', how='inner')
        if len(new_df[phecode_name].value_counts()) > 1 and new_df[phecode_name].value_counts()[1] > min_cases:
            return new_df
        else:
            return latent_df
    else:
        return latent_df


def merge_code_folder(latent_df, phe_folder='/home/sam/select_phecodes/', min_cases=4, max_codes=35):
    phe_codes = []
    for phe_file in sorted(os.listdir(phe_folder)):
        phe_code = phe_file.replace('.txt', '')
        print(f'try phecode: {phe_code}')
        latent_df = merge_phecode_file(latent_df, phe_folder + phe_file, phe_code, min_cases)
        if phe_code in latent_df:
            print(f'{phe_code} has enough prevalence: {latent_df[phe_code].value_counts()[1]}')
            phe_codes.append(phe_code)
            if len(phe_codes) >= max_codes:
                break
    return latent_df, phe_codes

def ttest_feature(feature, snp):
    ref = latent_df[latent_df[snp] == 0][feature].dropna().to_numpy(dtype=np.float32)
    het = latent_df[latent_df[snp] == 1][feature].dropna().to_numpy(dtype=np.float32)
    hom = latent_df[latent_df[snp] == 2][feature].dropna().to_numpy(dtype=np.float32)
    var = latent_df[(latent_df[snp] == 1) | (latent_df[snp] == 2)][feature].dropna().to_numpy(dtype=np.float32)
    t2, p2 = stats.ttest_ind(var, ref, equal_var = False)
    print(f"Reference n={len(ref)}, Heterozygous n={len(het)}, Homozygous n={len(hom)}, Non-reference n={len(var)}")
    t_het_ref, p_het_ref = stats.ttest_ind(het, ref, equal_var = False)
    print(f"Ref v Het {feature}:\t\t T-Statistic = {t_het_ref:0.2f}, P-Value = {p_het_ref}")
    t_hom_ref, p_hom_ref = stats.ttest_ind(hom, ref, equal_var = False)
    print(f"Ref v Hom {feature}:\t\t T-Statistic = {t_hom_ref:0.2f}, P-Value = {p_hom_ref}")
    t_var_ref, p_var_ref = stats.ttest_ind(var, ref, equal_var = False)
    print(f"Ref v Var {feature}:\t\t T-Statistic = {t_var_ref:0.2f}, P-Value = {p_var_ref}\n")
    return {#'T-test REF vs HET '+snp: (t_het_ref, p_het_ref), 
            #'T-test REF vs HOM '+snp: (t_hom_ref, p_hom_ref),
            'T-test REF vs VAR '+snp: (t_var_ref, p_var_ref)}
    
def plot_nested_dictionary(all_scores):
    n = 4
    eps = 1e-300
    for model in all_scores:
        n = max(n, len(all_scores[model]))
    cols = max(2, int(math.ceil(math.sqrt(n))))
    rows = max(2, int(math.ceil(n / cols)))
    fig, axes = plt.subplots(rows, cols, figsize=(cols * 4, rows * 3), sharex=True)
    renest = defaultdict(dict)
    errors = defaultdict(dict)
    lens = {}
    max_tstat = 0
    max_pval = 0
    for model in all_scores:
        for metric in all_scores[model]:
            renest[metric][model] = all_scores[model][metric][0]
            errors[metric][model] = all_scores[model][metric][1]
            lens[metric] = all_scores[model][metric][2]
            max_tstat = max(abs(all_scores[model][metric][0]), max_tstat)
            max_pval = max(-np.log10(all_scores[model][metric][1]+eps), max_pval)
    for metric, ax in zip(renest, axes.ravel()):
         
        models = [k for k,v in sorted(renest[metric].items(), key=lambda x: x[0].lower())]
        tstats = [abs(v) for k,v in sorted(renest[metric].items(), key=lambda x: x[0].lower())]
        pvalues = [-np.log10(v) if v > 1e-4800 else 500 for k,v in sorted(errors[metric].items(), key=lambda x: x[0].lower())]
        y_pos = np.arange(len(models))
        x = np.linspace(0, 1, int(max_pval))
        plt.imshow(x[:, np.newaxis], cmap=cm.jet)
        cb = plt.colorbar(ax=ax, ticks=[0, 1.0])
        cb.set_label('Negative Log P-Value')
        cb.ax.set_yticklabels(['0', f'{max_pval:0.0f}'])
        ax.barh(y_pos, tstats, color=[cm.jet(p/max_pval) for p in pvalues], align='center')
        ax.set_yticks(y_pos)
        ax.set_yticklabels(models)
        ax.invert_yaxis()  # labels read top-to-bottom
        ax.set_xlabel('T–Statistic')
        ax.xaxis.set_tick_params(which='both', labelbottom=True)
        ax.set_title(f'{metric}\n n={lens[metric]}')
            
    plt.tight_layout()    
    

def unit_vector(vector):
    """ Returns the unit vector of the vector.  """
    return vector / np.linalg.norm(vector)

def angle_between(v1, v2):
    """ Returns the angle in radians between vectors 'v1' and 'v2'::
            angle_between((1, 0, 0), (0, 1, 0))
            90
            angle_between((1, 0, 0), (1, 0, 0))
            0.0
            angle_between((1, 0, 0), (-1, 0, 0))
            180
    """
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0)) * 180 / 3.141592

def get_phenotype_vector(stratify_column, stratify_thresh, stratify_std, latent_cols, latent_df):
    hit = latent_df.loc[latent_df[stratify_column] >= stratify_thresh+stratify_std][latent_cols].to_numpy()
    miss = latent_df.loc[latent_df[stratify_column] < stratify_thresh-stratify_std][latent_cols].to_numpy()
    miss_mean_vector = np.mean(miss, axis=0)
    hit_mean_vector = np.mean(hit, axis=0)
    angle = angle_between(miss_mean_vector, hit_mean_vector)
#     print(f'Angle between {stratify_column} and all others: {angle}, \n'
#           f'Hit shape {hit.shape}, miss:{miss.shape} threshold:{stratify_thresh}\n'
#           f'Distance: {np.linalg.norm(hit_mean_vector - miss_mean_vector):.3f}, '
#           f'Hit std {np.std(hit, axis=1).mean():.3f}, miss std:{np.std(miss, axis=1).mean():.3f}\n')
    return hit_mean_vector - miss_mean_vector    

def check_snp_angles(snps, stratify_column, stratify_thresh, stratify_std, latent_cols, latent_df):
    pheno_vec = get_phenotype_vector(stratify_column, stratify_thresh, stratify_std, latent_cols, latent_df)
    sum_angles = 0
    for snp in snps:
        snp_vec = get_phenotype_vector(snp, 1, 0, latent_cols, latent_df)
        angle = angle_between(pheno_vec, snp_vec)
        sum_angles += abs(90-angle)
        print(f'Phenotype vector: {stratify_column} SNP:{snp} angle:{angle:0.1f}')
    print(f'{stratify_column} Average Difference from perpendicular: {sum_angles/len(snps):0.1f}\n')
    
    
def stratify_and_project_latent_space(stratify_column, stratify_thresh, stratify_std, 
                                      latent_cols, adjust_cols, latent_df, test_df, component_folder,
                                      manova=False, permute=False, save_components=False, histograms=False):   
    hit = latent_df.loc[latent_df[stratify_column] >= stratify_thresh+(1*stratify_std)]
    miss = latent_df.loc[latent_df[stratify_column] < stratify_thresh-(1*stratify_std)]
    hit_np = hit[latent_cols].to_numpy()
    miss_np = miss[latent_cols].to_numpy()
    miss_mean_vector = np.mean(miss_np, axis=0)
    hit_mean_vector = np.mean(hit_np, axis=0)
    angle = angle_between(miss_mean_vector, hit_mean_vector)
    space = test_df[latent_cols].to_numpy()
    space -= np.mean(space)
    space /= np.std(space)
    phenotype_vector = unit_vector(hit_mean_vector-miss_mean_vector)
    all_dots = np.array([np.dot(phenotype_vector, v) for v in space])
    all_phecodes = test_df[stratify_column].to_numpy()
    if permute:
        all_phecodes = np.random.permutation(all_phecodes)
    if len(adjust_cols) > 0:
        all_adjustments = test_df[adjust_cols].to_numpy()
        all_data = np.column_stack([all_phecodes, all_adjustments, np.ones(all_dots.shape[0])])
        formula = f'phecode ~ {" + ".join(adjust_cols)} + component'
    else:
        all_data = np.column_stack([all_phecodes, np.ones(all_dots.shape[0])])
        formula = f'phecode ~ component'
    
    data = {'component': all_dots, 'phecode': all_phecodes}
    for i, col in enumerate(adjust_cols):
        data[col] = all_adjustments[:, i]
    df = pd.DataFrame.from_dict(data)
    try:
        results = smf.logit(formula, data=df).fit(disp=False)
        stat_key = 'z'
        smf_ols_p_value = float(results.summary2().tables[1][f'P>|{stat_key}|']['component'])
        smf_ols_t_stat = float(results.summary2().tables[1][f'{stat_key}']['component'])
        if save_components:
            test_df['component'] = all_dots
            os.makedirs(os.path.dirname(component_folder), exist_ok=True)
            tsv = f'{component_folder}/{stratify_column}_component.tsv'
            test_df.to_csv(tsv, index=False, sep='\t')
        return results.summary2().tables[1]
    except np.linalg.LinAlgError as e:
        print(f'Phecode {stratify_column} Failed')
        return None

    
def phewas_feature(stratify_column, stratify_thresh, stratify_std, feature_col, 
                   adjust_cols, latent_df, test_df, component_folder):   
    hit = latent_df.loc[latent_df[stratify_column] >= stratify_thresh+(1*stratify_std)]
    miss = latent_df.loc[latent_df[stratify_column] < stratify_thresh-(1*stratify_std)]
    all_dots = test_df[feature_col].to_numpy()
    all_phecodes = test_df[stratify_column].to_numpy()

    if len(adjust_cols) > 0:
        all_adjustments = test_df[adjust_cols].to_numpy()
        formula = f'phecode ~ {" + ".join(adjust_cols)} + component' 
    else:
        formula = f'phecode ~ component'
    
    data = {'component': all_dots, 'phecode': all_phecodes}
    for i, col in enumerate(adjust_cols):
        data[col] = all_adjustments[:, i]
    df = pd.DataFrame.from_dict(data)
    try:
        results = smf.logit(formula, data=df).fit(disp=False)
        stat_key = 'z'
        smf_ols_p_value = float(results.summary2().tables[1][f'P>|{stat_key}|']['component'])
        smf_ols_t_stat = float(results.summary2().tables[1][f'{stat_key}']['component'])
        return results.summary2().tables[1]
    except np.linalg.LinAlgError as e:
        print(f'Phecode {stratify_column} Failed')
        return None    
    
    
def merge_and_stratify_phecode_file(latent_df, test_df, latent_cols, adjust_cols, 
                                    phecode_file, test_phecode_file, min_cases, 
                                    component_folder, permute=False):
    
    if 'phecode' not in phecode_file:
        print(f'No phecode {phecode_file}')
        return
    df = pd.read_csv(phecode_file, sep='\t')
    phecode_name = df.iloc[0].disease
    if df.has_disease.sum() > min_cases:
        df = df.rename(columns={'has_disease': phecode_name})
        latent_df = pd.merge(latent_df, df, left_on='sample_id', right_on='sample_id', how='inner')     
        test_df = pd.merge(test_df, df, left_on='sample_id', right_on='sample_id', how='inner')
        if len(latent_df[phecode_name].value_counts()) > 1 and latent_df[phecode_name].value_counts()[1] > min_cases:
            if len(latent_cols) == 1:
                results = phewas_feature(phecode_name, 1, 0, latent_cols[0], adjust_cols, 
                                         latent_df, test_df, component_folder)
            else:
                results = stratify_and_project_latent_space(phecode_name, 1, 0, latent_cols, adjust_cols, 
                                              latent_df, test_df, component_folder, permute=permute,
                                              save_components=False, histograms=False)
            
            return results, phecode_name
               


            

def merge_and_stratify_by_code_folder(latent_df, test_df, latent_cols, adjust_cols, component_folder,
                                      phe_folder='./phecodes/', test_phe_folder='./phecodes/', 
                                      min_cases=20, permute=False):
    counts = {}
    p_vals = {}
    betas = {}
    ses = {}

    for phe_file in sorted(os.listdir(phe_folder)):
        p = merge_and_stratify_phecode_file(latent_df, test_df, latent_cols, adjust_cols, 
                                            phe_folder + phe_file, test_phe_folder + phe_file, 
                                            min_cases, component_folder, permute)
        if p is not None:
            results, name = p
            if results is not None:
                pdf = pd.read_csv(test_phe_folder + phe_file, sep='\t')
                counts[name] = pdf.has_disease.sum()
                p_vals[name] = results['P>|z|']['component']
                betas[name] = results['Coef.']['component']
                ses[name] = results['Std.Err.']['component']
                print(f'Phecode: {p[1]}, Pvalue: {p_vals[name]}')
    return p_vals, betas, ses, counts
    
   
def project_latent_space_on_phecode(stratify_column, stratify_thresh, stratify_std, 
                                      latent_cols, adjust_cols, phenotype_vector, test_df, component_folder,
                                      permute=False, save_components=False, histograms=False):
    
    space = test_df[latent_cols].to_numpy()
    space -= np.mean(space)
    space /= np.std(space)
    all_dots = np.array([np.dot(phenotype_vector, v) for v in space])
    all_phenotypes = test_df[stratify_column].to_numpy()
    if permute:
        all_phenotypes = np.random.permutation(all_phenotypes)
    if len(adjust_cols) > 0:
        all_adjustments = test_df[adjust_cols].to_numpy()
        all_data = np.column_stack([all_phenotypes, all_adjustments, np.ones(all_dots.shape[0])])
        formula = f'phecode ~ {" + ".join(adjust_cols)} + component'
        adjust_formula = f'phecode ~ {" + ".join(adjust_cols)}'
    else:
        all_data = np.column_stack([all_phenotypes, np.ones(all_dots.shape[0])])
        formula = f'phecode ~ component'
    
    data = {'component': all_dots, 'phecode': all_phenotypes}
    for i, col in enumerate(adjust_cols):
        data[col] = all_adjustments[:, i]
    df = pd.DataFrame.from_dict(data)
    if save_components:
        test_df['component'] = all_dots
        os.makedirs(os.path.dirname(component_folder), exist_ok=True)
        tsv = f'{component_folder}/{stratify_column}_component.tsv'
        test_df.to_csv(tsv, index=False, sep='\t')
    try:
        results = smf.logit(formula, data=df).fit(disp=False)
        
        preds = results.predict(df)
        results2 = smf.logit(adjust_formula, data=df).fit(maxiter=200, disp=False)
        preds2 = results2.predict(df)
        auc_w_component = roc_auc_score(all_phenotypes, preds)
        auc_no_component = roc_auc_score(all_phenotypes, preds2)        
        if histograms: # and -np.log10(results.summary2().tables[1]['P>|z|']['component']) > 10:
            hit_dots = all_dots[all_phenotypes == 1]
            miss_dots = all_dots[all_phenotypes == 0]
            dists = [list(hit_dots), list(miss_dots)]
            labels = [f'{stratify_column} n={len(hit_dots)}', f'No {stratify_column} n={len(miss_dots)}']
            for i, data in enumerate(dists):
                #plt.hist(data, bins = 40, label=labels[i], alpha=0.5, density=True)
                sb.kdeplot(np.array(data), bw=0.5)
                # Title and labels
#                 plt.title(f'{stratify_column}')
#                 plt.xlabel(f'Component in direction of {stratify_column} vector')
#                 plt.ylabel('Density')
#             plt.legend()
            plt.show()
        
        return results.summary2().tables[1], auc_w_component, auc_no_component
    except np.linalg.LinAlgError as e:
        print(f'Phecode {stratify_column} Failed')
        return None, None, None
    

def stratify_phecode_file_on_phecode(phenotype_vectors, test_df, latent_cols, adjust_cols, 
                                    test_phecode_file, min_cases, 
                                    component_folder, permute, histograms):
    
    if 'phecode' not in test_phecode_file:
        return None, None, None, None
    df = pd.read_csv(test_phecode_file, sep='\t')
    phecode_name = df.iloc[0].disease
    phekey = phecode_name.replace('phecode', 'phe')
    #print(f'PHEHEHE {phekey}')
    row = phenotype_vectors[phenotype_vectors.phecode_text == phekey]

    if len(row) > 0 and df.has_disease.sum() > min_cases:
        df = df.rename(columns={'has_disease': phecode_name})
        test_df = pd.merge(test_df, df, left_on='sample_id', right_on='sample_id', how='inner')
        cols = [f'pv_{i}' for i in range(len(latent_cols))]
        phenotype_vector = row[cols].to_numpy()[0, :]
        #print(f'PHEHEHE {phekey} {len(row)} phenotype_vector {phenotype_vector.shape} ' )
        if len(test_df[phecode_name].value_counts()) > 1 and test_df[phecode_name].value_counts()[1] > min_cases:
            results, a1, a2 = project_latent_space_on_phecode(phecode_name, 1, 0, latent_cols, adjust_cols, 
                                                      phenotype_vector, test_df, component_folder, 
                                                      permute=permute, save_components=False, histograms=histograms)
            return results, phecode_name, a1, a2
    return None, None, None, None      

def stratify_code_folder_on_phecode(phenotype_vectors, test_df, latent_cols, adjust_cols, component_folder,
                                    test_phe_folder='./phecodes/', min_cases=20, permute=False, histograms=False):
    p_vals = {}
    counts = {}
    betas = {}
    ses = {}
    auc1 = {}
    auc2 = {}
    for phe_file in sorted(os.listdir(test_phe_folder)):
        results, name, a1, a2 = stratify_phecode_file_on_phecode(phenotype_vectors, test_df, latent_cols, adjust_cols,
                                                         test_phe_folder + phe_file, min_cases, component_folder, 
                                                         permute, histograms)
        
        if results is not None:
            pdf = pd.read_csv(test_phe_folder + phe_file, sep='\t')
            counts[name] = pdf.has_disease.sum()
            p_vals[name] = results['P>|z|']['component']
            betas[name] = results['Coef.']['component']
            ses[name] = results['Std.Err.']['component']
            auc1[name] = a1
            auc2[name] = a2
            print(f'Phe: {name}, N: {counts[name]}, P: {p_vals[name]:0.2E}, betas {betas[name]:0.3f} err: {ses[name]:0.4f}')
    return p_vals, betas, ses, counts, auc1, auc2




In [None]:
label_file = '/home/sam/trained_models/explore_phenotypes/tensors_all_union.csv'
labels = pd.read_csv(label_file)
all_scores = defaultdict(dict)
adjust_cols = []

phecode_meta = pd.read_csv('/home/sam/csvs/phecode_definitions.csv') 
cat_colors= {}
for i,k in enumerate(phecode_meta.category.value_counts().keys()):
    cat_colors[k] = i
    #print(f'{i} {k}')

phecode_meta.head()

In [None]:
latent_space_file=f'/home/sam/trained_models/hypertuned_48m_16e_ecg_median_raw_10_autoencoder_256d/hidden_embed_hypertuned_48m_16e_ecg_median_raw_10_autoencoder_256d.tsv'
latent_df = latent_space_dataframe(latent_space_file, label_file)
latent_dimension = 256
latent_cols = [f'latent_{i}' for i in range(latent_dimension)]
adjust_cols = ['ecg_age', 'sex', 'white']
train = latent_df.sample(frac=0.5)
test = latent_df.drop(train.index)
adjusted_ukb_ae_ecg = merge_and_stratify_by_code_folder(train, test, latent_cols, 
                                                 adjust_cols, './ukb_ecg_ae_phecode_projections/',
                                                 phe_folder='/home/sam/phecodes_tables_prevalent/', 
                                                 test_phe_folder='/home/sam/phecodes_tables_prevalent/', 
                                                 min_cases=100,
                                                 permute=False)


In [None]:
def phecode_dicts(pval_dict):
    categories = defaultdict(list)
    categories_text = defaultdict(list)

    for phe, pval in sorted(pval_dict.items(), key=operator.itemgetter(1)):
        print(f"phe is {phe.replace('phecode_', '').replace('_', '.')}")
        row = phecode_meta[phecode_meta.phecode == float(phe.replace('phecode_', '').replace('_', '.'))].iloc[0]
        if row.category in cat_colors:
            categories[row.category].append(pval)
            categories_text[row.category].append(row.phenotype)
            print(f'category: {row.category}\n phenotype: {row.phenotype}  phecode is {phe} and pvalue {pval:0.4E}\n' )
    return categories, categories_text

In [None]:
ukb_cat, ukb_text = phecode_dicts(adjusted_ukb_ae_ecg[0])

In [None]:
def qq_plot_theoretical(categories, text, title='QQ Plot', p_thresh=0.95):
    fig, ax = plt.subplots(figsize=(16, 12), dpi=400)
    sort_cat = sorted(categories.items(), key=operator.itemgetter(0))
    
    annotations = []
    x_offsets = [0]
    x_labels = []
    for i, (k,v) in enumerate(sort_cat):
        if len(categories[k]) < 3:
            continue
        neglog10p = -np.log10(np.array(categories[k])+1e-300)
        expected = -np.log10(np.arange(1.0/len(categories[k]), 1+1e-8, 1.0/len(categories[k]))) + x_offsets[-1]
        #print(f'{x_offset} expected {len(expected)}  neglog10p {len(neglog10p)} \n EXpected {expected[0]}')
        ax.scatter(sorted(expected[:len(neglog10p)], reverse=True), sorted(neglog10p[:len(expected)], reverse=True), 
                   label=k.capitalize())
        ax.plot([x_offsets[-1], expected[0]],[0, 1]) 
        x_offsets.append(expected[0] + 0.2)
        x_labels.append(k.capitalize())
        for j, txt in enumerate(text[k]):
            if neglog10p[j] > p_thresh:
                annotations.append(ax.annotate(txt, xy=(expected[j], neglog10p[j]), fontsize=12))
                #annotations.append(ax.annotate(txt.replace(' ',"\n"), xy=(expected[j], neglog10p[j]), fontsize=8))
                
    ax.set_xticks(x_offsets[:-1])
    ax.set_xticklabels(x_labels, rotation=30, ha='right')
    #plt.ylim(0, 15)
    ax.legend(ncol=2, fontsize=14)
    ax.set_title(title, fontsize=18)
    ax.set_xlabel('Expected -log10(P_value) per PheCode category')
    ax.set_ylabel('Observed -log10(P_value) per PheCode category')
    mask = np.zeros(fig.canvas.get_width_height(), bool)

    fig.canvas.draw()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    for a in annotations:
        bbox = a.get_window_extent(renderer=fig.canvas.get_renderer())
        if not np.isinf(bbox.x0): 
            x0 = int(bbox.x0)
            x1 = int(math.ceil(bbox.x1))
            y0 = int(bbox.y0)
            y1 = int(math.ceil(bbox.y1))

            s = np.s_[x0:x1+1, y0:y1+1]
            if np.any(mask[s]):
                a.set_visible(False)
            else:
                mask[s] = True

In [None]:
qq_plot_theoretical(ukb_cat, ukb_text, 'UKB Phecode Vectors PheWAS in UKB from ECG Median Autoencoder Latent Space, Cases > 500')

In [None]:
def write_phewas(pvals, betas, ses, counts, auc1, auc2, file_name):
    tuples = []
    for phe, pval in sorted(pvals.items(), key=operator.itemgetter(1)):
        #print(f"phe is {phe.replace('phecode_', '').replace('_', '.')}")
        row = phecode_meta[phecode_meta.phecode == float(phe.replace('phecode_', '').replace('_', '.'))].iloc[0]
        if row.category in cat_colors:
            cols = [row.phecode, phe, pval, row.phenotype, row.category, 
                    counts[phe], betas[phe], ses[phe], auc1[phe], auc2[phe]
                   ]
            tuples.append(tuple(cols))
    headers = ['phecode', 'phecode_text', 'p_value', 'phenotype', 'category',
               'n', 'beta', 'se', 'AUC_with_ECG', 'AUC_no_ECG']
    df = pd.DataFrame(tuples, columns = headers)
    df.to_csv(file_name, index=False)

In [None]:
write_phewas(p_vals, betas, ses, counts, auc1, auc2, '/home/sam/csvs/phewas_ukb_with_mgh_vectors_2022_11_30.csv')

In [None]:
!pip install adjustText
from adjustText import adjust_text

In [None]:
#df = pd.read_csv('/home/sam/csvs/meta_analysis_components_fixed_apr21.csv')
df = pd.read_csv('/home/sam/csvs/meta_analysis_components_fixed_may26.csv')

p_thresh = -np.log10(0.05/len(df))
title = 'QQ Plot for ECG Autoencoder PheWAS Fixed Effects Meta Analysis'
fig, ax = plt.subplots(figsize=(18,10), dpi=600)
ax.axhline(y=p_thresh, color='darkslategrey', linestyle='solid')
print(p_thresh)
annotations = []
x_offsets = [0]
x_labels = []
cmap = plt.get_cmap('hsv')
colors = cmap(np.linspace(0, 0.8, len(df.category.unique())))
for category, color in zip(sorted(df.category.unique(), reverse=True), colors):
    dfc = df[df.category == category]
    dfc = dfc.sort_values('meta_p', ascending=False)
    neglog10p = -np.log10(dfc.meta_p.to_numpy()+1e-300)
    expected = -np.log10(np.arange(1.0/len(dfc), 1+1e-8, 1.0/len(dfc))) + x_offsets[-1]
    ccategory = ' '.join(map(str.capitalize, category.split(' ')))
#     ax.scatter(sorted(expected[:len(neglog10p)], reverse=True), sorted(neglog10p[:len(expected)], reverse=True), 
#                label=ccategory, color=color)
    ax.scatter(sorted(expected, reverse=True), sorted(neglog10p, reverse=True), 
               label=ccategory, color=color)    
    ax.plot([x_offsets[-1], expected[0]],[0, 1], color=color) 
    x_labels.append(ccategory)
    x_offsets.append(expected[0] + 1)
    for j, txt in enumerate(dfc.phenotype.to_numpy()) :
        if neglog10p[j] > p_thresh:
            ctxt = ' '.join(map(str.capitalize, txt.split(' ')))
            annotations.append(ax.text(expected[-j]+0.4, neglog10p[j]+4, ctxt))

            
         
    ax.set_xticks(x_offsets[:-1])
    ax.set_xticklabels(x_labels, rotation=30, ha='right')
    
    ax.legend(loc='center left')
    handles, labels = ax.get_legend_handles_labels()
    # sort both labels and handles by labels
    labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
    ax.legend(handles, labels, ncol=4)
    #ax.set_title(title)
    ax.set_xlabel('Phecode Category')
    ax.set_ylabel('Observed -log10(p)')


    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

# adjust_text(annotations, only_move={'points':'y', 'texts':'y'},
#             arrowprops=dict(arrowstyle="-", color='black', lw=1.5))
mask = np.zeros(fig.canvas.get_width_height(), bool)
fig.canvas.draw()
for a in reversed(annotations):
    bbox = a.get_window_extent(renderer=fig.canvas.get_renderer())
    if not np.isinf(bbox.x0): 
        x0 = int(bbox.x0)
        x1 = int(math.ceil(bbox.x1))
        y0 = int(bbox.y0)
        y1 = int(math.ceil(bbox.y1))

        s = np.s_[x0:x1, y0:y1]
        if np.any(mask[s]):
            a.set_visible(False)
        else:
            mask[s] = True

In [None]:
from scipy.stats import pearsonr
import seaborn as sb
%matplotlib inline
import matplotlib.pyplot as plt
def cross_correlate_space(df, num_features):
    index = [f'pv_{i}' for i in range(num_features)]
    space = np.nan_to_num(df[index].to_numpy())
    corrs = np.zeros((len(df),len(df)))
    labels = {}
    print(len(df), space.shape)
    cats = df.category.to_numpy()
    labels = {}
    print(len(df), space.shape)
    for i in range(len(df)):
        if i== 0 or cats[i-1] != cats[i]:
            labels[' '.join(map(str.capitalize, cats[i].split(' ')))] = i
            print(f'{i} new cats {cats[i]}')
    for i in range(len(df)):
        for j in range(len(df)):
            corrs[i,j] = pearsonr(space[i,:], space[j,:])[0]
    fig, ax = plt.subplots(figsize=(8, 8), dpi=300)
    sb.heatmap(corrs, cmap='twilight', square=True, ax=ax, vmax=1,vmin=-1, 
               cbar_kws={"label": f"Phecode Vector\n Pearson Correlation", "shrink": .5})
    
    ax.set_xticks(list(labels.values()))
    ax.set_xticklabels(list(labels.keys()))
    ax.set_yticks(list(labels.values()))
    ax.set_yticklabels(list(labels.keys()))
    ax.set_xlabel('Phecode Categories')
    ax.set_ylabel('Phecode Categories')
    
phenotype_vectors = phenotype_vectors.sort_values('category')

In [None]:
cross_correlate_space(phenotype_vectors, 256)