In [1]:
import numpy as np
import pandas as pd
import pickle
from scipy.stats import pearsonr

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from pymir import mpl_stylesheet
from pymir import mpl_utils
mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 120, colors = 'kelly')

from nnwmf.optimize import IALM
from nnwmf.optimize import FrankWolfe, FrankWolfe_CV
from nnwmf.utils import model_errors as merr

import sys
sys.path.append("../utils/")
import histogram as mpy_histogram
import simulate as mpy_simulate
import plot_functions as mpy_plotfn

In [17]:
data_dir = "../data"
beta_df_filename   = f"{data_dir}/beta_df.pkl"
prec_df_filename   = f"{data_dir}/prec_df.pkl"
se_df_filename     = f"{data_dir}/se_df.pkl"
zscore_df_filename = f"{data_dir}/zscore_df.pkl"
snp_info_filename  = f"{data_dir}/snp_info.pkl"

'''
Data Frames for beta, precision, standard error and zscore.
'''

beta_df   = pd.read_pickle(beta_df_filename)
prec_df   = pd.read_pickle(prec_df_filename)
se_df     = pd.read_pickle(se_df_filename)
zscore_df = pd.read_pickle(zscore_df_filename)
snp_info  = pd.read_pickle(snp_info_filename)

trait_df = pd.read_csv(f"{data_dir}/trait_meta.csv")
phenotype_dict = trait_df.set_index('ID')['Broad'].to_dict()

In [3]:
X_nan = np.array(zscore_df).T
X_nan_cent = X_nan - np.nanmean(X_nan, axis = 0, keepdims = True)
X_nan_mask = np.isnan(X_nan)
X_cent = np.nan_to_num(X_nan_cent, copy = True, nan = 0.0)

print (f"We have {X_cent.shape[0]} samples (phenotypes) and {X_cent.shape[1]} features (variants)")
print (f"Fraction of Nan entries: {np.sum(X_nan_mask) / np.prod(X_cent.shape):.3f}")

We have 69 samples (phenotypes) and 10068 features (variants)
Fraction of Nan entries: 0.193


In [4]:
select_ids = zscore_df.columns
labels = [phenotype_dict[x] for x in select_ids]
unique_labels = list(set(labels))
nsample = X_cent.shape[0]
ntrait  = len(unique_labels)

trait_indices = [np.array([i for i, x in enumerate(labels) if x == label]) for label in unique_labels]
trait_colors  = {trait: color for trait, color in zip(unique_labels, (mpl_stylesheet.kelly_colors())[:ntrait])}

In [5]:
mf_methods = ['ialm', 'nnm', 'nnm_sparse']
lowrank_X = dict()

for method in mf_methods:
    with open (f"{data_dir}/lowrank_X_{method}.pkl", 'rb') as handle:
        lowrank_X[method] = pickle.load(handle)

In [6]:
def get_principal_components(X):
    X_cent = mpy_simulate.do_standardize(X, scale = False)
    X_cent /= np.sqrt(np.prod(X_cent.shape))
    U, S, Vt = np.linalg.svd(X_cent, full_matrices = False)
    pcomps = U @ np.diag(S)
    loadings = Vt.T @ np.diag(S)
    return loadings, pcomps, S

loadings  = dict()
pcomps    = dict()
eigenvals = dict()

loadings['tsvd'], pcomps['tsvd'], eigenvals['tsvd'] = get_principal_components(X_cent)
for m in mf_methods:
    loadings[m], pcomps[m], eigenvals[m] = get_principal_components(lowrank_X[m])

In [7]:
phenotype_dict_readable = {
    'AD_sumstats_Jansenetal_2019sept.txt.gz' : 'AD_Jansen_2019',
    'anxiety.meta.full.cc.txt.gz' : 'anxiety',
    'anxiety.meta.full.fs.txt.gz' : 'anxiety',
    'CNCR_Insomnia_all' : 'Insomnia',
    'daner_adhd_meta_filtered_NA_iPSYCH23_PGC11_sigPCs_woSEX_2ell6sd_EUR_Neff_70.txt.gz' : 'ADHD_Daner',
    'daner_PGC_BIP32b_mds7a_0416a.txt.gz' : 'BD_Daner_PGC',
    'daner_PGC_BIP32b_mds7a_mds7a_BD1.0416a_INFO6_A5_NTOT.txt.gz' : 'BD1_Daner_PGC',
    'daner_PGC_BIP32b_mds7a_mds7a_BD2.0416a_INFO6_A5_NTOT.txt.gz' : 'BD2_Daner_PGC',
    'ENIGMA_Intracraneal_Volume' : 'Intracraneal_Volume',
    'ieu-a-1000' : 'Neuroticism',
    'ieu-a-1041' : 'Intracranial volume',
    'ieu-a-1042' : 'Nucleus accumbens volume',
    'ieu-a-1043' : 'Amygdala volume',
    'ieu-a-1044' : 'Caudate volume',
    'ieu-a-1045' : 'Hippocampus volume',
    'ieu-a-1046' : 'Pallidum volume',
    'ieu-a-1047' : 'Putamen volume',
    'ieu-a-1048' : 'Thalamus volume',
    'ieu-a-1085' : 'Amyotrophic lateral sclerosis',
    'ieu-a-118' : 'Neuroticism',
    'ieu-a-1183' : 'ADHD',
    'ieu-a-1184' : 'Autism Spectrum Disorder',
    'ieu-a-1185' : 'Autism Spectrum Disorder',
    'ieu-a-1186' : 'Anorexia Nervosa',
    'ieu-a-1188' : 'Major Depressive Disorder',
    'ieu-a-1189' : 'Obsessive Compulsive Disorder',
    'ieu-a-22' : 'Schizophrenia',
    'ieu-a-297' : 'Alzheimers disease',
    'ieu-a-806' : 'Autism',
    'ieu-a-990' : 'Bulimia nervosa',
    'ieu-b-10' : 'Focal epilepsy',
    'ieu-b-11' : 'Focal epilepsy',
    'ieu-b-12' : 'Juvenile absence epilepsy',
    'ieu-b-13' : 'Childhood absence epilepsy',
    'ieu-b-14' : 'Focal epilepsy',
    'ieu-b-15' : 'Focal epilepsy',
    'ieu-b-16' : 'Generalized epilepsy',
    'ieu-b-17' : 'Juvenile myoclonic epilepsy',
    'ieu-b-18' : 'Multiple sclerosis',
    'ieu-b-2' : 'Alzheimers disease',
    'ieu-b-41' : 'Bipolar Disorder',
    'ieu-b-42' : 'Schizophrenia',
    'ieu-b-7' : 'Parkinsons',
    'ieu-b-8' : 'Epilepsy',
    'ieu-b-9' : 'Generalized epilepsy',
    'IGAP_Alzheimer' : 'IGAP_Alzheimer',
    'iPSYCH-PGC_ASD_Nov2017.txt.gz' : 'ASD_PGC_Nov2017',
    'Jones_et_al_2016_Chronotype' : 'Chronotype_Jones_2016',
    'Jones_et_al_2016_SleepDuration' : 'Sleep_duration_Jones_2016',
    'MDD_MHQ_BIP_METACARPA_INFO6_A5_NTOT_no23andMe_noUKBB.txt.gz' : 'MDD_BIP_no23andMe_noUKBB',
    'MDD_MHQ_METACARPA_INFO6_A5_NTOT_no23andMe_noUKBB.txt.gz' : 'MDD_METACARPA_no23andMe_noUKBB',
    'MHQ_Depression_WG_MAF1_INFO4_HRC_Only_Filtered_Dups_FOR_METACARPA_INFO6_A5_NTOT.txt.gz' : 'Depression',
    'MHQ_Recurrent_Depression_WG_MAF1_INFO4_HRC_Only_Filtered_Dups_FOR_METACARPA_INFO6_A5_NTOT.txt.gz' : 'Recurrent_Depression',
    'MHQ_Single_Depression_WG_MAF1_INFO4_HRC_Only_Filtered_Dups_FOR_METACARPA_INFO6_A5_NTOT.txt.gz' : 'Single_Depression',
    'MHQ_Subthreshold_WG_MAF1_INFO4_HRC_Only_Filtered_Dups_FOR_METACARPA_INFO6_A5_NTOT.txt.gz' : 'Subthreshold_WG',
    'ocd_aug2017.txt.gz' : 'OCD_aug2017',
    'PGC_ADHD_EUR_2017' : 'ADHD_PGC_EUR_2017',
    'PGC_ASD_2017_CEU' : 'ASD_PGC_2017_CEU',
    'pgc-bip2021-all.vcf.txt.gz' : 'BD_PGC_all_2021',
    'pgc-bip2021-BDI.vcf.txt.gz' : 'BDI_PGC_2021',
    'pgc-bip2021-BDII.vcf.txt.gz' : 'BDII_PGC_2021',
    'pgc.scz2' : 'Schizophrenia_PGC_2',
    'PGC3_SCZ_wave3_public.v2.txt.gz' : 'Schizophrenia_PGC_3',
    'pgcAN2.2019-07.vcf.txt.gz' : 'pgcAN2.2019-07.vcf.txt.gz',
    'pts_all_freeze2_overall.txt.gz' : 'pts_all_freeze2',
    'SSGAC_Depressive_Symptoms' : 'SSGAC_Depressive_Symptoms',
    'SSGAC_Education_Years_Pooled' : 'SSGAC_Education_Years_Pooled',
    'UKB_1160_Sleep_duration' : 'UKB_1160_Sleep_duration',
    'UKB_1180_Morning_or_evening_person_chronotype' : 'UKB_1180_Morning_or_evening_person_chronotype',
    'UKB_1200_Sleeplessness_or_insomnia' : 'UKB_1200_Sleeplessness_or_insomnia',
    'UKB_20002_1243_self_reported_psychological_or_psychiatric_problem' : 'UKB_20002_1243_self_reported_psychological_or_psychiatric_problem',
    'UKB_20002_1262_self_reported_parkinsons_disease' : 'UKB_20002_1262_self_reported_parkinsons_disease',
    'UKB_20002_1265_self_reported_migraine' : 'UKB_20002_1265_self_reported_migraine',
    'UKB_20002_1289_self_reported_schizophrenia' : 'UKB_20002_1289_self_reported_schizophrenia',
    'UKB_20002_1616_self_reported_insomnia' : 'UKB_20002_1616_self_reported_insomnia',
    'UKB_20016_Fluid_intelligence_score' : 'UKB_20016_Fluid_intelligence_score',
    'UKB_20127_Neuroticism_score' : 'UKB_20127_Neuroticism_score',
    'UKB_G40_Diagnoses_main_ICD10_G40_Epilepsy' : 'UKB_G40_Diagnoses_main_ICD10_G40_Epilepsy',
    'UKB_G43_Diagnoses_main_ICD10_G43_Migraine' : 'UKB_G43_Diagnoses_main_ICD10_G43_Migraine',
    'ieu-b-5070' : 'Schizophrenia',
    'GPC-NEO-NEUROTICISM' : 'GPC-NEO-NEUROTICISM',
    'ieu-a-1009' : 'Subjective well being',
    'ieu-a-1018' : 'Subjective well being',
    'ieu-a-1019' : 'Migraine in bipolar disorder',
    'ieu-a-1029' : 'Internalizing problems',
    'ieu-a-1061' : 'G speed factor',
    'ieu-a-1062' : 'Symbol search',
    'ieu-a-1063' : '8-choice reaction time',
    'ieu-a-1064' : '2-choice reaction time',
    'ieu-a-1065' : 'Inspection time',
    'ieu-a-1066' : 'Simple reaction time',
    'ieu-a-1067' : 'Digit symbol',
    'ieu-a-1068' : '4-choice reaction time',
    'ieu-a-45' : 'Anorexia nervosa',
    'ieu-a-298' : 'Alzheimers Disease',
    'ieu-a-808' : 'Bipolar Disorder',
    'ieu-a-810' : 'Schizophrenia',
    'ieu-a-812' : 'Parkinsons',
    'ieu-a-818' : 'Parkinsons',
    'ieu-a-824' : 'Alzheimers Disease',
    'ieu-b-43' : 'frontotemporal dementia',
    'ILAE_Genetic_generalised_epilepsy' : 'ILAE_Genetic_generalised_epilepsy'
}

labels_readable = [phenotype_dict_readable[x] for x in select_ids]

In [8]:
plot_methods = ['tsvd'] + mf_methods
plot_methods_names = {
    'tsvd' : 'Raw Data',
    'ialm' : 'RPCA-IALM',
    'nnm'  : 'NNM-FW',
    'nnm_sparse' : 'NNM-Sparse-FW',
}

In [9]:
corrs = dict()
pvals = dict()

# for m in plot_methods:
#     corrs[m], pvals[m] = get_corrmat(pcomps[m], X_cent, ncomp = 20)
#     with open (f"{data_dir}/loading_corr_{m}.pkl", 'wb') as handle:
#         pickle.dump(corrs[m], handle, protocol=pickle.HIGHEST_PROTOCOL)
#     with open (f"{data_dir}/loading_corr_pval_{m}.pkl", 'wb') as handle:
#         pickle.dump(pvals[m], handle, protocol=pickle.HIGHEST_PROTOCOL)

for m in plot_methods:
    with open (f"{data_dir}/loading_corr_{m}.pkl", 'rb') as handle:
        corrs[m] = pickle.load(handle)
    with open (f"{data_dir}/loading_corr_pval_{m}.pkl", 'rb') as handle:
        pvals[m] = pickle.load(handle)

In [10]:
def pval_to_manhattan_data(pval_data, pcidx):
    data = {i+1: dict() for i in range(22)}
    for i, val in enumerate(pval_data[:, pcidx]):
        rsid = rsid_list[i]
        chrm = int(snp_info_dict[rsid]['CHR'])
        bppos = snp_info_dict[rsid]['BP']
        data[chrm][bppos] = - np.log10(val)
    return data

In [19]:
def get_total_snps(sdict):
    stot = {i + 1 : 0 for i in range(22)}
    for snp, info in sdict.items():
        chrm = int(info['CHR'])
        bppos = info['BP']
        if bppos > stot[chrm]:
            stot[chrm] = bppos
    return stot

rsid_list = zscore_df.index
snp_info_dict = snp_info.set_index('SNP').to_dict(orient = 'index')
snp_tot = get_total_snps(snp_info_dict)

In [12]:
whichmethod = 'nnm_sparse'
npcomp = 10
for icomp in range(npcomp):
    pval_data = pval_to_manhattan_data(pvals[whichmethod], icomp)

In [20]:
pvals['nnm_sparse'].shape

(10068, 20)

In [21]:
df = pd.DataFrame(pvals['nnm_sparse'], columns=[f"PC{(i + 1):02d}" for i in range(20)], index=rsid_list)

In [23]:
chrm_list = [snp_info_dict[rsid]['CHR'] for rsid in rsid_list]
bppos_list = [snp_info_dict[rsid]['BP'] for rsid in rsid_list]
a1_list = [snp_info_dict[rsid]['A1'] for rsid in rsid_list]
a2_list = [snp_info_dict[rsid]['A2'] for rsid in rsid_list]

In [24]:
df.insert(0, 'CHR', chrm_list)
df.insert(1, 'BP', bppos_list)
df.insert(2, 'A1', a1_list)
df.insert(3, 'A2', a2_list)

In [25]:
df

Unnamed: 0,CHR,BP,A1,A2,PC01,PC02,PC03,PC04,PC05,PC06,...,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
rs1000031,18,48835070,G,A,7.422631e-02,0.097972,0.247711,0.476767,0.082436,5.433708e-01,...,0.935240,0.941811,0.914811,0.581951,0.920382,0.403505,0.249194,0.979514,0.752308,0.430816
rs1000269,20,20317839,G,A,5.375219e-01,0.502609,0.022620,0.000109,0.293267,3.003434e-01,...,0.921333,0.634377,0.572828,0.158633,0.349089,0.254829,0.424526,0.668366,0.954586,0.964721
rs10003281,4,135966929,T,C,3.098978e-01,0.954710,0.211275,0.893236,0.068240,1.853400e-08,...,0.676377,0.720226,0.469461,0.947054,0.722806,0.138866,0.822534,0.430424,0.562946,0.444230
rs10004866,4,65474274,T,G,3.068253e-03,0.918855,0.297918,0.265726,0.015150,3.711352e-02,...,0.431861,0.369428,0.166726,0.002138,0.295246,0.756696,0.433845,0.100954,0.075662,0.507525
rs10005235,4,74772610,T,C,2.457373e-01,0.095638,0.288727,0.322219,0.561365,8.401277e-01,...,0.057075,0.597793,0.973229,0.195882,0.379469,0.414009,0.968203,0.993873,0.748008,0.305201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
rs9989571,18,74972059,C,T,6.318590e-06,0.056867,0.543619,0.245213,0.143151,1.071744e-02,...,0.316641,0.895875,0.489673,0.387509,0.675797,0.784278,0.322042,0.743365,0.878251,0.882301
rs9991694,4,83078972,T,C,3.752557e-07,0.006464,0.579804,0.056797,0.010551,1.643510e-02,...,0.743803,0.927547,0.613604,0.883619,0.452764,0.525951,0.557713,0.398007,0.964597,0.439896
rs9992763,4,108137562,G,T,9.895024e-01,0.828606,0.754621,0.486959,0.678962,3.847490e-01,...,0.926779,0.998543,0.216710,0.802382,0.745869,0.095897,0.997941,0.025970,0.071930,0.946161
rs9993607,4,58503142,C,T,1.660853e-06,0.000227,0.982201,0.669206,0.000183,9.630507e-01,...,0.478628,0.478179,0.808850,0.164558,0.878914,0.655345,0.416203,0.511181,0.896615,0.757639


In [26]:
df.to_csv("variant_contribution_to_PC.csv")