In [1]:
import numpy as np
import pandas as pd
import pickle

import matplotlib.pyplot as plt
from pymir import mpl_stylesheet
from pymir import mpl_utils
mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 120, colors = 'kelly')
from matplotlib.gridspec import GridSpec

from nnwmf.optimize import IALM
from nnwmf.optimize import FrankWolfe, FrankWolfe_CV
from nnwmf.utils import model_errors as merr

import sys
sys.path.append("../utils/")
import histogram as mpy_histogram
import simulate as mpy_simulate
import plot_functions as mpy_plotfn

In [2]:
data_dir = "../data"
zscore_df_filename = f"{data_dir}/ukbb_zscore_df2.pkl"
zscore_df = pd.read_pickle(zscore_df_filename)
beta_df_filename = f"{data_dir}/ukbb_beta_df2.pkl"
beta_df = pd.read_pickle(beta_df_filename)
se_df_filename = f"{data_dir}/ukbb_se_df2.pkl"
se_df = pd.read_pickle(se_df_filename)

phenotype_metafile = "/gpfs/commons/home/sbanerjee/work/npd/UKBB/npd_phenotypes_broad_categories.tsv"
phenotype_df = pd.read_csv(phenotype_metafile, sep="\t")

n_signif_metafile = "/gpfs/commons/home/sbanerjee/work/npd/UKBB/npd_n_signif.tsv"
n_signif_df = pd.read_csv(n_signif_metafile, sep="\t", header = None, names = ['phenotype', 'n_signif'])

In [3]:
zscore_df = zscore_df.loc[:, n_signif_df.loc[n_signif_df['n_signif'] >= 4, 'phenotype']]
beta_df   = beta_df.loc[:, n_signif_df.loc[n_signif_df['n_signif'] >= 4, 'phenotype']]
se_df     = se_df.loc[:, n_signif_df.loc[n_signif_df['n_signif'] >= 4, 'phenotype']]

In [4]:
phenotype_ids = list(zscore_df.columns)
phenotype_names = [phenotype_df.loc[phenotype_df['Phenotype Code'] == x, 'Phenotype Name'].item() for x in phenotype_ids]
phenotype_categories = [phenotype_df.loc[phenotype_df['Phenotype Code'] == x, 'Phenotype Class'].item() for x in phenotype_ids]
unique_categories = list(set(phenotype_categories))

trait_indices = [np.array([i for i, x in enumerate(phenotype_categories) if x == catg]) for catg in unique_categories]
trait_colors  = {trait: color for trait, color in zip(unique_categories, (mpl_stylesheet.kelly_colors()))}

In [22]:
X_nan = np.array(beta_df).T
X_nan_cent = X_nan - np.nanmean(X_nan, axis = 0, keepdims = True)
X_nan_mask = np.isnan(X_nan)
X_cent = np.nan_to_num(X_nan_cent, copy = True, nan = 0.0)

X_weights = 1/np.square(np.array(se_df)).T

print (f"We have {X_cent.shape[0]} samples (phenotypes) and {X_cent.shape[1]} features (variants)")
print (f"Fraction of Nan entries: {np.sum(X_nan_mask) / np.prod(X_cent.shape):.3f}")

We have 81 samples (phenotypes) and 3387 features (variants)
Fraction of Nan entries: 0.000


In [25]:
nnm_weighted = FrankWolfe(model = 'nnm', max_iter = 1000, svd_max_iter = 50, 
                        tol = 1e-8, step_tol = 1e-8, simplex_method = 'sort',
                        show_progress = True, debug = True, print_skip = 100)
nnm_weighted.fit(X_cent, 1024.0, weight = X_weights)

2023-11-27 14:52:57,121 | nnwmf.optimize.frankwolfe                | INFO    | Iteration 0. Step size 0.000. Duality Gap 2.15562e+19
2023-11-27 14:53:00,854 | nnwmf.optimize.frankwolfe                | INFO    | Iteration 100. Step size 0.000. Duality Gap 1.55525e+17
2023-11-27 14:53:04,587 | nnwmf.optimize.frankwolfe                | INFO    | Iteration 200. Step size 0.000. Duality Gap 4.3933e+16
2023-11-27 14:53:08,351 | nnwmf.optimize.frankwolfe                | INFO    | Iteration 300. Step size 0.000. Duality Gap 6.48602e+16
2023-11-27 14:53:12,071 | nnwmf.optimize.frankwolfe                | INFO    | Iteration 400. Step size 0.000. Duality Gap 3.2847e+16
2023-11-27 14:53:15,751 | nnwmf.optimize.frankwolfe                | INFO    | Iteration 500. Step size 0.000. Duality Gap 1.73102e+16
2023-11-27 14:53:19,494 | nnwmf.optimize.frankwolfe                | INFO    | Iteration 600. Step size 0.000. Duality Gap 1.35282e+16
2023-11-27 14:53:23,159 | nnwmf.optimize.frankwolfe        

In [27]:
with open (f"{data_dir}/ukbb_npd_lowrank_X_nnm_weighted.pkl", 'wb') as handle:
    pickle.dump(nnm_weighted.X_, handle, protocol=pickle.HIGHEST_PROTOCOL)
#with open (f"{data_dir}/ukbb_npd_lowrank_E_nnm_weighted.pkl", 'wb') as handle:
#    pickle.dump(nnm_weighted.M_, handle, protocol=pickle.HIGHEST_PROTOCOL)