In [60]:
import os
import numpy as np
import pandas as pd

from nnwmf.optimize import IALM

import matplotlib.pyplot as plt
from pymir import mpl_stylesheet
from pymir import mpl_utils
mpl_stylesheet.banskt_presentation(splinecolor = 'black', dpi = 120, colors = 'kelly')

In [2]:
data_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/data"
h2_cut = 0.1
pval_cut = 5e-8

zscore_df_selected_filename = os.path.join(data_dir, f"modselect/zscore_h2{h2_cut}_pval{pval_cut}.pkl")
trait_df_selected_filename  = os.path.join(data_dir, f"modselect/traits_h2{h2_cut}.pkl")

zscore_df = pd.read_pickle(zscore_df_selected_filename)
trait_df  = pd.read_pickle(trait_df_selected_filename)

In [9]:
zscore_df_selected_filename

'/gpfs/commons/home/sbanerjee/work/npd/PanUKB/data/modselect/zscore_h20.1_pval5e-08.pkl'

In [5]:
X = np.array(zscore_df.values.T)
X_cent = X - np.mean(X, axis = 0, keepdims = True)

print (f"We have {X_cent.shape[0]} samples (phenotypes) and {X_cent.shape[1]} features (variants)")
print (f"Fraction of Nan entries: {np.sum(np.isnan(X)) / np.prod(X_cent.shape):.3f}")

We have 216 samples (phenotypes) and 48212 features (variants)
Fraction of Nan entries: 0.000


In [6]:
def generate_masked_input(Y, mask):
    Ymiss_nan = Y.copy()
    Ymiss_nan[mask] = np.nan
    Ymiss_nan_cent = Ymiss_nan - np.nanmean(Ymiss_nan, axis = 0, keepdims = True)
    Ymiss_nan_cent[mask] = 0.0
    return Ymiss_nan_cent

def generate_mask(n, p, ratio):
    mask = np.ones(n * p)
    nzero = int(ratio * n * p)
    mask[:nzero] = 0.0
    np.random.shuffle(mask)
    return mask.reshape(n,p) == 0.

Z_mask = generate_mask(X.shape[0], X.shape[1], 0.2)
Z_cent = generate_masked_input(X_cent, Z_mask)

In [57]:
def get_masked_rmse(original, recovered, mask = None):
    if mask is None: mask = np.ones_like(original)
    n = np.sum(mask)
    mse = np.sum(np.square((original - recovered) * mask)) / n
    return np.sqrt(mse)

def rpca_lambda_cv(Y, Ymask, Ytrue, max_iter = 1000, ncvseq = 20):
    #Y_nan = Y.copy()
    #Y_nan[Ymask] = np.nan
    #Y_cent, _ = do_center_nan(Y_nan)
    lmb_min = 0.5 / np.sqrt(np.max(Y.shape))
    lmb_max = 1.0 / np.sqrt(np.min(Y.shape))
    lmb_seq = np.logspace(np.log10(lmb_min), np.log10(lmb_max), ncvseq)
    rmse_seq = [0. for i in range(ncvseq)]
    for i in range(ncvseq):
        model = IALM(max_iter = max_iter, mu_update_method='admm', show_progress = True, print_skip = 100)
        model.fit(Y, mask = Ymask, lmb = lmb_seq[i])
        rmse_seq[i] = get_masked_rmse(Ytrue, model.L_, mask = Ymask)
    return lmb_seq, rmse_seq

In [63]:
rpca_cv_lmb

array([0.00227715, 0.00332142, 0.00484457, 0.0070662 , 0.01030664,
       0.01503309, 0.02192702, 0.03198238, 0.04664896, 0.06804138])

In [53]:
lmb_max = 1. / np.sqrt(np.min(X.shape))
lmb_min = 0.5 / np.sqrt(np.max(X.shape))
np.logspace(np.log10(lmb_min), np.log10(lmb_max), 20)

array([0.00227715, 0.00272298, 0.00325608, 0.00389356, 0.00465585,
       0.00556738, 0.00665736, 0.00796075, 0.00951931, 0.01138301,
       0.01361158, 0.01627647, 0.01946309, 0.02327359, 0.02783012,
       0.03327872, 0.03979406, 0.04758498, 0.05690121, 0.06804138])

In [40]:
rpca_model = IALM(max_iter = 10000, mu_update_method='admm', show_progress = True, print_skip = 100)
rpca_model.fit(Z_cent, mask = Z_mask, lmb = 0.006)

2024-02-27 14:47:01,544 | nnwmf.optimize.inexact_alm               | DEBUG   | Fit RPCA using IALM (mu update admm, lamba = 0.0060)
2024-02-27 14:47:05,207 | nnwmf.optimize.inexact_alm               | INFO    | Iteration 0. Primal residual 0.920551. Dual residual 0.00010833
2024-02-27 14:50:19,644 | nnwmf.optimize.inexact_alm               | INFO    | Iteration 100. Primal residual 1.01673e-05. Dual residual 5.46139e-06
2024-02-27 14:53:33,140 | nnwmf.optimize.inexact_alm               | INFO    | Iteration 200. Primal residual 3.87867e-07. Dual residual 2.3772e-07


In [42]:
root_mean_squared_error(X_cent, rpca_model.L_, mask = Z_mask)

2.0022873683090268

In [25]:
def root_mean_squared_error(original, recovered, mask = None):
    if mask is None: mask = np.ones_like(original)
    n = np.sum(mask)
    mse = np.sum(np.square((original - recovered) * mask)) / n
    return np.sqrt(mse)

In [26]:
root_mean_squared_error(X_cent, Z_cent, mask = Z_mask)

2.0022873865886495

In [43]:
print ("Nuclear Norms")
print (f"Original matrix: {np.linalg.norm(X, 'nuc')}")
print (f"Original matrix centered: {np.linalg.norm(X_cent, 'nuc')}")
print (f"Masked matrix: {np.linalg.norm(Z_cent, 'nuc')}")
print (f"Recovered matrix: {np.linalg.norm(rpca_model.L_, 'nuc')}")

Nuclear Norms
Original matrix: 56014.783744890694
Original matrix centered: 55147.50855870614
Masked matrix: 64141.473914118724
Recovered matrix: 39632.63858664293


In [41]:
np.logspace(-3, -2, 10)

array([0.001     , 0.00129155, 0.0016681 , 0.00215443, 0.00278256,
       0.00359381, 0.00464159, 0.00599484, 0.00774264, 0.01      ])