In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
import sys
sys.path.append("/gpfs/commons/home/sbanerjee/work/npd/lrma-dsc/dsc/functions")
import simulate as dsc_simulate

In [3]:
data_dir = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/data"
zscore_df = pd.read_pickle(os.path.join(data_dir, f"modselect/zscore_noRx.pkl"))

In [4]:
X = np.array(zscore_df.values.T)
Ztrue = X - np.mean(X, axis = 0, keepdims = True)

In [5]:
mask_ratio = 0.2
Zmask = dsc_simulate.generate_mask(X.shape[0], X.shape[1], mask_ratio)
Z = dsc_simulate.generate_masked_input(Ztrue, Zmask)

In [6]:
Z.shape

(2110, 51368)

In [9]:
Ztrue

array([[-0.85061846,  0.39508954, -1.77324203, ...,  0.64890079,
         1.59639086,  0.69026144],
       [ 0.61475513,  1.1604626 , -0.20645511, ...,  0.49504822,
         1.306907  , -0.28729541],
       [ 0.76398184,  0.50904104,  0.3150178 , ...,  0.62372307,
        -1.34546056, -0.79042181],
       ...,
       [-4.24003393, -5.31716186, -2.22136784, ..., -1.44154209,
         1.68784194, -0.00811659],
       [-4.64879786, -6.61521401, -3.25765615, ..., -1.30312517,
         1.81307603,  0.09153624],
       [ 3.00581218,  3.81444145,  4.47445101, ...,  2.30903555,
         2.37319293, -0.78436375]])

In [20]:
cv_input_data_file = os.path.join(data_dir, f"cv_input/zscore_noRx.npy")
with open(cv_input_data_file, 'wb') as f:
    np.save(f, Ztrue, allow_pickle=True)
    np.save(f, Z, allow_pickle=True)
    np.save(f, Zmask, allow_pickle=True)

In [21]:
with open(cv_input_data_file, 'rb') as f:
    Atrue = np.load(f)
    A = np.load(f)
    Amask = np.load(f)

In [24]:
np.allclose(Z, A, equal_nan = True)

True

In [26]:
from nnwmf.optimize import IALM
from nnwmf.optimize import FrankWolfe, FrankWolfe_CV
nnmcv = FrankWolfe_CV(kfolds = 2, model = 'nnm', max_iter = 100)
nucnormY = np.linalg.norm(Ztrue, 'nuc')
rseq = nnmcv._generate_rseq(nucnormY)

In [29]:
rseq_file = "/gpfs/commons/home/sbanerjee/work/npd/PanUKB/jobs/cross_validation/rseq.txt"
np.savetxt(rseq_file, rseq, fmt = "%g")

In [72]:
import pickle
res_file = "/gpfs/commons/home/sbanerjee/npddata/panukb/cross_validation/noRx/nnm_model_r4096.pkl"
with open(res_file, "rb") as mfile:
    model = pickle.load(mfile)

In [64]:
model.keys()



In [65]:
model['mask_']

array([[False, False, False, ...,  True, False,  True],
       [ True, False,  True, ..., False, False, False],
       [ True,  True, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [73]:
model['X_'][model['mask_']]

array([-0.01863855,  0.01805154, -0.01264652, ..., -0.03469477,
       -0.1126403 ,  0.33159743])

In [67]:
def get_masked_rmse(original, recovered, mask = None):
    if mask is None: mask = np.ones_like(original)
    n = np.sum(mask)
    mse = np.nansum(np.square((original - recovered) * mask)) / n
    return np.sqrt(mse)

In [77]:
get_masked_rmse(Ztrue, np.ones_like(Ztrue), Zmask)

1.5584433818025651

In [78]:
get_masked_rmse(Ztrue, np.zeros_like(Ztrue), Zmask)

1.195272107881818

In [74]:
get_masked_rmse(Ztrue, model['X_'], Zmask)

1.1401398084902823

In [75]:
get_masked_rmse(Ztrue, model['X_'], ~Zmask)

1.1384288245272913

In [76]:
Zshuffle = Ztrue.copy()
np.random.shuffle(Zshuffle)
get_masked_rmse(Ztrue, Zshuffle)

1.6929665463055956

In [71]:
model['X_'] * Zmask

array([[-0.00000000e+00, -0.00000000e+00, -0.00000000e+00, ...,
        -1.78566264e-06,  0.00000000e+00,  1.18711646e-06],
       [-4.65860387e-06, -0.00000000e+00, -6.04423556e-06, ...,
        -0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.99896368e-06, -1.85699379e-06, -0.00000000e+00, ...,
        -0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-0.00000000e+00, -0.00000000e+00, -0.00000000e+00, ...,
        -0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-0.00000000e+00, -0.00000000e+00, -0.00000000e+00, ...,
        -0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00, -0.00000000e+00, -0.00000000e+00]])

In [49]:
Zmask

array([[False, False, False, ...,  True, False,  True],
       [ True, False,  True, ..., False, False, False],
       [ True,  True, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [6]:
from nnwmf.optimize import IALM
from nnwmf.optimize import FrankWolfe, FrankWolfe_CV

def do_center_nan(X, mask = None):
    '''
    X may contain NaN values.
    If mask is not None, set mask to NaN.
    '''
    X_nan = X.copy()
    if mask is not None:
        X_nan[mask] = np.nan
    X_mask = np.isnan(X_nan)
    X_cent = X_nan - np.nanmean(X_nan, axis = 0, keepdims = True)
    # X_cent = np.nan_to_num(X_cent, copy = True, nan = 0.0)
    return X_cent, X_mask

def nnm(Y, max_iter = 1000, mask = None):
    Y_cent, Y_mask  = do_center_nan(Y, mask = mask)
    nnmcv = FrankWolfe_CV(kfolds = 2, model = 'nnm')
    nnmcv.fit(Y_nan)
    rank = nnmcv._optimized_rank()
    nnm = FrankWolfe(model = 'nnm', max_iter = max_iter, svd_max_iter = 50, show_progress = False, debug = False)
    nnm.fit(Y_cent, rank, mask = mask)
    nnm_dict = class_to_dict(nnm)
    nnm_dict["train_error"] = nnmcv.train_error_
    nnm_dict["test_error"] = nnmcv.test_error_
    return nnm.X, nnm_dict

In [7]:
Y_cent, Y_mask  = do_center_nan(Z, mask = Zmask)

We do not want the CV to know the masked values. After training, we will compare the accuracy of predicting the masked values.

In [8]:
np.sum(Y_mask) / np.prod(Y_mask.shape)

0.2

In [9]:
np.allclose(Y_cent, Z, equal_nan = True)

True

In [10]:
nnmcv = FrankWolfe_CV(kfolds = 2, model = 'nnm', max_iter = 100)

In [11]:
Y_cent_zero = np.nan_to_num(Y_cent, nan = 0.0)
nucnormY = np.linalg.norm(Y_cent_zero, 'nuc')

In [12]:
nucnormY

432478.86445869633

In [13]:
rseq = nnmcv._generate_rseq(nucnormY)

In [20]:
nnmcv.fit(Y_cent[:100, :2000], rseq = rseq)

In [21]:
nnmcv

<nnwmf.optimize.frankwolfe_cv.FrankWolfe_CV at 0x7faa8633e090>

In [22]:
property_list = [ x for x in vars(nnmcv).keys() if x not in ["logger_", "nnm_"] ]

In [23]:
property_list

['kfolds_',
 'do_shuffle_',
 'test_size_',
 'return_fits_',
 'do_chain_initialize_',
 'do_reverse_path_',
 'kwargs_',
 'is_debug_',
 'train_error_',
 'test_error_',
 'fold_labels_']

In [24]:
nnmcv.train_error_

{1.0: [1.0230127342605657, 1.022351730725972],
 2.0: [1.0224644841725752, 1.0218000255475013],
 4.0: [1.0213982836003426, 1.0207255578515169],
 8.0: [1.0193410384261383, 1.0186485976839488],
 16.0: [1.0154624569574233, 1.0147930642536858],
 32.0: [1.0081555538994766, 1.0075015574928594],
 64.0: [0.9946049589353289, 0.9940010292114604],
 128.0: [0.9693370402552203, 0.9686281683766891],
 256.0: [0.923229476081756, 0.9223724136119501],
 512.0: [0.8431255196164988, 0.8421861988768187],
 1024.0: [0.7158642569610191, 0.714864360139593],
 2048.0: [0.5401219187821557, 0.5392288475975265],
 4096.0: [0.3396963457635498, 0.3388246789288757],
 8192.0: [0.3182982560701147, 0.31960697103158603],
 16384.0: [0.31565975520059864, 0.3170197509845071],
 32768.0: [0.3128111690691166, 0.3141097106116839],
 65536.0: [0.3097279195714216, 0.31107523201378673],
 131072.0: [0.30673700154099715, 0.3080209891377426],
 262144.0: [0.303657905385584, 0.30488513264113276],
 524288.0: [0.3005838445204732, 0.3017351436

In [25]:
nnmcv.test_error_

{1.0: [1.0226770367804123, 1.0233049898362747],
 2.0: [1.0224379488173183, 1.02305329092544],
 4.0: [1.0219864187996635, 1.0225529371158433],
 8.0: [1.021098123143363, 1.0216468909517626],
 16.0: [1.019688812440919, 1.0202453123890118],
 32.0: [1.017565762894877, 1.0179647472164948],
 64.0: [1.0148020028924771, 1.0151284948688857],
 128.0: [1.0120328166082362, 1.0118761635993958],
 256.0: [1.0087210277901801, 1.0080821880791744],
 512.0: [1.006427288353525, 1.005055054008481],
 1024.0: [1.0059670149417996, 1.003855751358362],
 2048.0: [1.0065642399885821, 1.0047850631391877],
 4096.0: [1.0059385586090575, 1.0042638948731373],
 8192.0: [1.0068334283194753, 1.0045704902134889],
 16384.0: [1.00700284341479, 1.004738126749744],
 32768.0: [1.0070721683509762, 1.0048512798559956],
 65536.0: [1.0071411345796781, 1.0049030793148739],
 131072.0: [1.0072017065661, 1.0050226132923779],
 262144.0: [1.007458652805853, 1.0052434076570862],
 524288.0: [1.0077489647384095, 1.005297710649436]}

In [34]:
nnmcv._optimized_rank()

1024.0

In [27]:
nnmcv.fold_labels_.shape

(100, 2000)

In [28]:
k = 0
test_mask = nnmcv.fold_labels_ == k + 1
Y_test  = nnmcv._generate_masked_input(Y_cent[:100, :2000], test_mask)
Y_train = nnmcv._generate_masked_input(Y_cent[:100, :2000], ~test_mask)

In [29]:
np.sum(np.isnan(Y_test))

120205

In [30]:
np.sum(np.isnan(Y_train))

119810

In [31]:
nnmcv.nnm_[1.0][0].X

array([[ 5.56978992e-05,  5.33634087e-04,  4.70633870e-04, ...,
        -7.20409447e-05, -8.84159877e-05, -3.40204302e-04],
       [ 3.21970668e-06,  3.08883050e-05,  2.73383332e-05, ...,
        -4.25033691e-06, -5.24284694e-06, -1.97329412e-05],
       [ 1.47032604e-05,  1.40785130e-04,  1.23962861e-04, ...,
        -1.88386291e-05, -2.30656965e-05, -8.96686082e-05],
       ...,
       [-3.71035600e-05, -3.56364206e-04, -3.16380857e-04, ...,
         4.98464624e-05,  6.17470754e-05,  2.28074692e-04],
       [ 8.25397356e-05,  7.89707107e-04,  6.93876276e-04, ...,
        -1.04449455e-04, -1.27481216e-04, -5.02356739e-04],
       [-2.15238132e-04, -2.05968833e-03, -1.81063948e-03, ...,
         2.73165200e-04,  3.33649058e-04,  1.31060891e-03]])

In [32]:
from nnwmf.utils import model_errors as merr
merr.get(Y_test, nnmcv.nnm_[1.0][0].X, method = 'psnr')

25.251514871592754

In [None]:
cvlmb, cvrmse = rpca_lambda_cv(Z, Zmask, Ztrue, max_iter = max_iter)
_lmbopt = cvlmb[np.argmin(cvrmse)]
X, M, model = rpca(Z, mask = Zmask, max_iter = max_iter, lmb = _lmbopt)

In [43]:
ncvseq = 2
lmb_min = 0.5 / np.sqrt(np.max(Y_cent.shape))
lmb_max = 1.0 / np.sqrt(np.min(Y_cent.shape))
lmb_seq = np.logspace(np.log10(lmb_min), np.log10(lmb_max), ncvseq)
rmse_seq = [0. for i in range(ncvseq)]

In [44]:
lmb_seq

array([0.00220609, 0.02177002])

In [46]:
def get_masked_rmse(original, recovered, mask = None):
    if mask is None: mask = np.ones_like(original)
    n = np.sum(mask)
    mse = np.nansum(np.square((original - recovered) * mask)) / n
    return np.sqrt(mse)

max_iter = 5
Z_nan_zero = np.nan_to_num(Z, copy = True, nan = 0.0) 
for i in range(ncvseq):
    model = IALM(max_iter = max_iter, mu_update_method='admm', show_progress = True, print_skip = 100)
    model.fit(Z_nan_zero, mask = Zmask, lmb = lmb_seq[i])
    rmse_seq[i] = get_masked_rmse(Ztrue, model.L_, mask = Zmask)

2024-09-12 14:55:22,986 | nnwmf.optimize.inexact_alm               | DEBUG   | Fit RPCA using IALM (mu update admm, lamba = 0.0022)
2024-09-12 14:55:53,004 | nnwmf.optimize.inexact_alm               | INFO    | Iteration 0. Primal residual 0.944597. Dual residual 0.000108596
2024-09-12 14:57:03,872 | nnwmf.optimize.inexact_alm               | DEBUG   | Fit RPCA using IALM (mu update admm, lamba = 0.0218)
2024-09-12 14:57:34,133 | nnwmf.optimize.inexact_alm               | INFO    | Iteration 0. Primal residual 0.975676. Dual residual 4.48568e-05


In [47]:
rmse_seq

[1.1688295974564777, 1.0243196057425614]