In [7]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append('../smc')

import numpy as np   
import pandas as pd
from tqdm import tqdm
from time import time
import sys


from utils import *     # contains some useful helper functions 
from models import *    # toy models
from solvers import *   # matrix completion solvers
from methods import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# Fixed data parameters
max_test_queries = 100            
max_calib_queries = 1000
matrix_generation_seed = 2024    # Data matrix is fixed 

solvers = [
    "pmf", 
    "nnm", 
    "svt"
]

model = "RFM"

n1 = n2 = 200
noise_model = "step"
mu = 15
prop_obs = 0.2
gamma_n = 0.5
gamma_m = 0.9

# Other parameters
verbose = True
allow_inf = False
alpha = 0.1

r=5
k=2
repetition = 1

In [9]:
#################
# Generate Data #
#################
if model == "RFM":
    mm = RandomFactorizationModel(n1 ,n2, 5)
elif model == "ROM":
    mm = RandomOrthogonalModel(n1 ,n2, 5)
else:
    mm = RandomFactorizationModel(n1 ,n2, 5)

if verbose:
    print('Fixing the ground truth matrix generated from the {} model.\n'.format(model))
    sys.stdout.flush()

U, V, M_true = mm.sample_noiseless(matrix_generation_seed)

Fixing the ground truth matrix generated from the RFM model.



In [10]:
# Header for results file
def add_header(df):
    df["n1"] = n1
    df["n2"] = n2
    df['alpha'] = alpha
    df['r_true'] = 5
    df['r_guess'] = r
    df['gamma_n'] = gamma_n
    df['gamma_m'] = gamma_m
    df['mu'] = mu
    return df
    

def run_single_experiment(M_true, k, alpha, prop_obs, max_test_queries, max_calib_queries,
                          r, gamma_n=0, gamma_m=0, mu=1, random_state=0):
    res = pd.DataFrame({})


    #-------Generate masks----------#
    #-------------------------------#
    n1, n2 = M_true.shape
    sampler = QuerySampling(n1,n2)
    mask_obs, mask_test = sampler.sample_submask(sub_size=prop_obs, random_state=random_state)
    n_calib_queries = min(int(0.5 * np.sum(np.sum(mask_obs, axis=1) // k)), max_calib_queries)


    #------Sample test queries------#
    #-------------------------------#
    n_test_queries = min(int(0.99 * np.sum(np.sum(mask_test, axis=1) // k)), max_test_queries)
    _, idxs_test, _ = sampler.sample_train_calib(mask_test, k, calib_size=n_test_queries, random_state=random_state)  
    if verbose:
        print("Training size:{}, calib size: {}, test size: {}\n".format(np.sum(mask_obs)-n_calib_queries*k, n_calib_queries, n_test_queries))
        sys.stdout.flush()

    
    #--------Generate noise---------#
    #-------------------------------#
    nm = NoiseModel(random_state)
    M = nm.get_noisy_matrix(M_true, gamma_n=gamma_n, gamma_m=gamma_m, model=noise_model, 
                            mu=mu, alpha=alpha, normalize=False)


    for solver in solvers:
        #------Split train calib--------#
        #-------------------------------#
        mask_train, idxs_calib, _ = sampler.sample_train_calib(mask_obs, k, 
                                    calib_size=n_calib_queries, random_state=random_state)
        # nres = compute_error(M, np.multiply(M, mask_train), np.ones_like(M)-mask_train)
        # print(f"Dummy Normalized residual: {nres}\n")
        #--------Model Training---------#
        #-------------------------------#
        print("Running matrix completion algorithm on the training set...")
        sys.stdout.flush()
        tik = time()
        if solver == "pmf":
            Mhat, _, _ = pmf_solve(M, mask_train, k=r, verbose=verbose, random_state=random_state)
        elif solver == "svt":
            Mhat = svt_solve(M, mask_train, tau=5 * np.sum(M.shape) / 2, delta=2,verbose = verbose, random_state = random_state)
        elif solver == "nnm":
            Mhat = nnm_solve(M, mask_train, verbose=verbose, random_state=random_state)

        tok=time()
        print(f"run time for {solver} is {tok-tik}.")
        mae, rmse, relative_error = compute_error(M, Mhat, np.ones_like(M)-mask_train)
        print(f"Done training with {solver}! Frobenius_error: {relative_error}\n")
        sys.stdout.flush()
    
    
        #------Compute intervals--------# 
        #-------------------------------#
        ci_method = SimulCI(M, Mhat, mask_obs, idxs_calib, k)
        df = ci_method.get_CI(idxs_test, alpha, allow_inf=allow_inf)
        lower, upper, is_inf= df.loc[0].lower, df.loc[0].upper, df.loc[0].is_inf
        tmp_res = evaluate_SCI(lower, upper, k, M, idxs_test, is_inf=is_inf, method="conformal")
        tmp_res['solver'] = solver
        tmp_res['MAE'] = mae
        tmp_res['RMSE'] = rmse
        tmp_res['Frobenius_error'] = relative_error
        tmp_res['solver_runtime'] = tok-tik 
        res = pd.concat([res, tmp_res])

    res['k'] = k     
    res['Calib_queries'] = n_calib_queries
    res['Train_entries'] = np.sum(mask_train)
    res['Test_queries'] = n_test_queries
    res['random_state'] = random_state
    return res

In [11]:
seed = 1 

#####################
#  Run Experiments  #
#####################
results = pd.DataFrame({})

for i in tqdm(range(1, repetition+1), desc="Repetitions", leave=True, position=0):
    random_state = repetition * (seed-1) + i
    
    res = run_single_experiment(M_true, k, alpha, prop_obs, max_test_queries, max_calib_queries,
                        r, gamma_n=gamma_n, gamma_m=gamma_m, mu=mu, random_state=random_state)
    
    results = pd.concat([results, res])

add_header(results)

Repetitions:   0%|                                                                               | 0/1 [00:00<?, ?it/s]

Training size:6000, calib size: 1000, test size: 100

Running matrix completion algorithm on the training set...
Iteration: 1; Mean diff: 0.0140
Iteration: 2; Mean diff: 0.0060
Iteration: 3; Mean diff: 0.0038
Iteration: 4; Mean diff: 0.0031
Iteration: 5; Mean diff: 0.0028
Iteration: 6; Mean diff: 0.0025
Iteration: 7; Mean diff: 0.0017
Iteration: 8; Mean diff: 0.0012
Iteration: 9; Mean diff: 0.0011
Iteration: 10; Mean diff: 0.0010
Iteration: 11; Mean diff: 0.0009
Iteration: 12; Mean diff: 0.0008
Iteration: 13; Mean diff: 0.0008
Iteration: 14; Mean diff: 0.0008
Iteration: 15; Mean diff: 0.0007
Iteration: 16; Mean diff: 0.0007
Iteration: 17; Mean diff: 0.0007
Iteration: 18; Mean diff: 0.0007
Iteration: 19; Mean diff: 0.0007
Iteration: 20; Mean diff: 0.0006
Iteration: 21; Mean diff: 0.0005
Iteration: 22; Mean diff: 0.0004
Iteration: 23; Mean diff: 0.0003
Iteration: 24; Mean diff: 0.0003
Iteration: 25; Mean diff: 0.0003
Iteration: 26; Mean diff: 0.0002
Iteration: 27; Mean diff: 0.0002
Itera

CI: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 26.11it/s]

Done!





Running matrix completion algorithm on the training set...
                                     CVXPY                                     
                                     v1.3.0                                    
(CVXPY) Oct 25 02:43:40 PM: Your problem has 40000 variables, 1 constraints, and 0 parameters.
(CVXPY) Oct 25 02:43:40 PM: It is compliant with the following grammars: DCP, DQCP
(CVXPY) Oct 25 02:43:40 PM: (If you need to solve this problem multiple times, but with different data, consider using parameters.)
(CVXPY) Oct 25 02:43:40 PM: CVXPY will first compile your problem; then, it will invoke a numerical solver to obtain a solution.
-------------------------------------------------------------------------------
                                  Compilation                                  
-------------------------------------------------------------------------------
(CVXPY) Oct 25 02:43:40 PM: Compiling problem (target solver=SCS).
(CVXPY) Oct 25 02:43:40 PM: Reducti

CI: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 29.91it/s]

Done!





Running matrix completion algorithm on the training set...
Iteration: 1; Rel error: 1.0000
Iteration: 11; Rel error: 0.5982
Iteration: 21; Rel error: 0.5723
Iteration: 31; Rel error: 0.5314
Iteration: 41; Rel error: 0.6529
Iteration: 51; Rel error: 0.7027
Iteration: 61; Rel error: 0.5502
Iteration: 71; Rel error: 0.5020
Iteration: 81; Rel error: 0.4845
Iteration: 91; Rel error: 0.4787
Iteration: 101; Rel error: 0.6052
Iteration: 111; Rel error: 0.5745
Iteration: 121; Rel error: 0.6142
Iteration: 131; Rel error: 0.4319
Iteration: 141; Rel error: 0.5722
Iteration: 151; Rel error: 0.6216
Iteration: 161; Rel error: 0.4554
Iteration: 171; Rel error: 0.4927
Iteration: 181; Rel error: 0.5648
Iteration: 191; Rel error: 0.6671
run time for svt is 26.361884593963623.
Done training with svt! Frobenius_error: 0.48263354974012873

Computing conformal prediction intervals for 100 test queries...


CI: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 29.03it/s]

Done!



Repetitions: 100%|██████████████████████████████████████████████████████████████████████| 1/1 [03:06<00:00, 186.79s/it]


Unnamed: 0,Query_coverage,Coverage,Size,Inf_prop,Method,solver,MAE,RMSE,Frobenius_error,solver_runtime,...,Test_queries,random_state,n1,n2,alpha,r_true,r_guess,gamma_n,gamma_m,mu
0,0.91,0.95,2.576986,0.0,conformal,pmf,0.399003,0.609093,0.312573,32.776103,...,100,1,200,200,0.1,5,5,0.5,0.9,15
0,0.84,0.9,1.131428,0.0,conformal,nnm,0.226677,0.485283,0.249036,110.721013,...,100,1,200,200,0.1,5,5,0.5,0.9,15
0,0.85,0.885,2.196006,0.0,conformal,svt,0.478635,0.940481,0.482634,26.361885,...,100,1,200,200,0.1,5,5,0.5,0.9,15
