In [7]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append('../smc')
sys.path.append('../third_party')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import numpy as np   
import pandas as pd
import scipy.stats as stats
import sys
from tqdm import tqdm

from utils import *     # contains some useful helper functions 
from models import *    # toy models
from solvers import *   # matrix completion solvers
from methods import *
from missingness_estimation import *

In [9]:
r = 5
scale = 1
seed = 1

# Fixed data parameters
max_test_queries = 100            
max_calib_queries = 2000
matrix_generation_seed = 2024    # Data matrix is fixed 

n1 = n2 = 400

model = "RFM"
solver = "pmf"
prop_obs = 0.3


# Other parameters
verbose = True
allow_inf = False
alpha = 0.1

k_list = [2,5,8]
repetition = 1

In [10]:
#################
# Generate Data #
#################
if model == "RFM":
    mm = RandomFactorizationModel(n1 ,n2, 8)
elif model == "ROM":
    mm = RandomOrthogonalModel(n1 ,n2, 8)
else:
    mm = RandomFactorizationModel(n1 ,n2, 8)

if verbose:
    print('Fixing the ground truth matrix generated from the {} model.\n'.format(model))
    sys.stdout.flush()

U, V, M = mm.sample_noiseless(matrix_generation_seed)

Fixing the ground truth matrix generated from the RFM model.



In [11]:
#####################
# Define Experiment #
#####################
def run_single_experiment(M_true, k, alpha, prop_obs, max_test_queries, max_calib_queries,
                          r,scale, random_state=0):
    res = pd.DataFrame({})


    #--------Observation bias-------#
    #-------------------------------#
    n1, n2 = M_true.shape
    bm = SamplingBias(n1,n2)
    w_obs = bm.inc_weights(scale = scale)

    #-------Generate masks----------#
    #-------------------------------#
    sampler = QuerySampling(n1,n2)
    mask_obs, mask_test = sampler.sample_submask(sub_size=prop_obs, w=w_obs, random_state=random_state)
    n_calib_queries = min(int(0.5 * np.sum(np.sum(mask_obs, axis=1) // k)), max_calib_queries)

    print(f"Estimating missingness with guessed rank {r}...")
    w_obs_est = estimate_P(mask_obs, 1, r=r)
    print("Done estimating!\n")
    sys.stdout.flush()
    
    #------Sample test queries------#
    #-------------------------------#
    n_test_queries = min(int(0.99 * np.sum(np.sum(mask_test, axis=1) // k)), max_test_queries)
    _, idxs_test, _ = sampler.sample_train_calib(mask_test, k, calib_size=n_test_queries, random_state=random_state)  
    if verbose:
        print("Training size:{}, calib size: {}, test size: {}\n".format(np.sum(mask_obs)-n_calib_queries*k, n_calib_queries, n_test_queries))
        sys.stdout.flush()


    #------Split train calib--------#
    #-------------------------------#
    mask_train, idxs_calib, _ = sampler.sample_train_calib(mask_obs, k, 
                                calib_size=n_calib_queries, random_state=random_state)

    #--------Model Training---------#
    #-------------------------------#
    print("Running matrix completion algorithm on the splitted training set...")
    sys.stdout.flush()
    if solver == "pmf":
        Mhat, _, _ = pmf_solve(M, mask_train, k=r, verbose=verbose, random_state=random_state)
    elif solver == "svt":
        Mhat = svt_solve(M, mask_train, verbose = verbose, random_state = random_state)
    print("Done training!\n")
    sys.stdout.flush()


    #------Compute intervals--------# 
    #-------------------------------#

    # Evaluate the CI and quantile inflation weights using oracle obs sampling weights
    ci_method = SimulCI(M, Mhat, mask_obs, idxs_calib, k, w_obs=w_obs)
    df = ci_method.get_CI(idxs_test, alpha, allow_inf=allow_inf, store_weights=True)
    lower, upper, is_inf= df.loc[0].lower, df.loc[0].upper, df.loc[0].is_inf
    res = pd.concat([res, evaluate_SCI(lower, upper, k, M, idxs_test, is_inf=is_inf, method="conformal")])

    # Evaluate the CI and quantile inflation weights using estimated obs sampling weights
    ci_est = SimulCI(M, Mhat, mask_obs, idxs_calib, k, w_obs=w_obs_est)
    df = ci_est.get_CI(idxs_test, alpha, allow_inf=allow_inf, store_weights=True)
    lower, upper, is_inf= df.loc[0].lower, df.loc[0].upper, df.loc[0].is_inf
    res = pd.concat([res, evaluate_SCI(lower, upper, k, M, idxs_test, is_inf=is_inf, method="est")])

    # Evaluate the estimation gap
    weights_list = ci_method.weights_list
    est_weights_list = ci_est.weights_list
    est_gaps =[0.5*np.mean(np.abs(weights_list[i]-est_weights_list[i])) for i in range(len(weights_list))]
    avg_gap = np.mean(est_gaps)


    res['k'] = k 
    res['avg_gap'] = avg_gap   
    res['Calib_queries'] = n_calib_queries
    res['Train_entries'] = np.sum(mask_train)
    res['Test_queries'] = n_test_queries
    res['random_state'] = random_state
    return res

In [12]:
#####################
#  Run Experiments  #
#####################
results = pd.DataFrame({})

for i in tqdm(range(1, repetition+1), desc="Repetitions", leave=True, position=0):
    random_state = repetition * (seed-1) + i
    
    for k in tqdm(k_list, desc="k", leave=True, position=0):

        res = run_single_experiment(M, k, alpha, prop_obs, max_test_queries, max_calib_queries,
                            r, scale=scale, random_state=random_state)
        
        results = pd.concat([results, res])

k:   0%|                                                                                         | 0/3 [00:00<?, ?it/s]

Estimating missingness with guessed rank 5...
iter: 1
iter: 2
iter: 3
iter: 4
iter: 5
iter: 6
iter: 7
iter: 8
iter: 9
iter: 10
iter: 11
iter: 12
iter: 13
iter: 14
Function value changing by less than progTol
Done estimating!

Training size:44000, calib size: 2000, test size: 100

Running matrix completion algorithm on the splitted training set...
Iteration: 1; Mean diff: 0.0080
Iteration: 2; Mean diff: 0.0058
Iteration: 3; Mean diff: 0.0035
Iteration: 4; Mean diff: 0.0010
Iteration: 5; Mean diff: 0.0006
Iteration: 6; Mean diff: 0.0005
Iteration: 7; Mean diff: 0.0004
Iteration: 8; Mean diff: 0.0004
Iteration: 9; Mean diff: 0.0003
Iteration: 10; Mean diff: 0.0003
Iteration: 11; Mean diff: 0.0003
Iteration: 12; Mean diff: 0.0003
Iteration: 13; Mean diff: 0.0002
Iteration: 14; Mean diff: 0.0002
Iteration: 15; Mean diff: 0.0002
Iteration: 16; Mean diff: 0.0002
Iteration: 17; Mean diff: 0.0003
Iteration: 18; Mean diff: 0.0003
Iteration: 19; Mean diff: 0.0003
Iteration: 20; Mean diff: 0.0003


CI: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 14.13it/s]

Done!
Computing conformal prediction intervals for 100 test queries...



CI: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:06<00:00, 14.44it/s]

Done!



k:  33%|██████████████████████████▋                                                     | 1/3 [03:11<06:22, 191.19s/it]

Estimating missingness with guessed rank 5...
iter: 1
iter: 2
iter: 3
iter: 4
iter: 5
iter: 6
iter: 7
iter: 8
iter: 9
iter: 10
iter: 11
iter: 12
iter: 13
iter: 14
Function value changing by less than progTol
Done estimating!

Training size:38000, calib size: 2000, test size: 100

Running matrix completion algorithm on the splitted training set...
Iteration: 1; Mean diff: 0.0075
Iteration: 2; Mean diff: 0.0056
Iteration: 3; Mean diff: 0.0033
Iteration: 4; Mean diff: 0.0011
Iteration: 5; Mean diff: 0.0006
Iteration: 6; Mean diff: 0.0005
Iteration: 7; Mean diff: 0.0004
Iteration: 8; Mean diff: 0.0004
Iteration: 9; Mean diff: 0.0003
Iteration: 10; Mean diff: 0.0002
Iteration: 11; Mean diff: 0.0002
Iteration: 12; Mean diff: 0.0002
Iteration: 13; Mean diff: 0.0001
Iteration: 14; Mean diff: 0.0001
Iteration: 15; Mean diff: 0.0001
Iteration: 16; Mean diff: 0.0001
Iteration: 17; Mean diff: 0.0001
Iteration: 18; Mean diff: 0.0001
Iteration: 19; Mean diff: 0.0001
Iteration: 20; Mean diff: 0.0001


CI: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:09<00:00, 10.37it/s]

Done!





Computing conformal prediction intervals for 100 test queries...


CI: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:09<00:00, 10.48it/s]

Done!



k:  67%|█████████████████████████████████████████████████████▎                          | 2/3 [07:05<03:36, 216.39s/it]

Estimating missingness with guessed rank 5...
iter: 1
iter: 2
iter: 3
iter: 4
iter: 5
iter: 6
iter: 7
iter: 8
iter: 9
iter: 10
iter: 11
iter: 12
iter: 13
iter: 14
Function value changing by less than progTol
Done estimating!

Training size:32000, calib size: 2000, test size: 100

Running matrix completion algorithm on the splitted training set...
Iteration: 1; Mean diff: 0.0074
Iteration: 2; Mean diff: 0.0055
Iteration: 3; Mean diff: 0.0039
Iteration: 4; Mean diff: 0.0014
Iteration: 5; Mean diff: 0.0009
Iteration: 6; Mean diff: 0.0005
Iteration: 7; Mean diff: 0.0004
Iteration: 8; Mean diff: 0.0003
Iteration: 9; Mean diff: 0.0003
Iteration: 10; Mean diff: 0.0002
Iteration: 11; Mean diff: 0.0002
Iteration: 12; Mean diff: 0.0002
Iteration: 13; Mean diff: 0.0002
Iteration: 14; Mean diff: 0.0001
Iteration: 15; Mean diff: 0.0001
Iteration: 16; Mean diff: 0.0001
Iteration: 17; Mean diff: 0.0001
Iteration: 18; Mean diff: 0.0001
Iteration: 19; Mean diff: 0.0001
Iteration: 20; Mean diff: 0.0001


CI: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:11<00:00,  8.38it/s]

Done!
Computing conformal prediction intervals for 100 test queries...



CI: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:11<00:00,  8.53it/s]

Done!



k: 100%|████████████████████████████████████████████████████████████████████████████████| 3/3 [10:15<00:00, 205.24s/it]
Repetitions: 100%|██████████████████████████████████████████████████████████████████████| 1/1 [10:15<00:00, 615.74s/it]


In [13]:
results

Unnamed: 0,Query_coverage,Coverage,Size,Inf_prop,Method,k,avg_gap,Calib_queries,Train_entries,Test_queries,random_state
0,0.88,0.94,7.197203,0.01,conformal,2,0.00013,2000,44000,100,1
0,0.88,0.94,6.980278,0.0,est,2,0.00013,2000,44000,100,1
0,0.86,0.968,8.753994,0.02,conformal,5,0.000225,2000,38000,100,1
0,0.85,0.964,8.398095,0.0,est,5,0.000225,2000,38000,100,1
0,0.84,0.9775,10.555317,0.06,conformal,8,0.000293,2000,32000,100,1
0,0.78,0.9675,9.474108,0.0,est,8,0.000293,2000,32000,100,1
