In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append('../smc')
sys.path.append('../third_party')

In [45]:
import numpy as np   
import pandas as pd
import pdb
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
from tqdm import tqdm
import sys
import time

from utils import *     # contains some useful helper functions 
from utils_data import *
from models import *    # toy models
from solvers import *   # matrix completion solvers
from methods import *
from wsc import *
from missingness_estimation import *

In [28]:
M, mask_avail, mask_miss = load_data("../data", "books", -1, 2500)

In [40]:
# Matrix shape and rank
n1, n2 = M.shape
k = 4
r = 5
prop_train = 0.90
allow_inf = False
alpha = 0.1
random_state=0

In [41]:
n1*n2/1200/800

1.7057291666666665

In [42]:
max_test_queries = 100
sampler = QuerySampling(n1,n2)

# Randomly split the observed set into test set and training set
mask_obs, mask_test = sampler.sample_submask(mask=mask_avail, sub_size=prop_train, random_state=random_state)

# Sample test queries
n_test_queries = min(np.sum(np.sum(mask_test, axis=1) // k), max_test_queries)
_, idxs_test, _ = sampler.sample_train_calib(mask_test, k, calib_size=n_test_queries, random_state=random_state)

In [43]:
max_calib_queries = 800
n_calib_queries = min(int(0.5 * np.sum(np.sum(mask_obs, axis=1) // k)), max_calib_queries)
mask_train, idxs_calib, mask_calib = sampler.sample_train_calib(mask_obs, k, 
                                        calib_size=n_calib_queries, random_state=random_state)

print("Observation size: {}, training size: {},  calib queries: {}, test queries: {}"
      .format(np.sum(mask_obs), np.sum(mask_train), int(np.sum(mask_calib)//k), len(idxs_test[0])//k))

Observation size: 26181, training size: 22981,  calib queries: 800, test queries: 100


In [44]:
# Train matrix completion model
Mhat, _, _ = pmf_solve(M, mask_train, k=r, max_iteration=10, random_state=random_state)

Iteration: 1; Mean diff: 0.0089
Iteration: 2; Mean diff: 0.0062
Iteration: 3; Mean diff: 0.0053
Iteration: 4; Mean diff: 0.0032
Iteration: 5; Mean diff: 0.0015
Iteration: 6; Mean diff: 0.0007
Iteration: 7; Mean diff: 0.0005
Iteration: 8; Mean diff: 0.0004
Iteration: 9; Mean diff: 0.0003
Iteration: 10; Mean diff: 0.0003


In [46]:
w_obs=estimate_P(mask_train, prop_train, r=5)

iter: 1
iter: 2
iter: 3
iter: 4
iter: 5
iter: 6
iter: 7
iter: 8
iter: 9
iter: 10
iter: 11
Function value changing by less than progTol


In [47]:
ci_method = SimulCI(M, Mhat, mask_obs, idxs_calib, k, w_obs=w_obs)
df = ci_method.get_CI(idxs_test, alpha, allow_inf=allow_inf)
lower, upper, is_inf= df.loc[0].lower, df.loc[0].upper, df.loc[0].is_inf
lower[lower <= 0] = 0
upper[upper >= 5] = 5
res = evaluate_SCI(lower, upper, k, M, idxs_test, is_inf=is_inf, method="SMC")

Computing conformal prediction intervals for 100 test queries...


CI: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 26.82it/s]

Done!





In [48]:
mask_train, idxs_calib, mask_calib = sampler.sample_train_calib(mask_obs, 1, 
                                        calib_size=n_calib_queries*k, random_state=random_state)

print("Observation size: {}, training size: {}, calib size: {}, test queries: {}"
      .format(np.sum(mask_obs), np.sum(mask_train), np.sum(mask_calib), len(idxs_test[0])//k))

Observation size: 26181, training size: 22981, calib size: 3200, test queries: 100


In [49]:
start=time.time()
# Train matrix completion model
Mhat, _, _ = pmf_solve(M, mask_train, k=r, max_iteration=10, random_state=random_state)
print(time.time()-start)

Iteration: 1; Mean diff: 0.0077
Iteration: 2; Mean diff: 0.0053
Iteration: 3; Mean diff: 0.0055
Iteration: 4; Mean diff: 0.0037
Iteration: 5; Mean diff: 0.0017
Iteration: 6; Mean diff: 0.0008
Iteration: 7; Mean diff: 0.0005
Iteration: 8; Mean diff: 0.0004
Iteration: 9; Mean diff: 0.0003
Iteration: 10; Mean diff: 0.0003
706.5517158508301


In [50]:
a_list = [alpha, alpha * k]
ci_method = Bonf_benchmark(M, Mhat, mask_obs, idxs_calib, k, w_obs=w_obs)
df = ci_method.get_CI(idxs_test, a_list, allow_inf=allow_inf)
for i, m in enumerate(["Bonferroni", "Uncorrected"]):
    lower, upper, is_inf= df.loc[i].lower, df.loc[i].upper, df.loc[i].is_inf
    lower[lower <= 0] = 0
    upper[upper >= 5] = 5
    res = pd.concat([res, evaluate_SCI(lower, upper, k, M, idxs_test, is_inf=is_inf, method=m)])

Computing Bonferroni-style intervals for 100 test queries...


CI: 100%|████████████████████████████████████████████████████████████████████████████| 400/400 [00:37<00:00, 10.70it/s]

Done!





In [51]:
res

Unnamed: 0,Query_coverage,Coverage,Size,Inf_prop,Method
0,0.83,0.9425,4.008431,0.0,SMC
0,0.97,0.99,4.891472,0.0,Bonferroni
0,0.81,0.9375,3.684268,0.0,Uncorrected
