In [None]:
import numpy as np

In [None]:
N = 10**3
d = 4
inner_rank = int(N/2)
X = np.dot(np.random.randn(N, inner_rank), np.random.randn(inner_rank, d))
print("Mean squared element: %0.4f" % (X ** 2).mean())

In [None]:
X_incomplete = X.copy()
# missing entries indicated with NaN
for i in range(N):
    X_incomplete[i, np.random.randint(d)] = np.nan

## matrix completion using sklearn

In [None]:
from sklearn.impute import SimpleImputer, KNNImputer
# imp = SimpleImputer(missing_values=np.nan, strategy=Z'mean')
imp = KNNImputer(n_neighbors=2, weights="uniform")

In [None]:
((X - imp.fit_transform(X_incomplete))**2).mean()

In [None]:
# idea is to construct lines, find k-centers, find pts closest to 
# those k center for each line, find difference from original X

## setup coreset streamer

In [None]:
import sys
sys.path.insert(1, "./KLines")

import numpy as np
import copy
import math
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from klines import SetOfLines, SetOfPoints, CorsetForKMeansForLines, CoresetStreamer

assert(np.version.full_version == '1.16.5')  # later revisions hv slower array lookups


displacements = np.nan_to_num(X_incomplete)

spans = np.nan_to_num(X_incomplete)
spans[spans==0] = 1
spans[spans!=1] = 0

L = SetOfLines(spans, displacements, np.ones(N), np.ones(N))

class ParameterConfig:
    def __init__(self):
        pass
    
config = ParameterConfig()

In [None]:
## data
k = d//2
m = int(N*0.01)  # coreset size ~ reduction ratio
tau = 1e-3

config.a_b_approx_minimum_number_of_lines = 100 # constant 100, line 2, algo 2 BI-CRITERIA

config.sample_size_for_a_b_approx = int(m*1.01) # |S| >= m, line 3 of algo 2
                                                # note: there'll be a O(|S|^2) cost while computing algo 1
    
config.farthest_to_centers_rate_in_a_b_approx = 4/11  # opp of 7/11, line 6, algo 2 BI-CRITERIA
config.number_of_remains_multiply_factor = int(math.log(N))//k # this is `b` in algo 2, other paper, set as random here -  how to calculate it?
config.closest_to_median_rate = (1-tau)/(2*k)  # refer line 4, algo 1, other paper

config.median_sample_size = int(N*0.05)    # size of q_i, line 3, algo 2, other paper
config.max_sensitivity_multiply_factor = 100  # for outliers in coresets

config.number_of_remains = 50

SAMPLE_SIZE = 100   # coreset size grws linear w/ SAMPLE_SIZE

In [None]:
ITER = 4
vals = []

for _ in range(ITER):
    # define the streamer
    streamer = CoresetStreamer(SAMPLE_SIZE, k, config)
    coreset = streamer.stream(L)
    L1 = coreset[0]

    _, B, _ = CorsetForKMeansForLines(config).coreset(L1, k, int(L1.get_size()*0.25), True)

    vals.append(((X - L.get_projected_centers(B))**2).mean())

print(f"mean: {np.array(vals).mean()} var: {np.array(vals).var()}")