In [3]:
import math
import random
from tqdm import tqdm

In [4]:
import numpy as np
data = np.loadtxt("np.csv", delimiter=",")
print(data.shape)

(50000, 1000)


In [3]:
data_list = range(data.shape[0])

In [2]:
def stochastic_greedy(D, k, objFL, ϵ=1e-2, n=10):
    """Samples n subsets using stochastic-greedy.
        Args:
        D: number of training example
        k: The subset size.
        objFL: submodular function for conditional gain calculation
        ϵ: The error tolerance.
        n: The number of subsets to sample.

        Returns:
        A list of n subsets.
    """
    
    # Initialize empty subsets
    S = [set() for _ in range(n)]
    # Set random subset size for the stochastic-greedy algorithm
    s = int(D * math.log(1 / ϵ) / k)
    for i in range(n):
        for j in range(k):
            # Sample a random subset by sampling s elements from D \ Si
            R = random.choices(list(set(range(D)) - S[i]), k=s)
            # Use map to calculate marginal gains for all values in R with the custom set
            marginal_gains = list(map(lambda r: objFL.marginalGain(S[i], r), R))
            max_index = np.argmax(marginal_gains)
            max_r = R[max_index]
            S[i].add(max_r)
    return S


In [13]:
from submodlib import FacilityLocationFunction

In [6]:
objFL = FacilityLocationFunction(n=data.shape[0], data=data, separate_rep=False, mode="sparse", metric="euclidean")


KeyboardInterrupt



In [21]:
data_small = data[:100]
D = data_small.shape[0]
k = 50
n = 10
e = 1e-2

In [26]:
print(data_small.shape)

(100, 1000)


In [28]:
objFL = FacilityLocationFunction(n=D, data=data_small, separate_rep=False, mode="dense", metric="euclidean")

In [15]:
S = [set() for _ in range(n)]
# Set random subset size for the stochastic-greedy algorithm
s = int(D * math.log(1 / e) / k)
print(s)
for i in range(n):
    for j in tqdm(range(k)):
        # Sample a random subset by sampling s elements from D \ Si
        R = random.choices(list(set(range(D)) - S[i]), k=s)
        # Use map to calculate marginal gains for all values in R with the custom set
        # marginal_gains = list(map(lambda r: objFL.marginalGain(S[i], r), R))
        # max_index = np.argmax(marginal_gains)
        # max_r = S[max_index]
        # S[i].add(max_r)

18


100%|██████████████████████████████████████| 500/500 [00:00<00:00, 14264.21it/s]
100%|██████████████████████████████████████| 500/500 [00:00<00:00, 18821.03it/s]
100%|██████████████████████████████████████| 500/500 [00:00<00:00, 23038.28it/s]
100%|██████████████████████████████████████| 500/500 [00:00<00:00, 26307.46it/s]
100%|██████████████████████████████████████| 500/500 [00:00<00:00, 26365.67it/s]
100%|██████████████████████████████████████| 500/500 [00:00<00:00, 26699.33it/s]
100%|██████████████████████████████████████| 500/500 [00:00<00:00, 26208.18it/s]
100%|██████████████████████████████████████| 500/500 [00:00<00:00, 26541.86it/s]
100%|██████████████████████████████████████| 500/500 [00:00<00:00, 26625.43it/s]
100%|██████████████████████████████████████| 500/500 [00:00<00:00, 26569.10it/s]


In [10]:
print(data[0])

[-5.87178900e-01 -4.67061880e-01 -5.81903600e-01 -1.54780530e+00
 -5.95911740e-01 -1.35292500e-01 -4.40153270e-01  4.25905470e-01
  3.46328080e-01 -8.88410700e-01 -1.04910860e+00 -8.02275100e-01
 -2.71545650e-01 -9.71809570e-01 -1.13495610e+00 -7.24881100e-01
 -8.84935300e-01 -3.31837420e-01 -6.01179200e-01 -5.31691800e-01
 -1.55888340e+00 -7.49258400e-01 -1.48275960e+00  2.30334670e-01
 -1.08447090e+00 -1.27548650e+00 -9.36993200e-01 -1.25158990e+00
 -9.88694800e-01 -4.65047900e-01 -9.08616700e-01 -8.82250400e-01
 -5.60555200e-01 -5.09361270e-01 -4.88457800e-01 -4.87652660e-01
  5.90549950e-01 -7.51944500e-01 -5.32677300e-01 -1.66356680e-01
 -8.33782800e-01 -1.03180000e+00 -1.18988800e+00 -5.00095600e-01
 -8.25575350e-01 -5.81822300e-01 -9.41436900e-01 -6.55483540e-01
 -1.48487470e+00 -1.29148950e+00 -6.24501500e-01  3.63795640e-01
 -3.47354050e-01 -7.20277850e-01 -3.76805900e-01 -1.35293290e+00
 -4.28732630e-01 -1.51605620e+00 -6.09010340e-01 -5.99694200e-01
  6.36573700e-01  9.43388

In [7]:
temp = np.array([(1,2,3), (2,4,5), (5,6,7)])
print(type(temp), type(temp[0]))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [None]:
import numpy as np
from concurrent.futures import ThreadPoolExecutor

def euclidean_distance(data, i, j):
    return np.linalg.norm(data[i] - data[j])

def euclidean_kernel_threaded(data, n_threads=8):
    n_samples = data.shape[0]
    kernel = np.zeros((n_samples, n_samples))

    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        tasks = []
        for i in range(n_samples):
            for j in range(i + 1, n_samples):
                tasks.append(executor.submit(euclidean_distance, data, i, j))

        for i, j, future in zip(*[iter(tasks)] * 3):
            distance = future.result()
            kernel[i, j] = 1 / (distance**2 + 1e-8)
            kernel[j, i] = kernel[i, j]

    return kernel

In [None]:
# Example usage:
kernel = euclidean_kernel_threaded(data)
print(kernel.shape)