In [2]:
import math
import random
from tqdm import tqdm
import math
import numpy as np
from submodlib import FacilityLocationFunction


In [2]:
import numpy as np
data = np.loadtxt("np.csv", delimiter=",")
print(data.shape)

(50000, 1000)


In [34]:
data_list = range(data.shape[0])

In [35]:
def stochastic_greedy(D, k, objFL, ϵ=1e-2, n=10, test=False):
    """Samples n subsets using stochastic-greedy.
    Args:
        D: number of training example
        k: The subset size.
        objFL: submodular function for conditional gain calculation
        ϵ: The error tolerance.
        n: The number of subsets to sample.

    Returns:
        A list of n subsets.
    """
    
    # Initialize empty subsets
    S = [set() for _ in range(n)]
    # Set random subset size for the stochastic-greedy algorithm
    s = int(D * math.log(1 / ϵ) / k)
    for i in tqdm(range(n)):
        for j in range(k):
            # Sample a random subset by sampling s elements from D \ Si
            R = random.choices(list(set(range(D)) - S[i]), k=s)
            # Use map to calculate marginal gains for all values in R with the custom set
            marginal_gains = list(map(lambda r: objFL.marginalGain(S[i], r), R))
            max_index = np.argmax(marginal_gains)
            max_r = R[max_index]
            S[i].add(max_r)
            if test:
                print(R)
                print(marginal_gains)
                print(max_index, max_r)
                return S
    return S

In [3]:
def SGE_optimised(D, k, objFL, ϵ=1e-2, n=10, test=False):
    """Samples n subsets using stochastic-greedy.
    Args:
        D: number of training example
        k: The subset size.
        objFL: submodular function for conditional gain calculation
        ϵ: The error tolerance.
        n: The number of subsets to sample.

    Returns:
        A list of n subsets.
    """
    
    # Initialize empty subsets
    S = [set() for _ in range(n)]
    # Set random subset size for the stochastic-greedy algorithm
    s = int(D * math.log(1 / ϵ) / k)
    for i in tqdm(range(n)):
        for j in range(k):
            # Sample a random subset by sampling s elements from D \ Si
            R = random.choices(list(set(range(D)) - S[i]), k=s)
            # Use map to calculate marginal gains for all values in R with the custom set
            marginal_gains = list(map(lambda r: objFL.marginalGainWithMemoization(S[i], r), R))
            max_index = np.argmax(marginal_gains)
            max_r = R[max_index]
            objFL.updateMemoization(S[i], max_r)
            S[i].add(max_r)
            if test:
                print(R)
                print(marginal_gains)
                print(max_index, max_r)
                return S
        objFL.clearMemoization()
    return S


In [23]:
def WRE(D, k, obj, n=10):
    """Samples n subsets using weighted random exploration.
    Args:
        D: number of training example
        k: The subset size.
        objFL: submodular function for conditional gain calculation
        n: The number of subsets to sample.
    Returns:
        A list of n subsets.
    """

    A = set()
    G = np.zeros(D)
    for i in range(D):
        R = list(set(range(D))-A)
        marginal_gains = list(map(lambda r: obj.marginalGain(A, r), R))
        max_index = np.argmax(marginal_gains)
        max_r = R[max_index]
        G[max_r] = marginal_gains[max_index]
        A.add(max_r)
    ts = 1+G+0.5*G**2
    ts = ts/np.sum(ts)
    S = [set() for _ in range(n)]
    for i in range(n):
        S[i] = np.random.choice(len(ts), size=k, p=ts, replace=False)
    return S

In [4]:
def WRE_optimised(D, k, obj, n=10):
    """Samples n subsets using weighted random exploration.
    Args:
        D: number of training example
        k: The subset size.
        obj: submodular function for conditional gain calculation
        n: The number of subsets to sample.
    Returns:
        A list of n subsets.
    """

    A = set()
    G = np.zeros(D)
    for i in tqdm(range(D)):
        R = list(set(range(D))-A)
        marginal_gains = list(map(lambda r: obj.marginalGainWithMemoization(A, r), R))
        max_index = np.argmax(marginal_gains)
        max_r = R[max_index]
        G[max_r] = marginal_gains[max_index]
        obj.updateMemoization(A, max_r)
        A.add(max_r)
    ts = 1+G+0.5*G**2
    ts = ts/np.sum(ts)
    S = [set() for _ in range(n)]
    for i in tqdm(range(n)):
        S[i] = np.random.choice(len(ts), size=k, p=ts, replace=False)
    return S

In [None]:
def WRE_max(D, k, obj, n=10):
    """Samples n subsets using weighted random exploration.
    Args:
        D: number of training example
        k: The subset size.
        obj: submodular function for conditional gain calculation
        n: The number of subsets to sample.
    Returns:
        A list of n subsets.
    """

    A = set()
    G = np.zeros(D)
    for i in tqdm(range(D)):
        R = list(set(range(D))-A)
        marginal_gains = list(map(lambda r: obj.marginalGainWithMemoization(A, r), R))
        max_index = np.argmax(marginal_gains)
        max_r = R[max_index]
        G[max_r] = marginal_gains[max_index]
        obj.updateMemoization(A, max_r)
        A.add(max_r)
    ts = 1+G+0.5*G**2
    ts = ts/np.sum(ts)
    S = [set() for _ in range(n)]
    for i in tqdm(range(n)):
        S[i] = np.random.choice(len(ts), size=k, p=ts, replace=False)
    return S

In [None]:
objFL = FacilityLocationFunction(n=data.shape[0], data=data, separate_rep=False, mode="dense", metric="cosine")

In [None]:
import numpy as np
from concurrent.futures import ThreadPoolExecutor

def euclidean_distance(data, i, j):
    return np.linalg.norm(data[i] - data[j])

def euclidean_kernel_threaded(data, n_threads=8):
    n_samples = data.shape[0]
    kernel = np.zeros((n_samples, n_samples))

    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        tasks = []
        for i in range(n_samples):
            for j in range(i + 1, n_samples):
                tasks.append(executor.submit(euclidean_distance, data, i, j))

        for i, j, future in zip(*[iter(tasks)] * 3):
            distance = future.result()
            kernel[i, j] = 1 / (distance**2 + 1e-8)
            kernel[j, i] = kernel[i, j]

    return kernel

In [None]:
kernel = euclidean_kernel_threaded(data)
print(kernel.shape)

In [6]:
import pickle
with open("dataframe.pkl", "rb") as f:
    df = pickle.load(f)

In [10]:
groups = df.groupby('Label')
dataframes = [group for _, group in groups]


In [9]:
features = dataframes[0]["Features"].to_numpy()

KeyboardInterrupt: 

In [None]:
print(type(features[0]))

<class 'numpy.ndarray'>


In [None]:
from submodlib import FacilityLocationFunction
objFL = FacilityLocationFunction(n=features.shape[0], data=features, separate_rep=False, mode="dense", metric="cosine")

In [None]:
S = stochastic_greedy(features.shape[0], features.shape[0]//10, objFL)

100%|██████████| 10/10 [1:24:16<00:00, 505.68s/it]


In [None]:
S1 = stochastic_greedy(features.shape[0], features.shape[0]//10, objFL, test=True)

  0%|          | 0/10 [00:00<?, ?it/s]

[4305, 1839, 519, 4811, 4717, 1546, 4489, 2645, 4744, 335, 868, 748, 480, 2132, 4316, 2513, 3706, 646, 4988, 3376, 1770, 775, 2791, 4387, 3496, 3154, 3733, 4773, 3856, 2064, 1920, 2020, 2599, 643, 279, 999, 4439, 2936, 822, 1335, 4776, 184, 4373, 2903, 763, 1894]
[4965.339718222618, 4969.378837883472, 4968.610053837299, 4966.099387228489, 4970.711409151554, 4970.752628684044, 4965.456205308437, 4972.216241359711, 4969.260010838509, 4971.767273426056, 4963.7573165893555, 4970.705480277538, 4971.125554680824, 4970.412975728512, 4966.752062022686, 4970.4145275354385, 4969.5245279073715, 4969.404681861401, 4968.852150857449, 4972.110513925552, 4967.63619852066, 4971.440750896931, 4971.409717321396, 4970.033610463142, 4970.585119605064, 4972.232334852219, 4972.94669085741, 4969.715833425522, 4972.2846692204475, 4969.732272148132, 4970.620934188366, 4971.830148756504, 4969.935484468937, 4975.754317104816, 4971.0590970516205, 4973.4421036839485, 4970.144440174103, 4972.805777668953, 4971.3969




In [12]:
features2 = dataframes[1]["Features"].to_numpy()
objFL2 = FacilityLocationFunction(n=features2.shape[0], data=features2, separate_rep=False, mode="dense", metric="cosine")

In [15]:
S2 = SGE_optimised(features2.shape[0], features2.shape[0]//10, objFL2)

100%|██████████| 10/10 [00:23<00:00,  2.35s/it]


In [None]:
S4 = objFL2.maximize

In [13]:
S3 = WRE_optimised(features2.shape[0], features2.shape[0]//10, objFL2)

 36%|███▋      | 1819/5000 [6:39:55<11:39:22, 13.19s/it]  


KeyboardInterrupt: 

In [17]:
import itertools
import numpy as np

def calculate_iou(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    iou = len(intersection) / len(union)
    return iou


def calculate_iou_matrix(sets_of_integers):
    num_sets = len(sets_of_integers)
    iou_matrix = np.zeros((num_sets, num_sets))

    for i, j in itertools.combinations(range(num_sets), 2):
        set1 = sets_of_integers[i]
        set2 = sets_of_integers[j]
        iou = calculate_iou(set1, set2)
        iou_matrix[i, j] = iou
        iou_matrix[j, i] = iou  # Fill the symmetric part

    return iou_matrix


In [20]:
calculate_iou(S2[0], S2[0])

1.0

In [18]:
iou_matrix = calculate_iou_matrix(S2)

In [19]:
print(iou_matrix)

[[0.         0.49476831 0.51515152 0.49700599 0.52207002 0.51515152
  0.48809524 0.5060241  0.5015015  0.51975684]
 [0.49476831 0.         0.4858841  0.5015015  0.5015015  0.52439024
  0.53374233 0.51515152 0.50829563 0.53139357]
 [0.51515152 0.4858841  0.         0.5128593  0.52671756 0.5037594
  0.51057402 0.5128593  0.51745068 0.51057402]
 [0.49700599 0.5015015  0.5128593  0.         0.53374233 0.5128593
  0.53374233 0.53846154 0.51057402 0.5037594 ]
 [0.52207002 0.5015015  0.52671756 0.53374233 0.         0.52905199
  0.51745068 0.52439024 0.51745068 0.51057402]
 [0.51515152 0.52439024 0.5037594  0.5128593  0.52905199 0.
  0.52207002 0.51515152 0.53374233 0.5060241 ]
 [0.48809524 0.53374233 0.51057402 0.53374233 0.51745068 0.52207002
  0.         0.53846154 0.53374233 0.52439024]
 [0.5060241  0.51515152 0.5128593  0.53846154 0.52439024 0.51515152
  0.53846154 0.         0.51745068 0.5060241 ]
 [0.5015015  0.50829563 0.51745068 0.51057402 0.51745068 0.53374233
  0.53374233 0.5174506