In [1]:
import math
import random
from tqdm import tqdm
import math
import numpy as np
from submodlib import FacilityLocationFunction, GraphCutFunction
import pickle
import time

In [2]:
import numpy as np
data = np.loadtxt("np.csv", delimiter=",")
print(data.shape)

(50000, 1000)


In [2]:
def stochastic_greedy(D, k, objFL, ϵ=1e-2, n=10, test=False):
    """Samples n subsets using stochastic-greedy.
    Args:
        D: number of training example
        k: The subset size.
        objFL: submodular function for conditional gain calculation
        ϵ: The error tolerance.
        n: The number of subsets to sample.

    Returns:
        A list of n subsets.
    """
    
    # Initialize empty subsets
    S = [set() for _ in range(n)]
    # Set random subset size for the stochastic-greedy algorithm
    s = int(D * math.log(1 / ϵ) / k)
    for i in tqdm(range(n)):
        for j in range(k):
            # Sample a random subset by sampling s elements from D \ Si
            R = random.choices(list(set(range(D)) - S[i]), k=s)
            # Use map to calculate marginal gains for all values in R with the custom set
            marginal_gains = list(map(lambda r: objFL.marginalGain(S[i], r), R))
            max_index = np.argmax(marginal_gains)
            max_r = R[max_index]
            S[i].add(max_r)
            if test:
                print(R)
                print(marginal_gains)
                print(max_index, max_r)
                return S
    return S

In [3]:
def SGE_optimised(D, k, objFL, ϵ=1e-2, n=10, test=False):
    """Samples n subsets using stochastic-greedy.
    Args:
        D: number of training example
        k: The subset size.
        objFL: submodular function for conditional gain calculation
        ϵ: The error tolerance.
        n: The number of subsets to sample.

    Returns:
        A list of n subsets.
    """
    
    # Initialize empty subsets
    S = [set() for _ in range(n)]
    # Set random subset size for the stochastic-greedy algorithm
    s = int(D * math.log(1 / ϵ) / k)
    for i in tqdm(range(n)):
        for j in range(k):
            # Sample a random subset by sampling s elements from D \ Si
            R = random.choices(list(set(range(D)) - S[i]), k=s)
            # Use map to calculate marginal gains for all values in R with the custom set
            marginal_gains = list(map(lambda r: objFL.marginalGainWithMemoization(S[i], r), R))
            max_index = np.argmax(marginal_gains)
            max_r = R[max_index]
            objFL.updateMemoization(S[i], max_r)
            S[i].add(max_r)
            if test:
                print(R)
                print(marginal_gains)
                print(max_index, max_r)
                return S
        objFL.clearMemoization()
    return S


In [None]:
def WRE(D, k, obj, n=10):
    """Samples n subsets using weighted random exploration.
    Args:
        D: number of training example
        k: The subset size.
        objFL: submodular function for conditional gain calculation
        n: The number of subsets to sample.
    Returns:
        A list of n subsets.
    """

    A = set()
    G = np.zeros(D)
    for i in range(D):
        R = list(set(range(D))-A)
        marginal_gains = list(map(lambda r: obj.marginalGain(A, r), R))
        max_index = np.argmax(marginal_gains)
        max_r = R[max_index]
        G[max_r] = marginal_gains[max_index]
        A.add(max_r)
    ts = 1+G+0.5*G**2
    ts = ts/np.sum(ts)
    S = [set() for _ in range(n)]
    for i in range(n):
        S[i] = np.random.choice(len(ts), size=k, p=ts, replace=False)
    return S

In [None]:
def WRE_optimised(D, k, obj, n=10):
    """Samples n subsets using weighted random exploration.
    Args:
        D: number of training example
        k: The subset size.
        obj: submodular function for conditional gain calculation
        n: The number of subsets to sample.
    Returns:
        A list of n subsets.
    """

    A = set()
    G = np.zeros(D)
    for i in tqdm(range(D)):
        R = list(set(range(D))-A)
        # if i%100==99:
            # print(len(R))
        marginal_gains = list(map(lambda r: obj.marginalGainWithMemoization(A, r), R))
        max_index = np.argmax(marginal_gains)
        max_r = R[max_index]
        G[max_r] = marginal_gains[max_index]
        obj.updateMemoization(A, max_r)
        A.add(max_r)
    ts = 1+G+0.5*G**2
    ts = ts/np.sum(ts)
    S = [set() for _ in range(n)]
    for i in tqdm(range(n)):
        S[i] = np.random.choice(len(ts), size=k, p=ts, replace=False)
    return S

In [None]:
def WRE_max(D, k, obj, n=10):
    """Samples n subsets using weighted random exploration.
    Args:
        D: number of training example
        k: The subset size.
        obj: submodular function for conditional gain calculation
        n: The number of subsets to sample.
    Returns:
        A list of n subsets.
    """

    A = set()
    G = np.zeros(D)
    for i in tqdm(range(D)):
        R = list(set(range(D))-A)
        marginal_gains = list(map(lambda r: obj.marginalGainWithMemoization(A, r), R))
        max_index = np.argmax(marginal_gains)
        max_r = R[max_index]
        G[max_r] = marginal_gains[max_index]
        obj.updateMemoization(A, max_r)
        A.add(max_r)
    
    G = G -G.min()
    
    ts = 1+G+0.5*G**2
    ts = ts/np.sum(ts)
    S = [set() for _ in range(n)]
    for i in tqdm(range(n)):
        S[i] = np.random.choice(len(ts), size=k, p=ts, replace=False)
    return S

In [None]:
import numpy as np
from concurrent.futures import ThreadPoolExecutor

def euclidean_distance(data, i, j):
    return np.linalg.norm(data[i] - data[j])

def euclidean_kernel_threaded(data, n_threads=8):
    n_samples = data.shape[0]
    kernel = np.zeros((n_samples, n_samples))

    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        tasks = []
        for i in range(n_samples):
            for j in range(i + 1, n_samples):
                tasks.append(executor.submit(euclidean_distance, data, i, j))

        for i, j, future in zip(*[iter(tasks)] * 3):
            distance = future.result()
            kernel[i, j] = 1 / (distance**2 + 1e-8)
            kernel[j, i] = kernel[i, j]

    return kernel

In [None]:
# kernel = euclidean_kernel_threaded(data)
# print(kernel.shape)

In [6]:
subset_size_fraction = 0.3

In [4]:
with open("dataframe.pkl", "rb") as f:
    df = pickle.load(f)

In [5]:
groups = df.groupby('Label')
dataframes = [group for _, group in groups]

In [None]:
fraction_list = [0.05, 0.10, 0.15, 0.3, 0.5]

In [15]:
time_to_get_subsets = 0
list_of_class_wise_subsets = []
subset_size_fraction = 0.3
num_sets = 20
for i, df in enumerate(dataframes):
    features = df["Features"].to_numpy()
    objFL = FacilityLocationFunction(n=features.shape[0], data=features, separate_rep=False, mode="dense", metric="cosine")
    start_time = time.time()
    S = SGE_optimised(features.shape[0], int(features.shape[0]*subset_size_fraction), objFL, n=num_sets)
    time_to_get_subsets = time.time() - start_time
    set_indexes = []
    for j in range(num_sets):
        set_indexes.append(set(df.iloc[list(S[0])].Index.tolist()))
    with open(f"./class-data-{subset_size_fraction}/class_{i}.pkl", "wb") as f:
        pickle.dump(set_indexes, f)
    list_of_class_wise_subsets.append(set_indexes)
print("--- %s seconds ---" % (time.time() - start_time))

100%|██████████| 20/20 [01:48<00:00,  5.42s/it]
100%|██████████| 20/20 [01:42<00:00,  5.11s/it]
100%|██████████| 20/20 [01:44<00:00,  5.23s/it]
100%|██████████| 20/20 [01:42<00:00,  5.13s/it]
100%|██████████| 20/20 [01:41<00:00,  5.08s/it]
100%|██████████| 20/20 [01:44<00:00,  5.24s/it]
100%|██████████| 20/20 [01:48<00:00,  5.41s/it]
100%|██████████| 20/20 [01:51<00:00,  5.60s/it]
100%|██████████| 20/20 [01:47<00:00,  5.37s/it]
100%|██████████| 20/20 [01:47<00:00,  5.37s/it]

--- 1277.7257900238037 seconds ---





In [None]:
with open("./class-data/class_0.pkl", "rb") as f:
    data = pickle.load(f)

In [8]:
features = dataframes[0]["Features"].to_numpy()

In [9]:
print(type(features[0]))

<class 'numpy.ndarray'>


In [11]:
from submodlib import FacilityLocationFunction
objFL = FacilityLocationFunction(n=features.shape[0], data=features, separate_rep=False, mode="dense", metric="cosine")

In [12]:
S = SGE_optimised(features.shape[0], features.shape[0]//10, objFL)

100%|██████████| 10/10 [00:23<00:00,  2.39s/it]


In [16]:
print(type(dataframes[0]["Index"]))
print(S[0])

<class 'pandas.core.series.Series'>
{2056, 2063, 2068, 2076, 2081, 4130, 4131, 4132, 41, 4137, 2092, 46, 2095, 51, 53, 4151, 58, 2112, 2115, 2121, 74, 4171, 80, 83, 4181, 2134, 2133, 4186, 2142, 96, 4194, 108, 4205, 2157, 4208, 2162, 2163, 2169, 4219, 2171, 4221, 4223, 2177, 4227, 4229, 137, 4246, 2200, 153, 2206, 4255, 4257, 2215, 2216, 4266, 2220, 2222, 182, 2241, 2242, 2251, 4302, 206, 209, 215, 2267, 231, 2281, 234, 4336, 242, 246, 249, 4349, 2305, 4354, 2316, 2318, 4367, 4375, 281, 2330, 2339, 2347, 2351, 307, 4414, 4419, 4422, 329, 2382, 334, 4432, 2383, 337, 2387, 4436, 2408, 4458, 364, 365, 2420, 2422, 2423, 376, 4475, 2428, 386, 391, 2440, 4491, 405, 4506, 4507, 4509, 420, 4522, 2477, 2479, 433, 4540, 4545, 2505, 457, 2508, 2519, 4572, 481, 2534, 487, 4589, 4590, 506, 4611, 2565, 4618, 523, 2576, 4625, 2578, 4629, 538, 4635, 4642, 4643, 552, 4649, 4656, 2609, 4657, 564, 2612, 4663, 4671, 589, 591, 4688, 592, 2639, 2644, 597, 4693, 4695, 2646, 2652, 4702, 2660, 614, 616, 2668, 

In [19]:
index_values = dataframes[0].iloc[list(S[0])].Index
print(type(index_values))


<class 'pandas.core.series.Series'>


In [None]:
objGC = GraphCutFunction(n=features.shape[0], data=features, separate_rep=False, mode="dense", metric="cosine")

In [None]:
import time
start_time = time.time()
S = objFL.maximize(features.shape[0]-1, optimizer='NaiveGreedy', stopIfZeroGain=False, stopIfNegativeGain=False, epsilon=0.1, verbose=False, show_progress=True, costs=None, costSensitiveGreedy=False)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
S = objFL.maximize(features.shape[0]-1, show_progress=True)

In [None]:
print("Why is this running slow")

In [None]:
S = WRE_optimised(features.shape[0], features.shape[0]//10, objFL, n=20)

In [None]:
S = stochastic_greedy(features.shape[0], features.shape[0]//10, objFL)

In [None]:
S1 = stochastic_greedy(features.shape[0], features.shape[0]//10, objFL, test=True)

In [None]:
features2 = dataframes[1]["Features"].to_numpy()
objFL2 = FacilityLocationFunction(n=features2.shape[0], data=features2, separate_rep=False, mode="dense", metric="cosine")

In [None]:
S2 = SGE_optimised(features2.shape[0], features2.shape[0]//10, objFL2)

In [None]:
S3 = WRE_optimised(features2.shape[0], features2.shape[0]//10, objFL2)

In [None]:
import itertools
import numpy as np

def calculate_iou(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    iou = len(intersection) / len(union)
    return iou


def calculate_iou_matrix(sets_of_integers):
    num_sets = len(sets_of_integers)
    iou_matrix = np.zeros((num_sets, num_sets))

    for i, j in itertools.combinations(range(num_sets), 2):
        set1 = sets_of_integers[i]
        set2 = sets_of_integers[j]
        iou = calculate_iou(set1, set2)
        iou_matrix[i, j] = iou
        iou_matrix[j, i] = iou  # Fill the symmetric part

    return iou_matrix


In [None]:
calculate_iou(S2[0], S2[0])

In [None]:
iou_matrix = calculate_iou_matrix(S2)

In [None]:
print(iou_matrix)