In [None]:
import math
import random
from tqdm import tqdm
import math
import numpy as np
from submodlib import FacilityLocationFunction, GraphCutFunction
import pickle
import time

In [None]:
import numpy as np
data = np.loadtxt("np.csv", delimiter=",")
print(data.shape)

In [None]:
def stochastic_greedy(D, k, objFL, ϵ=1e-2, n=10, test=False):
    """Samples n subsets using stochastic-greedy.
    Args:
        D: number of training example
        k: The subset size.
        objFL: submodular function for conditional gain calculation
        ϵ: The error tolerance.
        n: The number of subsets to sample.

    Returns:
        A list of n subsets.
    """
    
    # Initialize empty subsets
    S = [set() for _ in range(n)]
    # Set random subset size for the stochastic-greedy algorithm
    s = int(D * math.log(1 / ϵ) / k)
    for i in tqdm(range(n)):
        for j in range(k):
            # Sample a random subset by sampling s elements from D \ Si
            R = random.choices(list(set(range(D)) - S[i]), k=s)
            # Use map to calculate marginal gains for all values in R with the custom set
            marginal_gains = list(map(lambda r: objFL.marginalGain(S[i], r), R))
            max_index = np.argmax(marginal_gains)
            max_r = R[max_index]
            S[i].add(max_r)
            if test:
                print(R)
                print(marginal_gains)
                print(max_index, max_r)
                return S
    return S

In [None]:
def SGE_optimised(D, k, objFL, ϵ=1e-2, n=10, test=False):
    """Samples n subsets using stochastic-greedy.
    Args:
        D: number of training example
        k: The subset size.
        objFL: submodular function for conditional gain calculation
        ϵ: The error tolerance.
        n: The number of subsets to sample.

    Returns:
        A list of n subsets.
    """
    
    # Initialize empty subsets
    S = [set() for _ in range(n)]
    # Set random subset size for the stochastic-greedy algorithm
    s = int(D * math.log(1 / ϵ) / k)
    for i in tqdm(range(n)):
        for j in range(k):
            # Sample a random subset by sampling s elements from D \ Si
            R = random.choices(list(set(range(D)) - S[i]), k=s)
            # Use map to calculate marginal gains for all values in R with the custom set
            marginal_gains = list(map(lambda r: objFL.marginalGainWithMemoization(S[i], r), R))
            max_index = np.argmax(marginal_gains)
            max_r = R[max_index]
            objFL.updateMemoization(S[i], max_r)
            S[i].add(max_r)
            if test:
                print(R)
                print(marginal_gains)
                print(max_index, max_r)
                return S
        objFL.clearMemoization()
    return S


In [None]:
def WRE(D, k, obj, n=10):
    """Samples n subsets using weighted random exploration.
    Args:
        D: number of training example
        k: The subset size.
        objFL: submodular function for conditional gain calculation
        n: The number of subsets to sample.
    Returns:
        A list of n subsets.
    """

    A = set()
    G = np.zeros(D)
    for i in range(D):
        R = list(set(range(D))-A)
        marginal_gains = list(map(lambda r: obj.marginalGain(A, r), R))
        max_index = np.argmax(marginal_gains)
        max_r = R[max_index]
        G[max_r] = marginal_gains[max_index]
        A.add(max_r)
    ts = 1+G+0.5*G**2
    ts = ts/np.sum(ts)
    S = [set() for _ in range(n)]
    for i in range(n):
        S[i] = np.random.choice(len(ts), size=k, p=ts, replace=False)
    return S

In [None]:
def WRE_optimised(D, k, obj, n=10):
    """Samples n subsets using weighted random exploration.
    Args:
        D: number of training example
        k: The subset size.
        obj: submodular function for conditional gain calculation
        n: The number of subsets to sample.
    Returns:
        A list of n subsets.
    """

    A = set()
    G = np.zeros(D)
    for i in tqdm(range(D)):
        R = list(set(range(D))-A)
        # if i%100==99:
            # print(len(R))
        marginal_gains = list(map(lambda r: obj.marginalGainWithMemoization(A, r), R))
        max_index = np.argmax(marginal_gains)
        max_r = R[max_index]
        G[max_r] = marginal_gains[max_index]
        obj.updateMemoization(A, max_r)
        A.add(max_r)
    ts = 1+G+0.5*G**2
    ts = ts/np.sum(ts)
    S = [set() for _ in range(n)]
    for i in tqdm(range(n)):
        S[i] = np.random.choice(len(ts), size=k, p=ts, replace=False)
    return S

In [None]:
def WRE_max(D, k, obj, n=10):
    """Samples n subsets using weighted random exploration.
    Args:
        D: number of training example
        k: The subset size.
        obj: submodular function for conditional gain calculation
        n: The number of subsets to sample.
    Returns:
        A list of n subsets.
    """

    A = set()
    G = np.zeros(D)
    for i in tqdm(range(D)):
        R = list(set(range(D))-A)
        marginal_gains = list(map(lambda r: obj.marginalGainWithMemoization(A, r), R))
        max_index = np.argmax(marginal_gains)
        max_r = R[max_index]
        G[max_r] = marginal_gains[max_index]
        obj.updateMemoization(A, max_r)
        A.add(max_r)
    ts = 1+G+0.5*G**2
    ts = ts/np.sum(ts)
    S = [set() for _ in range(n)]
    for i in tqdm(range(n)):
        S[i] = np.random.choice(len(ts), size=k, p=ts, replace=False)
    return S

In [None]:
import numpy as np
from concurrent.futures import ThreadPoolExecutor

def euclidean_distance(data, i, j):
    return np.linalg.norm(data[i] - data[j])

def euclidean_kernel_threaded(data, n_threads=8):
    n_samples = data.shape[0]
    kernel = np.zeros((n_samples, n_samples))

    with ThreadPoolExecutor(max_workers=n_threads) as executor:
        tasks = []
        for i in range(n_samples):
            for j in range(i + 1, n_samples):
                tasks.append(executor.submit(euclidean_distance, data, i, j))

        for i, j, future in zip(*[iter(tasks)] * 3):
            distance = future.result()
            kernel[i, j] = 1 / (distance**2 + 1e-8)
            kernel[j, i] = kernel[i, j]

    return kernel

In [None]:
# kernel = euclidean_kernel_threaded(data)
# print(kernel.shape)

In [None]:
with open("dataframe.pkl", "rb") as f:
    df = pickle.load(f)

In [None]:
groups = df.groupby('Label')
dataframes = [group for _, group in groups]

In [None]:
start_time = time.time()
list_of_class_wise_subsets = []
for i, df in enumerate(dataframes):
    features = df["Features"].to_numpy()
    objFL = FacilityLocationFunction(n=features.shape[0], data=features, separate_rep=False, mode="dense", metric="cosine")
    S = SGE_optimised(features.shape[0], features.shape[0]//10, objFL, n=20)
    with open(f"./class-data/class_{i}.pkl", "wb") as f:
        pickle.dump(S, f)
    list_of_class_wise_subsets.append(S)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
with open("./class-data/class_0.pkl", "rb") as f:
    data = pickle.load(f)

In [None]:
features = dataframes[0]["Features"].to_numpy()

In [None]:
print(type(features[0]))

In [None]:
from submodlib import FacilityLocationFunction
objFL = FacilityLocationFunction(n=features.shape[0], data=features, separate_rep=False, mode="sparse", metric="cosine")

In [None]:
objGC = GraphCutFunction(n=features.shape[0], data=features, separate_rep=False, mode="dense", metric="cosine")

In [None]:
import time
start_time = time.time()
S = objFL.maximize(features.shape[0]-1, optimizer='NaiveGreedy', stopIfZeroGain=False, stopIfNegativeGain=False, epsilon=0.1, verbose=False, show_progress=True, costs=None, costSensitiveGreedy=False)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
S = objFL.maximize(features.shape[0]-1, show_progress=True)

In [None]:
print("Why is this running slow")

In [None]:
S = WRE_optimised(features.shape[0], features.shape[0]//10, objFL, n=20)

In [None]:
S = stochastic_greedy(features.shape[0], features.shape[0]//10, objFL)

In [None]:
S1 = stochastic_greedy(features.shape[0], features.shape[0]//10, objFL, test=True)

In [None]:
features2 = dataframes[1]["Features"].to_numpy()
objFL2 = FacilityLocationFunction(n=features2.shape[0], data=features2, separate_rep=False, mode="dense", metric="cosine")

In [None]:
S2 = SGE_optimised(features2.shape[0], features2.shape[0]//10, objFL2)

In [None]:
S3 = WRE_optimised(features2.shape[0], features2.shape[0]//10, objFL2)

In [None]:
import itertools
import numpy as np

def calculate_iou(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    iou = len(intersection) / len(union)
    return iou


def calculate_iou_matrix(sets_of_integers):
    num_sets = len(sets_of_integers)
    iou_matrix = np.zeros((num_sets, num_sets))

    for i, j in itertools.combinations(range(num_sets), 2):
        set1 = sets_of_integers[i]
        set2 = sets_of_integers[j]
        iou = calculate_iou(set1, set2)
        iou_matrix[i, j] = iou
        iou_matrix[j, i] = iou  # Fill the symmetric part

    return iou_matrix


In [None]:
calculate_iou(S2[0], S2[0])

In [None]:
iou_matrix = calculate_iou_matrix(S2)

In [None]:
print(iou_matrix)