In [1]:
import time

def time_function(func):
    def wrapper(*args, **kwargs):
        appendix = ""
        # if instances in args:
        if "instances" in kwargs:
            # append len of instances
            appendix = f"({len(kwargs['instances'])} instances"
        if "papers" in kwargs:
            if appendix:
                appendix += ", "
            appendix += f"{len(kwargs['papers'])} papers"
        if appendix:
            appendix += ")"
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} executed in {end_time - start_time} seconds" + appendix)
        return result
    return wrapper

In [2]:
# testing time it takes to lookup frozenset in dict
@time_function
def frozen_time_test():
    test_dict = {}
    for i in range(100000):
        test_dict[frozenset([str(i)])] = i
    for i in range(100000):
        if frozenset([str(i)]) in test_dict:
            pass

frozen_dict = {}
for i in range(100000):
    frozen_dict[frozenset([str(i)])] = i

frozen_list = [frozenset([str(i)]) for i in range(100000)]

@time_function
def frozen_pre_test():
    for i in range(100000):
        if frozen_list[i] in frozen_dict:
            pass

@time_function
def string_time_test():
    test_dict = {}
    for i in range(100000):
        test_dict[str(i)] = i
    for i in range(100000):
        if str(i) in test_dict:
            pass

@time_function
def integer_time_test():
    test_dict = {}
    for i in range(100000):
        test_dict[i] = i
    for i in range(100000):
        if i in test_dict:
            pass

@time_function
def join_time_test():
    test_dict = {}
    for i in range(100000):
        test_dict[str(i) + " num"] = i
    for i in range(100000):
        if str(i) + " num" in test_dict:
            pass


# Pre-generated keys for string_time_test
string_list = [str(i) for i in range(100000)]
string_dict = {str(i): i for i in range(100000)}

@time_function
def string_pre_test():
    for i in range(100000):
        if string_list[i] in string_dict:
            pass

# Pre-generated keys for integer_time_test
integer_list = list(range(100000))
integer_dict = {i: i for i in range(100000)}

@time_function
def integer_pre_test():
    for i in range(100000):
        if integer_list[i] in integer_dict:
            pass

# Pre-generated keys for join_time_test
join_list = [str(i) + " num" for i in range(100000)]
join_dict = {str(i) + " num": i for i in range(100000)}

@time_function
def join_pre_test():
    for i in range(100000):
        if join_list[i] in join_dict:
            pass

In [3]:
frozen_pre_test()
string_pre_test()
integer_pre_test()
join_pre_test()

frozen_pre_test executed in 0.017003536224365234 seconds
string_pre_test executed in 0.010003089904785156 seconds
integer_pre_test executed in 0.005000114440917969 seconds
join_pre_test executed in 0.012002706527709961 seconds


In [4]:
frozen_time_test()
string_time_test()
integer_time_test()
join_time_test()

frozen_time_test executed in 0.17003798484802246 seconds
string_time_test executed in 0.03400826454162598 seconds
integer_time_test executed in 0.012002706527709961 seconds
join_time_test executed in 0.04801154136657715 seconds


In [5]:
def split_string(input_string, delimiters = [" ", "-", "_"]):
    for delimiter in delimiters:
        input_string = " ".join(input_string.split(delimiter))
    return input_string.split()

In [6]:
# setup
# init a 500x500 matrix of empty lists
# init a list of 500 random words
with open("MVP/instances.txt", "r") as file:
    instances = file.readlines()
word_combinations = {}


words = {}
for instance in instances:
    for word in split_string(instance):
        if word not in words:
            words[word] = len(words) 

word_combination_index_literal_literal = [[[] for _ in range(500)] for _ in range(500)]


for id1, literal1 in enumerate(instances):
    for id2 in range(id1 + 1, len(instances)):
        literal2 = instances[id2]
        # Use a sorted tuple for consistent ordering
        froz = frozenset(split_string(literal1) + split_string(literal2))
        # Check if the combination is already in the dictionary
        word_indizes = word_combinations.get(froz, -1)
        if word_indizes == -1:
            word_indizes = [words[word] for word in froz]
            word_combinations[froz] = word_indizes
        # Update the matrix with the index of the combination
        word_combination_index_literal_literal[id1][id2] = word_indizes
        word_combination_index_literal_literal[id2][id1] = word_indizes

In [7]:
def find_via_frozen_sets(literal1, literal2):
    froz = frozenset(split_string(literal1) + split_string(literal2))
    return word_combinations.get(froz, [])

def find_via_index(l1, l2):
    return word_combination_index_literal_literal[l1][l2]

In [8]:
# test both 10000000 times and compare time
@time_function
def test_frozen(run = 100):
    for i in range(run):
        for i, instance in enumerate(instances):
            for j in range(i + 1, len(instances)):
                find_via_frozen_sets(instance, instances[j])

@time_function
def test_index(run = 100):
    for i in range(run):
        for i in range(len(instances)):
            for j in range(i + 1, len(instances)):
                find_via_index(i, j)

In [9]:
# test = [1, 2, 4, 8, 16, 32, 64]

# for id, run in enumerate(test):
#     print(f"Run {run}")
#     if id % 2 == 0:
#         test_index(run)
#         test_frozen(run)
#     else:  
#         test_frozen(run)
#         test_index(run)

# Take-away
## Frozensets neu berechnen dauert länger als einmal die kombinationen speichern.

In [13]:
test_dict = {i: i for i in range(100000)}
test_list = list(range(100000))

@time_function
def dict_test(run = 100):
    for a in range(run):
        b = test_dict[a]

@time_function
def value_test(run = 100):
    for a in range(run):
        b = list(test_dict.values())[a]

from itertools import islice

@time_function
def slice_test(run = 100):
    for a in range(run):
        b = next(islice(test_dict.values(), a, None))

@time_function
def list_test(run = 100):
    for a in range(run):
        b = test_list[a]

run = 10000
dict_test(run)
value_test(run)
slice_test(run)
list_test(run)

dict_test executed in 0.005001068115234375 seconds
value_test executed in 44.96441173553467 seconds
list_test executed in 0.003002166748046875 seconds
