**Author:** J. Žovák, `482857@mail.muni.cz`

In [None]:
import matplotlib.pyplot as plt
import json
import numpy as np

In [None]:
dataset_name = "hnm"
dir_path = f"./data/{dataset_name}/"
payloads_path = dir_path + "payloads.jsonl"
tests_path = dir_path + "tests.jsonl"

In [None]:
with open(payloads_path, 'r') as file:
    payloads = [json.loads(line) for line in file]

In [None]:
# def preprocess_payloads(payloads):
#     """
#     Preprocess payloads replacing None values with the string 'None'.
#     :param payloads: A list of payload entries
#     :return: The preprocessed list of payloads
#     """
#     for payload in payloads:
#         for key, value in payload.items():
#             if value is None:
#                 payload[key] = 'None'
# 
# preprocess_payloads(payloads)

In [None]:
# # Store the preprocessed payloads back in the JSON Lines format
# with open(payloads_path, 'w') as file:
#     for payload in payloads:
#         json_line = json.dumps(payload)
#         file.write(json_line + '\n')

In [None]:
payloads[0]

In [None]:
unique_product_groups = set()

for product in payloads:
    unique_product_groups.add(product['product_group_name'])

unique_count = len(unique_product_groups)
print("Number of unique product group names:", unique_count)

# Load tests.jsonl as python list

In [None]:
with open(tests_path, 'r') as file:
    tests = [json.loads(line) for line in file]

In [None]:
print(len(tests))

In [None]:
print(tests[0]['closest_scores'])

In [None]:
for test in tests:
    test_condition = test.get('conditions', {})
    if "and" not in test_condition or len(test_condition["and"]) > 1:
        print(test_condition)
        break

In [None]:
print(tests[0]['closest_ids'])

Check whether tests contains None values if yes they need to be processed to replace None with 'None'

In [None]:
def contains_none(value):
    """
    Recursively checks if the given value, which can be a dictionary,
    a list, or any other type, contains None.
    """
    if value is None:
        return True
    if isinstance(value, dict):
        return any(contains_none(v) for v in value.values())
    if isinstance(value, list):
        return any(contains_none(item) for item in value)
    return False

def check_tests_for_none(tests):
    """
    Checks if any of the dictionaries in the 'tests' list contains a None value
    within the 'conditions' dictionary or any of its subdictionaries.
    """
    for test in tests:
        conditions = test.get('conditions', {})
        if contains_none(conditions):
            return True
    return False


print(check_tests_for_none(tests)) 

## Create no filters dataset

In [None]:
remove_conditions = False
if remove_conditions:
    for test in tests:
        test['conditions'] = None

    with open(tests_path, 'w') as file:
        for test in tests:
            file.write(json.dumps(test) + '\n')

    print("Modification complete. 'conditions' key removed and original file overwritten.")

## Remove redundant queries with low selectivity

In [None]:
len(tests)

In [None]:
# Load precomputed selectivity ratios for hnm
with open(f"./results/hnm/10_2_2024/" + "benchmark_hnm_ratios.json", 'r') as file:
    hnm_ratios = json.load(file)

In [None]:
def visualize_ratios(ratios):
    bins = np.linspace(0.0, 0.5, num=11)  # 11 edges for 10 bins
    
    hist, _ = np.histogram(ratios, bins)
    
    plt.figure(figsize=(10, 6))
    plt.bar(bins[:-1], hist, width=0.05, align='edge', edgecolor='black')
    
    plt.xlim(0, 0.5)
    
    plt.xlabel('% of Data Left', fontsize=16)
    plt.ylabel('Query Count', fontsize=16)
    plt.title('Number of Queries With Given Selectivity', fontsize=16)
    
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.tight_layout()
    
    plt.savefig(f'images/hnm_query_selectivity.pdf', format='pdf')
    plt.show()

In [None]:
visualize_ratios(hnm_ratios)

In [None]:
reduce_hnm_dataset = False
if reduce_hnm_dataset:
    np_hnm_ratios = np.array(hnm_ratios)
    
    hnm_ratios_above = np.where(np_hnm_ratios > 0.13)[0]
    hnm_ratios_bellow = np.where(np_hnm_ratios <= 0.13)[0]
    
    take_bellow = 2000 - hnm_ratios_above.size
    hnm_ratios_bellow_to_2k = hnm_ratios_bellow[:take_bellow]
    hnm2k_ratios_indexes = np.concatenate((hnm_ratios_bellow_to_2k, hnm_ratios_above), axis=0)


In [None]:
if reduce_hnm_dataset:
    np_tests = np.array(tests)

    np_tests_2k = np_tests[hnm2k_ratios_indexes]
    
    tests_2k = np_tests_2k.tolist()
    
    with open(tests_path, 'w') as file:
        for test in tests_2k:
            file.write(json.dumps(test) + '\n')

In [None]:
# visualize_ratios(np_hnm_ratios[hnm2k_ratios_indexes])

In [None]:
tests_2k = tests

In [None]:
def apply_condition(payloads, condition):
    """
    Apply a given condition to the list of payloads and return the filtered list.
    """
    filtered_payloads = []
    filtered_payloads_ids = []

    if 'and' in condition:
        for i, payload in enumerate(payloads):
            if all(payload.get(key, None) == val['match']['value'] for cond in condition['and'] for key, val in cond.items()):
                filtered_payloads.append(payload)
                filtered_payloads_ids.append(str(i))
    elif 'or' in condition:
        for i, payload in enumerate(payloads):
            if any(payload.get(key, None) == val['match']['value'] for cond in condition['or'] for key, val in cond.items()):
                filtered_payloads.append(payload)
                filtered_payloads_ids.append(str(i))

    return filtered_payloads, filtered_payloads_ids

In [None]:
ratios_2k = []

for condition in [tests_2k[i]['conditions'] for i in range(len(tests_2k))]:
    filtered_payloads, _ = apply_condition(payloads, condition)
    ratio = len(filtered_payloads) / len(payloads)
    ratios_2k.append(ratio)

In [None]:
visualize_ratios(ratios_2k)

In [None]:
(np.array((ratios_2k)) > 0.10).sum()