# Similarity Comparison Function Section

### Generic Similarity Comparison Function + Lock Version

In [None]:
# Generic Similarity Comparison Function (comparison tuples in a list for comparison!)

def generic_similarity_comparison_function(embedding_model_function, comparison_tuple_in_list, sort_output = 0):
    res_dict = {}
    for comp_items in comparison_tuple_in_list:
        # possible alternative is below, so that if embedding model only accept one string and return 1d array/tensor then works!!
        # comp_emb = [embedding_model_function(comp_items[0]), embedding_model_function(comp_items[1])]
        comp_emb = embedding_model_function([comp_items[0], comp_items[1]]) # or just list(comp_items)
        cos_sim = cosine_sim(comp_emb[0], comp_emb[1])
        res_dict[comp_items] = cos_sim
        
    # sort by -1 is descending, 0 is no sort, 1 is ascending!
    # default is no sort, 0
    if sort_output == -1:
        res_dict = {comp:comp_score for comp, comp_score in sorted(res_dict.items(), key = lambda dict_item: dict_item[1], reverse=True)}
    if sort_output == 1:
        res_dict = dict(sorted(res_dict.items(), key = lambda dict_item: dict_item[1], reverse=False))
    return res_dict
        

In [None]:
# partial does not allow arguments to be filled with keywords, need strictly positional so prefer not

## Error is like:
# generic_similarity_comparison_locked_model_MiniLM_L6_v2 = lock_generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2)
# generic_similarity_comparison_locked_model_MiniLM_L6_v2([("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")], sort_output=1)

## Fix is need to specific keyword or change embedding callbaack function position and all, by keyword is like:
# generic_similarity_comparison_locked_model_MiniLM_L6_v2 = lock_generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2)
# generic_similarity_comparison_locked_model_MiniLM_L6_v2(comparison_tuple_in_list=[("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")], sort_output=1)
## see the "comparison_tuple_in_list=" specified, for example! ^

"""
from functools import partial

def lock_generic_similarity_comparison_function(embedding_model_function):
    return partial(generic_similarity_comparison_function, embedding_model_function=embedding_model_function)
"""

## instead of def new function, lambda approach!

def lock_generic_similarity_comparison_function(embedding_model_function):
    return lambda comparison_tuple_in_list, sort_output = 0: generic_similarity_comparison_function(embedding_model_function=embedding_model_function, comparison_tuple_in_list=comparison_tuple_in_list, sort_output=sort_output)

### ONE Category Similarity Comparison Function + Lock Version

In [None]:
# ONE Category Similarity Comparison Function (compare to each string in a list!)
def single_category_similarity_comparison_function(embedding_model_function, category_single, texts, sort_output = 0):
    if type(texts) != list:
        texts = [texts]
    compiled_tuple_comparison_list = [(text, category) for text, category in zip(texts, [category_single for i in range(len(texts))])]
    comparison_result_dict = generic_similarity_comparison_function(embedding_model_function=embedding_model_function, comparison_tuple_in_list=compiled_tuple_comparison_list, sort_output=sort_output)
    return comparison_result_dict

In [None]:
def lock_single_category_similarity_comparison_function(embedding_model_function):
    def lockED_single_category_similarity_comparison_function(category_single, texts, sort_output=0): ## sort_output=0 is needed since it can be left blank when called from locked model!
        return single_category_similarity_comparison_function(embedding_model_function=embedding_model_function, category_single=category_single, texts=texts, sort_output=sort_output)
    return lockED_single_category_similarity_comparison_function

### Categories Similarity Comparison Function + Lock Version

In [None]:
# Categories Similarity Comparison Function (compare to each string in a list!)

# sort by -1 is descending, 0 is no sort, 1 is ascending!
# default is no sort, 0
    
def categories_similarity_comparison_function(embedding_model_function, categories, texts, sort_output = 0):
    if type(categories) != list:
        categories = [categories]
    categories_comparison_result_dict = {}
    for category in categories:
        categories_comparison_result_dict[category] = single_category_similarity_comparison_function(embedding_model_function=embedding_model_function, category_single=category, texts=texts, sort_output=sort_output)
    return categories_comparison_result_dict

In [None]:
"""
from functools import partial

def lock_categories_similarity_comparison_function(embedding_model_function):
    return partial(categories_similarity_comparison_function, embedding_model_function=embedding_model_function)
"""

# lambda approach somewhat!, no keyword in the lambda now, just based off positional cos it can! but the sort_output=0 is a must, so that when call the locked function, if leave blank for it, wont error!
def lock_categories_similarity_comparison_function(embedding_model_function):
    return lambda categories, texts, sort_output=0: categories_similarity_comparison_function(embedding_model_function, categories, texts, sort_output)

#### Comparison Result Display

In [None]:
# Categories Similarity Result Display

# Very specific use case only for "single_category_similarity_comparison_function" which returns a dict of compare_key and result_value
# Not usable on "categories_similarity_comparison_function" since this returns will return dict of dict!

def category_similarity_result_display(category_result_dict, sort_display = 0):
    print(f"Category: {list(category_result_dict.keys())[0][1]}") ## trashy clusterfuck
    print("Similarity Level:")
    if sort_display == -1:
        for comparison_items_tuple, comparison_result in (sorted(category_result_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
            print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
    elif sort_display == 1:
        for comparison_items_tuple, comparison_result in (sorted(category_result_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
            print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
    else:
        for comparison_items_tuple, comparison_result in category_result_dict.items():
            print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")

In [None]:
# Categories Similarity Result Display
def categories_similarity_result_display(categories_result_dict, sort_display = 0):
    for category, category_similarity_results_dict in categories_result_dict.items():
        print(f"Category: {category}")
        print("Similarity Level:")
        if sort_display == -1:
            for comparison_items_tuple, comparison_result in (sorted(category_similarity_results_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
                print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
        elif sort_display == 1:
            for comparison_items_tuple, comparison_result in (sorted(category_similarity_results_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
                print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
        else:
            for comparison_items_tuple, comparison_result in category_similarity_results_dict.items():
                print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")

##### Example Demo

In [None]:
# Sort -1, descending
generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, [("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")], sort_output=-1)

{('i like people', 'people love me'): 0.6048486815689496,
 ('the world is green', 'the ocean is blue'): 0.4271523653033571,
 ('hi there', 'the world is bad'): 0.1287177563239086}

In [None]:
# Sort 0, no sort
generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, [("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")], sort_output=0)

{('hi there', 'the world is bad'): 0.1287177563239086,
 ('i like people', 'people love me'): 0.6048486815689496,
 ('the world is green', 'the ocean is blue'): 0.4271523653033571}

In [None]:
# Sort 1, ascending
generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, [("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")], sort_output=1)

{('hi there', 'the world is bad'): 0.1287177563239086,
 ('the world is green', 'the ocean is blue'): 0.4271523653033571,
 ('i like people', 'people love me'): 0.6048486815689496}

In [None]:
generic_similarity_comparison_locked_model_MiniLM_L6_v2 = lock_generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2)
generic_similarity_comparison_locked_model_MiniLM_L6_v2([("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")])

{('hi there', 'the world is bad'): 0.1287177563239086,
 ('i like people', 'people love me'): 0.6048486815689496,
 ('the world is green', 'the ocean is blue'): 0.4271523653033571}

In [None]:
# Eg. category food, and texts are foods
single_category_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, "food", ["laksa", "laksa bowl", "prata", "curry prata", "steak", "sirloin steak with mushroom sauce"], sort_output=-1)

{('steak', 'food'): 0.5003130997095016,
 ('sirloin steak with mushroom sauce', 'food'): 0.2937494543044752,
 ('laksa bowl', 'food'): 0.259120507793398,
 ('curry prata', 'food'): 0.2585698555536739,
 ('laksa', 'food'): 0.21327309103382694,
 ('prata', 'food'): 0.21317178691606403}

In [None]:
single_category_similarity_comparison_locked_model_MiniLM_L6_v2 = lock_single_category_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2)
single_category_similarity_comparison_locked_model_MiniLM_L6_v2("food", ["laksa", "laksa bowl", "prata", "curry prata", "steak", "sirloin steak with mushroom sauce"], sort_output=-1)

{('steak', 'food'): 0.5003130997095016,
 ('sirloin steak with mushroom sauce', 'food'): 0.2937494543044752,
 ('laksa bowl', 'food'): 0.259120507793398,
 ('curry prata', 'food'): 0.2585698555536739,
 ('laksa', 'food'): 0.21327309103382694,
 ('prata', 'food'): 0.21317178691606403}

In [None]:
# Eg. Testing multi-categories
categories_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, ["food", "asian food", "western food"], ["laksa", "laksa bowl", "prata", "curry prata", "steak", "sirloin steak with mushroom sauce"], sort_output=-1)

{'food': {('steak', 'food'): 0.5003130997095016,
  ('sirloin steak with mushroom sauce', 'food'): 0.2937494543044752,
  ('laksa bowl', 'food'): 0.259120507793398,
  ('curry prata', 'food'): 0.2585698555536739,
  ('laksa', 'food'): 0.21327309103382694,
  ('prata', 'food'): 0.21317178691606403},
 'asian food': {('steak', 'asian food'): 0.30798892917323445,
  ('curry prata', 'asian food'): 0.30237662711869023,
  ('sirloin steak with mushroom sauce', 'asian food'): 0.3007048767274798,
  ('laksa bowl', 'asian food'): 0.22997642560937964,
  ('laksa', 'asian food'): 0.20722211920235642,
  ('prata', 'asian food'): 0.18665660738247442},
 'western food': {('steak', 'western food'): 0.39804313466154906,
  ('sirloin steak with mushroom sauce', 'western food'): 0.3040080630479595,
  ('curry prata', 'western food'): 0.288779802737338,
  ('laksa', 'western food'): 0.24079780320414573,
  ('laksa bowl', 'western food'): 0.23389015706789013,
  ('prata', 'western food'): 0.15397132982047892}}

In [None]:
categories_similarity_comparison_locked_model_MiniLM_L6_v2 = lock_categories_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2)
categories_similarity_comparison_locked_model_MiniLM_L6_v2(["food", "asian food", "western food"], texts=["laksa", "laksa bowl", "prata", "curry prata", "steak", "sirloin steak with mushroom sauce"])

{'food': {('laksa', 'food'): 0.21327309103382694,
  ('laksa bowl', 'food'): 0.259120507793398,
  ('prata', 'food'): 0.21317178691606403,
  ('curry prata', 'food'): 0.2585698555536739,
  ('steak', 'food'): 0.5003130997095016,
  ('sirloin steak with mushroom sauce', 'food'): 0.2937494543044752},
 'asian food': {('laksa', 'asian food'): 0.20722211920235642,
  ('laksa bowl', 'asian food'): 0.22997642560937964,
  ('prata', 'asian food'): 0.18665660738247442,
  ('curry prata', 'asian food'): 0.30237662711869023,
  ('steak', 'asian food'): 0.30798892917323445,
  ('sirloin steak with mushroom sauce', 'asian food'): 0.3007048767274798},
 'western food': {('laksa', 'western food'): 0.24079780320414573,
  ('laksa bowl', 'western food'): 0.23389015706789013,
  ('prata', 'western food'): 0.15397132982047892,
  ('curry prata', 'western food'): 0.288779802737338,
  ('steak', 'western food'): 0.39804313466154906,
  ('sirloin steak with mushroom sauce', 'western food'): 0.3040080630479595}}

In [None]:
# Very specific use case only for "single_category_similarity_comparison_function" which returns a dict of compare_key and result_value
# Not usable on "categories_similarity_comparison_function" since this returns will return dict of dict!
category_similarity_result_display(single_category_similarity_comparison_locked_model_MiniLM_L6_v2("food", ["super prata", "caifan", "ramen"]))

Category: food
Similarity Level:
super prata                    /-/ food                           : 0.16437
caifan                         /-/ food                           : 0.21156
ramen                          /-/ food                           : 0.43923


In [None]:
# Eg. Testing multi-categories DISPLAY
categories_similarity_result_display(categories_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, ["food", "asian food", "western food"], ["laksa", "laksa bowl", "prata", "curry prata", "steak", "sirloin steak with mushroom sauce"]), sort_display=-1)

Category: food
Similarity Level:
steak                          /-/ food                           : 0.50031
sirloin steak with mushroom sa /-/ food                           : 0.29375
laksa bowl                     /-/ food                           : 0.25912
curry prata                    /-/ food                           : 0.25857
laksa                          /-/ food                           : 0.21327
prata                          /-/ food                           : 0.21317
Category: asian food
Similarity Level:
steak                          /-/ asian food                     : 0.30799
curry prata                    /-/ asian food                     : 0.30238
sirloin steak with mushroom sa /-/ asian food                     : 0.3007
laksa bowl                     /-/ asian food                     : 0.22998
laksa                          /-/ asian food                     : 0.20722
prata                          /-/ asian food                     : 0.18666
Category: western

### Categories with "sub-categories"/"synonyms"

#### Both Comparisons and Display (General Cateogry with "sub-categories"/"synonyms")

##### Comparisons (General Cateogry with "sub-categories"/"synonyms" variation)

In [None]:
## Categories with sub-categories is in format of dictionary where general_category-key:sub-"categories"_in_list(actually more like "synonyms" of the general categories)-value
## returns a dictionary of general_category-key:{sub-"category"(general category "synonyms")-key:{(xyz_comparison, sub-"category"/"synonym"):cosine_similarity}}

def categories_wsub_similarity_comparison_function(embedding_model_function, categories_wsub_dict, texts, sort_output=0):
    categories_wsub_result_dict = {}
    for big_general_category, sub_categories in categories_wsub_dict.items():
        categories_wsub_result_dict[big_general_category] = categories_similarity_comparison_function(embedding_model_function=embedding_model_function, categories=([big_general_category]+sub_categories), texts=texts, sort_output=sort_output)
    return categories_wsub_result_dict

##### Comparisons - Lock (General Cateogry with "sub-categories"/"synonyms" variation)

In [None]:
def lock_categories_wsub_similarity_comparison_function(embedding_model_function):
    return lambda categories_wsub_dict, texts, sort_output=0: categories_wsub_similarity_comparison_function(embedding_model_function=embedding_model_function, categories_wsub_dict=categories_wsub_dict, texts=texts, sort_output=sort_output)

##### Display (General Cateogry with "sub-categories"/"synonyms" variation)

In [None]:
def categories_wsub_similarity_result_display(categories_wsub_result_dict, sort_display = 0):
    for big_general_category, big_wsub_categories_result in categories_wsub_result_dict.items():
        print(f"General Category: {big_general_category}")
        categories_similarity_result_display(big_wsub_categories_result, sort_display=sort_display)
        print()

#### Display, similar to but variation [Top xxx (Count) and Limit yyy (Value)]

In [None]:
## Top xxx and Limit yyy, display function different mainly

# Prep
def categories_similarity_result_display_top_limit(categories_result_dict, top_many = 5, limit_value = 0.5):
    for category, category_similarity_results_dict in categories_result_dict.items():
        print(f"Sub-Categories: {category}")
        print(f"Similarity Level Of Top {top_many} (Limit={limit_value}):")
        num_count = 0
        #if sort_display == -1:
        for comparison_items_tuple, comparison_result in (sorted(category_similarity_results_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
            if num_count == top_many or comparison_result < limit_value:
                break
            print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
            num_count += 1
        """
        elif sort_display == 1:
            for comparison_items_tuple, comparison_result in (sorted(category_similarity_results_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
                if num_count == top_many or comparison_result < limit_value:
                    break
                print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
        else:
            for comparison_items_tuple, comparison_result in category_similarity_results_dict.items():
                if num_count == top_many or comparison_result < limit_value:
                    break
                print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
        """
# Actual using function
def categories_wsub_similarity_result_display_top_limit(categories_wsub_result_dict, top_many = 5, limit_value = 0.5):
    for big_general_category, big_wsub_categories_result in categories_wsub_result_dict.items():
        print(f"General Category: {big_general_category}")
        general_category_subcats = tuple(big_wsub_categories_result.keys())
        subcat_combined_dicts = {}
        for subcat_dicts in big_wsub_categories_result.values():
            subcat_combined_dicts = subcat_combined_dicts | subcat_dicts
        #sorted_subcat_combined_dicts = {comp_tuple:comp_res for comp_tuple, comp_res in sorted(subcat_combined_dicts.items(), key = lambda dict_item: dict_item[1], reverse=True)}
        categories_similarity_result_display_top_limit({general_category_subcats: subcat_combined_dicts}, top_many=top_many, limit_value=limit_value)
        print()

##### Example Demo

In [None]:
categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo_cut_third)

{'Passage provided for business purpose': {'Passage provided for business purpose': {('Employees receive free flight benefits ',
    'Passage provided for business purpose'): 0.14779233235003653,
   ('to return to home country once a year and',
    'Passage provided for business purpose'): 0.23613265901235564,
   (' free flight benefits for business trips',
    'Passage provided for business purpose'): 0.29568490649739965},
  'Air tickets for business trip': {('Employees receive free flight benefits ',
    'Air tickets for business trip'): 0.4639808954117372,
   ('to return to home country once a year and',
    'Air tickets for business trip'): 0.14540735152911588,
   (' free flight benefits for business trips',
    'Air tickets for business trip'): 0.6598189842771236},
  'Air tickets for work purposes': {('Employees receive free flight benefits ',
    'Air tickets for work purposes'): 0.5376104533584114,
   ('to return to home country once a year and',
    'Air tickets for work purpos

In [None]:
categories_wsub_similarity_result_display(categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo_cut_third), sort_display=-1)

General Category: Passage provided for business purpose
Category: Passage provided for business purpose
Similarity Level:
 free flight benefits for busi /-/ Passage provided for business  : 0.29568
to return to home country once /-/ Passage provided for business  : 0.23613
Employees receive free flight  /-/ Passage provided for business  : 0.14779
Category: Air tickets for business trip
Similarity Level:
 free flight benefits for busi /-/ Air tickets for business trip  : 0.65982
Employees receive free flight  /-/ Air tickets for business trip  : 0.46398
to return to home country once /-/ Air tickets for business trip  : 0.14541
Category: Air tickets for work purposes
Similarity Level:
Employees receive free flight  /-/ Air tickets for work purposes  : 0.53761
 free flight benefits for busi /-/ Air tickets for work purposes  : 0.49501
to return to home country once /-/ Air tickets for work purposes  : 0.17532
Category: Flight for business trip
Similarity Level:
 free flight benefits for

In [None]:
categories_wsub_similarity_result_display_top_limit(categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo_cut_third), top_many=3, limit_value=0.55)

General Category: Passage provided for business purpose
Sub-Categories: ('Passage provided for business purpose', 'Air tickets for business trip', 'Air tickets for work purposes', 'Flight for business trip', 'Flight for work purposes', 'Travel incidental to employment')
Similarity Level Of Top 3 (Limit=0.55):
 free flight benefits for busi /-/ Flight for business trip       : 0.75365
 free flight benefits for busi /-/ Air tickets for business trip  : 0.65982
 free flight benefits for busi /-/ Flight for work purposes       : 0.60962

General Category: Passage provided when taking up employment and upon termination
Sub-Categories: ('Passage provided when taking up employment and upon termination', 'Air tickets back to home country after contract end', 'Air tickets back to home country after termination of employment', 'Air tickets to work in Singapore', 'Flight back to home country after contract end', 'Flight back to home country after termination of employment', 'Flight to work in Sin

### Categories Resorting (To Different Format) Function

#### Sorting and Organising Within a Category (Refers To "sub-categories"/"synonyms")

In [None]:
def categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict, get_inner_list = False, sort_within_cat=0, top_many_wsub = 3, limit_value = 0.5):
    resorted_categories_wsub_similarity_comparison_dict = {}
    if limit_value < 0:
        if sort_within_cat == -1:
            limit_value = 0
        if sort_within_cat == 0:
            limit_value = None
        if sort_within_cat == 1:
            limit_value = 1
    
    if get_inner_list:
        for category, sub_syn_cat_dict in categories_wsub_similarity_comparison_result_dict.items():
            resorted_categories_wsub_similarity_comparison_dict[category] = []
            for sub_syn_cat_text_tuple_pred_dict in sub_syn_cat_dict.values():
                for sub_syn_cat_text_tuple, pred in sub_syn_cat_text_tuple_pred_dict.items():
                    if sort_within_cat == 0 or (sort_within_cat == -1 and pred >= limit_value) or (sort_within_cat == 1 and pred <= limit_value):    
                        resorted_categories_wsub_similarity_comparison_dict[category].append((sub_syn_cat_text_tuple, pred))
        # sorting below is within a category itself
        for category, comparison_tuple_pred_tuple in resorted_categories_wsub_similarity_comparison_dict.items():
            if sort_within_cat == -1:
                resorted_categories_wsub_similarity_comparison_dict[category] = list(sorted(comparison_tuple_pred_tuple, key= lambda tuple_item: tuple_item[1], reverse=True))[:top_many_wsub]
            if sort_within_cat == 0:
                resorted_categories_wsub_similarity_comparison_dict[category] = list(comparison_tuple_pred_tuple)[:top_many_wsub]
            if sort_within_cat == 1:
                resorted_categories_wsub_similarity_comparison_dict[category] = list(sorted(comparison_tuple_pred_tuple, key= lambda tuple_item: tuple_item[1], reverse=False))[:top_many_wsub]
        return resorted_categories_wsub_similarity_comparison_dict
            
    else:    
        for category, sub_syn_cat_dict in categories_wsub_similarity_comparison_result_dict.items():
            resorted_categories_wsub_similarity_comparison_dict[category] = {}
            for sub_syn_cat_text_tuple_pred_dict in sub_syn_cat_dict.values():
                for sub_syn_cat_text_tuple, pred in sub_syn_cat_text_tuple_pred_dict.items():
                    if sort_within_cat == 0 or (sort_within_cat == -1 and pred >= limit_value) or (sort_within_cat == 1 and pred <= limit_value):    
                        resorted_categories_wsub_similarity_comparison_dict[category][sub_syn_cat_text_tuple] = pred
        # sorting below is within a category itself
        for category, comparison_tuple_pred_dict in resorted_categories_wsub_similarity_comparison_dict.items():
            if sort_within_cat == -1:
                resorted_categories_wsub_similarity_comparison_dict[category] = dict(list(sorted(comparison_tuple_pred_dict.items(), key= lambda dict_item: dict_item[1], reverse=True))[:top_many_wsub])
            if sort_within_cat == 0:
                resorted_categories_wsub_similarity_comparison_dict[category] = dict(list(comparison_tuple_pred_dict.items())[:top_many_wsub])
            if sort_within_cat == 1:
                resorted_categories_wsub_similarity_comparison_dict[category] = dict(list(sorted(comparison_tuple_pred_dict.items(), key= lambda dict_item: dict_item[1], reverse=False))[:top_many_wsub])
    return resorted_categories_wsub_similarity_comparison_dict

#### Sorting and Organising Between Categories (Refers To Different Categories In General) - Most to Least

In [None]:
## No Sorting Order, all made for the cleaning function, but the code is there but the argument is removed and relevant code portion is commented out, after all, for no sort, how to determine top xxx category, the top or bottom!!

def categories_wsub_similarity_comparison_resort_cleaning_function(resorted_categories_wsub_similarity_comparison_dict, get_inner_list = False, get_list = False, top_many_cat = 3):
    resultant_cleaned_list = []
    if get_inner_list:
        for category, comparison_tuple_pred_pair_tuple_list in resorted_categories_wsub_similarity_comparison_dict.items():
            for comparison_tuple, pred in comparison_tuple_pred_pair_tuple_list:
                resultant_cleaned_list.append((category, (comparison_tuple, pred)))
    else:
        for category, comparison_tuple_pred_pair_dict in resorted_categories_wsub_similarity_comparison_dict.items():
            for comparison_tuple, pred in comparison_tuple_pred_pair_dict.items():
                resultant_cleaned_list.append((category, (comparison_tuple, pred)))
    """
    # sort_cats args gone!! since cleaning is for top many, so no point giving option here, just restrict to just most to least!!
    if sort_cats == -1:
        sorted_resultant_cleaned_list = list(sorted(resultant_cleaned_list, key=lambda list_element: list_element[1][1], reverse=True))
    if sort_cats == 0:
        sorted_resultant_cleaned_list = resultant_cleaned_list
    if sort_cats == 1:
        sorted_resultant_cleaned_list = list(sorted(resultant_cleaned_list, key=lambda list_element: list_element[1][1], reverse=False))
    """
    sorted_resultant_cleaned_list = list(sorted(resultant_cleaned_list, key=lambda list_element: list_element[1][1], reverse=True))[:top_many_cat]
    if get_list:
        return sorted_resultant_cleaned_list
    """
    # for this to work with getting back a dict, which is sorted correctly, the "top_many_wsub" argument in previous function need to be 1
    ## if not, very wonky, since category as key means "top_many_cat" has to be <= number of category, else weird, and if "top_many_wsub" is not 1, the method below not so direct, need to ensure only add to dict once, and if category added then no more replacement!
    ### a possible alternative but unpreferred, so just keep "top_many_wsub" at 1 if following into this function!!
    
    sorted_resultant_cleaned_list_dict = {}
    for cat, tuple_pair in sorted_resultant_cleaned_list:
        if cat not in sorted_resultant_cleaned_list_dict:
            sorted_resultant_cleaned_list_dict[cat] = tuple_pair
    return sorted_resultant_cleaned_list_dict
    """
    return dict(sorted_resultant_cleaned_list)

#### New Resorted Format Display

In [None]:
def cleaned_categories_wsub_similarity_comparison_resorted_result_display(cleaned_resorted_compare_result, get_list):
    if get_list:
        for label, comparison_tuple_pred_pair_tuple in cleaned_resorted_compare_result:
            print(f"Category: {label}")
            print(f"{comparison_tuple_pred_pair_tuple[0][0]:30.30} /-/ {comparison_tuple_pred_pair_tuple[0][1]:30.30}: {comparison_tuple_pred_pair_tuple[1]:.5}")
            print()
    else:
        for label, comparison_tuple_pred_pair_tuple in cleaned_resorted_compare_result.items():
            print(f"Category: {label}")
            print(f"{comparison_tuple_pred_pair_tuple[0][0]:30.30} /-/ {comparison_tuple_pred_pair_tuple[0][1]:30.30}: {comparison_tuple_pred_pair_tuple[1]:.5}")
            print()

##### Example Demo

In [None]:
# Arguments to use for function
get_inner_list = True
sort_within_cat = 1
top_many_wsub = 3
limit_value = 0.07

categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict=categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, categories_wsub_dict=cat_wsub_demo, texts=splitted_sentence_text), get_inner_list=get_inner_list, sort_within_cat=sort_within_cat, top_many_wsub=top_many_wsub, limit_value=limit_value)

{'Passage provided for business purpose': [(('a year and',
    'Air tickets for work purposes'),
   0.03580521225597774),
  (('a year and', 'Air tickets for business trip'), 0.0504765864001799),
  (('a year and', 'Passage provided for business purpose'),
   0.06552286772996774)],
 'Passage provided when taking up employment and upon termination': [(('a year and',
    'Air tickets to work in Singapore'),
   0.030742087031399204),
  (('a year and',
    'Air tickets back to home country after termination of employment'),
   0.05293060206713229),
  (('a year and',
    'Flight back to home country after termination of employment'),
   0.0552523757358474)],
 'Home leave passage': [(('a year and', 'Air tickets to home country'),
   0.056946677215592184),
  (('a year and', 'Air passage benefit'), 0.06636109112551498)]}

In [None]:
# Arguments to use for function
get_inner_list = True
sort_within_cat = -1
top_many_wsub = 1
limit_value = -1

top_many_cat = 2
get_list = False

categories_wsub_similarity_comparison_resort_cleaning_function(resorted_categories_wsub_similarity_comparison_dict=categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict=categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, categories_wsub_dict=cat_wsub_demo, texts=splitted_sentence_text), get_inner_list=get_inner_list, sort_within_cat=sort_within_cat, top_many_wsub=top_many_wsub, limit_value=limit_value), get_inner_list=get_inner_list, get_list=get_list, top_many_cat=top_many_cat)

{'Passage provided for business purpose': (('free flight benefits for business trips',
   'Flight for business trip'),
  0.7536517813135932),
 'Home leave passage': (('to return to home country once',
   'Flight to home country'),
  0.6656597017594097)}

In [None]:
# Arguments to use for function
get_inner_list = True
sort_within_cat = -1
top_many_wsub = 1
limit_value = 0.6

top_many_cat = 3
get_list = True
        
cleaned_categories_wsub_similarity_comparison_resorted_result_display(categories_wsub_similarity_comparison_resort_cleaning_function(resorted_categories_wsub_similarity_comparison_dict=categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict=categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, categories_wsub_dict=cat_wsub_demo, texts=splitted_sentence_text), get_inner_list=get_inner_list, sort_within_cat=sort_within_cat, top_many_wsub=top_many_wsub, limit_value=limit_value), get_inner_list=get_inner_list, get_list=get_list, top_many_cat=top_many_cat), get_list=get_list)

Category: Passage provided for business purpose
free flight benefits for busin /-/ Flight for business trip      : 0.75365

Category: Home leave passage
to return to home country once /-/ Flight to home country        : 0.66566



In [None]:
# Arguments to use for function
get_inner_list = True
sort_within_cat = -1
top_many_wsub = 1
limit_value = 0.5

top_many_cat = 3
get_list = True
        
cleaned_categories_wsub_similarity_comparison_resorted_result_display(categories_wsub_similarity_comparison_resort_cleaning_function(resorted_categories_wsub_similarity_comparison_dict=categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict=categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, categories_wsub_dict=cat_wsub_demo, texts=splitted_sentence_text), get_inner_list=get_inner_list, sort_within_cat=sort_within_cat, top_many_wsub=top_many_wsub, limit_value=limit_value), get_inner_list=get_inner_list, get_list=get_list, top_many_cat=top_many_cat), get_list=get_list)

Category: Passage provided for business purpose
free flight benefits for busin /-/ Flight for business trip      : 0.75365

Category: Home leave passage
to return to home country once /-/ Flight to home country        : 0.66566

Category: Passage provided when taking up employment and upon termination
to return to home country once /-/ Flight back to home country af: 0.59888

