# Overall Useful Functions
- ```get_length(embedding_1d)```
- ```normalise_embedding(embedding_1d)```
- ```get_normalise_embedding(embedding_1d)```
- ```cosine_sim(embedding_1, embedding_2)```
- ```norm_ed_cosine_sim(embedding_1, embedding_2)```
- ```generic_sent_cos_sim(model_emb_func, t1, t2, additional_nesting = False)```

In [None]:
## Overall Useful Functions

import torch
import math
import numpy as np
def get_length(embedding_1d):
    sum = 0
    for i in embedding_1d:
        sum+=(i**2)
    return math.sqrt(sum)
def normalise_embedding(embedding_1d):
    length = get_length(embedding_1d)
    for i in range(len(embedding_1d)):
        embedding_1d[i] /= length
def get_normalise_embedding(embedding_1d):
    if type(embedding_1d) is torch.Tensor:
        temp_embedding_1d = (embedding_1d.detach().numpy()).copy()
    else:
        temp_embedding_1d = embedding_1d.copy()
    length = get_length(temp_embedding_1d)
    for i in range(len(temp_embedding_1d)):
        temp_embedding_1d[i] /= length
    return temp_embedding_1d


def cosine_sim(embedding_1, embedding_2):
    embedding_1 = get_normalise_embedding(embedding_1)
    embedding_2 = get_normalise_embedding(embedding_2)
    sim_sum = 0
    for e_1, e_2 in zip(embedding_1, embedding_2):
        sim_sum += (e_1*e_2)
    return sim_sum
def norm_ed_cosine_sim(embedding_1, embedding_2):
    sim_sum = 0
    for e_1, e_2 in zip(embedding_1, embedding_2):
        sim_sum += (e_1*e_2)
    return sim_sum

In [None]:
## Cosine Similarity -- Embedding Model

def generic_sent_cos_sim(model_emb_func, t1, t2, additional_nesting = False):
    if additional_nesting:
        return cosine_sim(model_emb_func(t1)[0], model_emb_func(t2)[0])    
    return cosine_sim(model_emb_func(t1), model_emb_func(t2))

In [None]:
## Embedding Model's Function / Model's Embedding Function
# Returns a 1d tensor with the embedding
# if list of input string, then 2d tensor, each 1d tensor inside the 2d tensor correspond to each input string in list

def embedding_model_function(text): ## single string or a list of strings
    return torch.tensor([1, 2, 3, 4, 5, 6]) ## eg

## EXAMPLE ABOVE (ONLY AN EXAMPLE)
### Format of this function is important!

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
# normalise tensor, but i think 1d tensor cannot? need to be inside another tensor to work?? idk, need read on docs or more maybe!
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    """
    print(attention_mask.shape, attention_mask)
    print(attention_mask.unsqueeze(-1).shape, attention_mask.unsqueeze(-1))
    print(input_mask_expanded)
    print(len(input_mask_expanded), token_embeddings.size(), input_mask_expanded.size())
    """
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Semantic Segmentation Function Section

In [None]:
## Semantic Segmentation Function PREPARATION FUNCTIONS

from itertools import islice

def window(seq, n=3):
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result
        
        
        
        
def climb(co_score_list, list_index, mode = "l"):
    res_score = 0
    if mode == "l":
        while (list_index >= 0):
            if co_score_list[list_index] > res_score:
                res_score = co_score_list[list_index]
                list_index -= 1
            else:
                break
        return res_score
    else:
        list_len = len(co_score_list)
        while (list_index < list_len):
            if co_score_list[list_index] > res_score:
                res_score = co_score_list[list_index]
                list_index += 1
            else:
                break
        return res_score
    
def get_depth_score_list(co_score_list):
    res_depth_score_list = []
    co_score_len = len(co_score_list)
    for i in range(co_score_len):
        i_co_score = co_score_list[i]
        l_peak = climb(co_score_list, i, "l")
        r_peak = climb(co_score_list, i, "r")
        i_depth_score = 0.5 * (l_peak + r_peak - (2*i_co_score))
        res_depth_score_list.append(i_depth_score)
    return np.array(res_depth_score_list)




import matplotlib.pyplot as plt

from scipy.signal import argrelmax

def get_local_maxima(depth_scores, order=1):
    maxima_ids = argrelmax(depth_scores, order=order)[0]
    filtered_scores = np.zeros(len(depth_scores))
    filtered_scores[maxima_ids] = depth_scores[maxima_ids]
    return filtered_scores

def compute_threshold(scores): ## maybe can make this more picky, by making threshold higher, like (np.std(s) / 3) or /4 or more instead?
    s = scores[np.nonzero(scores)]
    threshold = np.mean(s) - (np.std(s) / 2)
    # threshold = np.mean(s) - (np.std(s))
    return threshold

def get_threshold_segments(scores, threshold=0.1):
    segment_ids = np.where(scores >= threshold)[0]
    return segment_ids




def primitively_naive_tokeniser(text):
    toks_list = text.split(" ")
    return toks_list

In [None]:
## Semantic Segmentation Function Portions

WINDOW_SIZE = 3

def sentence_to_sliding_window(sentence_s):
    sentence_words_toks = primitively_naive_tokeniser(sentence_s)
    window_size_split = list(window(sentence_words_toks, WINDOW_SIZE))
    window_splited_texts = [' '.join([window_toks for window_toks in each_window]) for each_window in window_size_split]
    return window_splited_texts

def coherence_score_list_from_embedding_list(window_splited_embedding_list):
    coherence_scores_list = [cosine_sim(pair[0], pair[1]) for pair in zip(window_splited_embedding_list[:-1], window_splited_embedding_list[1:])]
    return coherence_scores_list

def plot_data_points(vary_data, thres = -1):
    plt.plot(vary_data)
    if (thres == -1):
        plt.show()
    else:
        plt.plot([thres for i in range(len(vary_data))])
        plt.show()

def filtered_indexes_list_to_splitted_segments_by_semantics(original_sent, filtered_indexes_list):
    sentence_words_toks = primitively_naive_tokeniser(original_sent)
    segment_key_breaks = get_threshold_segments(filtered_indexes_list, compute_threshold(filtered_indexes_list))
    segment_demark = [0] + [(ids + (WINDOW_SIZE-1)) for ids in segment_key_breaks] + [len(sentence_words_toks)]
    segment_demark_intervals = list(zip(segment_demark[:-1], segment_demark[1:]))
    resultant_segments_after_split_by_interval = [" ".join(sentence_words_toks[interval_points[0]:interval_points[1]]) for interval_points in segment_demark_intervals]
    return resultant_segments_after_split_by_interval

In [None]:
## Semantic Segmentation Function

def semantic_segmentation_function(embedding_model_function, sentence_text, intermediate_status = False, graph_status = False):
    windowed_parts = sentence_to_sliding_window(sentence_text)
    if intermediate_status:
        print(f"windowed_parts: {windowed_parts}")
    
    # if ensure "embedding_model_function" accept only 1 string and return 1d array/tensor then can use the below code, current should still work!!, as long as return 1d array for single string!!
    # embedding_list = [embedding_model_function(windowed_part) for windowed_part in windowed_parts]
    
    ## if list of input strings can produce 2d array/tensor automatically, then can just use below one!!, only 1 time embed bunch at once!!
    embedding_list = embedding_model_function(windowed_parts)
    if intermediate_status:
        print(f"embedding_list: {embedding_list}")
    """
    if graph_status:
        print("Embedding List Plot") # bad! like no use
        plot_data_points(embedding_list) # bad! like no use
    """
    
    windowed_parts_coherence_score_list = coherence_score_list_from_embedding_list(embedding_list)
    if intermediate_status:
        print(f"windowed_parts_coherence_score_list: {windowed_parts_coherence_score_list}")
    if graph_status:
        print("Coherence Score Plot:")
        plot_data_points(windowed_parts_coherence_score_list)
    
    windowed_parts_depth_score_list = get_depth_score_list(windowed_parts_coherence_score_list)
    if intermediate_status:
        print(f"windowed_parts_depth_score_list: {windowed_parts_depth_score_list}")
    if graph_status:
        print("Depth Score Plot:")
        plot_data_points(windowed_parts_depth_score_list)
    
    windowed_parts_filtered_depth_score_list = get_local_maxima(windowed_parts_depth_score_list)
    if intermediate_status:
        print(f"windowed_parts_filtered_depth_score_list: {windowed_parts_filtered_depth_score_list}")
    if graph_status:
        print("Filtered Depth Score Plot:")
        plot_data_points(windowed_parts_filtered_depth_score_list)
    
    filtered_threshold = compute_threshold(windowed_parts_filtered_depth_score_list)
    if intermediate_status:
        print(f"filtered_threshold: {filtered_threshold}")
    if graph_status:
        print("Filtered Depth Score With Threshold Line Plot:")
        plot_data_points(windowed_parts_filtered_depth_score_list, filtered_threshold)

    #sentences_tokenised = primitively_naive_tokeniser(sentences)
    #sentences_topics_splitted = filtered_indexes_list_to_splitted_sent(sentences_tokenised, windowed_sentences_filtered_depth_score_v1_list)
    sentences_topics_splitted = filtered_indexes_list_to_splitted_segments_by_semantics(sentence_text, windowed_parts_filtered_depth_score_list)
    return sentences_topics_splitted

In [None]:
# Lock Model
def lock_semantic_segmentation_function(embedding_model_function):
    def lockED_semantic_segmentation_function(sentence_text, intermediate_status = False, graph_status = False): # all these default params need to have because the locked function can have the option to leave the args blank for them to let it be default!
        return semantic_segmentation_function(embedding_model_function=embedding_model_function, sentence_text=sentence_text, intermediate_status=intermediate_status, graph_status=graph_status)
    return lockED_semantic_segmentation_function

##### Example Demo

In [None]:
# Eg. "get_sentence_embedding_MiniLM_L6_v2" embedding and statuses = False
semantic_segmentation_function(get_sentence_embedding_MiniLM_L6_v2, "Employees receive free flight benefits to return to home country once a year and free flight benefits for business trips", intermediate_status=False, graph_status=False)

['Employees receive free flight benefits',
 'to return to home country once',
 'a year and',
 'free flight benefits for business trips']

In [None]:
# Locked Model
semantic_segmentation_locked_model_MiniLM_L6_v2 = lock_semantic_segmentation_function(get_sentence_embedding_MiniLM_L6_v2)
semantic_segmentation_locked_model_MiniLM_L6_v2("Employees receive free flight benefits to return to home country once a year and free flight benefits for business trips", intermediate_status=False, graph_status=False)

['Employees receive free flight benefits',
 'to return to home country once',
 'a year and',
 'free flight benefits for business trips']

# Similarity Comparison Function Section

### Generic Similarity Comparison Function + Lock Version

In [None]:
# Generic Similarity Comparison Function (comparison tuples in a list for comparison!)

def generic_similarity_comparison_function(embedding_model_function, comparison_tuple_in_list, sort_output = 0):
    res_dict = {}
    for comp_items in comparison_tuple_in_list:
        # possible alternative is below, so that if embedding model only accept one string and return 1d array/tensor then works!!
        # comp_emb = [embedding_model_function(comp_items[0]), embedding_model_function(comp_items[1])]
        comp_emb = embedding_model_function([comp_items[0], comp_items[1]]) # or just list(comp_items)
        cos_sim = cosine_sim(comp_emb[0], comp_emb[1])
        res_dict[comp_items] = cos_sim
        
    # sort by -1 is descending, 0 is no sort, 1 is ascending!
    # default is no sort, 0
    if sort_output == -1:
        res_dict = {comp:comp_score for comp, comp_score in sorted(res_dict.items(), key = lambda dict_item: dict_item[1], reverse=True)}
    if sort_output == 1:
        res_dict = dict(sorted(res_dict.items(), key = lambda dict_item: dict_item[1], reverse=False))
    return res_dict
        

In [None]:
# partial does not allow arguments to be filled with keywords, need strictly positional so prefer not

## Error is like:
# generic_similarity_comparison_locked_model_MiniLM_L6_v2 = lock_generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2)
# generic_similarity_comparison_locked_model_MiniLM_L6_v2([("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")], sort_output=1)

## Fix is need to specific keyword or change embedding callbaack function position and all, by keyword is like:
# generic_similarity_comparison_locked_model_MiniLM_L6_v2 = lock_generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2)
# generic_similarity_comparison_locked_model_MiniLM_L6_v2(comparison_tuple_in_list=[("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")], sort_output=1)
## see the "comparison_tuple_in_list=" specified, for example! ^

"""
from functools import partial

def lock_generic_similarity_comparison_function(embedding_model_function):
    return partial(generic_similarity_comparison_function, embedding_model_function=embedding_model_function)
"""

## instead of def new function, lambda approach!

def lock_generic_similarity_comparison_function(embedding_model_function):
    return lambda comparison_tuple_in_list, sort_output = 0: generic_similarity_comparison_function(embedding_model_function=embedding_model_function, comparison_tuple_in_list=comparison_tuple_in_list, sort_output=sort_output)

### ONE Category Similarity Comparison Function + Lock Version

In [None]:
# ONE Category Similarity Comparison Function (compare to each string in a list!)
def single_category_similarity_comparison_function(embedding_model_function, category_single, texts, sort_output = 0):
    if type(texts) != list:
        texts = [texts]
    compiled_tuple_comparison_list = [(text, category) for text, category in zip(texts, [category_single for i in range(len(texts))])]
    comparison_result_dict = generic_similarity_comparison_function(embedding_model_function=embedding_model_function, comparison_tuple_in_list=compiled_tuple_comparison_list, sort_output=sort_output)
    return comparison_result_dict

In [None]:
def lock_single_category_similarity_comparison_function(embedding_model_function):
    def lockED_single_category_similarity_comparison_function(category_single, texts, sort_output=0): ## sort_output=0 is needed since it can be left blank when called from locked model!
        return single_category_similarity_comparison_function(embedding_model_function=embedding_model_function, category_single=category_single, texts=texts, sort_output=sort_output)
    return lockED_single_category_similarity_comparison_function

### Categories Similarity Comparison Function + Lock Version

In [None]:
# Categories Similarity Comparison Function (compare to each string in a list!)

# sort by -1 is descending, 0 is no sort, 1 is ascending!
# default is no sort, 0
    
def categories_similarity_comparison_function(embedding_model_function, categories, texts, sort_output = 0):
    if type(categories) != list:
        categories = [categories]
    categories_comparison_result_dict = {}
    for category in categories:
        categories_comparison_result_dict[category] = single_category_similarity_comparison_function(embedding_model_function=embedding_model_function, category_single=category, texts=texts, sort_output=sort_output)
    return categories_comparison_result_dict

In [None]:
"""
from functools import partial

def lock_categories_similarity_comparison_function(embedding_model_function):
    return partial(categories_similarity_comparison_function, embedding_model_function=embedding_model_function)
"""

# lambda approach somewhat!, no keyword in the lambda now, just based off positional cos it can! but the sort_output=0 is a must, so that when call the locked function, if leave blank for it, wont error!
def lock_categories_similarity_comparison_function(embedding_model_function):
    return lambda categories, texts, sort_output=0: categories_similarity_comparison_function(embedding_model_function, categories, texts, sort_output)

#### Comparison Result Display

In [None]:
# Categories Similarity Result Display

# Very specific use case only for "single_category_similarity_comparison_function" which returns a dict of compare_key and result_value
# Not usable on "categories_similarity_comparison_function" since this returns will return dict of dict!

def category_similarity_result_display(category_result_dict, sort_display = 0):
    print(f"Category: {list(category_result_dict.keys())[0][1]}") ## trashy clusterfuck
    print("Similarity Level:")
    if sort_display == -1:
        for comparison_items_tuple, comparison_result in (sorted(category_result_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
            print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
    elif sort_display == 1:
        for comparison_items_tuple, comparison_result in (sorted(category_result_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
            print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
    else:
        for comparison_items_tuple, comparison_result in category_result_dict.items():
            print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")

In [None]:
# Categories Similarity Result Display
def categories_similarity_result_display(categories_result_dict, sort_display = 0):
    for category, category_similarity_results_dict in categories_result_dict.items():
        print(f"Category: {category}")
        print("Similarity Level:")
        if sort_display == -1:
            for comparison_items_tuple, comparison_result in (sorted(category_similarity_results_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
                print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
        elif sort_display == 1:
            for comparison_items_tuple, comparison_result in (sorted(category_similarity_results_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
                print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
        else:
            for comparison_items_tuple, comparison_result in category_similarity_results_dict.items():
                print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")

##### Example Demo

In [None]:
# Sort -1, descending
generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, [("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")], sort_output=-1)

{('i like people', 'people love me'): 0.6048486815689496,
 ('the world is green', 'the ocean is blue'): 0.4271523653033571,
 ('hi there', 'the world is bad'): 0.1287177563239086}

In [None]:
# Sort 0, no sort
generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, [("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")], sort_output=0)

{('hi there', 'the world is bad'): 0.1287177563239086,
 ('i like people', 'people love me'): 0.6048486815689496,
 ('the world is green', 'the ocean is blue'): 0.4271523653033571}

In [None]:
# Sort 1, ascending
generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, [("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")], sort_output=1)

{('hi there', 'the world is bad'): 0.1287177563239086,
 ('the world is green', 'the ocean is blue'): 0.4271523653033571,
 ('i like people', 'people love me'): 0.6048486815689496}

In [None]:
generic_similarity_comparison_locked_model_MiniLM_L6_v2 = lock_generic_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2)
generic_similarity_comparison_locked_model_MiniLM_L6_v2([("hi there", "the world is bad"), ("i like people", "people love me"), ("the world is green", "the ocean is blue")])

{('hi there', 'the world is bad'): 0.1287177563239086,
 ('i like people', 'people love me'): 0.6048486815689496,
 ('the world is green', 'the ocean is blue'): 0.4271523653033571}

In [None]:
# Eg. category food, and texts are foods
single_category_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, "food", ["laksa", "laksa bowl", "prata", "curry prata", "steak", "sirloin steak with mushroom sauce"], sort_output=-1)

{('steak', 'food'): 0.5003130997095016,
 ('sirloin steak with mushroom sauce', 'food'): 0.2937494543044752,
 ('laksa bowl', 'food'): 0.259120507793398,
 ('curry prata', 'food'): 0.2585698555536739,
 ('laksa', 'food'): 0.21327309103382694,
 ('prata', 'food'): 0.21317178691606403}

In [None]:
single_category_similarity_comparison_locked_model_MiniLM_L6_v2 = lock_single_category_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2)
single_category_similarity_comparison_locked_model_MiniLM_L6_v2("food", ["laksa", "laksa bowl", "prata", "curry prata", "steak", "sirloin steak with mushroom sauce"], sort_output=-1)

{('steak', 'food'): 0.5003130997095016,
 ('sirloin steak with mushroom sauce', 'food'): 0.2937494543044752,
 ('laksa bowl', 'food'): 0.259120507793398,
 ('curry prata', 'food'): 0.2585698555536739,
 ('laksa', 'food'): 0.21327309103382694,
 ('prata', 'food'): 0.21317178691606403}

In [None]:
# Eg. Testing multi-categories
categories_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, ["food", "asian food", "western food"], ["laksa", "laksa bowl", "prata", "curry prata", "steak", "sirloin steak with mushroom sauce"], sort_output=-1)

{'food': {('steak', 'food'): 0.5003130997095016,
  ('sirloin steak with mushroom sauce', 'food'): 0.2937494543044752,
  ('laksa bowl', 'food'): 0.259120507793398,
  ('curry prata', 'food'): 0.2585698555536739,
  ('laksa', 'food'): 0.21327309103382694,
  ('prata', 'food'): 0.21317178691606403},
 'asian food': {('steak', 'asian food'): 0.30798892917323445,
  ('curry prata', 'asian food'): 0.30237662711869023,
  ('sirloin steak with mushroom sauce', 'asian food'): 0.3007048767274798,
  ('laksa bowl', 'asian food'): 0.22997642560937964,
  ('laksa', 'asian food'): 0.20722211920235642,
  ('prata', 'asian food'): 0.18665660738247442},
 'western food': {('steak', 'western food'): 0.39804313466154906,
  ('sirloin steak with mushroom sauce', 'western food'): 0.3040080630479595,
  ('curry prata', 'western food'): 0.288779802737338,
  ('laksa', 'western food'): 0.24079780320414573,
  ('laksa bowl', 'western food'): 0.23389015706789013,
  ('prata', 'western food'): 0.15397132982047892}}

In [None]:
categories_similarity_comparison_locked_model_MiniLM_L6_v2 = lock_categories_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2)
categories_similarity_comparison_locked_model_MiniLM_L6_v2(["food", "asian food", "western food"], texts=["laksa", "laksa bowl", "prata", "curry prata", "steak", "sirloin steak with mushroom sauce"])

{'food': {('laksa', 'food'): 0.21327309103382694,
  ('laksa bowl', 'food'): 0.259120507793398,
  ('prata', 'food'): 0.21317178691606403,
  ('curry prata', 'food'): 0.2585698555536739,
  ('steak', 'food'): 0.5003130997095016,
  ('sirloin steak with mushroom sauce', 'food'): 0.2937494543044752},
 'asian food': {('laksa', 'asian food'): 0.20722211920235642,
  ('laksa bowl', 'asian food'): 0.22997642560937964,
  ('prata', 'asian food'): 0.18665660738247442,
  ('curry prata', 'asian food'): 0.30237662711869023,
  ('steak', 'asian food'): 0.30798892917323445,
  ('sirloin steak with mushroom sauce', 'asian food'): 0.3007048767274798},
 'western food': {('laksa', 'western food'): 0.24079780320414573,
  ('laksa bowl', 'western food'): 0.23389015706789013,
  ('prata', 'western food'): 0.15397132982047892,
  ('curry prata', 'western food'): 0.288779802737338,
  ('steak', 'western food'): 0.39804313466154906,
  ('sirloin steak with mushroom sauce', 'western food'): 0.3040080630479595}}

In [None]:
# Very specific use case only for "single_category_similarity_comparison_function" which returns a dict of compare_key and result_value
# Not usable on "categories_similarity_comparison_function" since this returns will return dict of dict!
category_similarity_result_display(single_category_similarity_comparison_locked_model_MiniLM_L6_v2("food", ["super prata", "caifan", "ramen"]))

Category: food
Similarity Level:
super prata                    /-/ food                           : 0.16437
caifan                         /-/ food                           : 0.21156
ramen                          /-/ food                           : 0.43923


In [None]:
# Eg. Testing multi-categories DISPLAY
categories_similarity_result_display(categories_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, ["food", "asian food", "western food"], ["laksa", "laksa bowl", "prata", "curry prata", "steak", "sirloin steak with mushroom sauce"]), sort_display=-1)

Category: food
Similarity Level:
steak                          /-/ food                           : 0.50031
sirloin steak with mushroom sa /-/ food                           : 0.29375
laksa bowl                     /-/ food                           : 0.25912
curry prata                    /-/ food                           : 0.25857
laksa                          /-/ food                           : 0.21327
prata                          /-/ food                           : 0.21317
Category: asian food
Similarity Level:
steak                          /-/ asian food                     : 0.30799
curry prata                    /-/ asian food                     : 0.30238
sirloin steak with mushroom sa /-/ asian food                     : 0.3007
laksa bowl                     /-/ asian food                     : 0.22998
laksa                          /-/ asian food                     : 0.20722
prata                          /-/ asian food                     : 0.18666
Category: western

### Categories with "sub-categories"/"synonyms"

#### Both Comparisons and Display (General Cateogry with "sub-categories"/"synonyms")

##### Comparisons (General Cateogry with "sub-categories"/"synonyms" variation)

In [None]:
## Categories with sub-categories is in format of dictionary where general_category-key:sub-"categories"_in_list(actually more like "synonyms" of the general categories)-value
## returns a dictionary of general_category-key:{sub-"category"(general category "synonyms")-key:{(xyz_comparison, sub-"category"/"synonym"):cosine_similarity}}

def categories_wsub_similarity_comparison_function(embedding_model_function, categories_wsub_dict, texts, sort_output=0):
    categories_wsub_result_dict = {}
    for big_general_category, sub_categories in categories_wsub_dict.items():
        categories_wsub_result_dict[big_general_category] = categories_similarity_comparison_function(embedding_model_function=embedding_model_function, categories=([big_general_category]+sub_categories), texts=texts, sort_output=sort_output)
    return categories_wsub_result_dict

##### Comparisons - Lock (General Cateogry with "sub-categories"/"synonyms" variation)

In [None]:
def lock_categories_wsub_similarity_comparison_function(embedding_model_function):
    return lambda categories_wsub_dict, texts, sort_output=0: categories_wsub_similarity_comparison_function(embedding_model_function=embedding_model_function, categories_wsub_dict=categories_wsub_dict, texts=texts, sort_output=sort_output)

##### Display (General Cateogry with "sub-categories"/"synonyms" variation)

In [None]:
def categories_wsub_similarity_result_display(categories_wsub_result_dict, sort_display = 0):
    for big_general_category, big_wsub_categories_result in categories_wsub_result_dict.items():
        print(f"General Category: {big_general_category}")
        categories_similarity_result_display(big_wsub_categories_result, sort_display=sort_display)
        print()

#### Display, similar to but variation [Top xxx (Count) and Limit yyy (Value)]

In [None]:
## Top xxx and Limit yyy, display function different mainly

# Prep
def categories_similarity_result_display_top_limit(categories_result_dict, top_many = 5, limit_value = 0.5):
    for category, category_similarity_results_dict in categories_result_dict.items():
        print(f"Sub-Categories: {category}")
        print(f"Similarity Level Of Top {top_many} (Limit={limit_value}):")
        num_count = 0
        #if sort_display == -1:
        for comparison_items_tuple, comparison_result in (sorted(category_similarity_results_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
            if num_count == top_many or comparison_result < limit_value:
                break
            print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
            num_count += 1
        """
        elif sort_display == 1:
            for comparison_items_tuple, comparison_result in (sorted(category_similarity_results_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
                if num_count == top_many or comparison_result < limit_value:
                    break
                print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
        else:
            for comparison_items_tuple, comparison_result in category_similarity_results_dict.items():
                if num_count == top_many or comparison_result < limit_value:
                    break
                print(f"{comparison_items_tuple[0]:30.30} /-/ {comparison_items_tuple[1]:30.30} : {comparison_result:.5}")
        """
# Actual using function
def categories_wsub_similarity_result_display_top_limit(categories_wsub_result_dict, top_many = 5, limit_value = 0.5):
    for big_general_category, big_wsub_categories_result in categories_wsub_result_dict.items():
        print(f"General Category: {big_general_category}")
        general_category_subcats = tuple(big_wsub_categories_result.keys())
        subcat_combined_dicts = {}
        for subcat_dicts in big_wsub_categories_result.values():
            subcat_combined_dicts = subcat_combined_dicts | subcat_dicts
        #sorted_subcat_combined_dicts = {comp_tuple:comp_res for comp_tuple, comp_res in sorted(subcat_combined_dicts.items(), key = lambda dict_item: dict_item[1], reverse=True)}
        categories_similarity_result_display_top_limit({general_category_subcats: subcat_combined_dicts}, top_many=top_many, limit_value=limit_value)
        print()

##### Example Demo

In [None]:
categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo_cut_third)

{'Passage provided for business purpose': {'Passage provided for business purpose': {('Employees receive free flight benefits ',
    'Passage provided for business purpose'): 0.14779233235003653,
   ('to return to home country once a year and',
    'Passage provided for business purpose'): 0.23613265901235564,
   (' free flight benefits for business trips',
    'Passage provided for business purpose'): 0.29568490649739965},
  'Air tickets for business trip': {('Employees receive free flight benefits ',
    'Air tickets for business trip'): 0.4639808954117372,
   ('to return to home country once a year and',
    'Air tickets for business trip'): 0.14540735152911588,
   (' free flight benefits for business trips',
    'Air tickets for business trip'): 0.6598189842771236},
  'Air tickets for work purposes': {('Employees receive free flight benefits ',
    'Air tickets for work purposes'): 0.5376104533584114,
   ('to return to home country once a year and',
    'Air tickets for work purpos

In [None]:
categories_wsub_similarity_result_display(categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo_cut_third), sort_display=-1)

General Category: Passage provided for business purpose
Category: Passage provided for business purpose
Similarity Level:
 free flight benefits for busi /-/ Passage provided for business  : 0.29568
to return to home country once /-/ Passage provided for business  : 0.23613
Employees receive free flight  /-/ Passage provided for business  : 0.14779
Category: Air tickets for business trip
Similarity Level:
 free flight benefits for busi /-/ Air tickets for business trip  : 0.65982
Employees receive free flight  /-/ Air tickets for business trip  : 0.46398
to return to home country once /-/ Air tickets for business trip  : 0.14541
Category: Air tickets for work purposes
Similarity Level:
Employees receive free flight  /-/ Air tickets for work purposes  : 0.53761
 free flight benefits for busi /-/ Air tickets for work purposes  : 0.49501
to return to home country once /-/ Air tickets for work purposes  : 0.17532
Category: Flight for business trip
Similarity Level:
 free flight benefits for

In [None]:
categories_wsub_similarity_result_display_top_limit(categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo_cut_third), top_many=3, limit_value=0.55)

General Category: Passage provided for business purpose
Sub-Categories: ('Passage provided for business purpose', 'Air tickets for business trip', 'Air tickets for work purposes', 'Flight for business trip', 'Flight for work purposes', 'Travel incidental to employment')
Similarity Level Of Top 3 (Limit=0.55):
 free flight benefits for busi /-/ Flight for business trip       : 0.75365
 free flight benefits for busi /-/ Air tickets for business trip  : 0.65982
 free flight benefits for busi /-/ Flight for work purposes       : 0.60962

General Category: Passage provided when taking up employment and upon termination
Sub-Categories: ('Passage provided when taking up employment and upon termination', 'Air tickets back to home country after contract end', 'Air tickets back to home country after termination of employment', 'Air tickets to work in Singapore', 'Flight back to home country after contract end', 'Flight back to home country after termination of employment', 'Flight to work in Sin

### Categories Resorting (To Different Format) Function

#### Sorting and Organising Within a Category (Refers To "sub-categories"/"synonyms")

In [None]:
def categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict, get_inner_list = False, sort_within_cat=0, top_many_wsub = 3, limit_value = 0.5):
    resorted_categories_wsub_similarity_comparison_dict = {}
    if limit_value < 0:
        if sort_within_cat == -1:
            limit_value = 0
        if sort_within_cat == 0:
            limit_value = None
        if sort_within_cat == 1:
            limit_value = 1
    
    if get_inner_list:
        for category, sub_syn_cat_dict in categories_wsub_similarity_comparison_result_dict.items():
            resorted_categories_wsub_similarity_comparison_dict[category] = []
            for sub_syn_cat_text_tuple_pred_dict in sub_syn_cat_dict.values():
                for sub_syn_cat_text_tuple, pred in sub_syn_cat_text_tuple_pred_dict.items():
                    if sort_within_cat == 0 or (sort_within_cat == -1 and pred >= limit_value) or (sort_within_cat == 1 and pred <= limit_value):    
                        resorted_categories_wsub_similarity_comparison_dict[category].append((sub_syn_cat_text_tuple, pred))
        # sorting below is within a category itself
        for category, comparison_tuple_pred_tuple in resorted_categories_wsub_similarity_comparison_dict.items():
            if sort_within_cat == -1:
                resorted_categories_wsub_similarity_comparison_dict[category] = list(sorted(comparison_tuple_pred_tuple, key= lambda tuple_item: tuple_item[1], reverse=True))[:top_many_wsub]
            if sort_within_cat == 0:
                resorted_categories_wsub_similarity_comparison_dict[category] = list(comparison_tuple_pred_tuple)[:top_many_wsub]
            if sort_within_cat == 1:
                resorted_categories_wsub_similarity_comparison_dict[category] = list(sorted(comparison_tuple_pred_tuple, key= lambda tuple_item: tuple_item[1], reverse=False))[:top_many_wsub]
        return resorted_categories_wsub_similarity_comparison_dict
            
    else:    
        for category, sub_syn_cat_dict in categories_wsub_similarity_comparison_result_dict.items():
            resorted_categories_wsub_similarity_comparison_dict[category] = {}
            for sub_syn_cat_text_tuple_pred_dict in sub_syn_cat_dict.values():
                for sub_syn_cat_text_tuple, pred in sub_syn_cat_text_tuple_pred_dict.items():
                    if sort_within_cat == 0 or (sort_within_cat == -1 and pred >= limit_value) or (sort_within_cat == 1 and pred <= limit_value):    
                        resorted_categories_wsub_similarity_comparison_dict[category][sub_syn_cat_text_tuple] = pred
        # sorting below is within a category itself
        for category, comparison_tuple_pred_dict in resorted_categories_wsub_similarity_comparison_dict.items():
            if sort_within_cat == -1:
                resorted_categories_wsub_similarity_comparison_dict[category] = dict(list(sorted(comparison_tuple_pred_dict.items(), key= lambda dict_item: dict_item[1], reverse=True))[:top_many_wsub])
            if sort_within_cat == 0:
                resorted_categories_wsub_similarity_comparison_dict[category] = dict(list(comparison_tuple_pred_dict.items())[:top_many_wsub])
            if sort_within_cat == 1:
                resorted_categories_wsub_similarity_comparison_dict[category] = dict(list(sorted(comparison_tuple_pred_dict.items(), key= lambda dict_item: dict_item[1], reverse=False))[:top_many_wsub])
    return resorted_categories_wsub_similarity_comparison_dict

#### Sorting and Organising Between Categories (Refers To Different Categories In General) - Most to Least

In [None]:
## No Sorting Order, all made for the cleaning function, but the code is there but the argument is removed and relevant code portion is commented out, after all, for no sort, how to determine top xxx category, the top or bottom!!

def categories_wsub_similarity_comparison_resort_cleaning_function(resorted_categories_wsub_similarity_comparison_dict, get_inner_list = False, get_list = False, top_many_cat = 3):
    resultant_cleaned_list = []
    if get_inner_list:
        for category, comparison_tuple_pred_pair_tuple_list in resorted_categories_wsub_similarity_comparison_dict.items():
            for comparison_tuple, pred in comparison_tuple_pred_pair_tuple_list:
                resultant_cleaned_list.append((category, (comparison_tuple, pred)))
    else:
        for category, comparison_tuple_pred_pair_dict in resorted_categories_wsub_similarity_comparison_dict.items():
            for comparison_tuple, pred in comparison_tuple_pred_pair_dict.items():
                resultant_cleaned_list.append((category, (comparison_tuple, pred)))
    """
    # sort_cats args gone!! since cleaning is for top many, so no point giving option here, just restrict to just most to least!!
    if sort_cats == -1:
        sorted_resultant_cleaned_list = list(sorted(resultant_cleaned_list, key=lambda list_element: list_element[1][1], reverse=True))
    if sort_cats == 0:
        sorted_resultant_cleaned_list = resultant_cleaned_list
    if sort_cats == 1:
        sorted_resultant_cleaned_list = list(sorted(resultant_cleaned_list, key=lambda list_element: list_element[1][1], reverse=False))
    """
    sorted_resultant_cleaned_list = list(sorted(resultant_cleaned_list, key=lambda list_element: list_element[1][1], reverse=True))[:top_many_cat]
    if get_list:
        return sorted_resultant_cleaned_list
    """
    # for this to work with getting back a dict, which is sorted correctly, the "top_many_wsub" argument in previous function need to be 1
    ## if not, very wonky, since category as key means "top_many_cat" has to be <= number of category, else weird, and if "top_many_wsub" is not 1, the method below not so direct, need to ensure only add to dict once, and if category added then no more replacement!
    ### a possible alternative but unpreferred, so just keep "top_many_wsub" at 1 if following into this function!!
    
    sorted_resultant_cleaned_list_dict = {}
    for cat, tuple_pair in sorted_resultant_cleaned_list:
        if cat not in sorted_resultant_cleaned_list_dict:
            sorted_resultant_cleaned_list_dict[cat] = tuple_pair
    return sorted_resultant_cleaned_list_dict
    """
    return dict(sorted_resultant_cleaned_list)

#### New Resorted Format Display

In [None]:
def cleaned_categories_wsub_similarity_comparison_resorted_result_display(cleaned_resorted_compare_result, get_list):
    if get_list:
        for label, comparison_tuple_pred_pair_tuple in cleaned_resorted_compare_result:
            print(f"Category: {label}")
            print(f"{comparison_tuple_pred_pair_tuple[0][0]:30.30} /-/ {comparison_tuple_pred_pair_tuple[0][1]:30.30}: {comparison_tuple_pred_pair_tuple[1]:.5}")
            print()
    else:
        for label, comparison_tuple_pred_pair_tuple in cleaned_resorted_compare_result.items():
            print(f"Category: {label}")
            print(f"{comparison_tuple_pred_pair_tuple[0][0]:30.30} /-/ {comparison_tuple_pred_pair_tuple[0][1]:30.30}: {comparison_tuple_pred_pair_tuple[1]:.5}")
            print()

##### Example Demo

In [None]:
# Arguments to use for function
get_inner_list = True
sort_within_cat = 1
top_many_wsub = 3
limit_value = 0.07

categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict=categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, categories_wsub_dict=cat_wsub_demo, texts=splitted_sentence_text), get_inner_list=get_inner_list, sort_within_cat=sort_within_cat, top_many_wsub=top_many_wsub, limit_value=limit_value)

{'Passage provided for business purpose': [(('a year and',
    'Air tickets for work purposes'),
   0.03580521225597774),
  (('a year and', 'Air tickets for business trip'), 0.0504765864001799),
  (('a year and', 'Passage provided for business purpose'),
   0.06552286772996774)],
 'Passage provided when taking up employment and upon termination': [(('a year and',
    'Air tickets to work in Singapore'),
   0.030742087031399204),
  (('a year and',
    'Air tickets back to home country after termination of employment'),
   0.05293060206713229),
  (('a year and',
    'Flight back to home country after termination of employment'),
   0.0552523757358474)],
 'Home leave passage': [(('a year and', 'Air tickets to home country'),
   0.056946677215592184),
  (('a year and', 'Air passage benefit'), 0.06636109112551498)]}

In [None]:
# Arguments to use for function
get_inner_list = True
sort_within_cat = -1
top_many_wsub = 1
limit_value = -1

top_many_cat = 2
get_list = False

categories_wsub_similarity_comparison_resort_cleaning_function(resorted_categories_wsub_similarity_comparison_dict=categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict=categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, categories_wsub_dict=cat_wsub_demo, texts=splitted_sentence_text), get_inner_list=get_inner_list, sort_within_cat=sort_within_cat, top_many_wsub=top_many_wsub, limit_value=limit_value), get_inner_list=get_inner_list, get_list=get_list, top_many_cat=top_many_cat)

{'Passage provided for business purpose': (('free flight benefits for business trips',
   'Flight for business trip'),
  0.7536517813135932),
 'Home leave passage': (('to return to home country once',
   'Flight to home country'),
  0.6656597017594097)}

In [None]:
# Arguments to use for function
get_inner_list = True
sort_within_cat = -1
top_many_wsub = 1
limit_value = 0.6

top_many_cat = 3
get_list = True
        
cleaned_categories_wsub_similarity_comparison_resorted_result_display(categories_wsub_similarity_comparison_resort_cleaning_function(resorted_categories_wsub_similarity_comparison_dict=categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict=categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, categories_wsub_dict=cat_wsub_demo, texts=splitted_sentence_text), get_inner_list=get_inner_list, sort_within_cat=sort_within_cat, top_many_wsub=top_many_wsub, limit_value=limit_value), get_inner_list=get_inner_list, get_list=get_list, top_many_cat=top_many_cat), get_list=get_list)

Category: Passage provided for business purpose
free flight benefits for busin /-/ Flight for business trip      : 0.75365

Category: Home leave passage
to return to home country once /-/ Flight to home country        : 0.66566



In [None]:
# Arguments to use for function
get_inner_list = True
sort_within_cat = -1
top_many_wsub = 1
limit_value = 0.5

top_many_cat = 3
get_list = True
        
cleaned_categories_wsub_similarity_comparison_resorted_result_display(categories_wsub_similarity_comparison_resort_cleaning_function(resorted_categories_wsub_similarity_comparison_dict=categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict=categories_wsub_similarity_comparison_function(get_sentence_embedding_MiniLM_L6_v2, categories_wsub_dict=cat_wsub_demo, texts=splitted_sentence_text), get_inner_list=get_inner_list, sort_within_cat=sort_within_cat, top_many_wsub=top_many_wsub, limit_value=limit_value), get_inner_list=get_inner_list, get_list=get_list, top_many_cat=top_many_cat), get_list=get_list)

Category: Passage provided for business purpose
free flight benefits for busin /-/ Flight for business trip      : 0.75365

Category: Home leave passage
to return to home country once /-/ Flight to home country        : 0.66566

Category: Passage provided when taking up employment and upon termination
to return to home country once /-/ Flight back to home country af: 0.59888



# Zero Shot Classification Function Section

### Generic Zero Shot Classification Function + Lock Version

In [None]:
def classify_sentence(classifier, candidate_labels, sequence_to_classify, multi_label = True):
    result_dict = {}
    classifier_results = classifier(sequence_to_classify, candidate_labels, multi_label=multi_label)
    if type(classifier_results) != list:
        classifier_results = [classifier_results]
    for classifier_result in classifier_results:
        result_dict[classifier_result["sequence"]] = {label:label_prob for label,label_prob in zip(classifier_result["labels"], classifier_result["scores"])}
    return result_dict

In [None]:
def lock_classify_sentence(classifier):
    return lambda candidate_labels, sequence_to_classify, multi_label = True: classify_sentence(classifier=classifier, candidate_labels=candidate_labels, sequence_to_classify=sequence_to_classify, multi_label=multi_label)

### Categories Classification Function (+ Resort Format Function) + Lock Version

In [None]:
def categories_classification_function(classification_model_function, categories_candidate_labels, texts, multi_label = True, sort_output = 0):
    classification_results = classify_sentence(classifier=classification_model_function, candidate_labels=categories_candidate_labels, sequence_to_classify=texts, multi_label=multi_label)
    final_classified_dict = {}
    if sort_output == -1:
        final_classified_dict = classification_results
        return final_classified_dict
    if sort_output == 0:
        for seq in texts:
            final_classified_dict[seq] = {label:classification_results[seq][label] for label in categories_candidate_labels}
        return final_classified_dict
    if sort_output == 1:
        for seq in texts:
            pre_sort = {label:classification_results[seq][label] for label in categories_candidate_labels}
            final_classified_dict[seq] = {label:label_pred for label, label_pred in sorted(pre_sort.items(), key = lambda dict_item: dict_item[1])}
        return final_classified_dict

## Resort Format Function
def categories_classification_additional_resort_function(seq_classified_dictionary, categories_candidate_labels, sort_output = 0, top_many = 5, limit_value = 0.5):
    if limit_value < 0:
        if sort_output == -1:
            limit_value = 0
        if sort_output == 0:
            limit_value = None
        if sort_output == 1:
            limit_value = 1
    resorted_classification_dict = {label:{} for label in categories_candidate_labels}
    for seq, label_to_label_pred_dict in seq_classified_dictionary.items():
        for label in categories_candidate_labels:
            if sort_output == -1:
                if label_to_label_pred_dict[label] >= limit_value:
                    resorted_classification_dict[label][seq] = label_to_label_pred_dict[label]
            if sort_output == 0:
                # limit_value no meaning here since no sorting so no >= or <= to base off
                resorted_classification_dict[label][seq] = label_to_label_pred_dict[label]
            if sort_output == 1:
                if label_to_label_pred_dict[label] <= limit_value:
                    resorted_classification_dict[label][seq] = label_to_label_pred_dict[label]
    if sort_output == -1:
        for label in categories_candidate_labels:
            resorted_classification_dict[label] = dict(sorted(resorted_classification_dict[label].items(), key = lambda dict_item: dict_item[1], reverse=True))
    if sort_output == 0:
        resorted_classification_dict = resorted_classification_dict
    if sort_output == 1:
        for label in categories_candidate_labels:
            resorted_classification_dict[label] = dict(sorted(resorted_classification_dict[label].items(), key = lambda dict_item: dict_item[1], reverse=False))
    
    if top_many >= 0:
        for label in categories_candidate_labels:
            resorted_classification_dict[label] = dict(list(resorted_classification_dict[label].items())[:top_many])
    return resorted_classification_dict

In [None]:
def lock_categories_classification_function(classification_model_function):
    return lambda categories_candidate_labels, texts, multi_label = True, sort_output = 0 : categories_classification_function(classification_model_function=classification_model_function, categories_candidate_labels=categories_candidate_labels, texts=texts, multi_label = multi_label, sort_output = sort_output)


#### Classification Result Display

In [None]:
def categories_classification_resorted_result_display(classification_resorted_dictionary_result, sort_display = 0, top_many = 5, limit_value = 0.5):
    if limit_value < 0:
        if sort_display == -1:
            limit_value = 0
        if sort_display == 0:
            limit_value = None
        if sort_display == 1:
            limit_value = 1
    if top_many > 0:
        for label, seq_pred_dict in classification_resorted_dictionary_result.items():
            print(f"Category: {label}")
            if sort_display == -1:
                for seq, pred in dict(sorted(list(seq_pred_dict.items()), key=lambda list_dict_tuple: list_dict_tuple[1], reverse=True)[:top_many]).items():
                    if pred >= limit_value:
                        print(f"{seq:65.65}: {pred:.5}")
            if sort_display == 0:
                ## if no sorting, then top xxx and limit yyy does not make sense so not applicable here
                for seq, pred in seq_pred_dict.items():
                    print(f"{seq:65.65}: {pred:.5}")
            if sort_display == 1:
                for seq, pred in dict(sorted(list(seq_pred_dict.items()), key=lambda list_dict_tuple: list_dict_tuple[1], reverse=False)[:top_many]).items():
                    if pred <= limit_value:
                        print(f"{seq:65.65}: {pred:.5}")
            print()
    else:
        for label, seq_pred_dict in classification_resorted_dictionary_result.items():
            print(f"Category: {label}")
            if sort_display == -1:
                for seq, pred in dict(sorted(seq_pred_dict.items(), key=lambda list_dict_tuple: list_dict_tuple[1], reverse=True)).items():
                    if pred >= limit_value:
                        print(f"{seq:65.65}: {pred:.5}")
            if sort_display == 0:
                ## if no sorting, then top xxx and limit yyy does not make sense so not applicable here
                for seq, pred in seq_pred_dict.items():
                    print(f"{seq:65.65}: {pred:.5}")
            if sort_display == 1:
                for seq, pred in dict(sorted(seq_pred_dict.items(), key=lambda list_dict_tuple: list_dict_tuple[1], reverse=False)).items():
                    if pred <= limit_value:
                        print(f"{seq:65.65}: {pred:.5}")
            print()

##### Example Demo

In [None]:
classify_sentence(bart_mnli_classifier, candidate_possible_labels, sequences_list, multi_label=True)

{'one day i will see the world': {'travel': 0.994157612323761,
  'exploration': 0.92877596616745,
  'dancing': 0.005361784249544144,
  'cooking': 0.0016605753917247057},
 'i will explore sweden next semester': {'travel': 0.9911717772483826,
  'exploration': 0.9684410691261292,
  'dancing': 0.0032393524888902903,
  'cooking': 0.00020078456145711243},
 'I love popping and locking!': {'exploration': 0.7612733244895935,
  'dancing': 0.22573687136173248,
  'cooking': 0.17265444993972778,
  'travel': 0.0074744801968336105}}

In [None]:
classify_sentence(bart_mnli_classifier, candidate_possible_labels, sequences_list[0], multi_label=False)

{'one day i will see the world': {'travel': 0.8104696869850159,
  'exploration': 0.1847233921289444,
  'dancing': 0.0025745946913957596,
  'cooking': 0.0022323287557810545}}

In [None]:
categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=-1)

{'one day i will see the world': {'travel': 0.994157612323761,
  'exploration': 0.92877596616745,
  'dancing': 0.005361784249544144,
  'cooking': 0.0016605753917247057},
 'i will explore sweden next semester': {'travel': 0.9911717772483826,
  'exploration': 0.9684410691261292,
  'dancing': 0.0032393524888902903,
  'cooking': 0.00020078456145711243},
 'I love popping and locking!': {'exploration': 0.7612733244895935,
  'dancing': 0.22573687136173248,
  'cooking': 0.17265444993972778,
  'travel': 0.0074744801968336105}}

In [None]:
categories_classification_additional_resort_function(categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=-1), candidate_possible_labels, sort_output=-1, top_many=5, limit_value=0.5)

{'travel': {'one day i will see the world': 0.994157612323761,
  'i will explore sweden next semester': 0.9911717772483826},
 'cooking': {},
 'dancing': {},
 'exploration': {'i will explore sweden next semester': 0.9684410691261292,
  'one day i will see the world': 0.92877596616745,
  'I love popping and locking!': 0.7612733244895935}}

In [None]:
categories_classification_resorted_result_display(categories_classification_additional_resort_function(categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=-1), candidate_possible_labels, sort_output=-1, top_many=5, limit_value=0.5),sort_display=-1, top_many=-1, limit_value=-1)

Category: travel
one day i will see the world : 0.994157612323761
i will explore sweden next semester : 0.9911717772483826

Category: cooking

Category: dancing

Category: exploration
i will explore sweden next semester : 0.9684410691261292
one day i will see the world : 0.92877596616745
I love popping and locking! : 0.7612733244895935



In [None]:
categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=0)

{'one day i will see the world': {'travel': 0.994157612323761,
  'cooking': 0.0016605753917247057,
  'dancing': 0.005361784249544144,
  'exploration': 0.92877596616745},
 'i will explore sweden next semester': {'travel': 0.9911717772483826,
  'cooking': 0.00020078456145711243,
  'dancing': 0.0032393524888902903,
  'exploration': 0.9684410691261292},
 'I love popping and locking!': {'travel': 0.0074744801968336105,
  'cooking': 0.17265444993972778,
  'dancing': 0.22573687136173248,
  'exploration': 0.7612733244895935}}

In [None]:
categories_classification_additional_resort_function(categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=0), candidate_possible_labels, sort_output=0, top_many=5, limit_value=0.5)

{'travel': {'one day i will see the world': 0.994157612323761,
  'i will explore sweden next semester': 0.9911717772483826},
 'cooking': {},
 'dancing': {},
 'exploration': {'one day i will see the world': 0.92877596616745,
  'i will explore sweden next semester': 0.9684410691261292,
  'I love popping and locking!': 0.7612733244895935}}

In [None]:
categories_classification_resorted_result_display(categories_classification_additional_resort_function(categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=-1), candidate_possible_labels, sort_output=0, top_many=5, limit_value=0.5),sort_display=0, top_many=-1, limit_value=-1)

Category: travel
one day i will see the world : 0.994157612323761
i will explore sweden next semester : 0.9911717772483826
I love popping and locking! : 0.0074744801968336105

Category: cooking
one day i will see the world : 0.0016605753917247057
i will explore sweden next semester : 0.00020078456145711243
I love popping and locking! : 0.17265444993972778

Category: dancing
one day i will see the world : 0.005361784249544144
i will explore sweden next semester : 0.0032393524888902903
I love popping and locking! : 0.22573687136173248

Category: exploration
one day i will see the world : 0.92877596616745
i will explore sweden next semester : 0.9684410691261292
I love popping and locking! : 0.7612733244895935



In [None]:
categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=1)

{'one day i will see the world': {'cooking': 0.0016605753917247057,
  'dancing': 0.005361784249544144,
  'exploration': 0.92877596616745,
  'travel': 0.994157612323761},
 'i will explore sweden next semester': {'cooking': 0.00020078456145711243,
  'dancing': 0.0032393524888902903,
  'exploration': 0.9684410691261292,
  'travel': 0.9911717772483826},
 'I love popping and locking!': {'travel': 0.0074744801968336105,
  'cooking': 0.17265444993972778,
  'dancing': 0.22573687136173248,
  'exploration': 0.7612733244895935}}

In [None]:
categories_classification_additional_resort_function(categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=1), candidate_possible_labels, sort_output=1, top_many=5, limit_value=0.5)

{'travel': {'i will explore sweden next semester': 0.9911717772483826,
  'one day i will see the world': 0.994157612323761},
 'cooking': {},
 'dancing': {},
 'exploration': {'I love popping and locking!': 0.7612733244895935,
  'one day i will see the world': 0.92877596616745,
  'i will explore sweden next semester': 0.9684410691261292}}

In [None]:
categories_classification_resorted_result_display(categories_classification_additional_resort_function(categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=-1), candidate_possible_labels, sort_output=1, top_many=5, limit_value=0.5),sort_display=1, top_many=-1, limit_value=-1)

Category: travel
I love popping and locking! : 0.0074744801968336105

Category: cooking
i will explore sweden next semester : 0.00020078456145711243
one day i will see the world : 0.0016605753917247057
I love popping and locking! : 0.17265444993972778

Category: dancing
i will explore sweden next semester : 0.0032393524888902903
one day i will see the world : 0.005361784249544144
I love popping and locking! : 0.22573687136173248

Category: exploration



#### Additional Cleaning Up Function, Into Different Format + Display 

In [None]:
def categories_classification_additional_resort_cleaning_function(classification_resorted_dictionary_result, get_list = False, top_many_cat = 3, limit_value = 0.5):
    #return dict(list(dict(sorted(list(classification_resorted_dictionary_result.items()), key=lambda tuple_value_dict: list(tuple_value_dict[1].values())[0], reverse=True)).items())[:top_many_cat])
    cleaned_classification_resorted_dictionary_result = {}
    for label, seq_pred_dict in classification_resorted_dictionary_result.items():
        ## the "if" part and the "for" part is done so that if seq_pred_dict.items() is empty, then next(iter()) wont crash if solely use it!!
        """
        if len(seq_pred_dict) > 0:
            cleaned_classification_resorted_dictionary_result[label] = next(iter(seq_pred_dict.items()))
        """
        for seq, pred in seq_pred_dict.items():
            cleaned_classification_resorted_dictionary_result[label] = (seq, pred)
        ### if label dont have any that fits limit_value restriction, then the label wont appear in the dict at the end!!, not in this version at least!!!
    cleaned_classification_resorted_dictionary_result = dict(sorted(cleaned_classification_resorted_dictionary_result.items(), key=lambda dict_item: dict_item[1][1], reverse=True))
    if get_list:
        return list(cleaned_classification_resorted_dictionary_result.items())[:top_many_cat]
    return dict(list(cleaned_classification_resorted_dictionary_result.items())[:top_many_cat])


def cleaned_categories_classification_resorted_result_display(cleaned_classification_resorted_result, get_list):
    if get_list:
        for label, seq_pred_tuple in cleaned_classification_resorted_result:
            print(f"Category: {label}")
            print(f"{seq_pred_tuple[0]:65.65}: {seq_pred_tuple[1]:.5}")
            print()
    else:
        for label, seq_pred_tuple in cleaned_classification_resorted_result.items():
            print(f"Category: {label}")
            print(f"{seq_pred_tuple[0]:65.65}: {seq_pred_tuple[1]:.5}")
            print()

##### Example Demo

In [None]:
## the sort_output = -1 and top_many = 1 is both impt!!!

# Arguments to use for function
limit_value = 0.1
top_many_cat = 3
get_list = False

categories_classification_additional_resort_cleaning_function(classification_resorted_dictionary_result=categories_classification_additional_resort_function(categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=-1), candidate_possible_labels, sort_output=-1, top_many=1, limit_value=limit_value), get_list=get_list, top_many_cat=top_many_cat, limit_value=limit_value)

{'travel': ('one day i will see the world', 0.994157612323761),
 'exploration': ('i will explore sweden next semester', 0.9684410691261292),
 'dancing': ('I love popping and locking!', 0.22573687136173248)}

In [None]:
# Arguments to use for function
limit_value = 0.1
top_many_cat = 2
get_list = True

cleaned_categories_classification_resorted_result_display(cleaned_classification_resorted_result=categories_classification_additional_resort_cleaning_function(classification_resorted_dictionary_result=categories_classification_additional_resort_function(categories_classification_function(bart_mnli_classifier, candidate_possible_labels, sequences_list, sort_output=-1), candidate_possible_labels, sort_output=-1, top_many=1, limit_value=limit_value), get_list=get_list, top_many_cat=top_many_cat, limit_value=limit_value), get_list=get_list)


Category: travel
one day i will see the world: 0.994157612323761

Category: exploration
i will explore sweden next semester: 0.9684410691261292



# Overall Combined Function + Lock

## Overall Combined Function (Similarity Comparison)

In [None]:
# Overall Combined Function (Similarity Comparison)

def split_and_compare(split_embed_function, compare_embed_function, categories, sentence_text, intermediate = False, graph = False, sort_compare = 0, display_split = True , display_end = True, sort_display = 0):
    splitted_sentence_text = semantic_segmentation_function(embedding_model_function=split_embed_function, sentence_text=sentence_text, intermediate_status=intermediate, graph_status=graph)
    if display_split:
        print(f"Splitted texts: {splitted_sentence_text}")
    compare_result = categories_similarity_comparison_function(embedding_model_function=compare_embed_function, categories=categories, texts=splitted_sentence_text, sort_output=sort_compare)
    if display_end:
        categories_similarity_result_display(compare_result, sort_display=sort_display)
    return splitted_sentence_text, compare_result
    

In [None]:
## Lock split and compare overall combined function
def lock_split_and_compare(split_embed_function, compare_embed_function):
    return lambda categories, sentence_text, intermediate = False, graph = False, sort_compare = 0, display_split = True, display_end = True, sort_display = 0: split_and_compare(split_embed_function=split_embed_function, compare_embed_function=compare_embed_function, categories=categories, sentence_text=sentence_text, intermediate=intermediate, graph=graph, sort_compare=sort_compare, display_split=display_split, display_end=display_end, sort_display=sort_display)
    # SyntaxError: positional argument follows keyword argument
    # return lambda categories, sentence_text, intermediate = False, graph = False, sort_compare = 0, display_end = True, sort_display = 0: split_and_compare(split_embed_function=split_embed_function, compare_embed_function=compare_embed_function, categories, sentence_text, intermediate = False, graph = False, sort_compare = 0, display_end = True, sort_display = 0)

### OR !!

def lock_split_and_compare(split_embed_function, compare_embed_function):
    def lockED_split_and_compare(categories, sentence_text, intermediate = False, graph = False, sort_compare = 0, display_split = True , display_end = True, sort_display = 0):
        return split_and_compare(split_embed_function, compare_embed_function, categories, sentence_text, intermediate = intermediate, graph = graph, sort_compare = sort_compare, display_split = display_split , display_end = display_end, sort_display = sort_display)
    return lockED_split_and_compare


##### Example Demo

In [None]:
demo_cat = ["food", "asian food", "western food", "Home Leave Passage", "Employee flight benefits", "subsidised flight tickets for employees to go home", "covered business trip expenses"]
demo_sent = "Employees receive free flight benefits to return to home country once a year and free flight benefits for business trips"

In [None]:
split_and_compare(get_sentence_embedding_MiniLM_L6_v2, get_sentence_embedding_MiniLM_L6_v2, demo_cat, demo_sent, graph=False, sort_display=-1)
pass

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
Category: food
Similarity Level:
to return to home country once /-/ food                           : 0.20525
a year and                     /-/ food                           : 0.18797
free flight benefits for busin /-/ food                           : 0.02749
Employees receive free flight  /-/ food                           : -0.0047651
Category: asian food
Similarity Level:
to return to home country once /-/ asian food                     : 0.16057
a year and                     /-/ asian food                     : 0.061953
free flight benefits for busin /-/ asian food                     : 0.042717
Employees receive free flight  /-/ asian food                     : 0.017726
Category: western food
Similarity Level:
to return to home country once /-/ western food                   : 0.22035
a year and                     /-/ western food

In [None]:
locked_split_MiniLM_L6_v2_and_compare_MiniLM_L6_v2 = lock_split_and_compare(get_sentence_embedding_MiniLM_L6_v2, get_sentence_embedding_MiniLM_L6_v2)
locked_split_MiniLM_L6_v2_and_compare_MiniLM_L6_v2(demo_cat, demo_sent, sort_display=-1)
pass

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
Category: food
Similarity Level:
to return to home country once /-/ food                           : 0.20525
a year and                     /-/ food                           : 0.18797
free flight benefits for busin /-/ food                           : 0.02749
Employees receive free flight  /-/ food                           : -0.0047651
Category: asian food
Similarity Level:
to return to home country once /-/ asian food                     : 0.16057
a year and                     /-/ asian food                     : 0.061953
free flight benefits for busin /-/ asian food                     : 0.042717
Employees receive free flight  /-/ asian food                     : 0.017726
Category: western food
Similarity Level:
to return to home country once /-/ western food                   : 0.22035
a year and                     /-/ western food

### Split and Compare Variations

#### Split and Compare (with "sub-categories"/"synonyms" of general category)

##### Overall Function (General Category with "sub-categories"/"synonyms" variation)

In [None]:
def split_and_compare_wsub(split_embed_function, compare_embed_function, categories_wsub, sentence_text, intermediate = False, graph = False, sort_compare = 0, display_split = True , display_end = True, sort_display = 0):
    splitted_sentence_text = semantic_segmentation_function(embedding_model_function=split_embed_function, sentence_text=sentence_text, intermediate_status=intermediate, graph_status=graph)
    if display_split:
        print(f"Splitted texts: {splitted_sentence_text}")
    compare_result = categories_wsub_similarity_comparison_function(embedding_model_function=compare_embed_function, categories_wsub_dict=categories_wsub, texts=splitted_sentence_text, sort_output=sort_compare)
    if display_end:
        categories_wsub_similarity_result_display(compare_result, sort_display=sort_display)
    return splitted_sentence_text, compare_result

##### Overall Function - Lock (General Category with "sub-categories"/"synonyms" variation)

In [None]:
def lock_split_and_compare_wsub(split_embed_function, compare_embed_function):
    def lockED_split_and_compare_wsub(categories_wsub, sentence_text, intermediate = False, graph = False, sort_compare = 0, display_split = True , display_end = True, sort_display = 0):
        return split_and_compare_wsub(split_embed_function, compare_embed_function, categories_wsub=categories_wsub, sentence_text=sentence_text, intermediate = intermediate, graph = graph, sort_compare = sort_compare, display_split = display_split , display_end = display_end, sort_display = sort_display)
    return lockED_split_and_compare_wsub

#### Split and Compare [Top (Count) & Limit (Value)] - Display

##### Overall Function [Top xxx (Count) and Limit yyy (Value)]

In [None]:
def split_and_compare_wsub_top_limit(split_embed_function, compare_embed_function, categories_wsub, sentence_text, intermediate = False, graph = False, sort_compare = 0, display_split = True , display_end = True, top_many = 5, limit_value = 0.5):
    splitted_sentence_text = semantic_segmentation_function(embedding_model_function=split_embed_function, sentence_text=sentence_text, intermediate_status=intermediate, graph_status=graph)
    if display_split:
        print(f"Splitted texts: {splitted_sentence_text}")
    compare_result = categories_wsub_similarity_comparison_function(embedding_model_function=compare_embed_function, categories_wsub_dict=categories_wsub, texts=splitted_sentence_text, sort_output=sort_compare)
    if display_end:
        categories_wsub_similarity_result_display_top_limit(compare_result, top_many=top_many, limit_value=limit_value)
    return splitted_sentence_text, compare_result

##### Overall Function - Lock [Top xxx (Count) and Limit yyy (Value)]

In [None]:
def lock_split_and_compare_wsub_top_limit(split_embed_function, compare_embed_function):
    return lambda categories_wsub, sentence_text, intermediate = False, graph = False, sort_compare = 0, display_split = True , display_end = True, top_many = 5, limit_value = 0.5:split_and_compare_wsub_top_limit(split_embed_function=split_embed_function, compare_embed_function=compare_embed_function, categories_wsub=categories_wsub, sentence_text=sentence_text, intermediate = intermediate, graph = graph, sort_compare = sort_compare, display_split = display_split , display_end = display_end, top_many = top_many, limit_value = limit_value)

##### Example Demo

In [None]:
cat_wsub_demo = {"Passage provided for business purpose": ["Air tickets for business trip", "Air tickets for work purposes", "Flight for business trip", "Flight for work purposes", "Travel incidental to employment"],
            "Passage provided when taking up employment and upon termination": ["Air tickets back to home country after contract end", "Air tickets back to home country after termination of employment", "Air tickets to work in Singapore", "Flight back to home country after contract end", "Flight back to home country after termination of employment", "Flight to work in Singapore"],
            "Home leave passage": ["Air passage benefit", "Air tickets to home country", "Flight to home country"]}
hlp_biz_demo = "Employees receive free flight benefits to return to home country once a year and free flight benefits for business trips"

In [None]:
split_and_compare_wsub(get_sentence_embedding_MiniLM_L6_v2, get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo, sort_display=-1)
pass

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
General Category: Passage provided for business purpose
Category: Passage provided for business purpose
Similarity Level:
free flight benefits for busin /-/ Passage provided for business  : 0.29568
to return to home country once /-/ Passage provided for business  : 0.17853
Employees receive free flight  /-/ Passage provided for business  : 0.14779
a year and                     /-/ Passage provided for business  : 0.065523
Category: Air tickets for business trip
Similarity Level:
free flight benefits for busin /-/ Air tickets for business trip  : 0.65982
Employees receive free flight  /-/ Air tickets for business trip  : 0.46398
to return to home country once /-/ Air tickets for business trip  : 0.11836
a year and                     /-/ Air tickets for business trip  : 0.050477
Category: Air tickets for work purposes
Similarity Level:
Em

In [None]:
split_and_compare_wsub(get_sentence_embedding_MiniLM_L6_v2, get_sentence_embedding_bge, cat_wsub_demo, hlp_biz_demo, sort_display=-1)
pass

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
General Category: Passage provided for business purpose
Category: Passage provided for business purpose
Similarity Level:
free flight benefits for busin /-/ Passage provided for business  : 0.60146
to return to home country once /-/ Passage provided for business  : 0.55901
Employees receive free flight  /-/ Passage provided for business  : 0.52735
a year and                     /-/ Passage provided for business  : 0.49561
Category: Air tickets for business trip
Similarity Level:
free flight benefits for busin /-/ Air tickets for business trip  : 0.80979
Employees receive free flight  /-/ Air tickets for business trip  : 0.66801
to return to home country once /-/ Air tickets for business trip  : 0.57646
a year and                     /-/ Air tickets for business trip  : 0.39768
Category: Air tickets for work purposes
Similarity Level:
free

In [None]:
split_and_compare_wsub(get_sentence_embedding_bge, get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo, sort_display=-1)
pass

Splitted texts: ['Employees receive free flight benefits to', 'return to home country once', 'a year and', 'free flight benefits for business trips']
General Category: Passage provided for business purpose
Category: Passage provided for business purpose
Similarity Level:
free flight benefits for busin /-/ Passage provided for business  : 0.29568
Employees receive free flight  /-/ Passage provided for business  : 0.16618
a year and                     /-/ Passage provided for business  : 0.065523
return to home country once    /-/ Passage provided for business  : 0.039441
Category: Air tickets for business trip
Similarity Level:
free flight benefits for busin /-/ Air tickets for business trip  : 0.65982
Employees receive free flight  /-/ Air tickets for business trip  : 0.46172
return to home country once    /-/ Air tickets for business trip  : 0.081763
a year and                     /-/ Air tickets for business trip  : 0.050477
Category: Air tickets for work purposes
Similarity Level:


In [None]:
split_and_compare_wsub(get_sentence_embedding_bge, get_sentence_embedding_bge, cat_wsub_demo, hlp_biz_demo, sort_display=-1)
pass

Splitted texts: ['Employees receive free flight benefits to', 'return to home country once', 'a year and', 'free flight benefits for business trips']
General Category: Passage provided for business purpose
Category: Passage provided for business purpose
Similarity Level:
free flight benefits for busin /-/ Passage provided for business  : 0.60146
Employees receive free flight  /-/ Passage provided for business  : 0.54829
return to home country once    /-/ Passage provided for business  : 0.54396
a year and                     /-/ Passage provided for business  : 0.49561
Category: Air tickets for business trip
Similarity Level:
free flight benefits for busin /-/ Air tickets for business trip  : 0.80979
Employees receive free flight  /-/ Air tickets for business trip  : 0.70321
return to home country once    /-/ Air tickets for business trip  : 0.57358
a year and                     /-/ Air tickets for business trip  : 0.39768
Category: Air tickets for work purposes
Similarity Level:
Empl

In [None]:
split_and_compare_wsub_top_limit(get_sentence_embedding_MiniLM_L6_v2, get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo, top_many=3, limit_value=0.5)
pass

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
General Category: Passage provided for business purpose
Sub-Categories: ('Passage provided for business purpose', 'Air tickets for business trip', 'Air tickets for work purposes', 'Flight for business trip', 'Flight for work purposes', 'Travel incidental to employment')
Similarity Level Of Top 3 (Limit=0.5):
free flight benefits for busin /-/ Flight for business trip       : 0.75365
free flight benefits for busin /-/ Air tickets for business trip  : 0.65982
free flight benefits for busin /-/ Flight for work purposes       : 0.60962

General Category: Passage provided when taking up employment and upon termination
Sub-Categories: ('Passage provided when taking up employment and upon termination', 'Air tickets back to home country after contract end', 'Air tickets back to home country after termination of employment', 'Air tickets to work i

In [None]:
minilm62_bge_s_c1 = lock_split_and_compare(get_sentence_embedding_MiniLM_L6_v2, get_sentence_embedding_bge)
bge_minilm62_s_c2 = lock_split_and_compare_wsub(get_sentence_embedding_bge, get_sentence_embedding_MiniLM_L6_v2)

minilm62_minilm62_s_c3 = lock_split_and_compare_wsub_top_limit(get_sentence_embedding_MiniLM_L6_v2, get_sentence_embedding_MiniLM_L6_v2)
bge_bge_s_c3 = lock_split_and_compare_wsub_top_limit(get_sentence_embedding_bge, get_sentence_embedding_bge)

In [None]:
minilm62_bge_s_c1(list(cat_wsub_demo.keys())[0], hlp_biz_demo, )

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
Category: Passage provided for business purpose
Similarity Level:
Employees receive free flight  /-/ Passage provided for business  : 0.52735
to return to home country once /-/ Passage provided for business  : 0.55901
a year and                     /-/ Passage provided for business  : 0.49561
free flight benefits for busin /-/ Passage provided for business  : 0.60146


(['Employees receive free flight benefits',
  'to return to home country once',
  'a year and',
  'free flight benefits for business trips'],
 {'Passage provided for business purpose': {('Employees receive free flight benefits',
    'Passage provided for business purpose'): 0.5273454640409909,
   ('to return to home country once',
    'Passage provided for business purpose'): 0.5590083551667142,
   ('a year and', 'Passage provided for business purpose'): 0.4956094203318848,
   ('free flight benefits for business trips',
    'Passage provided for business purpose'): 0.6014551911575552}})

In [None]:
bge_minilm62_s_c2(cat_wsub_demo, hlp_biz_demo)
pass

Splitted texts: ['Employees receive free flight benefits to', 'return to home country once', 'a year and', 'free flight benefits for business trips']
General Category: Passage provided for business purpose
Category: Passage provided for business purpose
Similarity Level:
Employees receive free flight  /-/ Passage provided for business  : 0.16618
return to home country once    /-/ Passage provided for business  : 0.039441
a year and                     /-/ Passage provided for business  : 0.065523
free flight benefits for busin /-/ Passage provided for business  : 0.29568
Category: Air tickets for business trip
Similarity Level:
Employees receive free flight  /-/ Air tickets for business trip  : 0.46172
return to home country once    /-/ Air tickets for business trip  : 0.081763
a year and                     /-/ Air tickets for business trip  : 0.050477
free flight benefits for busin /-/ Air tickets for business trip  : 0.65982
Category: Air tickets for work purposes
Similarity Level:


In [None]:
bge_bge_s_c3(cat_wsub_demo, hlp_biz_demo, top_many=3)
pass

Splitted texts: ['Employees receive free flight benefits to', 'return to home country once', 'a year and', 'free flight benefits for business trips']
General Category: Passage provided for business purpose
Sub-Categories: ('Passage provided for business purpose', 'Air tickets for business trip', 'Air tickets for work purposes', 'Flight for business trip', 'Flight for work purposes', 'Travel incidental to employment')
Similarity Level Of Top 3 (Limit=0.5):
free flight benefits for busin /-/ Flight for business trip       : 0.81393
free flight benefits for busin /-/ Air tickets for business trip  : 0.80979
free flight benefits for busin /-/ Flight for work purposes       : 0.77811

General Category: Passage provided when taking up employment and upon termination
Sub-Categories: ('Passage provided when taking up employment and upon termination', 'Air tickets back to home country after contract end', 'Air tickets back to home country after termination of employment', 'Air tickets to work i

In [None]:
minilm62_minilm62_s_c3(cat_wsub_demo, hlp_biz_demo, top_many=3)
pass

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
General Category: Passage provided for business purpose
Sub-Categories: ('Passage provided for business purpose', 'Air tickets for business trip', 'Air tickets for work purposes', 'Flight for business trip', 'Flight for work purposes', 'Travel incidental to employment')
Similarity Level Of Top 3 (Limit=0.5):
free flight benefits for busin /-/ Flight for business trip       : 0.75365
free flight benefits for busin /-/ Air tickets for business trip  : 0.65982
free flight benefits for busin /-/ Flight for work purposes       : 0.60962

General Category: Passage provided when taking up employment and upon termination
Sub-Categories: ('Passage provided when taking up employment and upon termination', 'Air tickets back to home country after contract end', 'Air tickets back to home country after termination of employment', 'Air tickets to work i

### Split And Compare (Cleaned Up Top and Limit) + Lock Version

In [None]:
def split_and_compare_wsub_top_limit_cleaned(split_embed_function, compare_embed_function, categories_wsub, sentence_text, intermediate = False, graph = False, sort_compare = 0, get_inner_list = False, get_list = False, display_split = True , display_end = True, top_many_cat = 3, limit_value = 0.5, extra_clean_output=False):
    splitted_sentence_text = semantic_segmentation_function(embedding_model_function=split_embed_function, sentence_text=sentence_text, intermediate_status=intermediate, graph_status=graph)
    if display_split:
        print(f"Splitted texts: {splitted_sentence_text}")
    compare_result = categories_wsub_similarity_comparison_function(embedding_model_function=compare_embed_function, categories_wsub_dict=categories_wsub, texts=splitted_sentence_text, sort_output=sort_compare)
    resorted_compare_result = categories_wsub_similarity_comparison_resort_function(categories_wsub_similarity_comparison_result_dict=compare_result, get_inner_list=get_inner_list, sort_within_cat=-1, top_many_wsub=1, limit_value=limit_value) ## sort_within_cat=-1, top_many_wsub=1 are both need!! for the purpose of this cleaning up function part!!
    cleaned_resorted_compare_result = categories_wsub_similarity_comparison_resort_cleaning_function(resorted_categories_wsub_similarity_comparison_dict=resorted_compare_result, get_inner_list=get_inner_list, get_list=get_list, top_many_cat=top_many_cat)
    if display_end:
        cleaned_categories_wsub_similarity_comparison_resorted_result_display(cleaned_resorted_compare_result, get_list=get_list)
    if extra_clean_output:
        if get_list:
            return splitted_sentence_text, [(category, compare_sim_tuple[1]) for category, compare_sim_tuple in cleaned_resorted_compare_result]
        else:
            return splitted_sentence_text, {category:compare_sim_tuple[1] for category, compare_sim_tuple in cleaned_resorted_compare_result.items()}
    return splitted_sentence_text, cleaned_resorted_compare_result

In [None]:
def lock_split_and_compare_wsub_top_limit_cleaned(split_embed_function, compare_embed_function):
    return lambda categories_wsub, sentence_text, intermediate = False, graph = False, sort_compare = 0, get_inner_list = False, get_list = False, display_split = True , display_end = True, top_many_cat = 3, limit_value = 0.5, extra_clean_output=False: split_and_compare_wsub_top_limit_cleaned(split_embed_function=split_embed_function, compare_embed_function=compare_embed_function, categories_wsub=categories_wsub, sentence_text=sentence_text, intermediate = intermediate, graph = graph, sort_compare = sort_compare, get_inner_list = get_inner_list, get_list = get_list, display_split = display_split , display_end = display_end, top_many_cat = top_many_cat, limit_value = limit_value, extra_clean_output=extra_clean_output)

##### Example Demo

In [None]:
split_and_compare_wsub_top_limit_cleaned(get_sentence_embedding_MiniLM_L6_v2, get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo, display_end=True, get_inner_list=False, get_list=False, top_many_cat=2)

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
Category: Passage provided for business purpose
free flight benefits for busin /-/ Flight for business trip      : 0.75365

Category: Home leave passage
to return to home country once /-/ Flight to home country        : 0.66566



(['Employees receive free flight benefits',
  'to return to home country once',
  'a year and',
  'free flight benefits for business trips'],
 {'Passage provided for business purpose': (('free flight benefits for business trips',
    'Flight for business trip'),
   0.7536517813135932),
  'Home leave passage': (('to return to home country once',
    'Flight to home country'),
   0.6656597017594097)})

In [None]:
split_and_compare_wsub_top_limit_cleaned(get_sentence_embedding_MiniLM_L6_v2, get_sentence_embedding_MiniLM_L6_v2, cat_wsub_demo, hlp_biz_demo, display_end=True, get_inner_list=False, get_list=True, top_many_cat=2, extra_clean_output=True)

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
Category: Passage provided for business purpose
free flight benefits for busin /-/ Flight for business trip      : 0.75365

Category: Home leave passage
to return to home country once /-/ Flight to home country        : 0.66566



(['Employees receive free flight benefits',
  'to return to home country once',
  'a year and',
  'free flight benefits for business trips'],
 [('Passage provided for business purpose', 0.7536517813135932),
  ('Home leave passage', 0.6656597017594097)])

## Overall Combined Function (Zero Shot Classification)

### Normal Split and Classify + Lock Version

In [None]:
## Split and Classify, using Zero Shot Classification
def split_and_classify(split_embed_function, classify_function, candidate_possible_labels, sentence_text, intermediate = False, graph = False, multi_label=True, sort_classify = 0, additional_resort = True, display_split = True , display_end = True, sort_display = 0):
    splitted_sentence_text = semantic_segmentation_function(embedding_model_function=split_embed_function, sentence_text=sentence_text, intermediate_status=intermediate, graph_status=graph)
    if display_split:
        print(f"Splitted texts: {splitted_sentence_text}")
    classification_result = categories_classification_function(classification_model_function=classify_function, categories_candidate_labels=candidate_possible_labels, texts=splitted_sentence_text, multi_label=multi_label, sort_output=sort_classify)
    if additional_resort:
        resorted_classification_result = categories_classification_additional_resort_function(seq_classified_dictionary=classification_result, categories_candidate_labels=candidate_possible_labels, sort_output=sort_classify, top_many=-1, limit_value=0)
    if display_end:
        display_usage_resorted_classification_result = categories_classification_additional_resort_function(seq_classified_dictionary=classification_result, categories_candidate_labels=candidate_possible_labels, sort_output=0, top_many=-1, limit_value=-1)
        categories_classification_resorted_result_display(classification_resorted_dictionary_result=display_usage_resorted_classification_result, sort_display=sort_display, top_many=-1, limit_value=-1)
    if additional_resort:
        return splitted_sentence_text, resorted_classification_result
    return splitted_sentence_text, classification_result

In [None]:
def lock_split_and_classify(split_embed_function, classify_function):
    return lambda candidate_possible_labels, sentence_text, intermediate = False, graph = False, multi_label=True, sort_classify = 0, additional_resort = True, display_split = True , display_end = True, sort_display = 0: split_and_classify(split_embed_function=split_embed_function, classify_function=classify_function, candidate_possible_labels=candidate_possible_labels, sentence_text=sentence_text, intermediate = intermediate, graph = graph, multi_label=multi_label, sort_classify = sort_classify, additional_resort = additional_resort, display_split = display_split , display_end = display_end, sort_display = sort_display)

### Split and Classify [Top xxx (Count) and Limit yyy (Value)] + Lock Version

In [None]:
def split_and_classify_top_limit(split_embed_function, classify_function, candidate_possible_labels, sentence_text, intermediate = False, graph = False, multi_label=True, sort_classify = 0, additional_resort = True, display_split = True , display_end = True, sort_display = 0, top_many=5, limit_value=0.5):
    splitted_sentence_text = semantic_segmentation_function(embedding_model_function=split_embed_function, sentence_text=sentence_text, intermediate_status=intermediate, graph_status=graph)
    if display_split:
        print(f"Splitted texts: {splitted_sentence_text}")
    classification_result = categories_classification_function(classification_model_function=classify_function, categories_candidate_labels=candidate_possible_labels, texts=splitted_sentence_text, multi_label=multi_label, sort_output=sort_classify)
    if additional_resort:
        resorted_classification_result = categories_classification_additional_resort_function(seq_classified_dictionary=classification_result, categories_candidate_labels=candidate_possible_labels, sort_output=sort_classify, top_many=top_many, limit_value=limit_value)
    if display_end:
        display_usage_resorted_classification_result = categories_classification_additional_resort_function(seq_classified_dictionary=classification_result, categories_candidate_labels=candidate_possible_labels, sort_output=0, top_many=-1, limit_value=-1)
        categories_classification_resorted_result_display(classification_resorted_dictionary_result=display_usage_resorted_classification_result, sort_display=sort_display, top_many=top_many, limit_value=limit_value)
    if additional_resort:
        return splitted_sentence_text, resorted_classification_result
    return splitted_sentence_text, classification_result

In [None]:
def lock_split_and_classify_top_limit(split_embed_function, classify_function):
    return lambda candidate_possible_labels, sentence_text, intermediate = False, graph = False, multi_label=True, sort_classify = 0, additional_resort = True, display_split = True , display_end = True, sort_display = 0, top_many=5, limit_value=0.5: split_and_classify_top_limit(split_embed_function=split_embed_function, classify_function=classify_function, candidate_possible_labels=candidate_possible_labels, sentence_text=sentence_text, intermediate = intermediate, graph = graph, multi_label=multi_label, sort_classify = sort_classify, additional_resort = additional_resort, display_split = display_split , display_end = display_end, sort_display = sort_display, top_many=top_many, limit_value=limit_value)

### Split And Classify (Cleaned Up Top and Limit) + Lock Version

In [None]:
def split_and_classify_top_limit_cleaned(split_embed_function, classify_function, candidate_possible_labels, sentence_text, intermediate = False, graph = False, multi_label=True, get_list = False, display_split = True , display_end = True, top_many_cat=3, limit_value=0.5, extra_clean_output=False):
    splitted_sentence_text = semantic_segmentation_function(embedding_model_function=split_embed_function, sentence_text=sentence_text, intermediate_status=intermediate, graph_status=graph)
    if display_split:
        print(f"Splitted texts: {splitted_sentence_text}")
    classification_result = categories_classification_function(classification_model_function=classify_function, categories_candidate_labels=candidate_possible_labels, texts=splitted_sentence_text, multi_label=multi_label)
    resorted_classification_result = categories_classification_additional_resort_function(seq_classified_dictionary=classification_result, categories_candidate_labels=candidate_possible_labels, sort_output=-1, top_many=1, limit_value=limit_value) # the sort_output = -1 and top_many = 1 is both impt!!!
    cleaned_classification_resorted_result = categories_classification_additional_resort_cleaning_function(classification_resorted_dictionary_result=resorted_classification_result, get_list=get_list, top_many_cat=top_many_cat, limit_value=limit_value)

    if display_end:
        cleaned_categories_classification_resorted_result_display(cleaned_classification_resorted_result=cleaned_classification_resorted_result, get_list=get_list)
    
    if extra_clean_output:
        if get_list:
            return splitted_sentence_text, [(category_label, seq_pred_tuple[1]) for category_label, seq_pred_tuple in cleaned_classification_resorted_result]
        else:
            return splitted_sentence_text, {category_label:seq_pred_tuple[1] for category_label, seq_pred_tuple in cleaned_classification_resorted_result.items()}
    return splitted_sentence_text, cleaned_classification_resorted_result

In [None]:
def lock_split_and_classify_top_limit_cleaned(split_embed_function, classify_function):
    return lambda candidate_possible_labels, sentence_text, intermediate = False, graph = False, multi_label=True, get_list=False, display_split = True , display_end = True, top_many_cat=3, limit_value=0.5, extra_clean_output=False: split_and_classify_top_limit_cleaned(split_embed_function=split_embed_function, classify_function=classify_function, candidate_possible_labels=candidate_possible_labels, sentence_text=sentence_text, intermediate = intermediate, graph = graph, multi_label=multi_label, get_list=get_list, display_split = display_split , display_end = display_end, top_many_cat=top_many_cat, limit_value=limit_value, extra_clean_output=extra_clean_output)

##### Example Demo

In [None]:
split_and_classify(get_sentence_embedding_MiniLM_L6_v2, bart_mnli_classifier, new_temp_l_cat, hlp_biz_demo, sort_classify=-1)

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
Category: Passage provided for business purpose
Employees receive free flight benefits : 0.964426577091217
to return to home country once : 0.13720235228538513
a year and : 0.5728473663330078
free flight benefits for business trips : 0.9442800283432007

Category: Passage provided when taking up employment and upon termination
Employees receive free flight benefits : 0.48903337121009827
to return to home country once : 0.06560046970844269
a year and : 0.2950785160064697
free flight benefits for business trips : 0.010641817934811115

Category: Home leave passage
Employees receive free flight benefits : 0.49825218319892883
to return to home country once : 0.9193651676177979
a year and : 0.12018565833568573
free flight benefits for business trips : 0.07301679998636246



(['Employees receive free flight benefits',
  'to return to home country once',
  'a year and',
  'free flight benefits for business trips'],
 {'Passage provided for business purpose': {'Employees receive free flight benefits': 0.964426577091217,
   'free flight benefits for business trips': 0.9442800283432007,
   'a year and': 0.5728473663330078,
   'to return to home country once': 0.13720235228538513},
  'Passage provided when taking up employment and upon termination': {'Employees receive free flight benefits': 0.48903337121009827,
   'a year and': 0.2950785160064697,
   'to return to home country once': 0.06560046970844269,
   'free flight benefits for business trips': 0.010641817934811115},
  'Home leave passage': {'to return to home country once': 0.9193651676177979,
   'Employees receive free flight benefits': 0.49825218319892883,
   'a year and': 0.12018565833568573,
   'free flight benefits for business trips': 0.07301679998636246}})

In [None]:
split_and_classify_top_limit(get_sentence_embedding_MiniLM_L6_v2, bart_mnli_classifier, new_temp_l_cat, hlp_biz_demo, display_end=True, sort_classify=-1, top_many=3, limit_value=-1)
pass


Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
Category: Passage provided for business purpose
Employees receive free flight benefits : 0.964426577091217
to return to home country once : 0.13720235228538513
a year and : 0.5728473663330078
free flight benefits for business trips : 0.9442800283432007

Category: Passage provided when taking up employment and upon termination
Employees receive free flight benefits : 0.48903337121009827
to return to home country once : 0.06560046970844269
a year and : 0.2950785160064697
free flight benefits for business trips : 0.010641817934811115

Category: Home leave passage
Employees receive free flight benefits : 0.49825218319892883
to return to home country once : 0.9193651676177979
a year and : 0.12018565833568573
free flight benefits for business trips : 0.07301679998636246



In [None]:
split_and_classify_top_limit(get_sentence_embedding_MiniLM_L6_v2, bart_mnli_classifier, new_temp_l_cat, hlp_biz_demo, display_end=True, sort_classify=-1, sort_display=-1, top_many=2, limit_value=0.45)
pass

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
Category: Passage provided for business purpose
Employees receive free flight benefits : 0.964426577091217
free flight benefits for business trips : 0.9442800283432007

Category: Passage provided when taking up employment and upon termination
Employees receive free flight benefits : 0.48903337121009827

Category: Home leave passage
to return to home country once : 0.9193651676177979
Employees receive free flight benefits : 0.49825218319892883



In [None]:
split_and_classify_top_limit_cleaned(get_sentence_embedding_MiniLM_L6_v2, bart_mnli_classifier, new_temp_l_cat, hlp_biz_demo, display_end=True, get_list=True, top_many_cat=2, limit_value=0.5)
pass

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
Category: Passage provided for business purpose
Employees receive free flight benefits: 0.964426577091217

Category: Home leave passage
to return to home country once: 0.9193651676177979



In [None]:
split_and_classify_top_limit_cleaned(get_sentence_embedding_MiniLM_L6_v2, bart_mnli_classifier, new_temp_l_cat, hlp_biz_demo, display_end=True, get_list=False, top_many_cat=2, limit_value=0.5, extra_clean_output=True)

Splitted texts: ['Employees receive free flight benefits', 'to return to home country once', 'a year and', 'free flight benefits for business trips']
Category: Passage provided for business purpose
Employees receive free flight benefits                           : 0.96443

Category: Home leave passage
to return to home country once                                   : 0.91937



(['Employees receive free flight benefits',
  'to return to home country once',
  'a year and',
  'free flight benefits for business trips'],
 {'Passage provided for business purpose': 0.964426577091217,
  'Home leave passage': 0.9193651676177979})