# Semantic Segmentation Function Section

### Semantic Segmentation Function PREPARATION FUNCTIONS

In [None]:
## Semantic Segmentation Function PREPARATION FUNCTIONS

from itertools import islice

def window(seq, n=3):
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result
        
        
        
        
def climb(co_score_list, list_index, mode = "l"):
    res_score = 0
    if mode == "l":
        while (list_index >= 0):
            if co_score_list[list_index] > res_score:
                res_score = co_score_list[list_index]
                list_index -= 1
            else:
                break
        return res_score
    else:
        list_len = len(co_score_list)
        while (list_index < list_len):
            if co_score_list[list_index] > res_score:
                res_score = co_score_list[list_index]
                list_index += 1
            else:
                break
        return res_score
    
def get_depth_score_list(co_score_list):
    res_depth_score_list = []
    co_score_len = len(co_score_list)
    for i in range(co_score_len):
        i_co_score = co_score_list[i]
        l_peak = climb(co_score_list, i, "l")
        r_peak = climb(co_score_list, i, "r")
        i_depth_score = 0.5 * (l_peak + r_peak - (2*i_co_score))
        res_depth_score_list.append(i_depth_score)
    return np.array(res_depth_score_list)




import matplotlib.pyplot as plt

from scipy.signal import argrelmax

def get_local_maxima(depth_scores, order=1):
    maxima_ids = argrelmax(depth_scores, order=order)[0]
    filtered_scores = np.zeros(len(depth_scores))
    filtered_scores[maxima_ids] = depth_scores[maxima_ids]
    return filtered_scores

def compute_threshold(scores): ## maybe can make this more picky, by making threshold higher, like (np.std(s) / 3) or /4 or more instead?
    s = scores[np.nonzero(scores)]
    threshold = np.mean(s) - (np.std(s) / 2)
    # threshold = np.mean(s) - (np.std(s))
    return threshold

def get_threshold_segments(scores, threshold=0.1):
    segment_ids = np.where(scores >= threshold)[0]
    return segment_ids




def primitively_naive_tokeniser(text):
    toks_list = text.split(" ")
    return toks_list

### Semantic Segmentation Function Portions

In [None]:
## Semantic Segmentation Function Portions

WINDOW_SIZE = 3

def sentence_to_sliding_window(sentence_s):
    sentence_words_toks = primitively_naive_tokeniser(sentence_s)
    window_size_split = list(window(sentence_words_toks, WINDOW_SIZE))
    window_splited_texts = [' '.join([window_toks for window_toks in each_window]) for each_window in window_size_split]
    return window_splited_texts

def coherence_score_list_from_embedding_list(window_splited_embedding_list):
    coherence_scores_list = [cosine_sim(pair[0], pair[1]) for pair in zip(window_splited_embedding_list[:-1], window_splited_embedding_list[1:])]
    return coherence_scores_list

def plot_data_points(vary_data, thres = -1):
    plt.plot(vary_data)
    if (thres == -1):
        plt.show()
    else:
        plt.plot([thres for i in range(len(vary_data))])
        plt.show()

def filtered_indexes_list_to_splitted_segments_by_semantics(original_sent, filtered_indexes_list):
    sentence_words_toks = primitively_naive_tokeniser(original_sent)
    segment_key_breaks = get_threshold_segments(filtered_indexes_list, compute_threshold(filtered_indexes_list))
    segment_demark = [0] + [(ids + (WINDOW_SIZE-1)) for ids in segment_key_breaks] + [len(sentence_words_toks)]
    segment_demark_intervals = list(zip(segment_demark[:-1], segment_demark[1:]))
    resultant_segments_after_split_by_interval = [" ".join(sentence_words_toks[interval_points[0]:interval_points[1]]) for interval_points in segment_demark_intervals]
    return resultant_segments_after_split_by_interval

### Semantic Segmentation Function

In [None]:
## Semantic Segmentation Function

def semantic_segmentation_function(embedding_model_function, sentence_text, intermediate_status = False, graph_status = False):
    windowed_parts = sentence_to_sliding_window(sentence_text)
    if intermediate_status:
        print(f"windowed_parts: {windowed_parts}")
    
    # if ensure "embedding_model_function" accept only 1 string and return 1d array/tensor then can use the below code, current should still work!!, as long as return 1d array for single string!!
    # embedding_list = [embedding_model_function(windowed_part) for windowed_part in windowed_parts]
    
    ## if list of input strings can produce 2d array/tensor automatically, then can just use below one!!, only 1 time embed bunch at once!!
    embedding_list = embedding_model_function(windowed_parts)
    if intermediate_status:
        print(f"embedding_list: {embedding_list}")
    """
    if graph_status:
        print("Embedding List Plot") # bad! like no use
        plot_data_points(embedding_list) # bad! like no use
    """
    
    windowed_parts_coherence_score_list = coherence_score_list_from_embedding_list(embedding_list)
    if intermediate_status:
        print(f"windowed_parts_coherence_score_list: {windowed_parts_coherence_score_list}")
    if graph_status:
        print("Coherence Score Plot:")
        plot_data_points(windowed_parts_coherence_score_list)
    
    windowed_parts_depth_score_list = get_depth_score_list(windowed_parts_coherence_score_list)
    if intermediate_status:
        print(f"windowed_parts_depth_score_list: {windowed_parts_depth_score_list}")
    if graph_status:
        print("Depth Score Plot:")
        plot_data_points(windowed_parts_depth_score_list)
    
    windowed_parts_filtered_depth_score_list = get_local_maxima(windowed_parts_depth_score_list)
    if intermediate_status:
        print(f"windowed_parts_filtered_depth_score_list: {windowed_parts_filtered_depth_score_list}")
    if graph_status:
        print("Filtered Depth Score Plot:")
        plot_data_points(windowed_parts_filtered_depth_score_list)
    
    filtered_threshold = compute_threshold(windowed_parts_filtered_depth_score_list)
    if intermediate_status:
        print(f"filtered_threshold: {filtered_threshold}")
    if graph_status:
        print("Filtered Depth Score With Threshold Line Plot:")
        plot_data_points(windowed_parts_filtered_depth_score_list, filtered_threshold)

    #sentences_tokenised = primitively_naive_tokeniser(sentences)
    #sentences_topics_splitted = filtered_indexes_list_to_splitted_sent(sentences_tokenised, windowed_sentences_filtered_depth_score_v1_list)
    sentences_topics_splitted = filtered_indexes_list_to_splitted_segments_by_semantics(sentence_text, windowed_parts_filtered_depth_score_list)
    return sentences_topics_splitted

In [None]:
# Lock Model
def lock_semantic_segmentation_function(embedding_model_function):
    def lockED_semantic_segmentation_function(sentence_text, intermediate_status = False, graph_status = False): # all these default params need to have because the locked function can have the option to leave the args blank for them to let it be default!
        return semantic_segmentation_function(embedding_model_function=embedding_model_function, sentence_text=sentence_text, intermediate_status=intermediate_status, graph_status=graph_status)
    return lockED_semantic_segmentation_function

##### Example Demo

In [None]:
# Eg. "get_sentence_embedding_MiniLM_L6_v2" embedding and statuses = False
semantic_segmentation_function(get_sentence_embedding_MiniLM_L6_v2, "Employees receive free flight benefits to return to home country once a year and free flight benefits for business trips", intermediate_status=False, graph_status=False)

['Employees receive free flight benefits',
 'to return to home country once',
 'a year and',
 'free flight benefits for business trips']

In [None]:
# Locked Model
semantic_segmentation_locked_model_MiniLM_L6_v2 = lock_semantic_segmentation_function(get_sentence_embedding_MiniLM_L6_v2)
semantic_segmentation_locked_model_MiniLM_L6_v2("Employees receive free flight benefits to return to home country once a year and free flight benefits for business trips", intermediate_status=False, graph_status=False)

['Employees receive free flight benefits',
 'to return to home country once',
 'a year and',
 'free flight benefits for business trips']