data_preparation/split_paragraph.py

import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.signal import argrelextrema
import math

# Transcript is one line, so we select it and change question mark for dots so that we split it correctly.
def split_sentence(text):
    text = text.replace("?", ".")
    sentences = text.split('. ')
    sentences[-1] = sentences[-1].replace('.', '')
    return sentences

def unify_sentence(sentences):
    # Get the length of each sentence
    sentece_length = [len(each) for each in sentences]
    # Determine longest outlier
    long = np.mean(sentece_length) + np.std(sentece_length) *2
    # Determine shortest outlier
    short = np.mean(sentece_length) - np.std(sentece_length) *2
    # Shorten long sentences
    text = ''
    prev_each = ''

    for i, each in enumerate(sentences):
        if each == prev_each or len(each.strip()) == 0:
            continue
        if len(each) > long:
            # let's replace all the commas with dots
            comma_splitted = each.replace(',', '.')
            text+= f'{comma_splitted}. '
        else:
            text+= f'{each}. '

        prev_each = each
    
    sentences = text.split('. ')
    sentences[-1] = sentences[-1].replace('.', '')
    # Now let's concatenate short ones
    text = ''
    for each in sentences:
        if len(each) == 0:
            continue
        if len(each) < short:
            text+= f'{each} '
        else:
            text+= f'{each}. '
    
    return text

def rev_sigmoid(x:float)->float:
    return (1 / (1 + math.exp(0.5*x)))
    
def activate_similarities(similarities:np.array, p_size=10, order=5)->np.array:
    """ Function returns list of weighted sums of activated sentence similarities
    Args:
        similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity
        p_size (int): number of sentences are used to calculate weighted sum 
    Returns:
        list: list of weighted sums
    """
    if similarities.shape[0] < p_size:
        p_size = similarities.shape[0]
    x = np.linspace(-10,10,p_size)
    # Then we need to apply activation function to the created space
    y = np.vectorize(rev_sigmoid) 
    # Because we only apply activation to p_size number of sentences we have to add zeros to neglect the effect of every additional sentence and to match the length ofvector we will multiply
    activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size))
    ### 1. Take each diagonal to the right of the main diagonal
    diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])]
    ### 2. Pad each diagonal by zeros at the end. Because each diagonal is different length we should pad it with zeros at the end
    diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals]
    ### 3. Stack those diagonals into new matrix
    diagonals = np.stack(diagonals)
    ### 4. Apply activation weights to each row. Multiply similarities with our activation.
    diagonals = diagonals * activation_weights.reshape(-1,1)
    ### 5. Calculate the weighted sum of activated similarities
    activated_similarities = np.sum(diagonals, axis=0)
    ### 6. Find relative minima of our vector. For all local minimas and save them to variable with argrelextrema function
    minmimas = argrelextrema(activated_similarities, np.less, order=order) #order parameter controls how frequent should be splits. I would not reccomend changing this parameter.

    return minmimas

def correct_chunks(chunks):
    prev_chunk = None
    new_chunks = []
    for chunk in chunks:
        if prev_chunk:
            chunk['text'] = prev_chunk['text'] + chunk['text']
            chunk['timestamp'] = (prev_chunk['timestamp'][0], chunk['timestamp'][1])

        if not chunk['text'].endswith('.'):
            prev_chunk = chunk
        else:
            new_chunks.append(chunk)
            prev_chunk = None
    return new_chunks

def gen_parag(input_chunks, model_name='all-minilm-l6-v2', p_size=10, order=5):
    sentences_all = []
    timestamps_all = []
    
    corrected_chunks = correct_chunks(input_chunks)
    
    for chunk in corrected_chunks:
        sentences = split_sentence(chunk['text'])
        text = unify_sentence(sentences)
        text = text.strip()
        sentences = text.split('. ')
        sentences[-1] = sentences[-1].replace('.', '')
        timestamps = [chunk['timestamp']]*len(sentences)

        sentences_all += sentences
        timestamps_all += timestamps
    
    # Embed sentences
    model = SentenceTransformer(model_name)
    embeddings = model.encode(sentences_all)
    # Create similarities matrix
    similarities = cosine_similarity(embeddings)

    # Let's apply our function. For long sentences i reccomend to use 10 or more sentences
    minmimas = activate_similarities(similarities, p_size=p_size, order=order)

    # Create empty string
    split_points = [each for each in minmimas[0]]
    text = ''

    para_chunks = []
    para_timestamp = []
    start_timestamp = 0
    
    for num, each in enumerate(sentences_all):
        current_timestamp = timestamps_all[num]
        
        if text == '' and (start_timestamp == current_timestamp[1]):
            start_timestamp = current_timestamp[0]
        
        if num in split_points:
            #text+=f'{each}. '
            para_chunks.append(text)
            para_timestamp.append([start_timestamp, current_timestamp[1]])
            text = f'{each}. '
            start_timestamp = current_timestamp[1]
        else:
            text+=f'{each}. '

    if len(text):
        para_chunks.append(text)
        para_timestamp.append([start_timestamp, timestamps_all[-1][1]])
    
    return para_chunks, para_timestamp