/
split_paragraph.py
152 lines (128 loc) · 5.7 KB
/
split_paragraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.signal import argrelextrema
import math
# Transcript is one line, so we select it and change question mark for dots so that we split it correctly.
def split_sentence(text):
text = text.replace("?", ".")
sentences = text.split('. ')
sentences[-1] = sentences[-1].replace('.', '')
return sentences
def unify_sentence(sentences):
# Get the length of each sentence
sentece_length = [len(each) for each in sentences]
# Determine longest outlier
long = np.mean(sentece_length) + np.std(sentece_length) *2
# Determine shortest outlier
short = np.mean(sentece_length) - np.std(sentece_length) *2
# Shorten long sentences
text = ''
prev_each = ''
for i, each in enumerate(sentences):
if each == prev_each or len(each.strip()) == 0:
continue
if len(each) > long:
# let's replace all the commas with dots
comma_splitted = each.replace(',', '.')
text+= f'{comma_splitted}. '
else:
text+= f'{each}. '
prev_each = each
sentences = text.split('. ')
sentences[-1] = sentences[-1].replace('.', '')
# Now let's concatenate short ones
text = ''
for each in sentences:
if len(each) == 0:
continue
if len(each) < short:
text+= f'{each} '
else:
text+= f'{each}. '
return text
def rev_sigmoid(x:float)->float:
return (1 / (1 + math.exp(0.5*x)))
def activate_similarities(similarities:np.array, p_size=10, order=5)->np.array:
""" Function returns list of weighted sums of activated sentence similarities
Args:
similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity
p_size (int): number of sentences are used to calculate weighted sum
Returns:
list: list of weighted sums
"""
if similarities.shape[0] < p_size:
p_size = similarities.shape[0]
x = np.linspace(-10,10,p_size)
# Then we need to apply activation function to the created space
y = np.vectorize(rev_sigmoid)
# Because we only apply activation to p_size number of sentences we have to add zeros to neglect the effect of every additional sentence and to match the length ofvector we will multiply
activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size))
### 1. Take each diagonal to the right of the main diagonal
diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])]
### 2. Pad each diagonal by zeros at the end. Because each diagonal is different length we should pad it with zeros at the end
diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals]
### 3. Stack those diagonals into new matrix
diagonals = np.stack(diagonals)
### 4. Apply activation weights to each row. Multiply similarities with our activation.
diagonals = diagonals * activation_weights.reshape(-1,1)
### 5. Calculate the weighted sum of activated similarities
activated_similarities = np.sum(diagonals, axis=0)
### 6. Find relative minima of our vector. For all local minimas and save them to variable with argrelextrema function
minmimas = argrelextrema(activated_similarities, np.less, order=order) #order parameter controls how frequent should be splits. I would not reccomend changing this parameter.
return minmimas
def correct_chunks(chunks):
prev_chunk = None
new_chunks = []
for chunk in chunks:
if prev_chunk:
chunk['text'] = prev_chunk['text'] + chunk['text']
chunk['timestamp'] = (prev_chunk['timestamp'][0], chunk['timestamp'][1])
if not chunk['text'].endswith('.'):
prev_chunk = chunk
else:
new_chunks.append(chunk)
prev_chunk = None
return new_chunks
def gen_parag(input_chunks, model_name='all-minilm-l6-v2', p_size=10, order=5):
sentences_all = []
timestamps_all = []
corrected_chunks = correct_chunks(input_chunks)
for chunk in corrected_chunks:
sentences = split_sentence(chunk['text'])
text = unify_sentence(sentences)
text = text.strip()
sentences = text.split('. ')
sentences[-1] = sentences[-1].replace('.', '')
timestamps = [chunk['timestamp']]*len(sentences)
sentences_all += sentences
timestamps_all += timestamps
# Embed sentences
model = SentenceTransformer(model_name)
embeddings = model.encode(sentences_all)
# Create similarities matrix
similarities = cosine_similarity(embeddings)
# Let's apply our function. For long sentences i reccomend to use 10 or more sentences
minmimas = activate_similarities(similarities, p_size=p_size, order=order)
# Create empty string
split_points = [each for each in minmimas[0]]
text = ''
para_chunks = []
para_timestamp = []
start_timestamp = 0
for num, each in enumerate(sentences_all):
current_timestamp = timestamps_all[num]
if text == '' and (start_timestamp == current_timestamp[1]):
start_timestamp = current_timestamp[0]
if num in split_points:
#text+=f'{each}. '
para_chunks.append(text)
para_timestamp.append([start_timestamp, current_timestamp[1]])
text = f'{each}. '
start_timestamp = current_timestamp[1]
else:
text+=f'{each}. '
if len(text):
para_chunks.append(text)
para_timestamp.append([start_timestamp, timestamps_all[-1][1]])
return para_chunks, para_timestamp