This notebook is for extracting sentence from raw recording.  Basically, this code runs voice activity detection, runs ASR, and tries to guess the segments that belong to each sentence.

In [None]:
import re

from pydub import AudioSegment
import pickle
import pandas as pd
from tqdm import tqdm

from pyannote.audio import Model
modelPyannote = Model.from_pretrained("pyannote/segmentation", use_auth_token="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")

from pyannote.audio.pipelines import VoiceActivityDetection
pipeline = VoiceActivityDetection(segmentation=modelPyannote)
HYPER_PARAMETERS = {
  # onset/offset activation thresholds
  "onset": 0.5, "offset": 0.5,
  # remove speech regions shorter than that many seconds.
  "min_duration_on": 0.0,
  # fill non-speech regions shorter than that many seconds.
  "min_duration_off": 0.05
}
pipeline.instantiate(HYPER_PARAMETERS)


import re, os

from whisper_model import WhisperASR

whisper_model = WhisperASR(model_size='large-v2', language='french')
whisper_model.load()

In [None]:
import editdistance
import pickle
import json


def edit_distance(s1, s2):
    return editdistance.eval(s1, s2)

def format_int(i):
    return str(i).zfill(8)

def getBestMatch(text, textArray, doSubSentence):
    dist = 10000
    score = 10
    bestMatch = 'no match'
    for t in textArray:
        if doSubSentence == False:
            d = edit_distance(text, t)
            if d < dist and min(len(text), len(bestMatch)) > 0:
                bestMatch = t
                dist = d
                score = dist/min(len(text), len(bestMatch))
        else:
            if len(t) > len(text):
                for i in range(len(t) - len(text) + 1):
                    d = edit_distance(text, t[i:i+len(text)])
                    if d < dist and min(len(text), len(bestMatch)):
                        bestMatch = t
                        dist = d
                        score = dist/min(len(text), len(bestMatch))
            else: # len(t) * 1.5 > len(text) or len(t) < len(text):    
                for i in range(len(text) - len(t) + 1):
                    d = edit_distance(t, text[i:i+len(t)])
                    if d < dist and min(len(t), min(len(text), len(bestMatch))) > 0:
                        bestMatch = t
                        dist = d
                        score = dist/min(len(t), min(len(text), len(bestMatch)))
    return bestMatch, score


def buildSegmentRunASRandAlign(data, filename, segments, sentences, inverseSentences, offset):
    # extract and save segments
    # if offset > 10000:
    #     # print(offset)
    #     offset = int(str(offset)[1:])
    outputAudio = AudioSegment.empty()
    sec = 1000
    used_segments = []
    text = ''
    for seg in segments:
        text += seg['asr'] + ' '
    text = text.strip()
    # matchingSentences, score = getBestMatch(text, list(inverseSentences.keys())[max(0, offset-5):], False)
    matchingSentences, score = getBestMatch(text, list(inverseSentences.keys()), False)
    if matchingSentences not in inverseSentences:
        return text, offset, 'nothing found for segment ' + json.dumps(used_segments) + ' in ' + filename
    elif score > 0.5:
        return text, offset, 'nothing found for segment ' + json.dumps(used_segments) + ' in ' + filename
    outputFilename = inverseSentences[matchingSentences]
    return text, outputFilename, sentences[outputFilename]


# trim the audio using start end end time in secs
def trim_audio(path, start, end, out_path):
    sound = AudioSegment.from_file(path, format="wav")
    trimmed_sound = sound[start * 1000 : end * 1000]
    trimmed_sound.export(out_path, format="wav")
    return out_path

In [None]:

batch = 'batches/'
padding = 0.25

filenames = [batch + 'From 10002 - 11000.wav'] #, batch + 'Patch9.1.wav'] #, batch + 'Patch7.2.wav'] #, batch + 'Patch6_8.1.wav'] #, batch + 'Patch6_5.2.wav', batch + 'Patch6_6.wav', batch + 'Patch6_7.0.wav', batch + 'Patch6_7.1.wav', batch + 'Patch6_7.2.wav'] #, batch + '6d_File.1.wav', batch + '6d_File.2.wav'] # , batch + '4d_File.1.wav', batch + '4d_File.2.wav', batch + '4d_File.3.wav', batch + '5d_File.0.wav', batch + '5d_File.1.wav', batch + '5d_File.2.wav', batch + '5d_File.3.wav'] # , batch + 'Patch4.7.2.wav', batch + 'Patch4.7.3.wav'] #, batch + 'Patch4.4.wav', batch + 'Patch4.5.wav', batch + 'Patch4.6.wav'] # [batch + '6d_File.0.wav', batch + '6d_File.1.wav', batch + '6d_File.2.wav'] # [batch + '5d_File.0.wav', batch + '5d_File.1.wav', batch + '5d_File.2.wav', batch + '5d_File.3.wav']

filename = filenames[0]

if os.path.exists(filename + '.vad.bin'):
    vad = pickle.load(open(filename + '.vad.bin', 'rb'))
else:
    vad = pipeline(filename)
    i = 0
    with open(filename + '.vad.bin', 'wb') as f:
        pickle.dump(vad, f)
data = AudioSegment.from_file(filename)

sentences = {}
inverseSentences = {}
df_sentences = pd.read_csv('batches/fr - fr.csv')
id_int = df_sentences["unique_identifier"].apply(lambda x: int(x[2:]))
df_sentences["id_int"] = id_int 
df_sentences.set_index("id_int", inplace=True)
# include only ids in between 10002 and 11000
df_sentences = df_sentences.loc[10002:11000]
for index, row in df_sentences.iterrows():
    sentenceNum = int(index)
    sentence = row['text']
    sentences[sentenceNum] = sentence
    if sentence not in inverseSentences:
        inverseSentences[sentence] = sentenceNum
    else:
        tmp = sentence
        while tmp in inverseSentences:
            tmp += ' _'
        inverseSentences[tmp] = sentenceNum
                    

sentenceNumber = -1
segments = {}
if os.path.exists(filename + '.segments.json'):
    segments = json.load(open(filename + '.segments.json', 'r'))
else:
    timeline = vad.get_timeline().support()
    for segment in tqdm(timeline):
        start, end = list(segment)
        start = max(0, start - padding)
        end = min(end + padding, len(data)/1000)
        seg = {}
        seg['SegmentStart'] = start
        seg['SegmentEnd'] = end
        outputAudio = AudioSegment.empty()
        outputAudio += data[seg['SegmentStart'] * 1000 : seg['SegmentEnd'] * 1000]
        outputAudio.export(batch + 'TMP/tmp.wav', format='wav')
        # run ASR
        try:
            result = whisper_model.predict({"instances": [{"url": batch + 'TMP/tmp.wav'}]})
            asr = result['predictions'][0]
            seg['asr'] = asr
        except:
            seg['asr'] = ''
            pass
        segments[start] = seg
    # save segments
    with open(filename + '.segments.json', 'w') as fout:
        json.dump(segments, fout, indent=4)

In [None]:
import numpy as np
segments_list = [v for k, v in segments.items()]
sentences_list = [v for k, v in sentences.items()]
distances_matrix = np.ones((len(segments_list), len(sentences))) * 1000

for ik in range(len(segments_list)):
    for jk,  sentence in enumerate(sentences_list):
        try:
            distances_matrix[ik, jk] = edit_distance(segments_list[ik]['asr'], sentence)/min(len(segments_list[ik]['asr']), len(sentence))
        except:
            distances_matrix[ik, jk] = np.inf
    

In [None]:
# get the best match for each segment
best_matches = np.argmin(distances_matrix, axis=1)


In [None]:

# # make a dataframe 
columns = ["status", "filename", "sentenceNumber", "sentence", "asr", "start", "end", "ed_dist", "len_dif"]
df = pd.DataFrame(columns=columns)
best_matched_sentences = [sentences_list[k] for k in best_matches]

# print the results
for ik in range(len(segments_list)):
    asr = segments_list[ik]['asr']
    sentence = best_matched_sentences[ik]
    ed_dist = distances_matrix[ik, best_matches[ik]]
    try:
        len_dif = abs(len(asr) - len(sentence)) / min(len(asr), len(sentence))
    except:
        len_dif = np.inf
    start = segments_list[ik]['SegmentStart']
    end = segments_list[ik]['SegmentEnd']
    sentenceNumber = inverseSentences[sentence]
    if ed_dist < 0.25 and len_dif < 0.15:
        status = "assigned"
    else:
        status = "not_assigned"
    
    row = {
        "status": status,
        "filename": filename,
        "sentenceNumber": sentenceNumber,
        "sentence": sentence,
        "asr": asr,
        "start": start,
        "end": end,
        "ed_dist": ed_dist,
        "len_dif": len_dif
    }
    df = df.append(row, ignore_index=True)

In [None]:
#if there is inf  drop it
df = df.replace([np.inf, -np.inf], np.nan)
df.dropna(inplace=True)


In [None]:
# if there are multiple rows with same sentenceNumber take the last one and drop the rest
df = df.sort_values(by=['sentenceNumber'])
df = df.drop_duplicates(subset=['sentenceNumber'], keep='last')

df

In [None]:


# create a folder for wav files
wav_folder = os.path.join(batch, os.path.basename(filename).replace(".wav", ""))
os.makedirs(wav_folder, exist_ok=True)

# create "assigned" and "not_assigned" folders
os.makedirs(os.path.join(wav_folder, "assigned"), exist_ok=True)
os.makedirs(os.path.join(wav_folder, "not_assigned"), exist_ok=True)


# for each row in the dataframe if the status is assigned, create a wav file with the start and end times of the segment
# if not assigned, create a wav file with the start and end times of the segment 
for index, row in tqdm(df.iterrows(), total=len(df)):
    start = row['start']
    end = row['end']
    asr = row['asr']
    sentence = row['sentence']
    status = row['status']
    if status == "assigned":
        wav_path = os.path.join(wav_folder, "assigned", 'FR' + format_int(row['sentenceNumber']) + ".wav")
    else:
        wav_path = os.path.join(wav_folder, "not_assigned", 'FR' + format_int(row['sentenceNumber']) + ".wav")

    outpath = trim_audio(filename,start, end,  wav_path)
    



In [None]:
from glob import glob
filenames = glob(batch + "From*.wav")

In [None]:
filenames

In [None]:
filename