# Trying out just one file and concatenating the speech segments to assess them

In [None]:
import pyannote.core
from pydub import AudioSegment
file = {'uri':'1043_meal', 'audio':'/Users/andrei-macpro/Documents/Data/Audio/Meal/1043_meal.wav'}
import torch
# speech activity detection model trained on AMI training set
sad = torch.hub.load('pyannote/pyannote-audio', 'sad_ami')
# obtain raw SAD scores (as `pyannote.core.SlidingWindowFeature` instance)
sad_scores = sad(file)

In [None]:
from pyannote.audio.utils.signal import Binarize
binarize = Binarize(offset=0.52, onset=0.52, log_scale=True, 
                    min_duration_off=0.1, min_duration_on=0.1)

# speech regions (as `pyannote.core.Timeline` instance)
speech = binarize.apply(sad_scores, dimension=1)

In [None]:
# convert timestamps to miliseconds 
type(speech[0])
# speech is a timeline made of segments

In [None]:
#extract speech and non-speech segments (start and end time) as a dictionary
timestamps_speech=dict(speech)
timestamps_silence = dict(speech.gaps())

In [None]:
file=AudioSegment.from_wav('/Users/andrei-macpro/Documents/Data/Audio/1043_meal.wav')

In [None]:
# pydub works in miliseconds so we're going to convert the timestamps from the pyannote seconds to miliseconds
start_times_speech=[x*1000 for x in list(timestamps_speech.keys())]
end_times_speech=[x*1000 for x in list(timestamps_speech.values())]
start_times_silence=[x*1000 for x in list(timestamps_silence.keys())]
end_times_silence=[x*1000 for x in list(timestamps_silence.values())]

In [None]:
speech_segments=[file[start_time:end_time] for start_time,end_time in zip(start_times_speech, end_times_speech)]
silence_segments=[file[start_time:end_time] for start_time,end_time in zip(start_times_silence, end_times_silence)]

In [None]:
# export the concatenated speech and silence segments to disk
sum(speech_segments).export('/Users/andrei-macpro/Documents/Data/Audio/speech.wav', format="wav")
sum(silence_segments).export('/Users/andrei-macpro/Documents/Data/Audio/non-speech.wav', format="wav")

# Now let's get on with the full speech detection processing

In [None]:
import os
import pyannote.core
import torch
from pyannote.core import Timeline, Segment
from pyannote.audio.utils.signal import Binarize
from tqdm import tqdm
import pandas as pd

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Audio/Meal')


In [None]:
path = '/Users/andrei-macpro/Documents/Data/Audio/Meal'  #if string starts with slash it is considered absolute
dirs = os.listdir( path )
file_names=sorted([i for i in os.listdir(".") if not i.startswith(".")])
#file_names = [x.replace('.wav','') for x in file_names]

In [None]:
sad = torch.hub.load('pyannote/pyannote-audio', 'sad_ami') # ok so this isn't taking up too much memory 

In [None]:
binarize = Binarize(offset=0.52, onset=0.52, log_scale=True, 
                    min_duration_off=0.1, min_duration_on=0.1)

In [None]:
# extract a dictionary of key-filename : value-#pyannoteTimeline 
def timeline(file_names):
    temp_list1=list()
    temp_list2=list()
    for file_name in tqdm(file_names):
        sad_scores=sad(file_name)
        speech=binarize.apply(sad_scores,dimension=1)
        del sad_scores
        temp_list1.append(file_name.get('audio'))
        temp_list2.append(speech)
    trial_dict=dict(zip(temp_list1,temp_list2))
    return trial_dict

In [None]:
# extract json files of the #pyannoteTimeline objects which can then be loaded again
def write_disk(sad_segments):
    for key in sad_segments.keys():
        temp_json=sad_segments[key].for_json()
        with open(str(key)+'.json', 'w') as outfile:
            json.dump(temp_json, outfile)

In [None]:
# make dictionary for files and filenames
list_dict_filenames=list()
files = dict()
for file_name in file_names:
    list_dict_filenames.append({'audio':str(file_name)})

In [None]:
sad_segments=timeline(list_dict_filenames) 

In [None]:
import json
write_disk(sad_segments) #save Timeline objects as json files to disk

# now we load the json files back to timelines to get the durations and non-speech 

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Audio/speech_detection_meal')

In [None]:
import json
with open('1043_meal.wav.json') as json_file:
    data = json.load(json_file)

In [None]:
# lets make a loop that imports all json data into separate dictionaries
path = '/Users/andrei-macpro/Documents/Data/Audio/speech_detection_meal'
json_files = [pos_json for pos_json in sorted(os.listdir(path))]
list_json=list()
for filename in json_files:
   with open(filename) as json_file:
        data = json.load(json_file)
        list_json.append(data)

In [None]:
timeline=Timeline() # load a new Timeline object that can load json files

In [None]:
# make a loop to iterate through the files and create a new timeline for each of them 
list_timelines=list()
for file, segments in zip(json_files, list_json):
    timeline=Timeline()
    list_timelines.append(timeline.from_json(segments))

In [None]:
list_timelines[0].duration()

In [None]:
segmented=list_timelines[0].segmentation()

In [None]:
segmented.duration()

In [None]:
list_timelines[0].gaps().duration()

In [None]:
# now need to extract duration of speech and duration of non-speech and number of segments/intervals for each
duration_speech_meal = [timeline.duration()/(timeline.duration()+timeline.gaps().duration())*100 for timeline in list_timelines] # this is in seconds
intervals_speech_meal =[len(timeline) for timeline in list_timelines]

In [None]:
# now let's write them to pandas and then save it to a spreadsheet
# first need to get an index for pandas 
index_participants=sorted([int(i[:4]) for i in os.listdir(".")])

In [None]:
# create pandas df
df = pd.DataFrame(list(zip(duration_speech_meal, intervals_speech_meal)), 
               columns =['Speech Duration Meal(sec)', 'No of speech intervals meal'], index=index_participants) 

In [None]:
# save to excel file
df.to_excel('speech_detection.xlsx')