# Trying out just one file and concatenating the speech segments to assess them

In [2]:
import pyannote.core
from pydub import AudioSegment
file = {'uri':'1043_meal', 'audio':'/Users/andrei-macpro/Documents/Data/Audio/Meal/1043_meal.wav'}
import torch
# speech activity detection model trained on AMI training set
sad = torch.hub.load('pyannote/pyannote-audio', 'sad_ami')
# obtain raw SAD scores (as `pyannote.core.SlidingWindowFeature` instance)
sad_scores = sad(file)

Using cache found in /Users/andrei-macpro/.cache/torch/hub/pyannote_pyannote-audio_master


In [7]:
from pyannote.audio.utils.signal import Binarize
binarize = Binarize(offset=0.52, onset=0.52, log_scale=True, 
                    min_duration_off=0.1, min_duration_on=0.1)

# speech regions (as `pyannote.core.Timeline` instance)
speech = binarize.apply(sad_scores, dimension=1)

In [8]:
# convert timestamps to miliseconds 
type(speech[0])
# speech is a timeline made of segments

pyannote.core.segment.Segment

In [None]:
#extract speech and non-speech segments (start and end time) as a dictionary
timestamps_speech=dict(speech)
timestamps_silence = dict(speech.gaps())

In [None]:
file=AudioSegment.from_wav('/Users/andrei-macpro/Documents/Data/Audio/1043_meal.wav')

In [None]:
# pydub works in miliseconds so we're going to convert the timestamps from the pyannote seconds to miliseconds
start_times_speech=[x*1000 for x in list(timestamps_speech.keys())]
end_times_speech=[x*1000 for x in list(timestamps_speech.values())]
start_times_silence=[x*1000 for x in list(timestamps_silence.keys())]
end_times_silence=[x*1000 for x in list(timestamps_silence.values())]

In [None]:
speech_segments=[file[start_time:end_time] for start_time,end_time in zip(start_times_speech, end_times_speech)]
silence_segments=[file[start_time:end_time] for start_time,end_time in zip(start_times_silence, end_times_silence)]

In [None]:
# export the concatenated speech and silence segments to disk
sum(speech_segments).export('/Users/andrei-macpro/Documents/Data/Audio/speech.wav', format="wav")
sum(silence_segments).export('/Users/andrei-macpro/Documents/Data/Audio/non-speech.wav', format="wav")

# Now let's get on with the full speech detection processing

In [26]:
import os
import pyannote.core
import torch
from pyannote.core import Timeline, Segment
from pyannote.audio.utils.signal import Binarize
from tqdm import tqdm
import pandas as pd

In [None]:
os.chdir('/Users/andrei-macpro/Documents/Data/Audio/Meal')


In [None]:
path = '/Users/andrei-macpro/Documents/Data/Audio/Meal'  #if string starts with slash it is considered absolute
dirs = os.listdir( path )
file_names=sorted([i for i in os.listdir(".") if not i.startswith(".")])
#file_names = [x.replace('.wav','') for x in file_names]

In [None]:
sad = torch.hub.load('pyannote/pyannote-audio', 'sad_ami') # ok so this isn't taking up too much memory 

In [None]:
binarize = Binarize(offset=0.52, onset=0.52, log_scale=True, 
                    min_duration_off=0.1, min_duration_on=0.1)

In [None]:
# extract a dictionary of key-filename : value-#pyannoteTimeline 
def timeline(file_names):
    temp_list1=list()
    temp_list2=list()
    for file_name in tqdm(file_names):
        sad_scores=sad(file_name)
        speech=binarize.apply(sad_scores,dimension=1)
        del sad_scores
        temp_list1.append(file_name.get('audio'))
        temp_list2.append(speech)
    trial_dict=dict(zip(temp_list1,temp_list2))
    return trial_dict

In [None]:
# extract json files of the #pyannoteTimeline objects which can then be loaded again
def write_disk(sad_segments):
    for key in sad_segments.keys():
        temp_json=sad_segments[key].for_json()
        with open(str(key)+'.json', 'w') as outfile:
            json.dump(temp_json, outfile)

In [None]:
# make dictionary for files and filenames
list_dict_filenames=list()
files = dict()
for file_name in file_names:
    list_dict_filenames.append({'audio':str(file_name)})

In [None]:
sad_segments=timeline(list_dict_filenames) 

In [None]:
import json
write_disk(sad_segments) #save Timeline objects as json files to disk

# now we load the json files back to timelines to get the durations and no of intervals

In [159]:
# first for meal
import os
import json
import pyannote.core
from pyannote.core import Timeline, Segment
import statistics
os.chdir('/Users/andrei-macpro/Documents/Data/Audio/speech_detection_timestamps/speech_detection_meal')

In [160]:
# function that takes as input a list of file-names and outputs a list of pyannote timelines 
path = '/Users/andrei-macpro/Documents/Data/Audio/speech_detection_timestamps/speech_detection_meal'
json_files = [pos_json for pos_json in sorted(os.listdir(path))]
def get_timelines(json_files):
    list_json=list()
    list_timelines=list()
    for filename in json_files: # loop that imports all json data into separate dictionaries
        with open(filename) as json_file:
            data = json.load(json_file)
            list_json.append(data)
    for file, segments in zip(json_files, list_json): # loop to iterate through the files and create a new timeline for each of them 
        timeline=Timeline()
        list_timelines.append(timeline.from_json(segments))
    return list_timelines
    

In [161]:
def std_intervals(list_timelines): # standard dev of intervals of speech
    std_interval_duration_speech_meal=list()
    for timeline in list_timelines: # go thru each timeline
        segment_duration=list()
        for segment in timeline: # take the duration of each interval/segment
            segment_duration.append(segment.duration) #append to a list ==> list of all interval durations from one TL
        std_interval_duration_speech_meal.append(statistics.stdev(segment_duration)) # take std of that list
    return std_interval_duration_speech_meal


def std_intervals_silence(list_timelines):
    std_interval_duration_silence_meal=list()
    for timeline in list_timelines: # go thru each timeline
        segment_duration=list()
        for segment in timeline.gaps(): # take the duration of each interval/segment
            segment_duration.append(segment.duration) #append to a list ==> list of all interval durations from one TL
        std_interval_duration_silence_meal.append(statistics.stdev(segment_duration)) # take std of that list
    return std_interval_duration_silence_meal

In [162]:
list_timelines_meal=get_timelines(json_files) # list containing the pyannote timeline of each meal recording 

In [163]:
# now need to extract duration of speech and duration of non-speech and number of segments/intervals for each
duration_speech_meal = [timeline.duration()/(timeline.duration()+timeline.gaps().duration())*100 for timeline in list_timelines_meal] # this is in seconds
intervals_speech_meal =[len(timeline) for timeline in list_timelines_meal]
intervals_per_min_meal= [(interval/timeline.duration())*60 for interval,timeline in zip(intervals_speech_meal, list_timelines_meal)]

In [164]:
avg_interval_duration__speech_meal= [timeline.duration()/len(timeline) for timeline in list_timelines_meal]
std_interval_duration_speech_meal=std_intervals(list_timelines_meal)

In [165]:
avg_interval_duration__silence_meal= [timeline.gaps().duration()/len(timeline.gaps()) for timeline in list_timelines_meal]
std_interval_duration_silence_meal=std_intervals_silence(list_timelines_meal)

In [150]:
list_timelines[0].gaps().duration()

NameError: name 'list_timelines' is not defined

In [166]:
# now for play
os.chdir('/Users/andrei-macpro/Documents/Data/Audio/speech_detection_timestamps/speech_detection_play')

In [167]:
path = '/Users/andrei-macpro/Documents/Data/Audio/speech_detection_timestamps/speech_detection_play'
json_files = [pos_json for pos_json in sorted(os.listdir(path))]
list_timelines_play=get_timelines(json_files)

In [168]:
duration_speech_play = [timeline.duration()/(timeline.duration()+timeline.gaps().duration())*100 for timeline in list_timelines_play] # this is in seconds
intervals_speech_play =[len(timeline) for timeline in list_timelines_play]
intervals_per_min_play= [(interval/timeline.duration())*60 for interval,timeline in zip(intervals_speech_play, list_timelines_play)]

In [169]:
avg_interval_duration__speech_play= [timeline.duration()/len(timeline) for timeline in list_timelines_play]
std_interval_duration_speech_play=std_intervals(list_timelines_play)

In [170]:
avg_interval_duration__silence_play= [timeline.gaps().duration()/len(timeline.gaps()) for timeline in list_timelines_play]
std_interval_duration_silence_play=std_intervals_silence(list_timelines_play)

# Let's do number of transitions after 1,2,3 seconds

In [171]:
# now let's write them to pandas and then save it to a spreadsheet
# first need to get an index for pandas 
index_participants=sorted([int(i[:4]) for i in os.listdir(".")])

In [174]:
# create pandas df
df = pd.DataFrame(list(zip(duration_speech_meal, intervals_per_min_meal,avg_interval_duration__speech_meal, 
                           std_interval_duration_speech_meal,avg_interval_duration__silence_meal, std_interval_duration_silence_meal,
                           duration_speech_play, 
                           intervals_per_min_play, 
                           avg_interval_duration__speech_play,
                           std_interval_duration_speech_play,
                           avg_interval_duration__silence_play,
                           std_interval_duration_silence_play)), 
               columns =['Percent speech meal', 'intervals/min meal', 'Avg speech duration meal', 
                         'std speech duration meal', 'Avg silence duration meal',
                         'std silence duration meal',
                         'Percent speech play',
                        'intervals/min play','Avg speech duration play', 
                         'std speech duration play', 'Avg silence duration play', 
                         'std silence duration play'], index=index_participants) 
df.index.name='Subject_ID'

In [175]:
df.to_excel('speech_detection_features.xlsx')

In [173]:
df.head()

Unnamed: 0_level_0,Percent speech meal,intervals/min meal,Avg speech duration meal,std speech duration meal,Avg silence duration meal,std silence duration meal,Percent speech play,intervals/min play,Avg speech duration play,std speech duration play,Avg silence duration play,std silence duration play
Subject_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1043,62.701336,17.529338,3.422833,3.947883,2.046831,2.348845,82.601449,11.20497,5.354767,5.890128,1.141813,0.947178
1047,77.610462,13.859264,4.329234,5.094716,1.254307,1.216747,88.128893,10.087324,5.948059,4.799366,0.81527,0.668524
1049,43.958353,26.899321,2.23054,4.11205,2.860906,3.518392,59.860975,26.081915,2.300445,1.990709,1.572198,2.032243
1053,80.624091,11.131207,5.390251,6.769518,1.304219,1.227086,81.19465,16.224784,3.698046,4.286129,0.86591,0.78611
1059,77.260313,12.069696,4.971127,8.375445,1.47295,1.586202,67.370598,21.286176,2.818731,3.38016,1.376956,1.25187
