In [1]:
# Citation
# Please use the following citation

# @inproceedings{Hogg2019,
#   Title = {Speaker change detection using fundamental frequency with application to multi-talker segmentation},
#   Author = {Hogg, Aidan O. T. and Naylor, Patrick A. and Evers, Christine},
#   Booktitle = {ICASSP 2019, IEEE International Conference on Acoustics, Speech, and Signal Processing},
#   Address = {Brighton, UK},
#   Month = {April},
#   Year = {2 019},
# }

# Copyright (C) Aidan Hogg 2019
# Home page: https://aidanhogg.co.uk/

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You can obtain a copy of the GNU General Public License from
# http://www.gnu.org/copyleft/gpl.html or by writing to
# Free Software Foundation, Inc.,675 Mass Ave, Cambridge, MA 02139, USA.

import numpy as np
import math
import scipy.io as sio

from KalmanFilter import KalmanFilter as kf

def merge_vad_onsets(onsets, dt, speaker_changes):
    
    dt *= 100 # convert delta from seconds to frames

    for vad_change in onsets:

        high = vad_change + dt
        low = vad_change - dt

        detected = 0

        for change in speaker_changes:
            if low < change < high:
                detected += 1

        if detected == 0:
            speaker_changes.append(vad_change)
            sorted(speaker_changes)

    return speaker_changes


def test_bench(file, changes, oracle_changes, collar):
    
    
    print(file + ' Pitch features (collar ' + str(collar)+'s):')

    collar *= 100 # convert collar from seconds to frames

    identified = 0
    unidentified = 0
    for change in changes:

        high = change + collar
        low = change - collar

        detected = False

        for oracle_change in oracle_changes:
            if low < oracle_change < high:
                detected = True

        if detected:
            identified += 1
        else:
            unidentified += 1

    print('False alarms: ' + str(round(float(unidentified)/(identified+unidentified)*100.0, 2)) + '%')

    hit = 0
    miss = 0
    multi_hit = 0
    last_speaker_change = 0

    for oracle_change in oracle_changes:

        if last_speaker_change > oracle_change - collar:
            continue
        last_speaker_change = oracle_change

        high = oracle_change + collar
        low = oracle_change - collar

        detected = False

        for change in changes:
            if low < change < high:
                detected += 1

        if detected == 1:
            hit += 1
        elif detected == 0:
            miss += 1
        else:
            multi_hit += 1

    print('When there is a speaker change how often is there a change in pitch: '+str(round(float(hit)/(hit+miss+multi_hit)*100.0, 2))+'% (hit) ' +
            str(round(float(miss)/(hit+miss+multi_hit)*100.0, 2)) + '% (miss) ' +
            str(round(float(multi_hit)/(hit+miss+multi_hit)*100.0, 2)) + '% (multi-hit)')
    
    print('\n')
    return changes


def multi_track_kalman_filter(pv, fx):

    ###############################################################
    # Multi Track Kalman filter
    ###############################################################

    # initialise Kalman filter
    x = np.array([[0]])
    p = np.array([[0]])

    h = np.array([[1]])

    r = np.array([[math.sqrt(400)]])
    q = np.array([[0.01]])

    multi_tracks = [kf(x, p, h, q, r)]

    ###############################################################

    # Kalman filter

    speaker = []
    pefac = []
    changes = []

    select = 0
    threshold_pu = 10
    threshold_pl = 7
    state = 0

    for t in list(range(2, fx.size-2, 1)):

        transition_model = np.array([[1]])

        for track in multi_tracks:
            track.prediction(transition_model)

        threshold = 0.99
        if pv[t] > threshold and pv[t+1] > threshold and pv[t-1] > threshold \
                and pv[t+2] > threshold and pv[t-2] > threshold:
            pefac.append(fx[t])
            multi_tracks[select].update(fx[t])
        else:
            pefac.append(float('nan'))

        speaker.append(select)
        if state == 0:
            if abs(multi_tracks[select].get_post_fit_residual()) < threshold_pl:
                state = 1
        else:
            if abs(multi_tracks[select].get_post_fit_residual()) > threshold_pu:
                changes.append(t)
                state = 0
                min_state_value = 50
                threshold_state = 50
                min_state = 0
                for idx, track in enumerate(multi_tracks):
                    diff = abs(track.get_state() - multi_tracks[select].get_state())
                    if min_state_value > diff and idx != select:
                        min_state = idx
                        min_state_value = diff
                if min_state_value < threshold_state:
                    select = min_state

                else:
                    multi_tracks.append(kf(x, p, h, q, r))
                    select = len(multi_tracks) - 1

    return changes


###############################################################
# Main body
###############################################################

for meeting in ['ES2004','IS1009','EN2002', 'TS3003']:
    for meeting_id in ['a', 'b', 'c', 'd']:
        meeting_name = meeting+meeting_id

        # load pitch file generated in MATLAB by PEFAC
        pitch = 'AMI/pitch/' + meeting_name + '_D01-01_UNKNOWN_TRACK.mat'

        pitch_track = sio.loadmat(pitch)
        probability_voiced = pitch_track['track']['p'].item(0)
        fundamental_frequency = pitch_track['track']['f'].item(0)

        # estimate speaker changes based on pitch
        pitch_changes = multi_track_kalman_filter(probability_voiced, fundamental_frequency)

        # write pitch changes to file (Audacity label file format)
        lab_file = open('results/' + meeting_name + '_label.txt', "w")
        for change in pitch_changes:
            lab_file.write(str((change / 100.0) - 0.1) + '\t' + str(change / 100.0) + '\tPITCH\n')


        # import VAD onsets for meeting
        vad_file = 'AMI/vad/' + meeting_name + '_D01-01_UNKNOWN_TRACK_VAD.mat'

        vad_track = sio.loadmat(vad_file)
        vstart = vad_track['vadStart'].flatten() / 160

        # if VAD onset is within delta of a speaker change it will not be included
        delta = 0.5 # seconds

        # merge VAD onsets with speaker changes
        pitch_changes = merge_vad_onsets(vstart, delta, pitch_changes)

        # load oracle segmentation file
        segmentation_file = 'AMI/lab/' + meeting_name + '_D01-01_UNKNOWN.txt'

        oracle_changes = []
        try:
            with open(segmentation_file) as fp:
                for line in fp:
                    line = (line.rstrip()).split('\t')
                    oracle_changes.append(int(float(line[0]) * 100))
        except IOError as e:
            print("I/O error({0}): {1}".format(e.errno, e.strerror))
            exit()

        # Compare pitch estimates for speaker change boundaries against oracle segmentation
        collar = 0.5 # seconds
        test_bench(meeting_name, pitch_changes, oracle_changes, collar)



ES2004a Pitch features (collar 0.5s):
False alarms: 67.37%
When there is a speaker change how often is there a change in pitch: 72.0% (hit) 15.2% (miss) 12.8% (multi-hit)


ES2004b Pitch features (collar 0.5s):
False alarms: 69.58%
When there is a speaker change how often is there a change in pitch: 73.59% (hit) 14.72% (miss) 11.69% (multi-hit)


ES2004c Pitch features (collar 0.5s):
False alarms: 68.33%
When there is a speaker change how often is there a change in pitch: 64.65% (hit) 27.27% (miss) 8.08% (multi-hit)


ES2004d Pitch features (collar 0.5s):
False alarms: 60.07%
When there is a speaker change how often is there a change in pitch: 69.96% (hit) 22.75% (miss) 7.3% (multi-hit)


IS1009a Pitch features (collar 0.5s):
False alarms: 68.38%
When there is a speaker change how often is there a change in pitch: 65.12% (hit) 27.91% (miss) 6.98% (multi-hit)


IS1009b Pitch features (collar 0.5s):
False alarms: 74.56%
When there is a speaker change how often is there a change in pitch: