In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from types import SimpleNamespace
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.io import wavfile
from scipy import signal
import matplotlib.pyplot as plt
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/test"))

# Any results you write to the current directory are saved as output.

['M3.wav', 'M1.wav', 'F1.wav', 'F5.wav', 'F4.wav', 'M2.wav', 'M5.wav', 'F2.wav', 'F3.wav', 'M4.wav']


In [2]:
def autocorr_method(frame, sfreq,filter_b, filter_a, threshold=0.46, fmin=50, fmax=400):
    """Estimate pitch using autocorrelation
    """

    # Calculate autocorrelation using scipy correlate
    frame = frame.astype(np.float)
    frame -= frame.mean()
    amax = np.abs(frame).max()
    if amax > 0:
        frame /= amax
    else:
        return 0
    
    
    #preprocessing: center clipping
    #signal is normalized, set center clipping threshold to 30% of max (0.3)
    cl = 0.25
    for i in range(len(frame)):
        if frame[i] >= cl:
            frame[i] = frame[i] - cl
        elif frame[i] <= -cl:
            frame[i] = frame[i] + cl
        else: frame[i] = 0
    
    #pre-emphasis filter
    #fl = 100 #cut off freqs
    fh = 3000
    b, a = signal.butter(5, fh / (sfreq / 2), 'low')
    frame = signal.filtfilt(b, a, frame)
    
    filtFrames = []
    
    for i in range(len(filter_b)):
        fr = signal.lfilter(filter_b[i], filter_a[i], frame)
        filtFrames.append(fr)
    
    #for i in range(len(filtFrames)):
    #    for j in range(len(filtFrames[i])):
    #        filtFrames[i][j] = max(filtFrames[i][j], 0)
    
    
    
    lfreq = 700
    b, a = signal.butter(3, 2*lfreq/sfreq, 'low')
                                          
    for i in range(len(filtFrames)):
        filtFrames[i] = signal.lfilter(b, a, filtFrames[i])   #xband
    
    corr = []
    for i in range(len(filtFrames)):
        c = signal.correlate(filtFrames[i], filtFrames[i])
        c = c[len(c)//2:]
        corr.append(c)
    
    SACF = []
    
    for i in range(len(corr[0])):
        s = 0
        for j in range(len(corr)):
            s += corr[j][i]
        SACF.append(s)
    

    #print(SACF)
    # Find the first minimum
    dSACF = np.diff(SACF)
    rmin = np.where(dSACF > 0)[0]
    if len(rmin) > 0:
        rmin1 = rmin[0]
    else:
        return 0

    # Find the next peak
    peak = np.argmax(SACF[rmin1:]) + rmin1
    rmax = SACF[peak]/SACF[0]
    f0 = sfreq / peak

    if rmax > threshold and f0 >= fmin and f0 <= fmax:
        return f0
    else:
        return 0

In [3]:
class Counters:
    def __init__(self, gross_threshold=0.2):
        self.num_voiced = 0
        self.num_unvoiced = 0
        self.num_voiced_unvoiced = 0
        self.num_unvoiced_voiced = 0
        self.num_voiced_voiced = 0
        self.num_gross_errors = 0
        self.fine_error = 0
        self.e2 = 0
        self.gross_threshold = gross_threshold
        self.nfiles = 0

    def add(self, other):
        if other is not None:
            self.num_voiced += other.num_voiced
            self.num_unvoiced += other.num_unvoiced
            self.num_voiced_unvoiced += other.num_voiced_unvoiced
            self.num_unvoiced_voiced += other.num_unvoiced_voiced
            self.num_voiced_voiced += other.num_voiced_voiced
            self.num_gross_errors += other.num_gross_errors
            self.fine_error += other.fine_error
            self.e2 += other.e2
            self.nfiles += 1

    def __repr__(self):
        nframes = self.num_voiced + self.num_unvoiced
        if self.nfiles > 0:
            self.fine_error /= self.nfiles
        str = [
            f"Num. frames:\t{self.num_unvoiced + self.num_voiced} = {self.num_unvoiced} unvoiced + {self.num_voiced} voiced",
            f"Unvoiced frames as voiced:\t{self.num_unvoiced_voiced}/{self.num_unvoiced} ({100*self.num_unvoiced_voiced/self.num_unvoiced:.2f}%)",
            f"Voiced frames as unvoiced:\t{self.num_voiced_unvoiced}/{self.num_voiced} ({100*self.num_voiced_unvoiced/self.num_voiced:.2f}%)",
            f"Gross voiced errors (>{100*self.gross_threshold}%):\t{self.num_gross_errors}/{self.num_voiced_voiced} ({100*self.num_gross_errors/self.num_voiced_voiced:.2f}%)",
            f"MSE of fine errors:\t{100*self.fine_error:.2f}%",
            f"RMSE:\t{np.sqrt(self.e2/nframes):.2f}"
        ]
        return  '\n'.join(str)

In [4]:
def compare(fref, pitch):
    vref = np.loadtxt(fref)
    vtest = np.array(pitch)

    diff_frames = len(vref) - len(vtest)
    if abs(diff_frames) > 5:
        print(f"Error: number of frames in ref ({len(vref)}) != number of frames in test ({len(vtest)})")
        return None
    elif diff_frames > 0:
        vref = np.resize(vref, vtest.shape)
    elif diff_frames < 0:
        vtest = np.resize(vtest, vref.shape)

    counters = Counters()
    counters.num_voiced = np.count_nonzero(vref)
    counters.num_unvoiced = len(vref) - counters.num_voiced
    counters.num_unvoiced_voiced = np.count_nonzero(np.logical_and(vref == 0, vtest != 0))
    counters.num_voiced_unvoiced = np.count_nonzero(np.logical_and(vref != 0, vtest == 0))

    voiced_voiced = np.logical_and(vref != 0, vtest != 0)
    counters.num_voiced_voiced = np.count_nonzero(voiced_voiced)

    f = np.absolute(vref[voiced_voiced] - vtest[voiced_voiced])/vref[voiced_voiced]
    gross_errors = f > counters.gross_threshold
    counters.num_gross_errors = np.count_nonzero(gross_errors)
    fine_errors = np.logical_not(gross_errors)
    counters.fine_error = np.sqrt(np.square(f[fine_errors]).mean())
    counters.e2 = np.square(vref - vtest).sum()

    return counters

In [5]:
def wav2f0(options, gui):
    fs = open(options.submission, 'w') if options.submission is not None else None
    totalCounters = Counters()
    with open(gui) as f:
        if fs is not None:
            print('id,frequency', file=fs)
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            filename = os.path.join(options.datadir, line + ".wav")
            f0ref_filename = os.path.join(options.datadir, line + ".f0ref")
            print("Processing:", filename)
            sfreq, data = wavfile.read(filename)
            nsamples = len(data)

            # From miliseconds to samples
            ns_windowlength = int(round((options.windowlength * sfreq) / 1000))
            ns_frameshift = int(round((options.frameshift * sfreq) / 1000))
            ns_left_padding = int(round((options.left_padding * sfreq) / 1000))
            ns_right_padding = int(round((options.right_padding * sfreq) / 1000))
            pitch = []
            
            #compute filter bank
            startFreq = 100
            stopFreq = 3500
            numFilts = 10  #number of filters
            b0 = 50    #first filter bandwidth
            bf = stopFreq/2*b0/startFreq    #last filter bandwidth
            filtFreq = np.logspace(np.log10(startFreq), np.log10(stopFreq), numFilts)  #filters with log-separated central freqs 
            bw = np.logspace(np.log10(b0), np.log10(bf), numFilts)  #bandwidths associated to filters
    
            filter_b = []
            filter_a = []
            for i in range(numFilts):
                b, a = signal.butter(2, [2*(filtFreq[i]-bw[i]/2)/sfreq, 2*(filtFreq[i]+bw[i]/2)/sfreq], btype='band')
                filter_b.append(b)
                filter_a.append(a)
            
            for id, ini in enumerate(range(-ns_left_padding, nsamples - ns_windowlength + ns_right_padding + 1, ns_frameshift)):
                first_sample = max(0, ini)
                last_sample = min(nsamples, ini + ns_windowlength)
                frame = data[first_sample:last_sample]
                f0 = autocorr_method(frame, sfreq, filter_b, filter_a)
                pitch.append(f0)
               
            #print((pitch))
            #POST PROCESSING
            pitch = signal.medfilt(pitch, 5)
            #print(len(pitch))
            print(pitch[40:55])
            for i in range(len(pitch)):
                if fs is not None:
                    print(line + '_' + str(i) + ',', pitch[i], file=fs)
                          
            if os.path.isfile(f0ref_filename):
                counters = compare(f0ref_filename, pitch)
                totalCounters.add(counters)

    if totalCounters.num_voiced + totalCounters.num_unvoiced > 0:
        print("### Summary")
        print(totalCounters)
        print("-------------------------------\n")

In [6]:
fda_ue_options = SimpleNamespace(
    windowlength=32, frameshift=15, left_padding=16, right_padding=16, datadir='../input', submission=None)
wav2f0(fda_ue_options, '../input/fda_ue.gui')

Processing: ../input/fda_ue/rl001.wav


  b = a[a_slice]
  return x[reverse].conj()


[140.84507042 139.86013986 138.88888889 136.98630137   0.
   0.           0.           0.           0.           0.
   0.           0.           0.         119.04761905 121.21212121]
Processing: ../input/fda_ue/rl002.wav
[140.84507042 134.22818792 130.71895425 126.58227848 120.48192771
 116.95906433 114.28571429   0.           0.           0.
   0.           0.           0.           0.           0.        ]
Processing: ../input/fda_ue/rl003.wav
[145.98540146 144.92753623 143.88489209   0.           0.
   0.           0.           0.           0.           0.
   0.           0.         150.37593985 150.37593985 151.51515152]
Processing: ../input/fda_ue/rl004.wav
[141.84397163 119.76047904   0.           0.           0.
   0.           0.           0.           0.           0.
   0.           0.           0.           0.           0.        ]
Processing: ../input/fda_ue/rl005.wav
[108.69565217 104.71204188 102.56410256 100.50251256  97.56097561
  97.08737864   0.           0.           

In [7]:
test_options = SimpleNamespace(
    windowlength=26.5, frameshift=10, left_padding=13.25, right_padding=7, datadir='../input/test', submission='submission.csv')
wav2f0(test_options, '../input/test.gui')

Processing: ../input/test/F1.wav


  b = a[a_slice]
  return x[reverse].conj()


[  0.           0.           0.           0.         215.05376344
 215.05376344 215.05376344 210.52631579 208.33333333 206.18556701
 204.08163265 202.02020202 202.02020202 202.02020202 202.02020202]
Processing: ../input/test/F2.wav
[  0.    0.    0.    0.    0.    0.    0.  312.5 312.5 312.5 312.5 312.5
 312.5 312.5 312.5]
Processing: ../input/test/F3.wav
[281.69014085 281.69014085 281.69014085   0.           0.
   0.           0.           0.           0.           0.
   0.           0.           0.           0.         229.88505747]
Processing: ../input/test/F4.wav
[370.37037037 377.35849057 384.61538462 384.61538462 392.15686275
 400.         400.         400.         400.         392.15686275
 384.61538462   0.           0.           0.           0.        ]
Processing: ../input/test/F5.wav
[259.74025974 263.15789474 266.66666667 270.27027027 277.77777778
 281.69014085 289.85507246 298.50746269 307.69230769 312.5
 317.46031746 327.86885246 333.33333333 344.82758621 350.87719298]
Pr