In [1]:
import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
import sounddevice as sd
import scipy
import pickle
import librosa
%matplotlib qt5

In [2]:
# helpers to play/write/read audio
fs=44100

def play_audio(y):
    sd.play(y,fs)

def write_audio(y, filename):
    sf.write(filename+'.wav',y,fs)

def read_audio(filename):
    y, fs = sf.read(filename +'.wav')
    return y

def graph_signal(y, start=None, end=None):    
    plt.figure()
    x = np.arange(len(y))/fs
    plt.plot(x, y)
    plt.xlim(start, end)

def graph_spectrum(y, distance, title=""):
    Y = scipy.fft.rfft(y)
    plt.figure()
    Y = abs(Y)
    peaks, _ = scipy.signal.find_peaks(Y, distance=distance)
    plt.title(title)
    plt.plot(Y)
    plt.plot(peaks, Y[peaks], "x")

    # also graph multiples of the fundamental frequency
    space = peaks[1]
    a = (np.arange(20) + 1) * space
    plt.plot(a, np.zeros_like(a), '2', color='red')

def normalize(y, scale=1.0):
    return scale * y/max(abs(y))


In [3]:
class FormantSlide:
    def __init__(self, formant_thresh=1000, low_slide_start=1.0, high_slide_start=1.0, low_slide_T = 0.1, high_slide_T = 0.11):
        self.formant_thresh = formant_thresh
        self.low_slide_start = low_slide_start
        self.high_slide_start = high_slide_start
        self.high_slide_T = high_slide_T
        self.low_slide_T = low_slide_T

        

In [4]:
# brownian noise to add slight fluctuation to pitch
def gen_brown_noise(N: int):
    white_noise = np.random.randn(N)
    brownian_noise = np.cumsum(white_noise) 
    brownian_noise -= np.mean(brownian_noise) 
    brownian_noise /= np.max(np.abs(brownian_noise))
    
    return brownian_noise

In [5]:
# time dilation
def time_slide(diff, slide_T, N, noise=0):
    if(diff < 1):
        speedup = True
    else:
        speedup = False

    speed = abs(1 - diff) / (slide_T * fs)

    brown_noise = gen_brown_noise(N)
    t = np.zeros(N)
    for i in range(1, N):
        # add slight noise to each gap
        t[i] = t[i - 1] + diff + noise * brown_noise[i]
        if speedup:
            if(diff < 1):
                diff += speed
            elif (diff > 1):
                diff = 1
        else:
            if(diff > 1):
                diff -= speed
            elif (diff < 1):
                diff = 1
    return t


# synthesizer (similar to A2)
# TODO: good parameters: 0.7 low start, 1.2 high start
def simple_synth(f1, a, phi, T:float, noise, slide:FormantSlide):
    assert(len(a) == len(phi))

    # TODO: rounding; loses information
    N=int(T*fs)
    t=np.arange(N)/fs
    

    # slide up in pitch
    t_low = time_slide(slide.low_slide_start, slide.low_slide_T, N, noise)/fs
    # slide down in pitch
    t_high = time_slide(slide.high_slide_start, slide.low_slide_T, N, noise)/fs

    y = np.zeros_like(t)
    base = f1
    
    for i in range(len(a)):
        f = (i + 1) * base
        if(f > slide.formant_thresh):
            y2 = a[i] * np.cos(2*np.pi*f*t_high + phi[i])
        else:
            y2 = a[i] * np.cos(2*np.pi*f*t_low + phi[i])
        y += y2

    return y


In [6]:
def glottal_pulse(f0, T, noise, slide:FormantSlide):
    nharm = int(np.floor((fs/2)/f0))
    a = [1 for i in range(1, nharm + 1)]
    phi = np.zeros_like(a)

    y = simple_synth(f0, a, phi, T, noise, slide)
    return y/max(abs(y))

    
def glottal_pulse_simple(f0, T):
    N = int(T * fs)
    t = np.zeros(N)

    interval = fs/f0
    start = 0.0

    fade = 50
    dfade = 5

    while start < N:
        t[int(start)] = 1
        start += interval + fade
        if(fade > 0):
            fade -= dfade

    return t


In [None]:
# noise = 0.008 is a good value
# here, noise means fluctuation in pitch
# TODO: a2 is never used.
def glottal_filter(a, f0, T, noise, slide:FormantSlide = FormantSlide(), a2 = None, hold=None, fade = None):
    # TODO: simple glottal pulse or smooth?
    src = glottal_pulse(f0, T, noise, slide)

    # just filter with one filter
    if a2 is None:
        return scipy.signal.lfilter([1], a, src)

    # otherwise, transition between two filters
    hold = int(fs * hold)
    fade = int(fs * fade)

    def sigmoid(x):
        return 1/(1 + np.exp(-x))
    
    # Sigmoid crossfade (smooth)
    
    x = np.linspace(-6, 6, fade)
    w2 = sigmoid(x)
    w1 = 1 - w2

    y2 = np.zeros_like(src)
    y2[0:hold] = scipy.signal.lfilter([1], a, src[0:hold])
    ic = scipy.signal.lfiltic([1], a, y2[hold - len(a) + 1:hold:-1])
    for i in range(fade):
        # slowly transition the filter
        aa = w1[i] * a + w2[i] * a2
        ic = scipy.signal.lfiltic([1], aa, y2[hold + i - len(a) + 1:hold + i:-1])
        res, ic = scipy.signal.lfilter([1], aa, [src[hold + i]], zi=ic)
        y2[hold + i] = res[0]

    # ic = scipy.signal.lfiltic([1], a2, y2[hold + fade - len(a) + 1:hold + fade:-1])
    res, _ = scipy.signal.lfilter([1], a2, src[hold + fade:], zi=ic)
    y2[hold + fade:] = res

    return y2



# breathiness + voiceless part of consonants
def noise_filter(a, T, magnitude):
    N = int(T * fs)
    src = np.random.normal(0, 1, N) * magnitude
    x_hat = scipy.signal.lfilter([1], a, src)

    return x_hat

# Compute LPC coefficients for vowels

In [160]:
order = 100
# TODO: adjust order based on frequency
f0 = 174.61

coeffs = {}
vowels = ['a', 'e', 'i', 'o', 'u']
for v in ['a', 'e', 'i', 'o', 'u', 'm', 'n', 'r', 'w', 'j', 'z']:
    if v in vowels:
        y = read_audio(f"samples/vowel/{v}")
    else:
        y = read_audio(f"samples/consonant/{v}")
    y = y[:,0]
    lpc = librosa.lpc(y, order=order)
    coeffs[v] = lpc

    if v in vowels:
        v_air = v + "_air"
        y = read_audio(f"samples/vowel/{v_air}")
        y = y[:,0]
        lpc = librosa.lpc(y, order=order)
        coeffs[v_air] = lpc

with open('lpc_coeffs/vowelsf3', 'wb') as file:
    pickle.dump(coeffs, file)


In [None]:
# tester code

# yy = np.array([])

# for v in ['a', 'e', 'i', 'o', 'u']:
#     lpc = coeffs[v]
#     v_air = v + "_air"
#     lpc_air = coeffs[v_air]

#     y2 = glottal_filter(lpc, f0, T=1.0, noise=0.008, slide=FormantSlide(formant_thresh=1000, low_slide_start=0.8, high_slide_start=0.8))
#     y2 = 0.8 * y2/max(abs(y2))

#     yy = np.concatenate((yy, y2))
#     yy = np.concatenate((yy, np.zeros(11000)))

# play_audio(yy)
# write_audio(yy, 'output/lpc_vowels_formant_shift')

In [10]:
# k = glottal_pulse(174.61, 2.0, noise=0.02, slide=FormantSlide())
# play_audio(k)
# graph_spectrum(k, 300)

In [None]:
# tester code

# yy = np.array([])

# for v in ['m', 'n', 'r', 'w', 'j', 'z']:
#     lpc = coeffs[v]


#     y2 = glottal_filter(lpc, f0, T=1.0, noise=0.008, slide=FormantSlide(formant_thresh=1000, low_slide_start=0.8, high_slide_start=0.8))
#     y2 = 0.8 * y2/max(abs(y2))

#     yy = np.concatenate((yy, y2))
#     yy = np.concatenate((yy, np.zeros(11000)))

# play_audio(yy)
# write_audio(yy, 'output/lpc_vowels_formant_shift') 