In [2]:
import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
import sounddevice as sd
import scipy
import pickle
import librosa
%matplotlib qt5

# SYLLABLE_GEN
This notebook 

Main components:
- `cross_fade()`: A function to smoothly crossfade between two signals, similar to one you would see in a DAW. Uses sigmoid crossfade for smoothness
- `apply_smooth_filter()`: Given a signal, apply a lowpass filter to the beginning, and then slowly blend the signal back to original. Essentially a "fading lowpass"
- `Consonant_profile`: A class representing the phonetic properties of a consonant. This includes the consonant's influence on formant transitions, whether it has a vocal bar, etc.
- `gen_syllable()`: Generate a syllable, given that syllable's consonant, vowel, pitch, and some other adjustable factors. Performs a lot of casework on what consonant/vowel are used.

In [14]:
# import vowel gen
%run vowel_gen.ipynb
fs = 44100

In [4]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

# Cross-fade concatenation
def cross_fade(arr1, arr2, T):
    N = int(T * fs)
    
    if N == 0:
        return np.concatenate((arr1, arr2))
    if N <= 0 or N > min(len(arr1), len(arr2)):
        raise ValueError(f"invalid overlap {N}, {len(arr1)}, {len(arr2)}")
    
    head = arr1[:-N]
    b1 = arr1[-N:]
    b2 = arr2[0:N]
    tail = arr2[N:]
    
    # Sigmoid crossfade (smooth)
    
    x = np.linspace(-6, 6, N)
    w2 = sigmoid(x)
    w1 = 1 - w2

    b = b1 * w1 + b2 * w2
    return np.concatenate((head, b, tail))

def fade_in(arr, T):
    N = int(T * fs)
    return cross_fade(np.zeros(N + 1), arr, T)

def fade_out(arr, T):
    N = int(T * fs)
    return cross_fade(arr, np.zeros(N + 1), T)

In [None]:
# make a lowpass filter
def lowpass_filter(cutoff, order=2):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = scipy.signal.butter(order, normal_cutoff, btype='low', analog=False)
    return (b, a)


# Apply a gradually shifting filter to a signal
# T1: time spend under the lowpass filter
# T2: time to transition from lowpass to no filter
def apply_smooth_filter(y, T1, T2, lowpass_cutoff):
    N = len(y)

    A = int(T1 * fs)
    B = int(T2 * fs)
    
    if A + B > N:
        raise ValueError("A + B cannot be greater than the total number of samples")
    
    transition_length = B
    
    # fade curve
    w2 = sigmoid(np.linspace(-6, 6, transition_length))
    w1 = 1 - w2
    
    # Apply low-pass filter to the first A samples
    yy = np.copy(y)
    filter = lowpass_filter(lowpass_cutoff)
    filtered = scipy.signal.filtfilt(*filter, y)
    yy[:A] = filtered[:A]
    
    # Smoothly transition over the middle region
    yy[A:A + B] = cross_fade(filtered[A: A + B], y[A:A + B], T2)
    
    return yy


In [None]:
consonants =   ["k", "g", "s", "z", "t", "d", "n", "h", "b", "p", "m", "r", "w", "j"]
plosives =     ["p", "t", "b", "d", "g", "k"]
fricatives =   ["s", "z", "h"]
nasals =       ["m", "n"]
approximants = ["w", "r", "j"]

sample_based = [*plosives, "s", "h"]


class Consonant_profile:
    def __init__(self, duration, fade, bar, slide, silence=None):
        self.duration = int(fs * duration)
        self.fade = fade
        self.bar = bar
        self.slide = slide
        if silence is not None:
            silence = int(fs * silence)
        self.silence = silence


consonant_profiles = {}
# Define consonant profiles for each consonant. Paramters were chosen by hand, manually selected for comprehensibility
# Voiceless stops
consonant_profiles["p"] = Consonant_profile(0.06, 0.02, None, FormantSlide(formant_thresh=1000, low_slide_start=0.97, high_slide_start=0.97))
consonant_profiles["t"] = Consonant_profile(0.08, 0.01, None, FormantSlide(formant_thresh=1000, low_slide_start=0.98, high_slide_start=0.98))
consonant_profiles["k"] = Consonant_profile(0.095, 0.02, None, FormantSlide(formant_thresh=1000, low_slide_start=1.01, high_slide_start=1.01))
# Voiced stops
consonant_profiles["g"] = Consonant_profile(0.025, 0.01, (0.05, 0.05), FormantSlide(formant_thresh=1000, low_slide_start=0.75, high_slide_start=1.3))
consonant_profiles["b"] = Consonant_profile(0.03, 0.02, (0.02, 0.05), FormantSlide(formant_thresh=1000, low_slide_start=0.8, high_slide_start=0.8))
consonant_profiles["d"] = Consonant_profile(0.012, 0.001, (0.04, 0.03), FormantSlide(formant_thresh=1000, low_slide_start=0.8, high_slide_start=0.9))

# Voiceless fricatives
consonant_profiles["s"] = Consonant_profile(0.13, 0.03, None, FormantSlide(formant_thresh=1000, low_slide_start=0.8, high_slide_start=0.8), silence=0.03)
consonant_profiles["h"] = Consonant_profile(0.12, 0.04, None, FormantSlide(formant_thresh=1000, low_slide_start=0.8, high_slide_start=0.8), silence=0.04)

# Nasals
consonant_profiles['m'] = Consonant_profile(0.08, 0.04, None, FormantSlide(formant_thresh=1000, low_slide_start=1.1, high_slide_start=1.1))
consonant_profiles['n'] = Consonant_profile(0.06, 0.03, None, FormantSlide(formant_thresh=1000, low_slide_start=0.95, high_slide_start=0.95))
# Approximants
consonant_profiles['w'] = Consonant_profile(0.01, 0.13, None, FormantSlide(formant_thresh=1000, low_slide_start=0.65, high_slide_start=0.8))
consonant_profiles['j'] = Consonant_profile(0.01, 0.12, None, FormantSlide(formant_thresh=1000, low_slide_start=0.75, high_slide_start=0.85))
consonant_profiles['r'] = Consonant_profile(0.01, 0.12, None, FormantSlide(formant_thresh=1000, low_slide_start=0.75, high_slide_start=0.85))

# Voiced fricatives
consonant_profiles['z'] = Consonant_profile(0.02, 0.09, None, FormantSlide(formant_thresh=1000, low_slide_start=0.75, high_slide_start=0.85))

# Generate a syllable!
# c: consonant
# v: vowel
def gen_syllable(c, v, f0, T, tremolo = 0.2, air = 0.012, next_c = None):
    N = int(T * fs)

    # determine what formant slide the syllable should end with
    if next_c is not None:
        end_slide = consonant_profiles[next_c].slide
    else:
        end_slide = None

    if c is None:
        # Case 1: no consonant
        y_vowel = gen_vowel(v, f0, T, tremolo=tremolo, slide=None, slide2=end_slide, air = air)
        return y_vowel
    
    if v is None:
        # Case 2: no vowel
        if c in sample_based:
            consonant_y = read_audio(f"samples/consonant/{c}")[:,0]
            if len(consonant_y) < N:
                consonant_y = np.concatenate((consonant_y, np.zeros(N - len(consonant_y))))
        else:
            assert c in consonants
            consonant_y = gen_vowel(c, f0, T=T, tremolo=tremolo, slide2=end_slide)
        return consonant_y

    if c in sample_based:
        # Case 3: both vowel and consonant
        profile = consonant_profiles[c]
        
        consonant_y = read_audio(f"samples/consonant/{c}")[0:profile.duration,0]
        vowel_y = gen_vowel(v, f0, T, tremolo=tremolo, slide=profile.slide, air = air, slide2=end_slide)

        # slightly smoothen entrance
        vowel_y = fade_in(vowel_y, 0.005)

        if profile.silence is not None:
            silence_y = np.zeros(profile.silence)
            y = cross_fade(silence_y, vowel_y, profile.fade)
            y = cross_fade(consonant_y, y, profile.fade)
        elif profile.bar is None:
            y = cross_fade(consonant_y, vowel_y, profile.fade)
        else:
            # TODO cutoff
            vowel_y = apply_smooth_filter(vowel_y, profile.bar[0], profile.bar[1], lowpass_cutoff=500)
            y = cross_fade(consonant_y, vowel_y, profile.fade)
        return y[:N]
    else:
        # non-sample based consonant; must synthesize and blend
        assert c in consonants
        profile = consonant_profiles[c]
        
        consonant_y = gen_vowel(c, f0, T=profile.duration/fs + profile.fade, tremolo=tremolo, slide=profile.slide, air=air)
        vowel_y = gen_vowel(v, f0, T, tremolo=tremolo, slide=profile.slide, air = air, slide2=end_slide)

        y = cross_fade(consonant_y, vowel_y, profile.fade)

        # for z specifically, add some "sizzle"
        if c == 'z':
            sizzle = read_audio(f"samples/consonant/s")[0:7000,0] * 0.1
            sizzle = fade_out(sizzle, profile.fade)
            sizzle = np.concatenate((sizzle, np.zeros(len(y) - len(sizzle))))
            y += sizzle
        return y[:N]


In [None]:
# y = gen_syllable('p', 'a', f0=173, T=1.0, next_c = 'g')
# play_audio(y)