# P1
### Arthur Bright

# Imports

In [67]:
import matplotlib.pyplot as plt
import numpy as np
import soundfile as sf
import sounddevice as sd
import scipy
import pickle
%matplotlib qt5

In [239]:
# import syllable_gen
%run syllable_gen.ipynb
fs = 44100

# Representation of Sung Notes

In [84]:
# Convert a piano note's index into a frequency (using equal temperament)
#eg A0 = 0, A#0 = 1, etc
def piano(ind: int):
    assert(isinstance(ind, int))
    assert(ind >= 0)
    # using A4 = 440, we have A0 = 440/16 = 27.5
    return 27.5 * (2**(ind/12))

# A class to represent a frequency, but can be instantiated using a piano note index as well.
class Freq:
    def __init__(self, freq):
        assert(isinstance(freq, float) or isinstance(freq, int))
        assert(freq > 0)
        self.v = float(freq)

    def __str__(self):
        return str(self.v)


# A class to represent a phoneme. Currently only supports a subset of japanese language
class Phoneme:
    consonants =   ["k", "g", "s", "z", "t", "d", "n", "h", "b", "p", "m", "r", "w", "j"]
    plosives =     ["p", "t", "b", "d", "g", "k"]
    fricatives =   ["s", "z", "h"]
    nasals =       ["m", "n"]
    approximants = ["w", "r", "j"]
    assert((set(plosives) | set(fricatives) | set(nasals) | set(approximants)) == set(consonants))

    vowels =       ["a", "o", "u", "i", "e"]

    def __init__(self, s: str):
        assert(s in self.consonants or s in self.vowels)
        self.v = s

    def __str__(self):
        return str(self.v)


# A class to represent a syllable. Captures a consonant + vowel OR lone vowel
class Syllable:
    def __init__(self, vowel: Phoneme = None, cons: Phoneme = None):
        if (vowel is not None):
            assert(isinstance(vowel, Phoneme))
            assert(vowel.v in Phoneme.vowels)
        if (cons is not None):
            assert(isinstance(cons, Phoneme))
            assert(cons.v in Phoneme.consonants)
        self.vowel = vowel
        self.cons = cons

    def __str__(self):
        if(self.cons):
            return str(self.cons) + str(self.vowel)
        return str(self.vowel)
    
    def of_str(s : str):
        if len(s) == 1:
            if s in Phoneme.vowels:
                return Syllable(vowel=Phoneme(s))
            else:
                return Syllable(cons=Phoneme(s))
        elif len(s) == 2:
            return Syllable(vowel=Phoneme(s[1]), cons=Phoneme(s[0]))
        else:
            return ValueError(f"Illegal representation of syllable: {str(s)}")
        
    def is_silent(self):
        return self.vowel is None and self.cons is None

# A class to represent a sung note, which includes a (fundamental) frequency, syllable, and duration.
class Note:
    def __init__(self, f : Freq, s: Syllable, duration: float):
        assert(isinstance(f, Freq) and isinstance(s, Syllable) and isinstance(duration, float))
        self.f = f
        self.s = s
        self.duration = duration

    def __str__(self):
        return f"[{str(self.s)}|{str(self.f)}|{str(self.duration)}]"
    
    def is_silent(self):
        return self.s.is_silent()

    

In [70]:
# TESTER CODE
_f1 = Freq(piano(48))
print(_f1)

_p1 = Phoneme('a')
_p2 = Phoneme('b')
print(_p1)

_s1 = Syllable(_p1, _p2)
print(_s1)

_n1 = Note(_f1, _s1, 1.0)
print(_n1)


440.0
a
ba
[ba|440.0|1.0]


# Interface to Convert Sung Notes to Signal

#### Converting to/from files, and sonification/visualization

In [71]:
# helpers to play/write/read audio
fs=44100

def play_audio(y):
    sd.play(y,fs)

def write_audio(y, filename):
    sf.write(filename+'.wav',y,fs)

def read_audio(filename):
    y, fs = sf.read(filename +'.wav')
    return y

def graph_signal(y, start=None, end=None):    
    plt.figure()
    x = np.arange(len(y))/fs
    plt.plot(x, y)
    plt.xlim(start, end)

def graph_spectrum(y, distance, title=""):
    Y = scipy.fft.rfft(y)
    plt.figure()
    Y = abs(Y)
    peaks, _ = scipy.signal.find_peaks(Y, distance=distance)
    plt.title(title)
    plt.plot(Y)
    plt.plot(peaks, Y[peaks], "x")

    # also graph multiples of the fundamental frequency
    space = peaks[1]
    a = (np.arange(20) + 1) * space
    plt.plot(a, np.zeros_like(a), '2', color='red')


#### Putting it together

In [204]:
# determine the transition between syllables
            # Transition from previous syllable
            # smooth fade if: nasal (m, n), approximant (w, j, r), or z, or vowel transition
            # pause otherwise
FADE_DUR = 0.07
FADE_VOWEL_DUR = 0.13
BLOCK_OFF = 0.01
def fade_mode(s1, s2):
    vowel1 = None if s1.vowel is None else s1.vowel.v
    consonant1 = None if s1.cons is None else s1.cons.v

    vowel2 = None if s2.vowel is None else s2.vowel.v
    consonant2 = None if s2.cons is None else s2.cons.v

    if s1.is_silent():
        # if previous note was silence, just concatenate
        return ('concat', 0)
    if s2.is_silent():
        # if current note is silence and previous was not, fade out previous note
        return ('fade', 0, FADE_DUR)

    if consonant2 is None:
        return ('crossfade', FADE_VOWEL_DUR)
    if  consonant2 in ['m', 'n', 'j', 'r', 'w', 'z']:
        # crossfade
        return ('crossfade', FADE_DUR)
    else:
        # block off
        return ('blockoff', 0 - BLOCK_OFF, FADE_DUR)



# doc
def generate_notes(notes: list):
    yy = np.array([])
    for i, note in enumerate(notes):
        
        
        if note.is_silent():
            N = int(note.duration * fs)
            y = np.zeros(N)
        else:
            vowel = None if note.s.vowel is None else note.s.vowel.v
            consonant = None if note.s.cons is None else note.s.cons.v

            if i < len(notes) - 1:
                fade_pad = fade_mode(notes[i].s, notes[i + 1].s)[1]
            y = gen_syllable(consonant, vowel, f0=note.f.v, T=note.duration + fade_pad)

        if len(yy) == 0:
            yy = y
        else:
            fm = fade_mode(notes[i - 1].s, notes[i].s)
            if fm[0] == 'concat':
                yy = np.concatenate((yy, y))
            elif fm[0] == 'fade':
                yy = fade_out(yy, fm[2])
                yy = np.concatenate((yy, y))
            elif fm[0] == 'crossfade':
                yy = cross_fade(yy, y, fm[1])
            elif fm[0] == 'blockoff':
                yy = fade_out(yy, fm[2])
                yy = np.concatenate((yy, np.zeros(int(0 - fm[1] * fs))))
                yy = np.concatenate((yy, y))
            else:
                raise ValueError("invalid fade mode option")
    return yy

## Custom Input Format
A compact input format

In [107]:
# Map piano notes to their index (eg A0 = 0)
def piano_note_to_index(note: str):
    note_map = {'C': 0, 'C#': 1, 'D': 2, 'D#': 3, 'E': 4, 'F': 5, 'F#': 6, 'G': 7, 'G#': 8, 'A': 9, 'A#': 10, 'B': 11}
    
    # Extract the note and octave
    for n in note_map:
        if note[:-1] == n:
            pitch = n
            octave = int(note[len(n):])
            break
    else:
        raise ValueError("Invalid note format")
    
    return octave * 12 + note_map[pitch] - 9

In [192]:
# convert from a file in the following format:
# 0.5                (beat length)
# E4 D4 C4 D4 E4     (pitches)
# he lo wo ru do     (syllables)
# +- + +-----+  +   (note lengths; + means start of note, spaces are rests)

def convert_from_file(filename):
    lines = []
    with open(filename, "r") as file:
        lines = file.readlines()

    header = lines[0].split()
    beat_length = float(header[0])

    pitches_arr = lines[1].split()
    pitches_arr = [piano_note_to_index(p) for p in pitches_arr]

    syllables_arr = lines[2].split()
    syllables_arr = [Syllable.of_str(s) for s in syllables_arr]

    durations_str = lines[3]
    durations_arr = []
    for c in durations_str:
        if c == '-':
            durations_arr[-1] += 1
        elif c == '+':
            durations_arr.append(1)
        elif c == ' ':
            durations_arr.append(-1)  # -1 indicates a rest

    # check that specified number of notes is consistent
    _notes = sum([d > 0 for d in durations_arr])
    assert(_notes == len(pitches_arr))
    assert(_notes == len(syllables_arr))

    durations_arr = np.array(durations_arr) * beat_length

    res = []
    ind = 0
    silent_syllable = Syllable(None, None)
    for d in durations_arr:
        if(d < 0):
            res.append(Note(Freq(0.1), silent_syllable, 0 - d))
        else:
            freq = Freq(piano(pitches_arr[ind]))
            res.append(Note(freq, syllables_arr[ind], d))
            ind += 1
            
    return res


# TEST CODE
Code to actually run the above functions and generate output

In [263]:
filename = "senbonzakura"
notes = convert_from_file(f"input/{filename}.txt")
y = generate_notes(notes)
y = y/max(y)
play_audio(y)
write_audio(y, f"output/{filename}")

In [None]:
# DEBUGGING code
_dur = 0.2
notes = [
    Note(Freq(piano(48)), Syllable.of_str('a'), _dur),
    Note(Freq(piano(46)), Syllable.of_str('a'), _dur),
    Note(Freq(piano(44)), Syllable.of_str('a'), _dur),
    Silence(_dur),
    Note(Freq(piano(46)), Syllable.of_str('a'), _dur),
    Note(Freq(piano(48)), Syllable.of_str('a'), _dur)
]

y = generate_notes(notes)
play_audio(y)



In [None]:
# make pink noise (for a crude /k/ sound)
def generate_pink_noise(size):
    num_columns = 16  # Number of random sources to mix
    values = np.random.randn(num_columns, size)
    
    pink_noise = np.zeros(size)
    running_sum = np.zeros(num_columns)
    
    for i in range(size):
        column = np.random.randint(num_columns)
        running_sum[column] = values[column, i]
        pink_noise[i] = np.sum(running_sum) / num_columns
    
    return pink_noise

In [None]:
# DEBUGGING code
_dur = 0.3
_dur2 = 0.2
notes = [
    Note(Freq(piano(32)), Syllable.of_str('a'), _dur),
    Silence(0.05),
    # Note(Freq(piano(32)), Syllable.of_str('i'), _dur),
    # Silence(0.05),
    # Note(Freq(piano(32)), Syllable.of_str('a'), _dur),
    # Silence(0.05),
    # Note(Freq(piano(32)), Syllable.of_str('i'), _dur),
    # Silence(0.05),
    # Note(Freq(piano(32)), Syllable.of_str('a'), _dur),

]

y = generate_notes(notes) * 0.5
ss = read_audio("samples/consonant/s")[0:7000,0]
_sil = np.zeros(3000)

y = cross_fade(_sil, y, 3000)
y = cross_fade(ss, y, 3000)

# add a click (short pink noise)
k = generate_pink_noise(2000)
# k = scipy.fft.irfft(k, 3000)

y = np.concatenate((y, np.zeros(1000), k))

play_audio(y)
# write_audio(y, 'output/sak')
