# Pitch estimation with speech mnist

In [0]:
# download speech mnist
!wget --no-check-certificate -qq -r 'https://docs.google.com/uc?export=download&id=1vD12AFfbA8vrTHeXb1IKk4wKbZwWDARg' -O recordings.zip
!mkdir speech_mnist
!unzip -qq /content/recordings.zip -d ./speech_mnist 

In [0]:
import glob
import librosa
import IPython.display as ipd
import librosa.display

speech_mnist = glob.glob('./speech_mnist/*.wav')
X, fs = librosa.load(speech_mnist[0], sr=None)

print('file name : ' + speech_mnist[0])
print('sampling rate : ' + (str)(fs))
ipd.Audio(X, rate=fs, autoplay=False)

# with PYSPTK

In [0]:
# using pysptk
!pip install pysptk

In [0]:
import pysptk
import numpy as np 
import librosa

setting = {'f0_min' : 60.0, 'f0_max': 500.0, 'frame_length' : 512, 'hop_length' : 128}


audio, sr = librosa.load(speech_mnist[0], sr =None)
audio_int = audio*32768.
audio_int = np.asarray(audio_int, dtype='float64')
pitch = pysptk.swipe(audio_int, fs=sr, hopsize=setting['hop_length'], min=setting['f0_min'], max=setting['f0_max'], otype="f0")

In [0]:
np.shape(pitch)

In [0]:
import matplotlib.pyplot as plt 
plt.figure(figsize=(10,10))

plt.subplot(3,1,1)
plt.plot(audio[:len(audio)])
plt.xlim([0,len(audio)])
plt.subplot(3,1,2)
plt.plot(pitch[:])
plt.xlim([0,np.shape(pitch)[0]])
plt.subplot(3,1,3)
spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=512, hop_length=128)
librosa.display.specshow(np.log(spectrogram + 1e-3), sr=8000, cmap='viridis', x_axis=None, y_axis='linear')
plt.xlim([0,np.shape(spectrogram)[1]])
plt.ylim([0,2000])

# with CREPE

In [0]:
# using crepe

# https://arxiv.org/abs/1802.06182


!pip install crepe

In [0]:
import crepe
from scipy.io import wavfile

audio, sr = librosa.load(speech_mnist[0], sr =None)
time, frequency, confidence, activation = crepe.predict(audio, sr, viterbi=True)

In [0]:
import matplotlib.pyplot as plt 
plt.figure(figsize=(10,10))

plt.subplot(5,1,1)
plt.plot(audio[:len(audio)])
plt.xlim([0,len(audio)])
plt.subplot(5,1,2)
plt.plot(frequency[:])
plt.xlim([0,np.shape(frequency)[0]])

plt.subplot(5,1,3)
plt.plot(confidence)
plt.xlim([0,np.shape(confidence)[0]])

plt.subplot(5,1,4)
plt.plot(activation)
plt.xlim([0,np.shape(activation)[0]])

plt.subplot(5,1,5)
spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=512, hop_length=128)
librosa.display.specshow(np.log(spectrogram + 1e-3), sr=8000, cmap='viridis', x_axis=None, y_axis='linear')
plt.xlim([0,np.shape(spectrogram)[1]])
plt.ylim([0,2000])



# with Pyworld

In [0]:
# f0 extract with pyworld vocoder

# https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder
# 
!pip install pyworld

In [0]:
import pyworld as pw


audio, sr = librosa.load(speech_mnist[0], sr =None)
x = np.asarray(audio, dtype='double')
fs = sr
_f0, t = pw.dio(x, fs)    # raw pitch extractor
f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
ap = pw.d4c(x, f0, t, fs)         # extract aperiodicity
y = pw.synthesize(f0, sp, ap, fs)

In [0]:
np.shape(_f0)

In [0]:
import matplotlib.pyplot as plt 
plt.figure(figsize=(10,10))

plt.subplot(6,1,1)
plt.plot(audio[:len(audio)])
plt.xlim([0,len(audio)])
plt.subplot(6,1,2)
plt.plot(_f0[:])
plt.xlim([0,np.shape(_f0)[0]])

plt.subplot(6,1,3)
plt.plot(f0)
plt.xlim([0,np.shape(f0)[0]])

plt.subplot(6,1,4)
librosa.display.specshow(np.log(sp.T + 1e-3), sr=8000, cmap='viridis', x_axis=None, y_axis='linear')
plt.xlim([0,np.shape(sp)[0]])

plt.subplot(6,1,5)
librosa.display.specshow(np.log(ap.T + 1e-3), sr=8000, cmap='viridis', x_axis=None, y_axis='linear')
plt.xlim([0,np.shape(ap)[0]])

plt.subplot(6,1,6)
spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=512, hop_length=128)
librosa.display.specshow(np.log(spectrogram + 1e-3), sr=8000, cmap='viridis', x_axis=None, y_axis='linear')
plt.xlim([0,np.shape(spectrogram)[1]])
plt.ylim([0,2000])

In [0]:
# reconstruction audio with difference pitch

# original audio
ipd.Audio(data=audio, rate=sr, autoplay=False)


In [0]:
# enhanced audio
y = pw.synthesize(f0, sp, ap, fs)
ipd.Audio(data=y, rate=fs, autoplay=False)

In [0]:
# pitch shift
y = pw.synthesize(f0*2, sp, ap, fs)
ipd.Audio(data=y, rate=fs, autoplay=False)

In [0]:
# with another audio (singing)

!wget --no-check-certificate -qq -r 'https://docs.google.com/uc?export=download&id=1JsNKqb2Q1A6dMZ1XXFZc_vEhgAlnUOcq' -O singing.wav

In [0]:
audio, sr = librosa.load('./singing.wav', sr =None)
x = np.asarray(audio, dtype='double')
fs = sr
_f0, t = pw.dio(x, fs)    # raw pitch extractor
f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
ap = pw.d4c(x, f0, t, fs)         # extract aperiodicity
y = pw.synthesize(f0, sp, ap, fs)

In [0]:
ipd.Audio(data=audio, rate=sr, autoplay=False)

In [0]:
y = pw.synthesize(f0, sp, ap, fs)
ipd.Audio(data=y, rate=fs, autoplay=False)

In [0]:
f0_copy = np.copy(f0)
f0_copy[np.nonzero(f0_copy)] *=0.7

y = pw.synthesize(f0_copy, sp, ap, fs)
ipd.Audio(data=y, rate=fs, autoplay=False)

# External example : MIREX


https://www.music-ir.org/mirex/wiki/2019:Main_Page

* Cover song identification
* Audio Fingerprinting
* Patterns for Prediction
* Multiple Fundamental Frequency Estimation & Tracking
* Chord Estimation
* Onset / Key / Tempo Estimation
* Automatic Lyrics-to-Audio Alignment
* Audio Meelody Extraction
* Query by Singing/Humming
...



# External example : Cover Song Identification


In [0]:
!wget --no-check-certificate -qq -r 'https://docs.google.com/uc?export=download&id=1U-SQSgL26TsFMCSDkXpue0WpH_GT0Dfg' -O A_1.mp3

!wget --no-check-certificate -qq -r 'https://docs.google.com/uc?export=download&id=1108dc7EeKwRBh302YoY20bmDqffWQzr7' -O A_2.mp3

!wget --no-check-certificate -qq -r 'https://docs.google.com/uc?export=download&id=1XnH9pY186Knh1S-lShgB1YI5AOKEiEHa' -O B_1.mp3


In [0]:
A_1,sr = librosa.load('./A_1.mp3')
A_2, sr = librosa.load('./A_2.mp3')
B_1, sr = librosa.load('./B_1.mp3')

In [0]:
A_1_cpt = librosa.feature.chroma_cqt(y=A_1[:120*sr], sr=22050, hop_length=(2**6)*344, n_chroma=12, n_octaves=7)
A_2_cpt = librosa.feature.chroma_cqt(y=A_2[:120*sr], sr=22050, hop_length=(2**6)*344, n_chroma=12, n_octaves=7)
B_1_cpt = librosa.feature.chroma_cqt(y=B_1[:120*sr], sr=22050, hop_length=(2**6)*344, n_chroma=12, n_octaves=7)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))

plt.subplot(3,1,1)
plt.imshow(A_1_cpt)

plt.subplot(3,1,2)
plt.imshow(A_2_cpt)

plt.subplot(3,1,3)
plt.imshow(B_1_cpt)

In [0]:
import numpy as np
a = np.arange(10)
print(a)

b = np.roll(a, 2)
print(b)

In [0]:
# transposition check
x = A_1_cpt
y = A_2_cpt
result = []
for i in range(12):
  temp = np.roll(x, i, axis=0)
  result.append(np.mean((temp-y)*(temp-y)))

np.argmin(result)

In [0]:
# if transposition?

A_2_trans = librosa.effects.pitch_shift(A_2[:120*sr], sr, 2, bins_per_octave=12)
ipd.Audio(A_2_trans, rate=22050, autoplay=False)
# A_2_cpt = librosa.feature.chroma_cqt(y=A_2[:120*sr], sr=22050, hop_length=(2**6)*344, n_chroma=12, n_octaves=7)



In [0]:
plt.figure(figsize=(20,7))
plt.subplot(2,1,1)
plt.imshow(A_2_cpt)

A_2_cpt_trans = librosa.feature.chroma_cqt(y=A_2_trans[:120*sr], sr=22050, hop_length=(2**6)*344, n_chroma=12, n_octaves=7)

plt.subplot(2,1,2)
plt.imshow(A_2_cpt_trans)

In [0]:
# transposition check (if transposition == True)
x = A_1_cpt
y = A_2_cpt_trans
result = []
for i in range(12):
  temp = np.roll(x, i, axis=0)
  result.append(np.mean((temp-y)*(temp-y)))

np.argmin(result)

In [0]:
# calculate cross-similarity-matrix
x = A_1_cpt
y = A_2_cpt
csm = np.zeros((121,121))
for i in range(121):
  for j in range(121):
    csm[i,j] = np.mean(np.square(x[:,i] - y[:,j]))


plt.imshow(csm)

In [0]:
# calculate cross-similarity-matrix (transposition==True)
x = A_1_cpt
y = A_2_cpt_trans
csm = np.zeros((121,121))
for i in range(121):
  for j in range(121):
    csm[i,j] = np.mean(np.square(x[:,i] - y[:,j]))


plt.imshow(csm)

In [0]:
x = A_1_cpt
y = B_1_cpt
csm = np.zeros((121,121))
for i in range(121):
  for j in range(121):
    csm[i,j] = np.mean(np.square(x[:,i] - y[:,j]))


plt.imshow(csm)

In [0]:
x = A_2_cpt
y = B_1_cpt
csm = np.zeros((121,121))
for i in range(121):
  for j in range(121):
    csm[i,j] = np.mean(np.square(x[:,i] - y[:,j]))


plt.imshow(csm)

In [0]:
# if slower than original
A_2_slow = librosa.effects.time_stretch(A_2[:120*sr], 0.8)
ipd.Audio(A_2_slow, rate=22050, autoplay=False)

In [0]:
A_2_cpt_slow = librosa.feature.chroma_cqt(y=A_2_slow[:120*sr], sr=22050, hop_length=(2**6)*344, n_chroma=12, n_octaves=7)

x = A_1_cpt
y = A_2_cpt_slow
csm = np.zeros((121,121))
for i in range(121):
  for j in range(121):
    csm[i,j] = np.mean(np.square(x[:,i] - y[:,j]))


plt.imshow(csm)

# External example : onsets and frames (piano transcription)

arxiv paper : https://arxiv.org/pdf/1710.11153.pdf


In [0]:
#@title Setup Environment

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import glob

print('Copying checkpoint from GCS...')
!rm -r /content/onsets-frames
!mkdir /content/onsets-frames
!gsutil -q -m cp -R gs://magentadata/models/onsets_frames_transcription/* /content/onsets-frames/
!unzip -o /content/onsets-frames/maestro_checkpoint.zip -d /content/onsets-frames
CHECKPOINT_DIR = '/content/onsets-frames/train'
  
print('Installing dependencies...')
!apt-get update -qq && apt-get install -qq libfluidsynth1 fluid-soundfont-gm build-essential libasound2-dev libjack-dev ffmpeg  
!pip install pyfluidsynth pretty_midi

if glob.glob('/content/onsets-frames/magenta*.whl'):
  !pip install -q /content/onsets-frames/magenta*.whl
else:
  !pip install -qU magenta

# Hack to allow python to pick up the newly-installed fluidsynth lib. 
# This is only needed for the hosted Colab environment.
import ctypes.util
orig_ctypes_util_find_library = ctypes.util.find_library
def proxy_find_library(lib):
  if lib == 'fluidsynth':
    return 'libfluidsynth.so.1'
  else:
    return orig_ctypes_util_find_library(lib)
ctypes.util.find_library = proxy_find_library

In [0]:
# onsets and frames (multi-pitch piano transcription)

# https://arxiv.org/abs/1710.11153

import tensorflow as tf
import librosa
import numpy as np

from google.colab import files

from magenta.common import tf_utils
from magenta.music import audio_io
import magenta.music as mm
from magenta.models.onsets_frames_transcription import audio_label_data_utils
from magenta.models.onsets_frames_transcription import configs
from magenta.models.onsets_frames_transcription import constants
from magenta.models.onsets_frames_transcription import data
from magenta.models.onsets_frames_transcription import infer_util
from magenta.models.onsets_frames_transcription import train_util
from magenta.music import midi_io
from magenta.music.protobuf import music_pb2
from magenta.music import sequences_lib

## Define model and load checkpoint
## Only needs to be run once.

config = configs.CONFIG_MAP['onsets_frames']
hparams = config.hparams
hparams.use_cudnn = False
hparams.batch_size = 1

examples = tf.placeholder(tf.string, [None])

dataset = data.provide_batch(
    examples=examples,
    preprocess_examples=True,
    params=hparams,
    is_training=False,
    shuffle_examples=False,
    skip_n_initial_records=0)

estimator = train_util.create_estimator(
    config.model_fn, CHECKPOINT_DIR, hparams)

iterator = dataset.make_initializable_iterator()
next_record = iterator.get_next()

In [0]:
# upload audio

uploaded = files.upload()

to_process = []
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  wav_data = uploaded[fn]
  example_list = list(
      audio_label_data_utils.process_record(
          wav_data=wav_data,
          ns=music_pb2.NoteSequence(),
          example_id=fn,
          min_length=0,
          max_length=-1,
          allow_empty_notesequence=True))
  assert len(example_list) == 1
  to_process.append(example_list[0].SerializeToString())
  
  print('Processing complete for', fn)
  
sess = tf.Session()

sess.run([
    tf.initializers.global_variables(),
    tf.initializers.local_variables()
])

sess.run(iterator.initializer, {examples: to_process})

def transcription_data(params):
  del params
  return tf.data.Dataset.from_tensors(sess.run(next_record))
input_fn = infer_util.labels_to_features_wrapper(transcription_data)

In [0]:
prediction_list = list(
    estimator.predict(
        input_fn,
        yield_single_examples=False))
assert len(prediction_list) == 1

frame_predictions = prediction_list[0]['frame_predictions'][0]
onset_predictions = prediction_list[0]['onset_predictions'][0]
velocity_values = prediction_list[0]['velocity_values'][0]

sequence_prediction = sequences_lib.pianoroll_to_note_sequence(
    frame_predictions,
    frames_per_second=data.hparams_frames_per_second(hparams),
    min_duration_ms=0,
    min_midi_pitch=constants.MIN_MIDI_PITCH,
    onset_predictions=onset_predictions,
    velocity_values=velocity_values)

# Ignore warnings caused by pyfluidsynth
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

mm.plot_sequence(sequence_prediction)
mm.play_sequence(sequence_prediction, mm.midi_synth.fluidsynth,
                 colab_ephemeral=False)