In [None]:
import numpy as np
import pandas as pd
import xarray as xr

import matplotlib.pyplot as plt
plt.rcParams["figure.facecolor"] = "w"

from pathlib import Path

import librosa 
import librosa.display


#### Read with pydub
import pydub
def read_mp3_pydub(f, normalized=False):
    """MP3 to numpy array"""
    a = pydub.AudioSegment.from_mp3(f)
    y = np.array(a.get_array_of_samples())
    if a.channels == 2:
        y = y.reshape((-1, 2))
    if normalized:
        return a.frame_rate, np.float32(y) / 2**15
    else:
        return a.frame_rate, y

In [None]:
data_dir = Path("../data")
audio_file = data_dir / "cv-valid-train/sample-000001.mp3"

y, sr = librosa.load(str(audio_file.resolve()), sr=None)

In [None]:
# in speech, the recommended n_fft value is 512 at 22,050 Hz. Sr is 48,000 Hz here
specgram = np.abs(librosa.stft(y, n_fft=1024, hop_length=512))**2
specgram = librosa.amplitude_to_db(specgram, ref=np.max)

mel = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=512)
mel_db = librosa.power_to_db(mel, ref=np.max)

fig, ax = plt.subplots(3, 1, figsize=(16, 7), sharex=True)
librosa.display.waveshow(y, sr=sr, x_axis="time", ax=ax[0])
ax[0].set_xlabel("")
ax[0].set_title("Audio signal")
librosa.display.specshow(specgram, sr=sr, hop_length=512, x_axis='time', y_axis='log', ax=ax[1])
ax[1].set_xlabel("")
ax[1].set_title("Spectrogram")
librosa.display.specshow(mel_db, sr=sr, hop_length=512, x_axis='time', y_axis='mel', ax=ax[2])
ax[2].set_title("Mel spectrogram")