# Image creation from audio data



# Image generation process
- Compute dB scaled mel power spectrum over 5 seconds interval.
- Use primary label for each of these intervals.
- Pad to 5 second images if we have a minimal duration.
- Consider a maximum duration for a maximum number of images created per file.
- Create images independently.

In [1]:
import os, pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pydantic import BaseModel as ConfigBaseModel
from joblib import delayed, Parallel
import librosa
print("librosa:", librosa.__version__)
import tensorflow as tf
print("tensorflow:", tf.__version__)
import cv2
print("opencv:", cv2.__version__)
from IPython.display import Audio

librosa: 0.10.2.post1
tensorflow: 2.16.1
opencv: 4.10.0


# Config

In [3]:
import json
from typing import ClassVar
from pydantic import BaseModel

class Config(BaseModel):
    # data
    base_dir: ClassVar[str] = "/kaggle/input/birdclef-2023/"
    train_sound_dir: ClassVar[str] = "/kaggle/input/birdclef-2023/train_audio/"
    path_train: ClassVar[str] = base_dir + "train_metadata.csv"
    path_sample_submission: ClassVar[str] = base_dir + "sample_submission.csv"
    sample_rate: int = 32_000

    # spec
    img_size: tuple[int, int] = (128, 256)
    seconds: int = 5
    num_offset_max: int = 24
    min_duration: float = 0.5
    n_fft: int = 2048
    n_mels: int = img_size[0]
    hop_length: int = (seconds * sample_rate - n_fft) // (img_size[1] - 1)
    center: bool = False
    fmin: int = 500
    fmax: int = 12_500
    top_db: int = 80

    # output
    out_dir: ClassVar[str] = "/kaggle/working/train/"
    jpeg_quality: int = 100

cfg = Config()

# Serialize to JSON using Python's `json` module for indentation
with open("cfg.json", "w") as f:
    json.dump(cfg.model_dump(), f, indent=2)


# Prepare

In [6]:
data = pd.read_csv(cfg.path_train)
data["path_ogg"] = cfg.train_sound_dir + data["filename"]

In [7]:
sample_submission = pd.read_csv(cfg.path_sample_submission)
labels = sample_submission.columns[1:].to_list()
assert labels == sorted(labels), "labels are not sorted"
label_encoder = pd.Series(np.arange(len(labels)), index=labels)
data["label"] = data["primary_label"].map(label_encoder)

In [None]:
def get_duration(rec):
    return librosa.get_duration(path=rec["path_ogg"])

def get_duration_df(df):
    return df.apply(get_duration, axis=1)

In [None]:
durations = Parallel(n_jobs=os.cpu_count(), verbose=1, backend='multiprocessing')(
    delayed(get_duration_df)(sub) 
    for sub in np.array_split(data, os.cpu_count())
)
data["duration"] = pd.concat(durations)

In [None]:
print(data["duration"].head(10))

In [None]:
data["num_offset"] = (1 + (data["duration"] - cfg.min_duration) // cfg.seconds).astype('int')
data["num_offset"] = data["num_offset"].clip(upper=cfg.num_offset_max)

In [None]:
data["num_offset"].head(10)

## Get spectogram image
In short we like to use 5 second interval spectograms as input images. But what should be done with the corner cases?
- There is a maximum number of offset considered for very long audio files.
- Very short files should be padded with zero to get a minimal length.

In [5]:
data = pd.read_csv("/kaggle/input/unsampled-metadata/upsampled_metadata (2).csv")
print(data.head(10))

  primary_label secondary_labels                                type  \
0       abethr1               []                            ['song']   
1       abethr1               []                            ['call']   
2       abethr1               []                            ['song']   
3       abethr1               []                            ['song']   
4       abethr1               []                    ['call', 'song']   
5       abethr1      ['rbsrob1']                            ['song']   
6       abethr1               []                    ['call', 'song']   
7       abethr1               []                            ['song']   
8       abethr1               []                            ['song']   
9       abethr1               []  ['adult', 'sex uncertain', 'song']   

   latitude  longitude     scientific_name               common_name  \
0    4.3906    38.2788  Turdus tephronotus  African Bare-eyed Thrush   
1   -2.9524    38.2921  Turdus tephronotus  African Bare-eyed T

In [8]:
def get_mel_spec_db(path_ogg, offset):
    """Get dB scaled mel power spectrum"""
    required_len = cfg.seconds * cfg.sample_rate
    sig, dr = librosa.load(path=path_ogg, sr=cfg.sample_rate, offset=(offset * cfg.seconds), duration=cfg.seconds)
    sig = np.concatenate([sig, np.zeros((required_len - len(sig)), dtype=sig.dtype)])
    mel_spec = librosa.feature.melspectrogram(
        y=sig,
        hop_length=cfg.hop_length,
        sr=cfg.sample_rate,
        n_fft=cfg.n_fft,
        n_mels=cfg.n_mels,
        center=cfg.center,
        fmin=cfg.fmin,
        fmax=cfg.fmax,
    )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max, top_db=cfg.top_db)
    return mel_spec_db

def normalize_img(img):
    """Normalize to uint8 image range"""
    assert img.ndim == 2, "unexpected dimension"
    v_min, v_max = np.min(img), np.max(img)
    return ((img - v_min) / (v_max - v_min) * 255).astype('uint8')

def process_record(rec):
    """Process a single record"""
    rec_dir = cfg.out_dir + rec.primary_label
    # os.makedirs(rec_dir, exist_ok=True)
    stats = []
    base_stat = {"label": rec.label, "orig_filename": rec.filename}
    for offset in range(rec.num_offset):
        # mel_spec_db = get_mel_spec_db(rec.path_ogg, offset=offset)
        # img = normalize_img(mel_spec_db)
        fname = f"{pathlib.Path(rec.filename).stem}_{offset}.jpeg"
        path_img = os.path.join(rec_dir, fname)
        # ret = cv2.imwrite(path_img, img, [cv2.IMWRITE_JPEG_QUALITY, cfg.jpeg_quality])
        stat = base_stat.copy()
        ret = "TRUE"
        stat.update({
            "offset": offset,
            "ret": ret,
            "filename": "/".join(pathlib.Path(path_img).parts[-2:]),
        })
        stats.append(stat)
    return pd.DataFrame(stats)


def process_data(data):
    """Process dataframe"""
    errors = []
    l_stats = []
    for rec in data.itertuples():
        try:
            stats = process_record(rec)
            l_stats.append(stats)
        except Exception as err:            
            print(f"Error reading {rec.filename}: {str(err)}")            
            errors.append((rec.filename, str(err)))
            return
    return l_stats, errors

#### Dev

# Run all

In [9]:
results = Parallel(n_jobs=os.cpu_count(), verbose=1, backend='multiprocessing')(
    delayed(process_data)(sub) for sub in np.array_split(data, os.cpu_count())
)

[Parallel(n_jobs=4)]: Using backend MultiprocessingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    8.0s remaining:    8.0s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   10.1s finished


In [10]:
errors = [x for r in results for x in r[1]]
img_stats = [x for r in results for x in r[0]]
if len(img_stats):
    img_stats = pd.concat(img_stats).reset_index(drop=True)
img_stats

Unnamed: 0,label,orig_filename,offset,ret,filename
0,0,abethr1/XC128013.ogg,0,TRUE,abethr1/XC128013_0.jpeg
1,0,abethr1/XC128013.ogg,1,TRUE,abethr1/XC128013_1.jpeg
2,0,abethr1/XC128013.ogg,2,TRUE,abethr1/XC128013_2.jpeg
3,0,abethr1/XC128013.ogg,3,TRUE,abethr1/XC128013_3.jpeg
4,0,abethr1/XC128013.ogg,4,TRUE,abethr1/XC128013_4.jpeg
...,...,...,...,...,...
136526,75,crefra2/XC667443.ogg,4,TRUE,crefra2/XC667443_4.jpeg
136527,75,crefra2/XC667443.ogg,5,TRUE,crefra2/XC667443_5.jpeg
136528,75,crefra2/XC667443.ogg,6,TRUE,crefra2/XC667443_6.jpeg
136529,75,crefra2/XC667443.ogg,7,TRUE,crefra2/XC667443_7.jpeg


In [11]:
print("Expected number of images:", data["num_offset"].sum())

Expected number of images: 136531


In [12]:
errors

[]

In [14]:
img_stats.to_csv("/kaggle/working/" + "img_stats.csv", index=False)

In [None]:
def convert_bytes(num):
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0

        
bs = sum(os.stat(f).st_size for f in pathlib.Path(cfg.out_dir).glob("*/*"))
print(cfg.out_dir, convert_bytes(bs))

In [2]:
%cd /kaggle/working/
!ls

/kaggle/working
cfg.json  train


In [3]:
%cd train

/kaggle/working/train


In [5]:
%cd ../
%cd ../

/kaggle/working
/kaggle


In [10]:
%cd /kaggle/

/kaggle


In [11]:
import pandas as pd
data = pd.read_csv("img_stats.csv")
print(data.head(10))

FileNotFoundError: [Errno 2] No such file or directory: 'img_stats.csv'