# Let's Think of a Better Split Method

In [1]:
import numpy as np
import librosa as lb
import librosa.display as lbd
import soundfile as sf
from  soundfile import SoundFile
import pandas as pd
from  IPython.display import Audio
from pathlib import Path

from matplotlib import pyplot as plt

from tqdm.notebook import tqdm
import joblib, json, re

from  sklearn.model_selection  import StratifiedKFold
tqdm.pandas()

In [2]:
df = pd.read_csv('../input/birdclef-2023/train_metadata.csv')
df['secondary_labels'] = df['secondary_labels'].apply(lambda x: re.findall(r"'(\w+)'", x))
df['len_sec_labels'] = df['secondary_labels'].map(len)


In [3]:
df[df.len_sec_labels>0].sample(3)

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename,len_sec_labels
16369,yebapa1,[reftin1],"['duet', 'song']",-33.4733,26.9505,Apalis flavida,Yellow-breasted Apalis,Tim Cockcroft,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/622672,yebapa1/XC622672.ogg,1
13488,subbus1,[reedov1],['song'],-26.086,27.844,Telophorus sulfureopectus,Sulphur-breasted Bushshrike,Niall Perrins,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/336397,subbus1/XC336397.ogg,1
9667,libeat1,"[afmdov1, blnmou1, slcbou1]","['call', 'song']",0.5824,37.5861,Merops pusillus,Little Bee-eater,Rory Nefdt,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/235185,libeat1/XC235185.ogg,3


In [4]:
df.primary_label.value_counts()

barswa     500
wlwwar     500
thrnig1    500
eaywag1    500
comsan     500
          ... 
lotcor1      1
whctur2      1
whhsaw1      1
afpkin1      1
crefra2      1
Name: primary_label, Length: 264, dtype: int64

# Something has to be done for Birds with <= 1 samples.

## Also the fact that we have to perform inference in 2 hours w/ CPU, I think best solution is just to have single split rather than using multiple folds.

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd

def birds_stratified_split(df, target_col, test_size=0.2):
    class_counts = df[target_col].value_counts()
    low_count_classes = class_counts[class_counts < 2].index.tolist() ### Birds with single counts

    df['train'] = df[target_col].isin(low_count_classes)

    train_df, val_df = train_test_split(df[~df['train']], test_size=test_size, stratify=df[~df['train']][target_col], random_state=42)

    train_df = pd.concat([train_df, df[df['train']]], axis=0).reset_index(drop=True)

    # Remove the 'valid' column
    train_df.drop('train', axis=1, inplace=True)
    val_df.drop('train', axis=1, inplace=True)

    return train_df, val_df

In [6]:
train_df, valid_df = birds_stratified_split(df, 'primary_label', 0.2)

In [7]:
df.primary_label.value_counts()

barswa     500
wlwwar     500
thrnig1    500
eaywag1    500
comsan     500
          ... 
lotcor1      1
whctur2      1
whhsaw1      1
afpkin1      1
crefra2      1
Name: primary_label, Length: 264, dtype: int64

In [8]:
train_df.primary_label.value_counts()

barswa     400
wlwwar     400
thrnig1    400
eaywag1    400
comsan     400
          ... 
crefra2      1
lotcor1      1
whctur2      1
whhsaw1      1
yebsto1      1
Name: primary_label, Length: 264, dtype: int64

In [9]:
valid_df.primary_label.value_counts()

comsan     100
eaywag1    100
wlwwar     100
barswa     100
thrnig1    100
          ... 
gobsta5      1
palpri1      1
pabspa1      1
macshr1      1
darter3      1
Name: primary_label, Length: 254, dtype: int64

In [10]:
class Config:
    sampling_rate = 32000
    duration = 5 
    fmin = 0
    fmax = None
    audios_path = Path("../input/birdclef-2023/train_audio")
    out_dir_train = Path("../input/split-creating-melspecs-stage-1/specs/train") 
    
    out_dir_valid = Path("../input/split-creating-melspecs-stage-1/specs/valid") 


In [11]:
Config.out_dir_train.mkdir(exist_ok=True, parents=True)
Config.out_dir_valid.mkdir(exist_ok=True, parents=True)

In [12]:
def get_audio_info(filepath):
    """Get some properties from  an audio file"""
    with SoundFile(filepath) as f:
        sr = f.samplerate
        frames = f.frames
        duration = float(frames)/sr
    return {"frames": frames, "sr": sr, "duration": duration}

In [13]:
def add_path_df(df):
    
    df["path"] = [str(Config.audios_path/filename) for filename in df.filename]
    df = df.reset_index(drop=True)
    pool = joblib.Parallel(2)
    mapper = joblib.delayed(get_audio_info)
    tasks = [mapper(filepath) for filepath in df.path]
    df2 =  pd.DataFrame(pool(tqdm(tasks))).reset_index(drop=True)
    df = pd.concat([df,df2], axis=1).reset_index(drop=True)

    return df

In [14]:
tqdm.pandas()


In [15]:
train_df = add_path_df(train_df)

  0%|          | 0/13554 [00:00<?, ?it/s]

In [16]:
valid_df = add_path_df(valid_df)

  0%|          | 0/3387 [00:00<?, ?it/s]

In [17]:
train_df["duration"].describe()

count    13554.000000
mean        40.584178
std         69.757744
min          0.548000
25%         12.816000
50%         24.659297
75%         45.733570
max       2373.528000
Name: duration, dtype: float64

In [18]:
def compute_melspec(y, sr, n_mels, fmin, fmax):
    """
    Computes a mel-spectrogram and puts it at decibel scale
    Arguments:
        y {np array} -- signal
        params {AudioParams} -- Parameters to use for the spectrogram. Expected to have the attributes sr, n_mels, f_min, f_max
    Returns:
        np array -- Mel-spectrogram
    """
    melspec = lb.feature.melspectrogram(
        y=y, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax,
    )

    melspec = lb.power_to_db(melspec).astype(np.float32)
    return melspec

In [19]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = np.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.astype(np.uint8)
    else:
        V = np.zeros_like(X, dtype=np.uint8)

    return V

def crop_or_pad(y, length, is_train=True, start=None):
    if len(y) < length:
        y = np.concatenate([y, np.zeros(length - len(y))])
        
        n_repeats = length // len(y)
        epsilon = length % len(y)
        
        y = np.concatenate([y]*n_repeats + [y[:epsilon]])
        
    elif len(y) > length:
        if not is_train:
            start = start or 0
        else:
            start = start or np.random.randint(len(y) - length)

        y = y[start:start + length]

    return y

In [20]:
class AudioToImage:
    def __init__(self, sr=Config.sampling_rate, n_mels=128, fmin=Config.fmin, fmax=Config.fmax, duration=Config.duration, step=None, res_type="kaiser_fast", resample=True, train = True):

        self.sr = sr
        self.n_mels = n_mels
        self.fmin = fmin
        self.fmax = fmax or self.sr//2

        self.duration = duration
        self.audio_length = self.duration*self.sr
        self.step = step or self.audio_length
        
        self.res_type = res_type
        self.resample = resample

        self.train = train
    def audio_to_image(self, audio):
        melspec = compute_melspec(audio, self.sr, self.n_mels, self.fmin, self.fmax ) 
        image = mono_to_color(melspec)
#         compute_melspec(y, sr, n_mels, fmin, fmax)
        return image

    def __call__(self, row, save=True):

      audio, orig_sr = sf.read(row.path, dtype="float32")

      if self.resample and orig_sr != self.sr:
        audio = lb.resample(audio, orig_sr, self.sr, res_type=self.res_type)
        
      audios = [audio[i:i+self.audio_length] for i in range(0, max(1, len(audio) - self.audio_length + 1), self.step)]
      audios[-1] = crop_or_pad(audios[-1] , length=self.audio_length)
      images = [self.audio_to_image(audio) for audio in audios]
      images = np.stack(images)
        
      if save:
        if self.train:
            path = Config.out_dir_train/f"{row.filename}.npy"
        else:
            path = Config.out_dir_valid/f"{row.filename}.npy"
            
        path.parent.mkdir(exist_ok=True, parents=True)
        np.save(str(path), images)
      else:
        return  row.filename, images

In [21]:
tqdm.pandas()

In [22]:
def get_audios_as_images(df, train = True):
    pool = joblib.Parallel(2)
    
    converter = AudioToImage(step=int(Config.duration*0.666*Config.sampling_rate),train=train)
    mapper = joblib.delayed(converter)
    tasks = [mapper(row) for row in df.itertuples(False)]
    pool(tqdm(tasks))

In [23]:
get_audios_as_images(train_df, train = True)


  0%|          | 0/13554 [00:00<?, ?it/s]

In [24]:
get_audios_as_images(valid_df, train = False)

  0%|          | 0/3387 [00:00<?, ?it/s]

In [25]:
#