In [None]:
import torchaudio
import IPython.display as ipd
import numpy as np
import pandas as pd
import torch

df = pd.read_csv('../input/birdclef-2022/train_metadata.csv')
df.head()

In [None]:
filename_1 = df["filename"].values[0] # first training example
filename = f"../input/birdclef-2022/train_audio/{filename_1}"
ipd.Audio(filename)


In [None]:
import matplotlib.pyplot as plt
waveform,sample_rate = torchaudio.load(filename)
print(waveform.shape)
print(sample_rate)
plt.figure()
plt.plot(waveform.t().numpy())
plt.show()

In [None]:
n_fft = 1024
win_length = None
hop_length = 512

spectrogram = torchaudio.transforms.Spectrogram(
    n_fft = n_fft,           # freqGroup = n_fft//2 + 1
    win_length = win_length, # freq gap for each group
    hop_length = hop_length, # length = samples / hop_length
    center = True,
    pad_mode = 'reflect',
    power=2.0
)

In [None]:
spec = spectrogram(waveform)
spec.shape

In [None]:
import librosa
def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
    fig, axs = plt.subplots(1, 1)
    axs.set_title(title or 'Spectrogram (db)')
    axs.set_ylabel(ylabel)
    axs.set_xlabel('frame')
    im = axs.imshow(torchaudio.transforms.AmplitudeToDB(top_db=80)(spec), origin='lower', aspect=aspect)
    if xmax:
        axs.set_xlim((0, xmax))
    fig.colorbar(im, ax=axs)
    plt.show(block=False)

In [None]:
plot_spectrogram(spec[0], title='torchaudio')

In [None]:
plot_spectrogram(spec[0], title="Original")

# time_mask_param is maximum possible  length of the mask
masking = torchaudio.transforms.TimeMasking(time_mask_param=80)
spec1 = masking(spec)
plot_spectrogram(spec1[0], title="Masked along time axis")

In [None]:
plot_spectrogram(spec[0], title="Original")

#  freq_mask_param (int) – maximum possible length of the mask
masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
spec = masking(spec)

plot_spectrogram(spec[0], title="Masked along frequency axis")

In [None]:
from tqdm import tqdm

for i in tqdm(df.index,total=df.shape[0]):
    filename = f"../input/birdclef-2022/train_audio/{df.iloc[i].filename}"
    metadata = torchaudio.info(filename)
    df.loc[i,'num_channels'] = metadata.num_channels
    df.loc[i,'num_frames'] = metadata.num_frames
    df.loc[i,'sample_rate'] = metadata.sample_rate


In [None]:
df['duration'] = df['num_frames']/df['sample_rate']

In [None]:
# df.to_csv('train_metadata.csv', index=False)

# Data Analysis

In [None]:
df.describe()

* num_channels 考虑合并还是拆分
* duration 多数小于50s，但也有超长时长，需要统一处理
* rating意义能否使用

In [None]:
# 时长过长
filename = f"../input/birdclef-2022/train_audio/{df.loc[df['duration'].argmax()].filename}"
ipd.Audio(filename)

In [None]:
labelCount = df[['primary_label','filename']].groupby('primary_label').count()
labelCount.loc[labelCount['filename']<5]

In [None]:
df[['primary_label','filename']].groupby('primary_label').count().describe()

# Change singal to mono

In [None]:
filename = f"../input/birdclef-2022/train_audio/{df.iloc[3].filename}"
waveform,sample_rate = torchaudio.load(filename)
print('origin',waveform.shape)
display(ipd.Audio(waveform,rate=sample_rate))

waveform = torch.mean(waveform, axis=0, keepdim=True)
print('mono',waveform.shape)
display(ipd.Audio(waveform,rate=sample_rate))

In [None]:
def print_stats(waveform, sample_rate=None, src=None):
    if src:
        print("-" * 10)
        print("Source:", src)
        print("-" * 10)
    if sample_rate:
        print("Sample Rate:", sample_rate)
    print("Shape:", tuple(waveform.shape))
    print("Dtype:", waveform.dtype)
    print(f" - Max:     {waveform.max().item():6.3f}")
    print(f" - Min:     {waveform.min().item():6.3f}")
    print(f" - Mean:    {waveform.mean().item():6.3f}")
    print(f" - Std Dev: {waveform.std().item():6.3f}")
    print()
    print(waveform)
    print()
    
print_stats(waveform, sample_rate=sample_rate, src=None)

# Waveform to Specgrum

In [None]:
print(waveform.shape,sample_rate)

In [None]:
   
n_fft = 1024
win_length = 1024
hop_length = 512

spectrogram = torchaudio.transforms.Spectrogram(
    n_fft = n_fft,           # freqGroup = n_fft//2 + 1
    win_length = win_length, # freq gap for each group
    hop_length = hop_length, # length = samples / hop_length
    center = True,
    pad_mode = 'reflect',
    power=2.0
)
spec = spectrogram(waveform)
spec

In [None]:
def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
    fig, axs = plt.subplots(1, 1)
    axs.set_title(title or 'Spectrogram (db)')
    axs.set_ylabel(ylabel)
    axs.set_xlabel('frame')
    im = axs.imshow(torchaudio.transforms.AmplitudeToDB(top_db=80)(spec), origin='lower', aspect=aspect)
    if xmax:
        axs.set_xlim((0, xmax))
    fig.colorbar(im, ax=axs)
    plt.show(block=False)
plot_spectrogram(spec[0])

In [None]:
def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
    fig, axs = plt.subplots(1, 1)
    axs.set_title(title or 'Spectrogram (db)')
    axs.set_ylabel(ylabel)
    axs.set_xlabel('frame')
    im = axs.imshow(torchaudio.transforms.AmplitudeToDB()(spec), origin='lower', aspect=aspect)
    if xmax:
        axs.set_xlim((0, xmax))
    fig.colorbar(im, ax=axs)
    plt.show(block=False)
plot_spectrogram(spec[0])

In [None]:
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 128

mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    center=True,
    pad_mode="reflect",
    power=2.0,
    norm='slaney',
    onesided=True,
    n_mels=n_mels,
    mel_scale="htk",
)
melspec = mel_spectrogram(waveform)
melspec

# How to use Rating & Deal Secondary_label?--Sample

In [None]:
groupSampleNum = 10
resample_data = pd.DataFrame(columns = df.columns.to_list())
for GroupName,groupData in df.groupby('primary_label'):
    goodData = groupData.sort_values(['secondary_labels','rating'],ascending=False).head(groupSampleNum)
    resample_data = resample_data.append(goodData)
resample_data = resample_data.reset_index(drop=True)

In [None]:
resample_data.to_csv('resampleTrainMetadata.csv')

In [None]:
resample_data

In [None]:
df = pd.read_csv('resampleTrainMetadata.csv')


In [None]:
df