### Centro Universitário da Fundação Educacional Inaciana "Padre Sabóia de Medeiros" (FEI)


*FEI's Stricto Sensu Graduate Program in Electrical Engineering*

Concentration area: ARTIFICIAL INTELLIGENCE APPLIED TO AUTOMATION AND ROBOTICS

Master's thesis student Andre Luiz Florentino

***

## Check for GPU

In [None]:
import tensorflow as tf
print(tf.__version__)

pd = tf.config.experimental.list_physical_devices()
for i in pd:
    print(i)
print('------------------------------------------------------------------------------------------')


print(tf.config.list_physical_devices('GPU'))
# [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

print(tf.test.is_built_with_cuda)
# <function is_built_with_cuda at 0x000001AA24AFEC10>

print(tf.test.gpu_device_name())
# /device:GPU:0

#gvd = tf.config.get_visible_devices()
for j in tf.config.get_visible_devices():
    print(j)
# PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')
# PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')

# Chapter 8: Feature extraction for CNN 2D (Convolutional Neural Network)

***

## Import modules

In [None]:
import librosa
import librosa.display
import os
import warnings
import random
import mimetypes


import pandas     as pd
import seaborn    as sns
import numpy      as np
import IPython.display as ipd

from random import sample

from matplotlib  import pyplot  as plt

from tqdm                        import tqdm

In [None]:
# Globals
current_path = os.getcwd()

# For the picture names
pic_first_name = '08_Feature_extraction_for_CNN_2D_agg_'

# For Librosa
FRAME_SIZE  = 1024
HOP_LENGTH  = 512
SEED        = 1000
SR          = 22050
N_FTT       = 2048
BANDS       = 60

# Values for feature extraction
threshold   = 60
frames      = 44

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 9)
pd.set_option('display.width', 300)
pd.set_option('display.max_colwidth', 120)

tf.random.set_seed(SEED)
np.random.seed(SEED)

mimetypes.init()
mimetypes.add_type('audio/ogg','.ogg')

## Loading the dataset

In [None]:
# Select the dataset

opcD = 0
while str(opcD) not in '1234':
    print()
    print("1-) ESC-10")
    print("2-) BDLib2")
    print("3-) US8K")
    print("4-) US8K_AV")

    opcD = input("\nSelect the dataset: ")
    if opcD.isdigit():
        opcD = int(opcD)
    else:
        opcD = 0

if opcD == 1:

    path        = os.path.join(current_path, "_dataset", "ESC-10")
    path_pic    = os.path.join(current_path, "ESC-10_results")
    path_models = os.path.join(current_path, "ESC-10_saved_models")
    
    # Check if the folder exists, if not, create it
    if not os.path.exists(path_models):
        os.makedirs(path_models)
   
    subfolders  = next(os.walk(path))[1]
    nom_dataset = 'ESC-10' 
    csv_file    = 'ESC-10.csv'
    fold        = 1
    dog_set     = 'Dog bark'
    time_length = 5
    windowingNo = 9
    aug_factor  = 6
    
    pkl_features          = 'ESC-10_features_original.pkl'
    pkl_aug_features      = 'ESC-10_features_augmented_no_windowing.pkl'
    pkl_aug_wind_features = 'ESC-10_features_augmented.pkl'
    
    pkl_features_CNN_2D          = 'ESC-10_features_CNN_2D_original.pkl'
    pkl_aug_features_CNN_2D      = 'ESC-10_features_CNN_2D_augmented_no_windowing.pkl'
    pkl_aug_wind_features_CNN_2D = 'ESC-10_features_CNN_2D_augmented.pkl'
    

    
if opcD == 2:
    
    path        = os.path.join(current_path, "_dataset", "BDLib2")
    path_pic    = os.path.join(current_path, "BDLib2_results")
    path_models = os.path.join(current_path, "BDLib2_saved_models")
    
    # Check if the folder exists, if not, create it
    if not os.path.exists(path_models):
        os.makedirs(path_models)

    subfolders  = next(os.walk(path))[1]
    nom_dataset = 'BDLib2' 
    csv_file    = 'BDLib2.csv'
    fold        = 'fold-1'
    dog_set     = 'dogs'
    time_length = 10
    windowingNo = 19
    aug_factor  = 6

    pkl_features          = 'BDLib2_features_original.pkl'
    pkl_aug_features      = 'BDLib2_features_augmented_no_windowing.pkl'
    pkl_aug_wind_features = 'BDLib2_features_augmented.pkl'
    
    pkl_features_CNN_2D          = 'BDLib2_features_CNN_2D_original.pkl'
    pkl_aug_features_CNN_2D      = 'BDLib2_features_CNN_2D_augmented_no_windowing.pkl'
    pkl_aug_wind_features_CNN_2D = 'BDLib2_features_CNN_2D_augmented.pkl'
    
    
if opcD == 3:
    
    path        = os.path.join(current_path, "_dataset", "US8K")
    path_pic    = os.path.join(current_path, "US8K_results")
    path_models = os.path.join(current_path, "US8K_saved_models")
    
    # Check if the folder exists, if not, create it
    if not os.path.exists(path_models):
        os.makedirs(path_models)
        
    subfolders  = next(os.walk(path))[1]
    nom_dataset = 'US8K' 
    csv_file    = 'US8K.csv'
    fold        = '1'
    dog_set     = 'dog_bark'
    time_length = 4
    windowingNo = 7
    aug_factor  = 6

    pkl_features          = 'US8K_features_original.pkl'
    pkl_aug_features      = 'US8K_features_augmented_no_windowing.pkl'
    pkl_aug_wind_features = 'US8K_features_windowed.pkl' # augmented and windowed makes no sense. Dataset is already quite large

    pkl_features_CNN_2D          = 'US8K_features_CNN_2D_original.pkl'
    pkl_aug_features_CNN_2D      = 'US8K_features_CNN_2D_augmented_no_windowing.pkl'
    pkl_aug_wind_features_CNN_2D = 'US8K_features_CNN_2D_windowed.pkl' # augmented and windowed makes no sense. Dataset is already quite large
    
    
if opcD == 4:

    path        = os.path.join(current_path, "_dataset", "US8K_AV")
    path_pic    = os.path.join(current_path, "US8K_AV_results")
    path_models = os.path.join(current_path, "US8K_AV_saved_models")
    
    # Check if the folder exists, if not, create it
    if not os.path.exists(path_models):
        os.makedirs(path_models)


    subfolders  = next(os.walk(path))[1]
    nom_dataset = 'US8K_AV' 
    csv_file    = 'US8K_AV.csv'
    fold        = '1'
    dog_set     = 'dog_bark'
    time_length = 4
    windowingNo = 7
    aug_factor  = 6

    pkl_features          = 'US8K_AV_features_original.pkl'
    pkl_aug_features      = 'US8K_AV_features_augmented_no_windowing.pkl'
    pkl_aug_wind_features = 'US8K_AV_features_windowed.pkl' # augmented and windowed makes no sense. Dataset is already quite large
    
    pkl_features_CNN_2D          = 'US8K_AV_features_CNN_2D_original.pkl'
    pkl_aug_features_CNN_2D      = 'US8K_AV_features_CNN_2D_augmented_no_windowing.pkl'
    pkl_aug_wind_features_CNN_2D = 'US8K_AV_features_CNN_2D_windowed.pkl' # augmented and windowed makes no sense. Dataset is already quite large

In [None]:
def get_next_file_number(folder: str):
    files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f)) and f.startswith(pic_first_name)]
    if not files:
        return 1
    else:
        numbers = [int(f.split('.')[0].split('_')[-1]) for f in files]
        return max(numbers) + 1

In [None]:
from MT_loadDataset import loadDataset

In [None]:
loadDataset = loadDataset(path)
DB          = loadDataset.db_B

print("\nClasses:\n--------------------")
print(DB["Class_categorical"].value_counts())
print("\nTotal number of unique files..........: ", len(np.unique(DB["File_name"])))
print("Total number of AUDIO files...........: ", len(DB))
DB

In [None]:
# Analysis of the class balancing

sns.set_style("darkgrid")
gTitle = f'{nom_dataset} - Number of classes = ' + str(len(pd.Series(DB['Class_categorical']).unique()))
g = sns.displot(DB,x='Class_categorical', hue='Class_categorical',height = 5, aspect = 2).set(title=gTitle)
g.set_xticklabels(rotation=90)
g.set_titles('Number of classes')

# Retrieve the axes object from the plot
axes = g.ax

# Iterate over each bar in the plot
for p in axes.patches:
    # Get the coordinates of the bar
    width = p.get_width()
    height = p.get_height()
    cord_x, cord_y = p.get_xy()
    if height > 0:
        axes.annotate(f'{height}', (cord_x + width/2, cord_y + height), ha='center')
        
g._legend.remove()

plt.tight_layout()

## Pre-processing the data (Features extraction)

### Exploratory code that lead to a class for extracting the features

Hand crafting the features into the dataframe

In [None]:
os.chdir(path)

In [None]:
pwd

In [None]:
# Group by the class and get one random sample of each class
k = DB.groupby('Class_categorical')['Class_OHEV'].apply(lambda s: s.sample(1))
print(k)

# Convert the pandas series into a dataframe
temp_k_df = k.reset_index()

# Delete the index from the grouppby result
del temp_k_df['level_1']

# Set the "Class" as the dataframe index
temp_k_df.set_index("Class_categorical", inplace=True)

# Convert the dataframe to a dictionary (Class: Class_encoder)
encoder_dict = temp_k_df["Class_OHEV"].to_dict()
encoder_dict

In [None]:
type(DB['Class_OHEV'][0][0])

In [None]:
encoder_dict[dog_set]

In [None]:
# Read the pkl file with the augmented features extracted

opc = 0
while str(opc) not in '123':
    print()
    print("1-) Features original")
    print("2-) Features augmented")
    print("3-) Features augmented and windowed (US8K is only windowed)")

    opc = input("\nSelect the dataset: ")
    if opc.isdigit():
        opc = int(opc)
    else:
        opc = 0

if opc == 1:
    DB_from_pkl      = pd.read_pickle(os.path.join(path_models, pkl_features))
    model_surname    = '_original'
    pkl_feature_file = pkl_features_CNN_2D 
    check_agg        = 1

elif opc == 2:
    DB_from_pkl      = pd.read_pickle(os.path.join(path_models, pkl_aug_features))
    model_surname    = '_augmented'
    pkl_feature_file = pkl_aug_features_CNN_2D
    check_agg        =  aug_factor

elif (opcD == 3 or opcD == 4) and opc == 3:
    DB_from_pkl      = pd.read_pickle(os.path.join(path_models, pkl_aug_wind_features))
    model_surname    = '_windowed'
    pkl_feature_file = pkl_aug_wind_features_CNN_2D
    check_agg        =  windowingNo * 1

elif opc == 3:
    DB_from_pkl      = pd.read_pickle(os.path.join(path_models, pkl_aug_wind_features))
    model_surname    = '_windowed'
    pkl_feature_file = pkl_aug_wind_features_CNN_2D
    check_agg        =  windowingNo * aug_factor
    
else:
    pass

In [None]:
check_agg

In [None]:
DB_from_pkl.dtypes

In [None]:
total_duration = 0
for audio in DB_from_pkl['Audio']:
    total_duration = total_duration + librosa.get_duration(y=audio)
print('Total duration of the dataset: ' , "{:0.4f} h".format(total_duration / 3600))

In [None]:
DB_from_pkl

In [None]:
for col in DB_from_pkl.columns:
    print(col)

In [None]:
DB_from_pkl = DB_from_pkl[['Audio', 'Class_categorical', 'Class_OHEV', 'Fold']]
DB_from_pkl

In [None]:
dog = DB_from_pkl[DB_from_pkl['Class_categorical'] == dog_set]
dog

In [None]:
random_idx = random.choice(dog.index.tolist())
random_sample = dog['Audio'][random_idx]
print(f'Dataframe index....: {random_idx}')
print(f'Sample file name...: {random_sample}')

In [None]:
ipd.Audio(random_sample, rate = SR)

In [None]:
X      = DB_from_pkl['Audio'].values
y_cat  = DB_from_pkl['Class_categorical'].values
y_OHEV = DB_from_pkl['Class_OHEV'].values
folds  = DB_from_pkl['Fold'].values

In [None]:
type(DB_from_pkl['Class_OHEV'][random_idx])

In [None]:
librosa.get_duration(y = dog['Audio'][random_idx])

In [None]:
len(dog['Audio'][random_idx])

In [None]:
pkl_features_CNN_2D

In [None]:
# Method to extract the Log-Mel + Deltas as input for the CNN 2D
        
def LogMel_extractor(audio_clips:list):
    
    log_specgrams = []
    framesLst     = []

    for signal in tqdm(audio_clips):
        melspec = librosa.feature.melspectrogram(y          = signal, 
                                                 n_mels     = BANDS,
                                                 hop_length = HOP_LENGTH, 
                                                 n_fft      = N_FTT, 
                                                 sr         = SR) 

        #logspec = librosa.core.amplitude_to_db(melspec)
        logspec = librosa.power_to_db(melspec)
        frames = logspec.shape[1]
        framesLst.append(frames)

        # Flattens the array (bands , frames) to (bands * frames , 1) E.g.: (60 , 216) --> (12.960 , 1)
        logspec = logspec.flatten()[:, np.newaxis]

        # Appends to array
        log_specgrams.append(logspec)

    # Reshape to audio, bands, frames and channels E.g.: (Depends on the model Ori or Aug, 60, 44, 1)
    log_specgrams = np.asarray(log_specgrams,dtype='float32').reshape(len(log_specgrams),BANDS,frames,1)

    # Initiate zeros for the log mel spectrogram delta
    features = np.concatenate((log_specgrams,
                               np.zeros(np.shape(log_specgrams)),
                               np.zeros(np.shape(log_specgrams))), axis=3)

    # Add the delta for the log mel spectrogram as channels
    for i in tqdm(range(len(features))):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0], order = 1)
        features[i, :, :, 2] = librosa.feature.delta(features[i, :, :, 0], order = 2)

    # Vertically stack up the deltas to created an aggregated structures of features
    mel, delta1, delta2 = np.split(features, 3, axis=3)
    aggregated          = np.concatenate((mel, delta1, delta2), axis=1)
        
    if len(set(framesLst)) == 1:
        duration = "{:.2f}".format((framesLst[0]*HOP_LENGTH)/SR)
        print(f"Mel spectrogram created by a {duration} seconds audio. Number of frames: {framesLst[0]}")
        
    return np.array(aggregated), np.array(features)

In [None]:
X, X_channel = LogMel_extractor(X)

In [None]:
X.shape

In [None]:
len(X)

In [None]:
len(X[0])

In [None]:
len(X[0][0])

In [None]:
X[0].shape

In [None]:
index = random.randint(1,len(X))
random_sample_agg = X[index]
print(index)
print(random_sample_agg.shape)

In [None]:
random_sample = X_channel[index]
print(index)
print(random_sample.shape)

In [None]:
X[index]

In [None]:
DB_from_pkl['Audio'][index]

In [None]:
DB_from_pkl['Class_categorical'][index]

In [None]:
ipd.Audio(DB_from_pkl['Audio'][index], rate = SR)

In [None]:
X_mel, X_mel_delta, X_mel_delta2 = np.split(random_sample, 3, axis=2)

print(X_mel.shape)
print(X_mel_delta.shape)
print(X_mel_delta2.shape)


X_mel = np.squeeze(X_mel) 
X_mel_delta = np.squeeze(X_mel_delta)
X_mel_delta2 = np.squeeze(X_mel_delta2)


print(X_mel.shape)
print(X_mel_delta.shape)
print(X_mel_delta2.shape)

In [None]:
X_mel[0]

In [None]:
X_mel_delta[0]

In [None]:
X_mel_delta2[0]

In [None]:
t = X[index]
t.shape

In [None]:
t_s = np.squeeze(t) 
t_s.shape

In [None]:
# Split based on the number of mel bands

array1 = t_s[:60, :]
array2 = t_s[60:120, :]
array3 = t_s[120:, :]

In [None]:
array1[0]

In [None]:
array2[0]

In [None]:
array3[0]

In [None]:
(X_mel[0] == array1[0]).all()

In [None]:
(X_mel_delta[0] == array2[0]).all()

In [None]:
(X_mel_delta2[0] == array3[0]).all()

In [None]:
array1.shape

In [None]:
X_mel.shape

In [None]:
picture_name = f'{pic_first_name}{get_next_file_number(path_pic):02d}.png'

plt.figure(figsize=(17,7))

librosa.display.specshow(X_mel, sr=SR, x_axis='time', y_axis='mel')
plt.colorbar()
plt.title(f"Mel frequency spectrogram of {DB_from_pkl['Class_categorical'][index]} (model: {model_surname})" )

plt.savefig(os.path.join(path_pic, picture_name))

plt.tight_layout()

In [None]:
picture_name = f'{pic_first_name}{get_next_file_number(path_pic):02d}.png'

plt.figure(figsize=(17,7))

librosa.display.specshow(X_mel_delta, sr=SR, x_axis='time', y_axis='mel')
plt.colorbar()
plt.title(f"Delta Mel frequency spectrogram of {DB_from_pkl['Class_categorical'][index]} (model: {model_surname})" )

plt.savefig(os.path.join(path_pic, picture_name))

plt.tight_layout()

In [None]:
picture_name = f'{pic_first_name}{get_next_file_number(path_pic):02d}.png'

plt.figure(figsize=(17,7))

librosa.display.specshow(X_mel_delta2, sr=SR, x_axis='time', y_axis='mel')
plt.colorbar()
plt.title(f"Delta Delta Mel frequency spectrogram of {DB_from_pkl['Class_categorical'][index]} (model: {model_surname})" )

plt.savefig(os.path.join(path_pic, picture_name))

plt.tight_layout()

In [None]:
picture_name = f'{pic_first_name}{get_next_file_number(path_pic):02d}.png'

plt.figure(figsize=(17,7))

librosa.display.specshow(array1, sr=SR, x_axis='time', y_axis='mel')
plt.colorbar()
plt.title(f"Mel frequency spectrogram of {DB_from_pkl['Class_categorical'][index]} from split aggregated feature (model: {model_surname})" )

plt.savefig(os.path.join(path_pic, picture_name))

plt.tight_layout()

In [None]:
picture_name = f'{pic_first_name}{get_next_file_number(path_pic):02d}.png'

plt.figure(figsize=(17,7))

librosa.display.specshow(array2, sr=SR, x_axis='time', y_axis='mel')
plt.colorbar()
plt.title(f"Delta Mel frequency spectrogram of {DB_from_pkl['Class_categorical'][index]} from split aggregated feature (model: {model_surname})" )

plt.savefig(os.path.join(path_pic, picture_name))

plt.tight_layout()

In [None]:
picture_name = f'{pic_first_name}{get_next_file_number(path_pic):02d}.png'

plt.figure(figsize=(17,7))

librosa.display.specshow(array3, sr=SR, x_axis='time', y_axis='mel')
plt.colorbar()
plt.title(f"Delta Delta Mel frequency spectrogram of {DB_from_pkl['Class_categorical'][index]} from split aggregated feature (model: {model_surname})" )

plt.savefig(os.path.join(path_pic, picture_name))

plt.tight_layout()

In [None]:
# Convert the array to DataFrame
array_list = [X[i] for i in range(X.shape[0])]

# Insert the label colum in the DataFrame
DB_from_pkl.insert(loc = 4, column = 'features', value = array_list)

# Drops the audio data
DB_features = DB_from_pkl.copy()
DB_features = DB_features.drop(columns='Audio')

In [None]:
DB_features

In [None]:
pkl_feature_file

In [None]:
# Save the features to a pickle file

DB_features.to_pickle(os.path.join(path_models, pkl_feature_file))

In [None]:
# Read the pkl file with the features extracted for the CNN classifier

DB_retrieved_pkl = pd.read_pickle(os.path.join(path_models, pkl_feature_file))

In [None]:
DB_retrieved_pkl

In [None]:
# Retrieve the information from DataFrame as numpy array
X3 = DB_retrieved_pkl['features'].to_numpy()
y3 = DB_retrieved_pkl['Class_OHEV'].values

#Reshape to the correct dimension
X3_reshaped = np.stack(X3)

In [None]:
X3_reshaped.shape

In [None]:
(X == X3_reshaped).all()

In [None]:
count = 0
for arr1,arr2 in zip(y_OHEV,y3):
    if np.array_equal(arr1,arr2) == False:
        count = count + 1
if count > 0:
    print("The arrays are NOT identical")
else:
    print("The arrays are identical")

## Checking the Log mel spectogram manually created data against the aggregated method

In [None]:
# Windowing function 

def windows(data, window_size):
    start = 0
    while start < len(data):
        yield int(start), int(start + window_size)
        start += (window_size / 2)

In [None]:
# Function to augment the audio and extract the features
        
def window_audio_file(data: list, SR: int ):

    frames_no      = frames
    window_size    = 512 * (frames_no - 1)
    audio_windowed = []
       
    for (start, end) in windows(data, window_size):
        if(len(data[start:end]) == window_size):

            # Window the audio
            signal  = data[start:end]

            # Appends to array
            audio_windowed.append(signal)
    
    return np.array(audio_windowed)

In [None]:
def pre_processing(files:list, time_length:int):
    
    time_length       = time_length  
    silence_threshold = threshold
    target_samples    = int(time_length * SR)
    audio_array       = []


    for i, audio in tqdm(enumerate(files, start=0)):

        # Split the audio into non-silent intervals
        non_silent_intervals = librosa.effects.split(audio, 
                                                     top_db       = silence_threshold,
                                                     frame_length = FRAME_SIZE, 
                                                     hop_length   = HOP_LENGTH)

        # Extract non-silent segments from the original audio data
        non_silent_audio  = []
        for interval in non_silent_intervals:
            start, end = interval
            non_silent_audio.extend(audio[start:end])

        # Convert the list back to a NumPy array
        non_silent_audio_array = np.array(non_silent_audio)

        # Repeat the non-silent audio array to fit the target time length
        extended_audio = np.tile(non_silent_audio_array, target_samples // len(non_silent_audio_array) + 1)

        # Truncate the extended audio to match the desired duration
        audio_array.append(extended_audio[:target_samples])
    
    return audio_array

In [None]:
def augmentation(files:list):
    
    audio_augmented = []
    
    for rawdata in files:

        start_  = int(np.random.uniform(-4800,4800))

        # Time shifting (randomly)
        if start_ >= 0:
            audio_time_shift = np.r_[rawdata[start_:], np.random.uniform(-0.001,0.001, start_)]
        else:
            audio_time_shift = np.r_[np.random.uniform(-0.001,0.001, -start_), rawdata[:start_]]

        audio_augmented.append(rawdata)
        audio_augmented.append(audio_time_shift)

        # Time stretching
        audio_augmented.append(librosa.effects.time_stretch(rawdata, rate=0.85))
        audio_augmented.append(librosa.effects.time_stretch(rawdata, rate=1.15))

        # Pitch shifting
        audio_augmented.append(librosa.effects.pitch_shift(rawdata, sr = SR, n_steps = 4))
        audio_augmented.append(librosa.effects.pitch_shift(rawdata, sr = SR, n_steps = -4))
        
    audio_augmented = pre_processing(audio_augmented, time_length)

    return audio_augmented

In [None]:
# Gets a random index from the original dataset

index_chk = random.randint(1,len(DB))
index_chk

In [None]:
# Creates a probe list from the random index

probe_list = []
t = DB['Path'][index_chk]
print(t)
probe, _ = librosa.load(t, sr = SR)
probe_list.append(probe)
print(probe_list)

In [None]:
# Sample from original audio dataset

print(DB['Class_categorical'][index_chk])
print('Audio file duration: ' , "{:0.4f} s".format(librosa.get_duration(y=probe)))
print(len(probe))
ipd.Audio(probe, rate = SR)

In [None]:
probe.shape

In [None]:
np.shape(probe_list)

In [None]:
# Audio normalization

audio_pp = pre_processing(probe_list, time_length)

In [None]:
np.shape(audio_pp)

In [None]:
# Audio augmentation

audio_aug = augmentation(probe_list)

In [None]:
np.shape(audio_aug)

In [None]:
# Sample from original audio dataset pre-processed individually

print('Audio file duration: ' , "{:0.4f} s".format(librosa.get_duration(y=audio_pp[0])))
print(len(audio_pp[0]))
ipd.Audio(audio_pp[0], rate = SR)

In [None]:
# Audio framing (windowing)

windowed = window_audio_file(audio_pp[0], SR)

In [None]:
windowed.shape

In [None]:
# Sample from original audio dataset pre-processed individually and windowed

print('Audio file duration: ' , "{:0.4f} s".format(librosa.get_duration(y=windowed[0])))
print(len(windowed[0]))
ipd.Audio(windowed[0], rate = SR)

In [None]:
windowed[0]

In [None]:
# Equivalent sample from the augmented dataset

print(DB_from_pkl['Class_categorical'][index_chk * check_agg])
print('Audio file duration: ' , "{:0.4f} s".format(librosa.get_duration(y=DB_from_pkl['Audio'][index_chk * check_agg])))
DB_from_pkl['Audio'][index_chk * check_agg]
ipd.Audio(DB_from_pkl['Audio'][index_chk * check_agg], rate = SR)

In [None]:
windowed[0].shape

In [None]:
DB_from_pkl['Audio'][index_chk * check_agg].shape

In [None]:
index_chk * check_agg

In [None]:
if opc == 1:
    array_check = audio_pp[0]
    
elif opc == 2:
    array_check = audio_aug[0]
    
else:
    array_check = windowed[0]

In [None]:
(array_check == DB_from_pkl['Audio'][index_chk * check_agg]).all()

In [None]:
# Check the audio data if uncomented in line 62 --> DB_features = DB_features.drop(columns='Audio')
# Not needed anymore, used utilized during script development to check the data consistency

# (array_check == DB_retrieved_pkl['Audio'][index_chk * check_agg]).all()

In [None]:
# Check the audio data if uncomented in line 62 --> DB_features = DB_features.drop(columns='Audio')
# Not needed anymore, used utilized during script development to check the data consistency

# (array_check == DB_features['Audio'][index_chk * check_agg]).all()

In [None]:
XS  = librosa.feature.melspectrogram(y          = array_check,
                                     sr         = SR, 
                                     n_fft      = N_FTT,
                                     hop_length = HOP_LENGTH,
                                     n_mels     = BANDS)

Xdb = librosa.power_to_db(XS)

In [None]:
XS.shape

In [None]:
Xdb.shape

In [None]:
Xdb_delta = librosa.feature.delta(Xdb, order = 1)
Xdb_delta.shape

In [None]:
Xdb_delta2 = librosa.feature.delta(Xdb, order = 2)
Xdb_delta2.shape

In [None]:
#X_mel, X_mel_delta, X_mel_delta2 = np.split(X_channel[0], 3, axis=2)
#print(X_mel.shape)  # Output: (60, 41, 1)
#print(X_mel_delta.shape)  # Output: (60, 41, 1)
#print(X_mel_delta2.shape)  # Output: (60, 41, 1)

#X_mel = np.squeeze(X_mel) 
#X_mel_delta = np.squeeze(X_mel_delta)
#X_mel_delta2 = np.squeeze(X_mel_delta2)

#print(X_mel.shape)  # Output: (60, 41, 1)
#print(X_mel_delta.shape)  # Output: (60, 41, 1)
#print(X_mel_delta2.shape)  # Output: (60, 41, 1)

In [None]:
index_chk * check_agg

In [None]:
print(DB_retrieved_pkl['Class_categorical'][index_chk * check_agg])
temp = np.squeeze(X3[index_chk * check_agg]).astype('float32') 
X_mel, X_mel_delta, X_mel_delta2 = temp[:60, :], temp[60:120, :], temp[120:, :]

print(X_mel.shape)  # Output: (60, 41, 1)
print(X_mel_delta.shape)  # Output: (60, 41, 1)
print(X_mel_delta2.shape)  # Output: (60, 41, 1)

In [None]:
print((Xdb        == X_mel).all())
print((Xdb_delta  == X_mel_delta).all())
print((Xdb_delta2 == X_mel_delta2).all())

In [None]:
# Just a test

frames = np.linspace(1, 216, num=216)
frames

In [None]:
# Compute mel spectrogram and plot result

def mel_spectrogram_aug(audio_mel:list, title_add:str):
    
    picture_name = f'{pic_first_name}{get_next_file_number(path_pic):02d}.png'
    
    frames_x = np.linspace(0, audio_mel.shape[1], num=audio_mel.shape[1])
    mels_y   = np.linspace(0, audio_mel.shape[0], num=audio_mel.shape[0])
    
    plt.figure(figsize = (20, 8))
    
    librosa.display.specshow(audio_mel, sr=SR, x_axis='time', y_axis='mel')
    plt.colorbar(format = "%+2.0f dB")
    
    plt.title(nom_dataset + f" - Log mel frequency spectrogram for the seletec sample " + title_add, fontsize = 16)
    plt.xlabel("Time")
    
    for i, frame in enumerate(frames_x):
        plt.annotate(str(int(frame)), (i/len(frames_x), 0.0), xycoords='axes fraction', ha='center')
    
    # Plot the y space based on the number of mels --> image of 41 frames x 60 mels.
    # Works only if the y_axis = 'linear'
    #y_min = 0
    #y_max = 1
    #y_coords = np.linspace(0, y_max, num=60)
    #for y_coord, y_val in zip(y_coords, mels_y):
    #    plt.annotate(str(int(y_val)), (1, y_coord), ha='right', va='top', xycoords='axes fraction', fontsize = 6)
      
    plt.tight_layout()
    plt.savefig(os.path.join(path_pic, picture_name))
    plt.show() 

In [None]:
for i in X_mel:
    print(i)

In [None]:
for i in Xdb:
    print(i)

In [None]:
mel_spectrogram_aug(Xdb, '(manually created for ' + model_surname + ')')

In [None]:
mel_spectrogram_aug(X_mel, '(from the method for ' + model_surname + ')')

In [None]:
mel_spectrogram_aug(Xdb_delta, '(Delta - manually created for ' + model_surname + ')')

In [None]:
mel_spectrogram_aug(X_mel_delta, '(Delta - from the method for ' + model_surname + ')')

In [None]:
mel_spectrogram_aug(Xdb_delta2, '(Delta 2 - manually created for ' + model_surname + ')')

In [None]:
mel_spectrogram_aug(X_mel_delta2, '(Delta 2 - from the method for ' + model_surname + ')')

In [None]:
(Xdb == X_mel).all()

In [None]:
(Xdb_delta == X_mel_delta.astype(np.float32)).all()

In [None]:
(Xdb_delta2 == X_mel_delta2.astype(np.float32)).all()

## End of the notebook
