# STEP 3 Audio Preparation and Model


In [231]:
import os
import pandas as pd
import numpy as np
import librosa
import soundfile as sf

In [232]:
#Global parameters
language_dic = {"it" : "Italian", "es" : "Spanish"}
analysis_window_length = 0.01  # 10 ms in seconds
language_mp3_path = "/Users/Andee/Documents/CBS - Data Science/Second Semester/Machine Learning/Assignments/MLAssignments/FinalProject/languages"

In [229]:
# number of repeats defined by minimum duration: ceil(10 seconds / min duration)
min_clip = 1.3035
x_seconds = 5
num_reps = int(np.ceil(x_seconds / min_clip))
def repeat_audio_x_seconds(track, dur, num_repeats=10, fs=16000):
    num_samples_xs = int(fs * dur)
    track = np.concatenate([track]*num_repeats, axis=0)
    track = track[0:num_samples_xs]
    return track

In [255]:
data = {"filename": [],"language": [], "tracks": [], "mfccs": [] }
language_list = os.listdir(language_mp3_path)

for language in language_list:
    language_path = os.path.join(language_mp3_path, language)

    # Skip non-directory items
    if not os.path.isdir(language_path):
        continue

    if language in language_dic:
        language_name = language_dic[language]
    else:
        print("Unknown language!")
        language_name = language

    clips_path = os.path.join(language_path, "clips")

    # Check if the clips directory exists
    if not os.path.isdir(clips_path):
        print(f"No 'clips' directory found in {language_path}")
        continue

    # get a list of all files in the folder
    mp3_list = os.listdir(clips_path)
    mp3_list = mp3_list[:200]
    # looping through all mp3s in one language
    for mp3 in mp3_list:
        # adding the filename as key
        data["filename"].append(mp3)
        # adding the label/language
        data["language"].append(language_name)
        # adding the clip
        audio_path = os.path.join(clips_path, mp3)
        audio_samples, fs = sf.read(audio_path)
        audio_samples = repeat_audio_x_seconds(audio_samples, x_seconds, num_repeats=num_reps, fs=fs)
        data["tracks"].append(audio_samples)
        # calculate MFCC for the clip
        #y, sr = sf.read(audio_path)
        y = audio_samples
        sr = fs
        hop_length = int(analysis_window_length * sr)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=128, hop_length=hop_length)
        mfccs_scaled_features = np.mean(mfcc.T,axis=0)
        data["mfccs"].append(mfccs_scaled_features)

In [256]:
df = pd.DataFrame(data)
print(df.head())
df.mfccs[50].shape

                       filename language  \
0  common_voice_it_35120759.mp3  Italian   
1  common_voice_it_35392147.mp3  Italian   
2  common_voice_it_35163674.mp3  Italian   
3  common_voice_it_35270868.mp3  Italian   
4  common_voice_it_35219606.mp3  Italian   

                                              tracks  \
0  [0.0, -9.25904426914148e-13, -2.37487178360307...   
1  [0.0, 1.898078048900853e-12, 8.351942184722794...   
2  [0.0, 9.291632133839878e-13, 1.317183666825483...   
3  [0.0, 3.016666777488908e-13, -5.72718142442113...   
4  [0.0, -1.1740968440532296e-12, 4.2296575313363...   

                                               mfccs  
0  [-183.24911437422236, 128.47573750048255, -14....  
1  [-545.4151837868718, 115.31204428122385, 13.94...  
2  [-350.3443815564079, 132.89305024105846, 13.59...  
3  [-431.7047493775508, 59.977884118447584, 17.76...  
4  [-365.42030358211827, 91.3890718105903, 19.736...  


(128,)

In [257]:
#Check-point to see if all data was correctly loaded
print(f'The number of files loaded is: {len(df["filename"])}, in {df["language"].nunique()} languages and {len(data["tracks"])} tracks ')

The number of files loaded is: 400, in 2 languages and 400 tracks 


In [258]:
print(df["mfccs"].shape)
X=np.array(df['mfccs'].tolist())
y=np.array(df['language'].tolist())

(400,)


In [259]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [260]:
# using the train test split function
X_train, X_test, y_train, y_test = train_test_split(X,y ,
                                                    random_state=42,
                                                    train_size=0.8)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(320, 128)
(80, 128)
(320, 2)
(80, 2)


Model Creation

In [261]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [262]:
### No of classes
num_labels=y.shape[1]

In [263]:
model=Sequential()
###first layer
model.add(Dense(100,input_shape=(128,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [264]:
model.summary()

Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_104 (Dense)           (None, 100)               12900     
                                                                 
 activation_104 (Activation)  (None, 100)              0         
                                                                 
 dropout_78 (Dropout)        (None, 100)               0         
                                                                 
 dense_105 (Dense)           (None, 200)               20200     
                                                                 
 activation_105 (Activation)  (None, 200)              0         
                                                                 
 dropout_79 (Dropout)        (None, 200)               0         
                                                                 
 dense_106 (Dense)           (None, 100)             

In [265]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [266]:
## Training model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.hdf5',
                               verbose=1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100
 1/10 [==>...........................] - ETA: 1:10 - loss: 28.6835 - accuracy: 0.4688
Epoch 1: val_loss improved from inf to 8.22651, saving model to saved_models/audio_classification.hdf5
Epoch 2/100
Epoch 2: val_loss improved from 8.22651 to 2.57317, saving model to saved_models/audio_classification.hdf5
Epoch 3/100
 1/10 [==>...........................] - ETA: 0s - loss: 13.7121 - accuracy: 0.5000
Epoch 3: val_loss improved from 2.57317 to 0.85716, saving model to saved_models/audio_classification.hdf5
Epoch 4/100
Epoch 4: val_loss improved from 0.85716 to 0.55576, saving model to saved_models/audio_classification.hdf5
Epoch 5/100
Epoch 5: val_loss improved from 0.55576 to 0.46197, saving model to saved_models/audio_classification.hdf5
Epoch 6/100
Epoch 6: val_loss did not improve from 0.46197
Epoch 7/100
Epoch 7: val_loss did not improve from 0.46197
Epoch 8/100
Epoch 8: val_loss did not improve from 0.46197
Epoch 9/100
Epoch 9: val_loss did not improve from 0.46197
Epo

In [267]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.9125000238418579
