In [195]:
import librosa.display
import IPython.display as ipd

# feature extractoring and preprocessing data
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from PIL import Image
import pathlib

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

#Keras
import keras

# Normalization
from sklearn import preprocessing 

import warnings
warnings.filterwarnings('ignore')

In [3]:
np.random.seed(43)
file_path = '.\TUT-acoustic-scenes-2017-development.meta\TUT-acoustic-scenes-2017-development\evaluation_setup'
fold1 = os.listdir(file_path)

In [414]:
train1 = os.path.join(file_path,fold1[2])
fold1_filename = []
fold1_label = []
with open(train1, 'r') as f:
    data = f.readlines()  #data reading line by line
    for line in data:
        record = line.split()        #split the filename and label
        fold1_filename.append(record[0][6:])
        fold1_label.append(record[1])
print('fold 1 has {} sounds to train'.format(len(fold1_filename)))


fold 1 has 3510 sounds to train


Because training set is too large, so we take first 100 songs for testing our network

In [4]:
# audio = []
# for root,dirnames,filenames in os.walk('./'):
#     for filename in filenames:
#         f = os.path.join(root, filename)
#         if f.endswith('.wav'):
#             audio.append(f)

In [415]:
audio = []

indices = np.arange(len(fold1_filename))
np.random.shuffle(indices)
n_sound = 1000
indices = indices[:n_sound]
fold1_filename_small = np.array(fold1_filename)[indices]
fold1_label_small = np.array(fold1_label)[indices]

for root,dirnames,filenames in os.walk('./'):
    for filename in filenames:
         if filename in fold1_filename_small:
            f = os.path.join(root, filename)
            audio.append(f)
print(f'Small training test has {len(audio)} data')
print(f'Small training test label has {len(audio)} records')
classes = set(fold1_label_small)
n_classes = len(classes)
print(f'There are {n_classes} classes in the training data')

Small training test has 1000 data
Small training test label has 1000 records
There are 15 classes in the training data


In [416]:
dictionary = dict(zip(classes, list(np.arange(n_classes))))  # build the connection between labels and numbers
print('label list is the following:')
print(dictionary)
temp=[]
for label in fold1_label_small:
    temp.append(dictionary[label])
fold1_label_small = temp
# print(fold1_label_small)

label list is the following:
{'office': 0, 'residential_area': 1, 'library': 2, 'home': 3, 'beach': 4, 'forest_path': 5, 'metro_station': 6, 'car': 7, 'bus': 8, 'tram': 9, 'cafe/restaurant': 10, 'park': 11, 'city_center': 12, 'grocery_store': 13, 'train': 14}


Run this block for the first time to run this script

In [29]:
# M is a matrix who stores the infomation about each audio
# Structure of matrix M: song stft rmse spec_cent spec_bw rolloff zcr mfcc(39 coef) label
M = []

for song,label in zip(audio,fold1_label_small):
    y, sr = librosa.load(song, mono=True, duration=5) 
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    to_append = f'{np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
    for e in mfcc[1:]:  # delete the first coefficient
        to_append += f' {np.mean(e)}'
    to_append += f' {label}'
    M.append(to_append)

In [234]:
# np.save('M.npy',M)   # 6 min to load 1000 sound files
M = np.load('M.npy')

In [196]:
DATA_train= np.zeros((n_sound,26))
for i in range(n_sound):
    DATA_train[i,:] = np.array(M[i].split(' '))
print(DATA_train.shape)


(1000, 26)


In [221]:
X = DATA_train[:,:-1]    #  25 coefficients which define the caracteristics of each sound
y = DATA_train[:,-1].astype(np.int32)   # last column is label
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Normalie each column of X, which means each dimension
# Convert class vector y to binary class matrices, in one-hot form
X_train, y_train = preprocessing.scale(X),keras.utils.to_categorical(y, n_classes)  
n_features = X_train.shape[1]

In [402]:
from keras import models
from keras import layers
import keras.backend as K
from keras import metrics

model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape=(n_features,)))
# model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(n_classes, activation='softmax'))

In [403]:
model.summary()   # The model is complex for the dataset or not?

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_35 (Dense)             (None, 32)                832       
_________________________________________________________________
dense_36 (Dense)             (None, 15)                495       
Total params: 1,327
Trainable params: 1,327
Non-trainable params: 0
_________________________________________________________________


In [224]:
def r2(y_true, y_pred): # correlation coefficient
    a = K.square(y_pred - y_true)
    b = K.sum(a)
    c = K.mean(y_true)
    d = K.square(y_true - c)
    e = K.sum(d)
    f = 1 - b/e
    return f
def precision_top_k(y_true, y_pred):
    return metrics.top_k_categorical_accuracy(y_true, y_pred,k=5)

In [405]:
# opt = keras.optimizers.RMSprop(learning_rate=0.001)
opt = keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=opt,
              loss='categorical_crossentropy',
              metrics=[precision_top_k])    # metrics.categorical_accuracy

In [406]:
# Training
history = model.fit(X_train,y_train,epochs=20,batch_size=100)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [227]:
test1 = os.path.join(file_path,fold1[0])
fold1_filename = []
fold1_label = []
with open(test1, 'r') as f:
    data = f.readlines()  #data reading line by line
 
    for line in data:
        record = line.split()        #split the filename and label
        fold1_filename.append(record[0][6:])
        fold1_label.append(record[1])
print('fold 1 has {} documents to test'.format(len(fold1_filename)))


fold 1 has 1170 documents to test


In [236]:
audio = []

for root,dirnames,filenames in os.walk('./'):
    for filename in filenames:
         if filename in fold1_filename:
            f = os.path.join(root, filename)
            audio.append(f)

print(f'Test set has {len(audio)} records')
classes = set(fold1_label)
n_classes = len(classes)
print(f'There are {n_classes} classes in the test set')

Test set has 1170 records
There are 15 classes in the test set


In [229]:
temp=[]
for label in fold1_label:
    temp.append(dictionary[label])
fold1_label= temp
# print(fold1_label)

Run this block for the first time to run this script

In [237]:
# M is a matrix who stores the infomation about each audio
# Structure of matrix M: song stft rmse spec_cent spec_bw rolloff zcr mfcc(39 coef) label

M_test = []

for song in audio:
    y, sr = librosa.load(song, mono=True, duration=5) 
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    to_append = f'{np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
    for e in mfcc[1:]:  # delete the first coefficient
        to_append += f' {np.mean(e)}'
    M_test.append(to_append)

In [239]:
# np.save('M.npy',M_test)   # 8min 40s
M_test = np.load('M.npy')

In [256]:
DATA_test = np.zeros((1170,25))
for i in range(1170):
    DATA_test[i,:] = np.array(M_test[i].split(' '))
X_test = DATA_test
y_test = np.array(fold1_label)
X_test, y_test = preprocessing.scale(X_test),keras.utils.to_categorical(y_test, n_classes)

In [407]:
y_pred = model.predict(X_test)
label_pred = np.argsort(y_pred,axis=1)[:,-5:]  # top_5_categorical_accuracy  shape:1170*5
label_true = np.array(fold1_label)
acc=0
for i in range(1170):
    a = label_true[i]
    b = label_pred[i,:]
    acc+=np.isin(a,b,invert=False)
print('top_5_categorical_accuracy on test set is {} %'.format(acc*100/1170))

top_5_categorical_accuracy on test set is 36.41025641025641 %


In [408]:
# Score trained model.
test_loss, test_acc = model.evaluate(X_test,y_test, verbose=1)
print('Test loss:', test_loss)
print(f'Test_acc: {100*test_acc} %') # Low test accuracy less than training data accuracy indicating Overfitting

Test loss: 3.0875690248277454
Test_acc: 36.41025722026825 %
