In [1]:
import os
import uuid
import numpy as np
import librosa
import pylab
import io
import json
import math

from io import BytesIO
import base64
from werkzeug.utils import secure_filename
from flask import Flask, request, jsonify, url_for, render_template
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from matplotlib import cm
from librosa import display
from pydub import AudioSegment
from pydub.utils import make_chunks
from sklearn.preprocessing import normalize



In [2]:
ALLOWED_EXTENSION = set(['wav'])
IMAGE_SIZE = (300, 300)

In [3]:
def splice(audio):
    my_audio = AudioSegment.from_file(audio, "wav")
    chunk_length_ms = 1920
    chunks = make_chunks(my_audio, chunk_length_ms) #Make chunks of 1.92s
    chunk_list = []
    for chunk in chunks:
        # Ignore audio that are less than 1.92s
        if not math.isclose(chunk.duration_seconds, 1.92) and float(chunk.duration_seconds) < 1.92:
            continue
        chunk_list.append(chunk.export(BytesIO(), format="wav"))
    return chunk_list

def convert_logmel(chunks):
    converted_chunks = []
    nfft = 2048
    hop_length = 512
    n_mels = 128
    w = 30
    
    for chunk in chunks:
        x, sr = librosa.load(chunk, sr=8000)
        L = librosa.feature.melspectrogram(y=x, sr=sr, n_mels=n_mels, n_fft=nfft, window=w, hop_length=hop_length)
        log_power = librosa.power_to_db(L, ref=np.max)
        pylab.figure(figsize=(3,3))
        pylab.axis('off') 
        pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) # Remove the white edge
        librosa.display.specshow(log_power, cmap=cm.jet)
        buf = BytesIO()
        pylab.savefig(buf, bbox_inches=None, pad_inches=0, format="jpg")
        buf.seek(0)
        converted_chunks.append(buf)
    return converted_chunks

def convert_mfcc(chunks):
    converted_chunks = []
    nfft = 2048
    hop_length = 512
    n_mfcc = 13 # 13 - 20 mfcc
    n_mels = 128
    w = 30
    
    for chunk in chunks:
        x, sr = librosa.load(chunk, sr=8000)
        M = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=n_mfcc, n_mels=n_mels, n_fft=nfft, window=w, hop_length=hop_length)
        pylab.figure(figsize=(3,3))
        pylab.axis('off') 
        pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) # Remove the white edge
        librosa.display.specshow(M, cmap=cm.jet)
        buf = BytesIO()
        pylab.savefig(buf, bbox_inches=None, pad_inches=0, format="jpg")
        buf.seek(0)
        converted_chunks.append(buf)
    return converted_chunks

def convert_logmel_display(audio):
    nfft = 2048
    hop_length = 512
    n_mels = 128
    w = 30
    
    x, sr = librosa.load(audio, sr=8000)
    L = librosa.feature.melspectrogram(y=x, sr=sr, n_mels=n_mels, n_fft=nfft, window=w, hop_length=hop_length)
    log_power = librosa.power_to_db(L, ref=np.max)
    pylab.figure(figsize=(3,3))
    pylab.axis('off') 
    pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) # Remove the white edge
    librosa.display.specshow(log_power, cmap=cm.jet)
    buf = BytesIO()
    pylab.savefig(buf, bbox_inches=None, pad_inches=0, format="jpg")
    buf.seek(0)
    return buf

def convert_mfcc_display(audio):
    nfft = 2048
    hop_length = 512
    n_mfcc = 13 # 13 - 20 mfcc
    n_mels = 128
    w = 30
    
    x, sr = librosa.load(audio, sr=8000)
    M = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=n_mfcc, n_mels=n_mels, n_fft=nfft, window=w, hop_length=hop_length)
    pylab.figure(figsize=(3,3))
    pylab.axis('off') 
    pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) # Remove the white edge
    librosa.display.specshow(M, cmap=cm.jet)
    buf = BytesIO()
    pylab.savefig(buf, bbox_inches=None, pad_inches=0, format="jpg")
    buf.seek(0)
    return buf

def predict(logmel_specs, mfcc_specs):
    prediction_list = []
    
    for logmel, mfcc in zip(logmel_specs, mfcc_specs):
        # Reshape log-mel specs
        logmel_image = load_img(logmel, target_size=(300,300))
        logmel_image = img_to_array(logmel_image)
        logmel_normalized = logmel_image / 255.0
        logmel_normalized = logmel_normalized.reshape((1, logmel_normalized.shape[0], logmel_normalized.shape[1], logmel_normalized.shape[2]))
        # Reshape MFCC specs
        mfcc_image = load_img(mfcc, target_size=(300,300))
        mfcc_image = img_to_array(mfcc_image)
        mfcc_normalized = mfcc_image / 255.0
        mfcc_normalized = mfcc_normalized.reshape((1, mfcc_normalized.shape[0], mfcc_normalized.shape[1], mfcc_normalized.shape[2]))
        # Predict
        prediction_list.append(model.predict([logmel_normalized, mfcc_normalized]))
    return prediction_list

In [4]:
def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1] in ALLOWED_EXTENSION

In [None]:
app = Flask(__name__)
model = load_model('./saved_models/fine-tuning_siamese-vgg19_logmel+mfcc_30epochs_6.h5', compile=False)

@app.route('/index/')
def index():
    return render_template('base.html')

@app.route('/upload_and_classify/')
def upload_and_classify():
    return render_template('upload_and_classify.html')

@app.route('/api/audio', methods=['POST'])
def upload_audio():
    if 'audio' not in request.files:
        return render_template('base.html', prediction='No posted audio.')
    file = request.files['audio']
    
    if file.filename == '':
        return render_template('base.html', prediction='You did not select an audio.')
    
    if file and allowed_file(file.filename):
        filename = secure_filename(file.filename)
        print("***"+filename)
        
        """
        Step 1: Read audio file
        Step 2: Splice audio file into 1.92s chunks
        Step 3: Remove chunks < 1.92
        Step 4: Convert chunks into Spectrogram images
        Step 5: Get prediction for each image
        Step 6: Get average prediction 
        Step 7: Display results
        """
        
        predictions = predict(convert_logmel(splice(file)), convert_mfcc(splice(file)))
        file.seek(0)
        
        logmel_image = convert_logmel_display(BytesIO(file.read()))
        file.seek(0)
        mfcc_image = convert_mfcc_display(BytesIO(file.read()))
        
        logmel_image = base64.b64encode(logmel_image.getvalue())
        mfcc_image = base64.b64encode(mfcc_image.getvalue())
        
        # # Encode the image file
        # encoded_img_data = base64.b64encode(buf.getvalue())
        
        # Convert prediction to list
        # lst = [arr.tolist()[0] for arr in predictions]
        # # Un-nest the list
        # lst = [item for sublist in lst for item in sublist]
        
        concatenated_array = np.concatenate(predictions, axis=0)

        average_values = np.mean(concatenated_array, axis=0)
        
        return render_template('base.html', logmel_image=logmel_image.decode('utf-8'), mfcc_image=mfcc_image.decode('utf-8'), prediction = json.dumps(average_values.tolist()))
    else:
        return render_template('base.html', prediction='Invalid File')
    

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
