In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
#         print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
import pandas as pd       
import os 
import math 
import numpy as np
import matplotlib.pyplot as plt  
import IPython.display as ipd  # To play sound in the notebook
import librosa
import librosa.display
from tqdm import *
from keras.callbacks import (EarlyStopping, LearningRateScheduler,
                             ModelCheckpoint, TensorBoard, ReduceLROnPlateau)
!apt install -y ffmpeg
# os.chdir("/kaggle/input/freesound-audio-tagging/audio_train")
#os.getcwd()
os.chdir("/kaggle/input/speech-accent-archive/recordings")
import warnings
warnings.filterwarnings('ignore')

OUTPUT_DIR = '/kaggle/working/'


In [6]:
# Play female from Kentucky
fname_f = 'recordings/' + 'english385.mp3'   
ipd.Audio(fname_f)

In [7]:
# Play male from Kentucky
fname_m = 'recordings/' + 'english381.mp3'
ipd.Audio(fname_m)

In [8]:
# MFCC for female 
SAMPLE_RATE = 22050
fname_f = 'recordings/' + 'english385.mp3'

y, sr = librosa.load(fname_f, sr=SAMPLE_RATE, duration = 20)# Chop audio at 10 secs...
mfcc = librosa.feature.mfcc(y=y, sr=SAMPLE_RATE, n_mfcc = 13)# 10 MFCC components

plt.figure(figsize=(15, 7))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc)
plt.ylabel('MFCC')
plt.colorbar()

In [9]:
# MFCC for male  
SAMPLE_RATE = 22050
fname_m = 'recordings/' + 'english381.mp3'  
y, sr = librosa.load(fname_m, sr=SAMPLE_RATE, duration = 20)
mfcc = librosa.feature.mfcc(y=y, sr=SAMPLE_RATE, n_mfcc = 13)

plt.figure(figsize=(15, 7))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()

In [10]:
data = pd.read_csv('/kaggle/input/speech-accent-archive/speakers_all.csv')

In [11]:
data.head()

In [12]:
data.tail()

In [13]:
data[data['native_language']=='english']

In [14]:
data[data['file_missing?']=='True'].count()

Let's found gender class distribution:

In [15]:
data['sex'].value_counts()

In [16]:
res=data['native_language'].value_counts()

In [17]:
res1 =data[data['native_language']!='english'].native_language.value_counts()

In [18]:
res1[res>40].sum()

In [19]:
dg=data.groupby("native_language").filter(lambda x: len(x) >40)

In [97]:
# Unseen data for testing later
test_data = data.groupby("native_language").filter(lambda x: len(x) <40)

In [98]:
dg.sex.value_counts()

In [21]:
dg['native_language'].value_counts()

In [22]:
data.shape

In [23]:
dg.shape

In [24]:
dg['filename'] = dg['filename'].apply(lambda x: x+'.mp3')

In [25]:
dg.head()

In [26]:
dg.drop(columns=['Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11'],inplace=True)

In [27]:
dg.head()

In [28]:
dg['accent'] = dg['native_language'].apply(lambda x: 'native' if x=='english' else 'non-native')

In [29]:
dg['accent'].value_counts()

In [30]:
def feature_extractor(files, feature_type='mfcc'):
    features = []
    SAMPLE_RATE = 22050
    dire = 'recordings/'
    if feature_type == 'mfcc':
        for file in files:
            f_name = str(dire+file)
            y, sr = librosa.load(f_name, sr=SAMPLE_RATE, duration = 10)
            mfcc = librosa.feature.mfcc(y=y, sr=SAMPLE_RATE, n_mfcc = 10)
            features.append(mfcc)
    return features

In [31]:
# features = feature_extractor(dg['filename'])

In [32]:
# features[0]

In [33]:
class Config(object):
    def __init__(self,sampling_rate=16000, audio_duration=2, n_classes=10, learning_rate=0.0001, max_epochs=20, n_mfcc=40):
        self.sampling_rate = sampling_rate
        self.audio_duration = audio_duration
        self.n_classes = n_classes
        self.n_mfcc = n_mfcc
        self.learning_rate = learning_rate
        self.max_epochs = max_epochs
        self.audio_length = self.sampling_rate * self.audio_duration
        self.dim = (self.n_mfcc, 1 + int(np.floor(self.audio_length/512)), 1)

def prepare_data(fnames, config, data_dir):
    X = np.empty(shape=(len(fnames), config.dim[0], config.dim[1], 1))
    input_length = config.audio_length
    for i, fname in tqdm_notebook(enumerate(fnames), total=len(fnames)):
        file_path = os.path.join(data_dir, fname)
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate, res_type="kaiser_fast")

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

        data = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
        data = np.expand_dims(data, axis=-1)
        X[i,] = data
    return X


In [34]:
X_fnames = np.array(dg['filename'])
data_path = 'recordings/'

config = Config(sampling_rate=22050, audio_duration=20, learning_rate=0.0001, n_mfcc=13, n_classes=2)
feature_file_path = OUTPUT_DIR+'mfcc_features.npy'
feature_file_path = '/kaggle/input/speechdetection/mfcc_features.npy'
if os.path.exists(feature_file_path):
    X = np.load(feature_file_path)
else:
    X = prepare_data(X_fnames, config, data_path)
    np.save(feature_file_path, X)

In [35]:
# with open(OUTPUT_DIR+'mfcc_features.npy', 'wb+') as f:
#     np.save(f, X)

In [36]:
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

y_gender = dg['sex']
y_accent = dg['accent']

encoder = LabelEncoder()
encoder.fit(y_gender)
y_gender = encoder.transform(y_gender)

encoder1 = LabelEncoder()
encoder1.fit(y_accent)
y_accent = encoder1.transform(y_accent)
y_gender = to_categorical(np.array(y_gender))
y_accent = to_categorical(np.array(y_accent))

Normalization

Normalization is a crucial preprocessing step. The simplest method is rescaling the range of features to scale the range in [0, 1].

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_gender_train, y_gender_test, y_accent_train, y_accent_test = train_test_split(X, y_gender, y_accent, test_size=0.2, random_state=10)
print ('Train set:', X_train.shape,  y_gender_train.shape)
print ('Test set:', X_test.shape,  y_gender_test.shape)

X_train, X_val, y_gender_train, y_gender_val, y_accent_train, y_accent_val = train_test_split(X_train, y_gender_train, y_accent_train, test_size=0.15, random_state=10)

mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_val = (X_val - mean)/std
X_test = (X_test - mean)/std

In [38]:
print ('Label accent set:', y_accent_train.shape,  y_accent_test.shape)

# Simple Multilayer Feedforward Neural Network

In [39]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense,Dropout,Activation, Flatten, Input, Conv2D, BatchNormalization, MaxPooling2D, LSTM, Concatenate
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
import tensorflow as tf
from keras.regularizers import l2

In [40]:
input_layer = Input(shape=(13, 862), name='Input')

dense1=Dense(128, activation='relu',name='Hidden_1')(input_layer)
dropout1 = Dropout(0.5)(dense1)

dense2=Dense(256, activation='relu',name='Hidden_2')(dropout1)
dropout2 = Dropout(0.5)(dense2)

dense3=Dense(128, activation='relu',name='Hidden_3')(dropout2)
dropout3 = Dropout(0.5)(dense3)
m = tf.keras.layers.Flatten()(dropout3)

out_accent = Dense(2, activation='sigmoid',name='output_accent')(m)
out_gender = Dense(2, activation='sigmoid',name='output_gender')(m)


model_ffn = Model(inputs = input_layer ,outputs=[out_accent, out_gender])


In [41]:
model_ffn.summary()

In [42]:
model_ffn.compile(optimizer='Adam',loss={'output_accent':'binary_crossentropy','output_gender': 'binary_crossentropy'}, metrics ={'output_accent': 'accuracy', 'output_gender': 'accuracy'})

In [43]:
## Training my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 70
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR + 'saved_models/audio_classification_ffn.hdf5', mode='min',
                               verbose=1, save_best_only=True,patience=10, monitor='val_loss')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=10)

start = datetime.now()

history = model_ffn.fit(X_train, {"output_accent": y_accent_train, "output_gender": y_gender_train},validation_data=(X_val,{"output_accent": y_accent_val, "output_gender": y_gender_val}),
                        batch_size=num_batch_size, epochs=num_epochs, callbacks=[checkpointer, early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

### Evaluation

In [44]:
results = model_ffn.evaluate(X_test, {"output_accent": y_accent_test, "output_gender": y_gender_test})
print(results)

In [45]:
# Plot training and validation accuracy values
plt.plot(history.history['output_accent_accuracy'])
plt.plot(history.history['val_output_accent_accuracy'])
plt.plot(history.history['output_gender_accuracy'])
plt.plot(history.history['val_output_gender_accuracy'])
plt.title('Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train(Accent)','Validation(Accent)', 'Train(Gender)','Validation(Gender)'], loc = 'upper left')
plt.show()

# Plot training and validation accuracy values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

In [255]:
accent_prediction, gender_prediction = model_ffn.predict(X_test)
prediction_accent_rounded = [np.argmax(i) for i in accent_prediction]
# prediction_ANN_rounded[0]
y_test_index = [np.argmax(i) for i in y_accent_test]

In [256]:
from sklearn.metrics import classification_report

print(classification_report(y_test_index, prediction_accent_rounded))

In [48]:
import tensorflow as tf

#Confusion Matrix - verify accuracy of each class
import seaborn as sns
cm = tf.math.confusion_matrix(labels = y_test_index, predictions = prediction_accent_rounded)
plt.figure(figsize = (12,7))
sns.heatmap(cm,annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('True_value')

In [257]:
prediction_gender_rounded = [np.argmax(i) for i in gender_prediction]
# prediction_ANN_rounded[0]
y_test_gender_index = [np.argmax(i) for i in y_gender_test]

In [258]:
cm = tf.math.confusion_matrix(labels = y_test_gender_index, predictions = prediction_gender_rounded)
plt.figure(figsize = (12,7))
sns.heatmap(cm,annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('True_value')

In [259]:
from sklearn.metrics import classification_report

print(classification_report(y_test_gender_index, prediction_gender_rounded))

# CNN

### CNN- Accent classification

In [51]:
def create_cnn_model(input_shape=None):
    # Create CNN model
    input_layer = Input(shape=(13, 862, 1), name='Input')
    # conv 1
    conv = Conv2D(32, 3, padding='same', activation='relu')(input_layer)
    batchnorm = BatchNormalization()(conv)
    maxpool = MaxPooling2D(pool_size=(2, 2))(batchnorm)
    batchnorm1 = BatchNormalization()(maxpool)
    # conv 2
    conv1 = Conv2D(32, 3, padding='same', activation='relu')(batchnorm1)
    batchnorm2 = BatchNormalization()(conv1)
    maxpool1 = MaxPooling2D(pool_size=(2, 2))(batchnorm2)
    batchnorm3 = BatchNormalization()(maxpool1)
    # conv 3
    conv2 = Conv2D(128, 3, padding='same', activation='relu', name='conv2')(batchnorm3)
    batchnorm4 = BatchNormalization()(conv2)
    maxpool2 = MaxPooling2D(pool_size=(2, 2))(batchnorm4)
    batchnorm5 = BatchNormalization()(maxpool2)
    # flatten
    flatten = Flatten()(batchnorm5)
    dense = Dense(64, activation='relu',name='Hidden_1')(flatten)
    dropout = Dropout(0.5)(dense)
    # output
    out = Dense(2, activation='sigmoid',name='output')(dropout)
    model = Model(inputs = input_layer ,outputs=out)
    return model


In [52]:
model_cnn_accent = create_cnn_model()

In [53]:
model_cnn_accent.summary()

In [54]:
# tf.keras.utils.plot_model(model_cnn_accent, to_file='model.png', show_shapes=True, show_layer_names=False)


In [55]:
model_cnn_accent.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')

In [56]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR +'saved_models/accent_classification_cnn.h5', 
                               verbose=1, save_best_only=True,patience=10, mode='max', monitor='val_accuracy')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)

start = datetime.now()

history = model_cnn_accent.fit(X_train, y_accent_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_val, y_accent_val), callbacks=[checkpointer,early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [57]:
results_accent = model_cnn_accent.evaluate(X_test, y_accent_test)
print(results_accent)

In [58]:
# Plot training and validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

# Plot training and validation accuracy values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

### CNN - Gender classification

In [59]:
model_cnn_gender = create_cnn_model()

In [60]:
model_cnn_gender.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')

In [61]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR +'saved_models/gender_classification_cnn.h5', 
                               verbose=1, save_best_only=True,patience=5, mode='max', monitor='val_accuracy')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)

start = datetime.now()

history_gender = model_cnn_gender.fit(X_train, y_gender_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_val, y_gender_val), callbacks=[checkpointer, early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [62]:
results_gender = model_cnn_gender.evaluate(X_test, y_gender_test)
print(results_gender)

In [63]:
# Plot training and validation accuracy values
plt.plot(history_gender.history['accuracy'])
plt.plot(history_gender.history['val_accuracy'])
plt.title('Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

# Plot training and validation accuracy values
plt.plot(history_gender.history['loss'])
plt.plot(history_gender.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

### Multi-task Learning using ConvNets: Accent and Gender

In [64]:
def create_multitask_cnn():
    # Create CNN model
    input_layer = Input(shape=(13, 862,1), name='Input')
    # conv 1
    conv = Conv2D(32, 3, padding='same', activation='relu')(input_layer)
    batchnorm = BatchNormalization()(conv)
    maxpool = MaxPooling2D(pool_size=(2, 2))(batchnorm)
    batchnorm1 = BatchNormalization()(maxpool)
    # conv 2
    conv1 = Conv2D(32, 3, padding='same', activation='relu')(batchnorm1)
    batchnorm2 = BatchNormalization()(conv1)
    maxpool1 = MaxPooling2D(pool_size=(2, 2))(batchnorm1)
    batchnorm3 = BatchNormalization()(maxpool1)
    # conv 3
    conv2 = Conv2D(128, 3, padding='same', activation='relu')(batchnorm3)
    batchnorm4 = BatchNormalization()(conv2)
    maxpool2 = MaxPooling2D(pool_size=(2, 2))(batchnorm4)
    batchnorm5 = BatchNormalization()(maxpool2)
    # flatten
    flatten = Flatten()(batchnorm5)
    dense = Dense(64, activation='relu',name='Hidden_1')(flatten)
    dropout = Dropout(0.5)(dense)
    # output
    out_accent = Dense(2, activation='sigmoid',name='output_accent')(dropout)
    out_gender = Dense(2, activation='sigmoid',name='output_gender')(dropout)

    model_cnn = Model(inputs = input_layer ,outputs=[out_accent, out_gender])
    return model_cnn

In [65]:
multitask_cnn = create_multitask_cnn()
multitask_cnn.summary()

In [66]:
multitask_cnn.compile(optimizer='Adam',loss={'output_accent':'binary_crossentropy','output_gender': 'binary_crossentropy'}, metrics ={'output_accent': 'accuracy', 'output_gender': 'accuracy'})

In [67]:
## Training my model

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR + 'saved_models/multitask_classification_cnn.h5', 
                               verbose=1, save_best_only=True,patience=10, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=10)

start = datetime.now()

history_multitask_cnn = multitask_cnn.fit(X_train, {"output_accent": y_accent_train, "output_gender": y_gender_train},validation_data=(X_val,{"output_accent": y_accent_val, "output_gender": y_gender_val}),
                        batch_size=num_batch_size, epochs=num_epochs, callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [68]:
results_multitask_cnn = multitask_cnn.evaluate(X_test, {"output_accent": y_accent_test, "output_gender": y_gender_test})
print(results_multitask_cnn)

In [250]:
accent_prediction, gender_prediction = multitask_cnn.predict(X_test)
prediction_accent_rounded = [np.argmax(i) for i in accent_prediction]
# prediction_ANN_rounded[0]
y_test_index = [np.argmax(i) for i in y_accent_test]

In [251]:
import tensorflow as tf

#Confusion Matrix - verify accuracy of each class
import seaborn as sns
cm = tf.math.confusion_matrix(labels = y_test_index, predictions = prediction_accent_rounded)
plt.figure(figsize = (12,7))
sns.heatmap(cm,annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('True_value')

In [252]:
prediction_gender_rounded = [np.argmax(i) for i in gender_prediction]
# prediction_ANN_rounded[0]
y_test_gender_index = [np.argmax(i) for i in y_gender_test]
cm = tf.math.confusion_matrix(labels = y_test_gender_index, predictions = prediction_gender_rounded)
plt.figure(figsize = (12,7))
sns.heatmap(cm,annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('True_value')

In [253]:
from sklearn.metrics import classification_report

print(classification_report(y_test_gender_index, prediction_gender_rounded))

In [254]:
print(classification_report(y_test_index, prediction_accent_rounded))

In [69]:
# Plot training and validation accuracy values
plt.plot(history_multitask_cnn.history['output_accent_accuracy'])
plt.plot(history_multitask_cnn.history['val_output_accent_accuracy'])
plt.plot(history_multitask_cnn.history['output_gender_accuracy'])
plt.plot(history_multitask_cnn.history['val_output_gender_accuracy'])
plt.title('Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train(Accent)','Validation(Accent)', 'Train(Gender)','Validation(Gender)'], loc = 'upper left')
plt.show()

# Plot training and validation accuracy values
plt.plot(history_multitask_cnn.history['loss'])
plt.plot(history_multitask_cnn.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

# LSTM

### LSTM - Accent classification

In [None]:
def create_LSTM():
    # Create CNN model
    input_layer = Input(shape=(13, 862), name='Input')
    
    lstm_1 = LSTM(64, return_sequences=True)(input_layer)
    lstm_2 = LSTM(64)(lstm_1)
    dense = Dense(64, activation='relu',name='Hidden_1')(lstm_2)
    
#     lstm_3 = LSTM(64, stateful=False)(lstm_2)
    dropout = Dropout(0.5)(dense)
    # output
    out_accent = Dense(2, activation='sigmoid',name='output_accent')(dropout)
#     out_gender = Dense(2, activation='sigmoid',name='output_gender')(dropout)

    model = Model(inputs = input_layer ,outputs=out_accent)
    return model

In [None]:
accent_lstm = create_LSTM()

In [None]:
accent_lstm.compile(optimizer='Adam',loss='binary_crossentropy', metrics =['accuracy'])

In [None]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR +'saved_models/accent_classification.h5', 
                               verbose=1, save_best_only=True,patience=10, mode='max', monitor='val_accuracy')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=7)

start = datetime.now()

history_accent_lstm = accent_lstm.fit(X_train, y_accent_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_val, y_accent_val), callbacks=[checkpointer, early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [None]:
results_accent_lstm = accent_lstm.evaluate(X_test, y_accent_test)
print(results_accent_lstm)

### LSTM multitask

In [None]:
def create_LSTM_multi():
    # Create CNN model
    input_layer = Input(shape=(13, 862), name='Input')
    
#     lstm_1 = LSTM(64, return_sequences=True)(input_layer)
    lstm_2 = LSTM(100)(input_layer)
    
#     lstm_3 = LSTM(64, stateful=False)(lstm_2)
    dropout = Dropout(0.25)(lstm_2)
    # output
    out_accent = Dense(2, activation='sigmoid',name='output_accent')(dropout)
    out_gender = Dense(2, activation='sigmoid',name='output_gender')(dropout)

    model_cnn = Model(inputs = input_layer ,outputs=[out_accent, out_gender])
    return model_cnn

In [None]:
multitask_lstm = create_LSTM_multi()

In [None]:
multitask_lstm.compile(optimizer='Adam',loss={'output_accent':'binary_crossentropy','output_gender': 'binary_crossentropy'}, metrics ={'output_accent': 'accuracy', 'output_gender': 'accuracy'})

In [None]:
## Training my model

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR + 'saved_models/multitask_classification_lstm.h5', 
                               verbose=1, save_best_only=True,patience=10, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=10)

start = datetime.now()

history_multitask_lstm = multitask_lstm.fit(X_train, {"output_accent": y_accent_train, "output_gender": y_gender_train},validation_data=(X_val,{"output_accent": y_accent_val, "output_gender": y_gender_val}),
                        batch_size=num_batch_size, epochs=num_epochs, callbacks=[checkpointer], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [None]:
results_multitask_lstm = multitask_lstm.evaluate(X_test, {"output_accent": y_accent_test, "output_gender": y_gender_test})
print(results_multitask_lstm)

# CRNN

### CRNN - Accent classificaiton

In [75]:
from keras.layers.core import Dense, Permute, Reshape
from keras.layers.wrappers import Bidirectional
def create_CRNN_model(input_shape, config, is_training=True):
    
    model = Sequential()
    
    model.add(Conv2D(32, 3, activation="relu",padding='same', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(32, 3, activation="relu",padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(128, 3, activation="relu",padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))

#     model.add(Conv2D(256, 3, activation="relu",padding='same'))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2)))

#     model.add(Conv2D(512, 3, 3, W_regularizer=l2(weight_decay), activation="relu"))
#     model.add(BatchNormalization())
    # model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

#     model.add(Convolution2D(512, 3, 3, W_regularizer=l2(weight_decay), activation="relu"))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

#     model.add(Conv2D(512, 3, 3, W_regularizer=l2(weight_decay), activation="relu"))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    

#     input_layer = Input(shape=(13, 862,1), name='Input')
#     # conv 1
#     conv = Conv2D(32, 3, padding='same', activation='relu')(input_layer)
#     batchnorm = BatchNormalization()(conv)
#     maxpool = MaxPooling2D(pool_size=(2, 2))(batchnorm)
#     batchnorm1 = BatchNormalization()(maxpool)
#     # conv 2
#     conv1 = Conv2D(32, 3, padding='same', activation='relu')(batchnorm1)
#     batchnorm2 = BatchNormalization()(conv1)
#     maxpool1 = MaxPooling2D(pool_size=(2, 2))(batchnorm1)
#     batchnorm3 = BatchNormalization()(maxpool1)
#     # conv 3
#     conv2 = Conv2D(128, 3, padding='same', activation='relu')(batchnorm3)
#     batchnorm4 = BatchNormalization()(conv2)
#     maxpool2 = MaxPooling2D(pool_size=(2, 2))(batchnorm4)
#     batchnorm5 = BatchNormalization()(maxpool2)

    # (bs, y, x, c) --> (bs, x, y, c)
    model.add(Permute((2, 1, 3)))

    # (bs, x, y, c) --> (bs, x, y * c)
    bs, x, y, c = model.layers[-1].output_shape
    model.add(Reshape((x, y*c)))

    model.add(Bidirectional(LSTM(256, return_sequences=False), merge_mode="concat"))
    model.add(Dense(2, activation="sigmoid"))


    return model

In [76]:
crnn_model = create_CRNN_model((13,862,1), config)

In [77]:
crnn_model.compile(optimizer='Adam',loss='binary_crossentropy', metrics =['accuracy'])

In [78]:
## Trianing my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR +'saved_models/accent_crnn_classification.h5', 
                               verbose=1, save_best_only=True,patience=10, mode='max', monitor='val_accuracy')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)

start = datetime.now()

history_accent_lstm = crnn_model.fit(X_train, y_accent_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_val, y_accent_val), callbacks=[checkpointer, early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [79]:
from keras.models import load_model
crnn_accent_model = load_model('/kaggle/working/saved_models/accent_crnn_classification.h5')
results_accent_crnn = crnn_accent_model.evaluate(X_test, y_accent_test)
print(results_accent_crnn)

### CRNN - Gender Classification 

In [80]:
crnn_model_gender = create_CRNN_model((13,862,1), config)

In [81]:
crnn_model_gender.compile(optimizer='Adam',loss='binary_crossentropy', metrics =['accuracy'])

In [82]:
## Trianing my model

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR +'saved_models/gender_crnn_classification.h5', 
                               verbose=1, save_best_only=True,patience=10, mode='max', monitor='val_accuracy')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)

start = datetime.now()

history_gender_crnn = crnn_model_gender.fit(X_train, y_gender_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_val, y_gender_val), callbacks=[checkpointer, early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [83]:
from keras.models import load_model
crnn_gender_model = load_model('/kaggle/working/saved_models/gender_crnn_classification.h5')
results_agender_crnn = crnn_gender_model.evaluate(X_test, y_gender_test)
print(results_agender_crnn)

### Multitask Learning- CRNN

In [214]:
def create_CRNN_multitask_model():
    
#     model = Sequential()
    
#     model.add(Conv2D(64, 3, activation="relu",padding='same', input_shape=input_shape))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2)))

#     model.add(Conv2D(128, 3, activation="relu",padding='same'))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2)))

#     model.add(Conv2D(128, 3, activation="relu",padding='same'))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2)))

#     model.add(Conv2D(256, 3, activation="relu",padding='same'))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2)))

#     model.add(Conv2D(512, 3, 3, W_regularizer=l2(weight_decay), activation="relu"))
#     model.add(BatchNormalization())
    # model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

#     model.add(Convolution2D(512, 3, 3, W_regularizer=l2(weight_decay), activation="relu"))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

#     model.add(Conv2D(512, 3, 3, W_regularizer=l2(weight_decay), activation="relu"))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    

    input_layer = Input(shape=(13, 862,1), name='Input')
    # conv 1
    conv = Conv2D(32, 3, padding='same', activation='relu')(input_layer)
    batchnorm = BatchNormalization()(conv)
    maxpool = MaxPooling2D(pool_size=(2, 2))(batchnorm)
    # conv 2
    conv1 = Conv2D(32, 3, padding='same', activation='relu')(maxpool)
    batchnorm2 = BatchNormalization()(conv1)
    maxpool1 = MaxPooling2D(pool_size=(2, 2))(batchnorm2)

    # conv 3
    conv2 = Conv2D(128, 3, padding='same', activation='relu')(maxpool1)
    batchnorm4 = BatchNormalization()(conv2)
    maxpool2 = MaxPooling2D(pool_size=(2, 2))(batchnorm4)
    
    permute = Permute((2, 1, 3))(maxpool2)
    reshaped = Reshape((107, 128))(permute)

    lstm = Bidirectional(LSTM(256, return_sequences=False))(reshaped)
    
    out_accent = Dense(2, activation='sigmoid',name='output_accent')(lstm)
    out_gender = Dense(2, activation='sigmoid',name='output_gender')(lstm)

    # (bs, y, x, c) --> (bs, x, y, c)
#     model.add(Permute((2, 1, 3)))

#     # (bs, x, y, c) --> (bs, x, y * c)
#     bs, x, y, c = model.layers[-1].output_shape
#     model.add(Reshape((x, y*c)))

#     model.add(Bidirectional(LSTM(256, return_sequences=False), merge_mode="concat"))
#     model.add([Dense(2, activation="sigmoid", name='output_accent'),Dense(2, activation="sigmoid", name='output_gender')])
    model = Model(inputs = input_layer ,outputs=[out_accent, out_gender])

    return model

In [215]:
multitask_crnn = create_CRNN_multitask_model()

In [216]:
multitask_crnn.summary()

In [217]:
multitask_crnn.compile(optimizer='Adam',loss={'output_accent':'binary_crossentropy','output_gender': 'binary_crossentropy'}, metrics ={'output_accent': 'accuracy', 'output_gender': 'accuracy'})

In [218]:
## Training my model

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR + 'saved_models/multitask_classification_crnn.h5', 
                               verbose=1, save_best_only=True,patience=10, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)

start = datetime.now()

history_multitask_crnn = multitask_crnn.fit(X_train, {"output_accent": y_accent_train, "output_gender": y_gender_train},validation_data=(X_val,{"output_accent": y_accent_val, "output_gender": y_gender_val}),
                        batch_size=num_batch_size, epochs=num_epochs, callbacks=[checkpointer, early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [219]:
from keras.models import load_model
crnn_multi_model = load_model('/kaggle/working/saved_models/multitask_classification_crnn.h5')
results_multitask_crnn = crnn_multi_model.evaluate(X_test, {"output_accent": y_accent_test, "output_gender": y_gender_test})
print(results_multitask_crnn)

In [220]:
accent_prediction, gender_prediction = crnn_multi_model.predict(X_test)

prediction_accent_rounded = [np.argmax(i) for i in accent_prediction]

# prediction_ANN_rounded[0]

y_test_index = [np.argmax(i) for i in y_accent_test]


import tensorflow as tf


#Confusion Matrix - verify accuracy of each class

import seaborn as sns

cm = tf.math.confusion_matrix(labels = y_test_index, predictions = prediction_accent_rounded)

plt.figure(figsize = (12,7))

sns.heatmap(cm,annot=True, fmt='d')

plt.xlabel('Prediction')

plt.ylabel('True_value')




In [221]:
prediction_gender_rounded = [np.argmax(i) for i in gender_prediction]

# prediction_ANN_rounded[0]

y_test_gender_index = [np.argmax(i) for i in y_gender_test]

cm = tf.math.confusion_matrix(labels = y_test_gender_index, predictions = prediction_gender_rounded)

plt.figure(figsize = (12,7))

sns.heatmap(cm,annot=True, fmt='d')

plt.xlabel('Prediction')

plt.ylabel('True_value')

In [222]:
from sklearn.metrics import classification_report

print(classification_report(y_test_gender_index, prediction_gender_rounded))

In [223]:
from sklearn.metrics import classification_report

print(classification_report(y_test_index, prediction_accent_rounded))

# Testing All the models

prepare test samples

In [224]:
test_data = test_data[test_data['file_missing?']==False]

In [225]:
test_data.head()

In [160]:
test_data['native_language'].value_counts()

In [161]:
test_data[test_data['sex']=='female'].head(20)

In [133]:
test_data['filename'] = test_data['filename'].apply(lambda x: x+'.mp3')

In [162]:
f_female_non_native = ['afrikaans1.mp3']

In [163]:
data_path = 'recordings/'
config = Config(sampling_rate=22050, audio_duration=20, learning_rate=0.0001, n_mfcc=13, n_classes=2)
X_female_non_native = prepare_data(f_female_non_native, config, data_path)

In [164]:
# Play male from Kentucky
fname_m = 'recordings/' + 'afrikaans1.mp3'
ipd.Audio(fname_m)

In [173]:
a=encoder.transform(['female'])
y_female_non_native_gender = to_categorical(np.array(a),  num_classes=2)
print(y_female_non_native_gender)

In [204]:
encoder1.transform(['non-native'])

In [239]:
encoder.transform(['female'])

# Label Encoding:
Female - 0
Male  - 1

Native - 0
Non-native - 1

In [174]:
a=encoder1.transform(['non-native'])
y_female_non_native_accent = to_categorical(np.array(a),  num_classes=2)
print(y_female_non_native_accent)

In [246]:
pred = crnn_multi_model.predict(X_female_non_native)

In [247]:
print("X=%s, Accent Predicted=%s" % (pred[0], y_female_non_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred[1], y_female_non_native_gender[0]))

In [186]:
pred1 = cnn_multi_model.predict(X_female_non_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_female_non_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred1[1], y_female_non_native_gender[0]))

African Male accent

In [228]:
f_male_non_native = ['afrikaans4.mp3']
X_male_non_native = prepare_data(f_female_non_native, config, data_path)

In [229]:
a=encoder.transform(['male'])
y_male_non_native_gender = to_categorical(np.array(a),  num_classes=2)
print(y_male_non_native_gender)

In [230]:
a=encoder1.transform(['non-native'])
y_male_non_native_accent = to_categorical(np.array(a),  num_classes=2)
print(y_male_non_native_accent)

In [231]:
from keras.models import load_model
cnn_multi_model = load_model('/kaggle/working/saved_models/multitask_classification_cnn.h5')
pred = cnn_multi_model.predict(X_male_non_native)

In [232]:
print("X=%s, Accent Predicted=%s" % (pred[0], y_male_non_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred[1], y_male_non_native_gender[0]))

In [249]:
pred1 = crnn_multi_model.predict(X_male_non_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_male_non_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred1[1], y_male_non_native_gender[0]))

English Male

In [187]:
file = '../input/common-voice-samples/English-male.mp3'

In [234]:
X_male_native = prepare_data(['English-male.mp3'], config, '/kaggle/input/common-voice-samples')

In [235]:
a=encoder.transform(['male'])
y_male_native_gender = to_categorical(np.array(a),  num_classes=2)
print(y_male_native_gender)

In [236]:
a=encoder1.transform(['native'])
y_male_native_accent = to_categorical(np.array(a),  num_classes=2)
print(y_male_native_accent)

In [237]:
pred1 = crnn_multi_model.predict(X_male_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_male_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred1[1], y_male_native_gender[0]))

In [196]:
pred1 = cnn_multi_model.predict(X_male_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_male_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred1[1], y_male_native_gender[0]))

US female

In [None]:
filename = 'kaggle/input/common-voice/cv-other-dev/cv-other-dev/sample-001204.mp3'

In [198]:
X_female_native = prepare_data(['sample-001204.mp3'], config, '/kaggle/input/common-voice/cv-other-dev/cv-other-dev')

In [199]:
a=encoder.transform(['female'])
y_female_native_gender = to_categorical(np.array(a),  num_classes=2)
print(y_female_native_gender)
a=encoder1.transform(['native'])
y_female_native_accent = to_categorical(np.array(a),  num_classes=2)
print(y_female_native_accent)

In [238]:
pred1 = crnn_multi_model.predict(X_female_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_male_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred1[1], y_male_native_gender[0]))

pred1 = cnn_multi_model.predict(X_female_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_male_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred1[1], y_male_native_gender[0]))

In [263]:
from keras.models import load_model
cnn_accent_model = load_model('/kaggle/working/saved_models/accent_classification_cnn.h5')
cnn_gender_model = load_model('/kaggle/working/saved_models/gender_classification_cnn.h5')
pred1 = cnn_accent_model.predict(X_female_native)
print("Accent Predicted=%s, X=%s" % (pred1[0], y_female_native_accent[0]))
pred1 = cnn_accent_model.predict(X_male_native)
print("Accent Predicted=%s, X=%s" % (pred1[0], y_male_native_accent[0]))
pred1 = cnn_accent_model.predict(X_male_non_native)
print("Accent Predicted=%s, X=%s" % (pred1[0], y_male_non_native_accent[0]))
pred1 = ffn_multi_model.predict(X_female_non_native)
print("Accent Predicted=%s, X=%s" % (pred1[0], y_female_non_native_accent[0]))
pred1 = cnn_gender_model.predict(X_female_native)
print("Accent Predicted=%s, X=%s" % (pred1[0], y_female_native_gender[0]))

In [245]:
from keras.models import load_model
crnn_accent_model = load_model('/kaggle/working/saved_models/accent_crnn_classification.h5')
crnn_gender_model = load_model('/kaggle/working/saved_models/gender_crnn_classification.h5')
pred1 = crnn_accent_model.predict(X_female_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_female_native_accent[0]))
pred1 = crnn_gender_model.predict(X_female_native)
print("X=%s, Gender Predicted=%s" % (pred1[0], y_female_native_gender[0]))

In [267]:
ffn_multi_model = load_model('/kaggle/working/saved_models/audio_classification_ffn.hdf5')
# Female, Native
pred1 = ffn_multi_model.predict(X_female_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_female_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred1[1], y_female_native_gender[0]))
# Male, Native
pred1 = ffn_multi_model.predict(X_male_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_male_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred1[1], y_male_native_gender[0]))
# Male, Non-native
pred1 = ffn_multi_model.predict(X_male_non_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_male_non_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred1[1], y_male_non_native_gender[0]))

# Female, Non-native
pred1 = ffn_multi_model.predict(X_female_non_native)
print("X=%s, Accent Predicted=%s" % (pred1[0], y_female_non_native_accent[0]))
print("X=%s, Gender Predicted=%s" % (pred1[1], y_female_non_native_gender[0]))

# Multifeature: MFCC + Chroma

In [101]:
def prepare_data_multiple(fnames, config, data_dir):
    X_mfcc = np.empty(shape=(len(fnames), config.dim[0], config.dim[1], 1))
    X_chroma = np.empty(shape=(len(fnames), 12, config.dim[1], 1))
    print(X_chroma.shape)
    input_length = config.audio_length
    for i, fname in tqdm_notebook(enumerate(fnames), total=len(fnames)):
        file_path = os.path.join(data_dir, fname)
        data, _ = librosa.core.load(file_path, sr=config.sampling_rate, res_type="kaiser_fast")

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

        data_mfcc = librosa.feature.mfcc(data, sr=config.sampling_rate, n_mfcc=config.n_mfcc)
        data_mfcc = np.expand_dims(data_mfcc, axis=-1)
        X_mfcc[i,] = data_mfcc
        data = librosa.feature.chroma_cqt(data, sr=config.sampling_rate)
#         print(data.shape)
        data = np.expand_dims(data, axis=-1)
        X_chroma[i,] = data
    return X_mfcc, X_chroma

In [102]:
X_fnames = np.array(dg['filename'])
data_path = 'recordings/'

config = Config(sampling_rate=22050, audio_duration=20, learning_rate=0.0001, n_mfcc=12, n_classes=2)
feature_file_path = OUTPUT_DIR+'mfcc_features.npy'

X_mfcc, X_chroma = prepare_data_multiple(X_fnames, config, data_path)

In [103]:
from sklearn.model_selection import train_test_split
X_train_mfcc, X_test_mfcc, y_gender_train, y_gender_test, y_accent_train, y_accent_test = train_test_split(X_mfcc, y_gender, y_accent, test_size=0.2, random_state=10)
print ('Train set:', X_train_mfcc.shape,  y_gender_train.shape)
print ('Test set:', X_test_mfcc.shape,  y_gender_test.shape)
X_train_chroma, X_test_chroma, _, _, _, _ = train_test_split(X_chroma, y_gender, y_accent, test_size=0.2, random_state=10)

X_train_mfcc, X_val_mfcc, _, _, _, _ = train_test_split(X_train_mfcc, y_gender_train, y_accent_train, test_size=0.15, random_state=10)
X_train_chroma, X_val_chroma, y_gender_train, y_gender_val, y_accent_train, y_accent_val = train_test_split(X_train_chroma, y_gender_train, y_accent_train, test_size=0.15, random_state=10)

mean = np.mean(X_train_mfcc, axis=0)
std = np.std(X_train_mfcc, axis=0)

X_train_mfcc = (X_train_mfcc - mean)/std
X_val_mfcc = (X_val_mfcc - mean)/std
X_test_mfcc = (X_test_mfcc - mean)/std

mean = np.mean(X_train_chroma, axis=0)
std = np.std(X_train_chroma, axis=0)

X_train_chroma = (X_train_chroma - mean)/std
X_val_chroma= (X_val_chroma - mean)/std
X_test_chroma = (X_test_chroma - mean)/std

In [104]:
X_train_chroma.shape

In [105]:
X_train_mfcc.shape

In [14]:
def create_CRNN_multi_model(input_shape, is_training=True):
    
#     model = Sequential()
    
#     model.add(Conv2D(64, 3, activation="relu",padding='same', input_shape=input_shape))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2)))

#     model.add(Conv2D(128, 3, activation="relu",padding='same'))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2)))

#     model.add(Conv2D(128, 3, activation="relu",padding='same'))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2)))

#     model.add(Conv2D(256, 3, activation="relu",padding='same'))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2)))

#     model.add(Conv2D(512, 3, 3, W_regularizer=l2(weight_decay), activation="relu"))
#     model.add(BatchNormalization())
    # model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

#     model.add(Convolution2D(512, 3, 3, W_regularizer=l2(weight_decay), activation="relu"))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))

#     model.add(Conv2D(512, 3, 3, W_regularizer=l2(weight_decay), activation="relu"))
#     model.add(BatchNormalization())
#     model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
    

    input_layer_mfcc = Input(shape=(12, 862,1), name='Input')
    input_layer_chroma = Input(shape=(12, 862,1), name='Input2')

    # conv 1
    conv_mfcc = Conv2D(32, 3, padding='same', activation='relu')(input_layer_mfcc)
    batchnorm_mfcc = BatchNormalization()(conv_mfcc)
    maxpool_mfcc = MaxPooling2D(pool_size=(2, 2))(batchnorm_mfcc)
    # conv 2
    conv1_mfcc = Conv2D(32, 3, padding='same', activation='relu')(maxpool_mfcc)
    batchnorm2_mfcc = BatchNormalization()(conv1_mfcc)
    maxpool1_mfcc = MaxPooling2D(pool_size=(2, 2))(batchnorm2_mfcc)

    # conv 3
    conv2_mfcc = Conv2D(32, 3, padding='same', activation='relu')(maxpool1_mfcc)
    batchnorm4_mfcc = BatchNormalization()(conv2_mfcc)
    maxpool2_mfcc = MaxPooling2D(pool_size=(2, 2))(batchnorm4_mfcc)
    
    # conv 1
    conv_chroma = Conv2D(32, 3, padding='same', activation='relu')(input_layer_chroma)
    batchnorm_chroma = BatchNormalization()(conv_chroma)
    maxpool_chroma = MaxPooling2D(pool_size=(2, 2))(batchnorm_chroma)
    # conv 2
    conv1_chroma = Conv2D(32, 3, padding='same', activation='relu')(maxpool_chroma)
    batchnorm2_chroma = BatchNormalization()(conv1_chroma)
    maxpool1_chroma = MaxPooling2D(pool_size=(2, 2))(batchnorm2_chroma)

    # conv 3
    conv2_chroma = Conv2D(32, 3, padding='same', activation='relu')(maxpool1_chroma)
    batchnorm4_chroma = BatchNormalization()(conv2_chroma)
    maxpool2_chroma = MaxPooling2D(pool_size=(2, 2))(batchnorm4_chroma)
    

    
    concat = Concatenate()([maxpool2_mfcc, maxpool2_chroma])
#     concat = Concatenate()([batchnorm5_mfcc, batchnorm5_chroma])
    
    permute = Permute((2, 1, 3))(concat)
    reshaped = Reshape((107, 64))(permute)

    lstm = Bidirectional(LSTM(256, return_sequences=False))(reshaped)
    
    out_accent = Dense(2, activation='sigmoid',name='output_accent')(lstm)
    out_gender = Dense(2, activation='sigmoid',name='output_gender')(lstm)

    # (bs, y, x, c) --> (bs, x, y, c)
#     model.add(Permute((2, 1, 3)))

#     # (bs, x, y, c) --> (bs, x, y * c)
#     bs, x, y, c = model.layers[-1].output_shape
#     model.add(Reshape((x, y*c)))

#     model.add(Bidirectional(LSTM(256, return_sequences=False), merge_mode="concat"))
#     model.add([Dense(2, activation="sigmoid", name='output_accent'),Dense(2, activation="sigmoid", name='output_gender')])
    model = Model(inputs = [input_layer_mfcc, input_layer_chroma] ,outputs=[out_accent, out_gender])

    return model

In [15]:
multitask_crnn_2 = create_CRNN_multi_model((12,862,1))

In [16]:
multitask_crnn_2.summary()

In [168]:
multitask_crnn_2.compile(optimizer='Adam',loss={'output_accent':'binary_crossentropy','output_gender': 'binary_crossentropy'}, metrics ={'output_accent': 'accuracy', 'output_gender': 'accuracy'})

In [169]:
## Training my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR + 'saved_models/multitask_multiinput_classification_crnn.h5', 
                               verbose=1, save_best_only=True,patience=10, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)

start = datetime.now()

history_multitask_crnn2 = multitask_crnn_2.fit({"Input": X_train_mfcc, "Input2": X_train_chroma}, {"output_accent": y_accent_train, "output_gender": y_gender_train},validation_data=({"Input": X_val_mfcc, "Input2": X_val_chroma},{"output_accent": y_accent_val, "output_gender": y_gender_val}),
                        batch_size=num_batch_size, epochs=num_epochs, callbacks=[checkpointer, early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [170]:
from keras.models import load_model
crnn_multi_model = load_model('/kaggle/working/saved_models/multitask_multiinput_classification_crnn.h5')
results_multitask_crnn = crnn_multi_model.evaluate({"Input": X_test_mfcc, "Input2": X_test_chroma}, {"output_accent": y_accent_test, "output_gender": y_gender_test})
print(results_multitask_crnn)

FFFN

In [17]:
def ffn_multiinputs():
#     input_layer_mfcc = Input(shape=(12, 862), name='Input')
#     input_layer_chroma = Input(shape=(12, 862), name='Input2')
#     concat = Concatenate()([input_layer_mfcc, input_layer_chroma])

#     dense1=Dense(128, activation='relu',name='Hidden_1')(concat)
#     dropout1 = Dropout(0.5)(dense1)

#     dense2=Dense(256, activation='relu',name='Hidden_2')(dropout1)
#     dropout2 = Dropout(0.5)(dense2)

#     dense3=Dense(128, activation='relu',name='Hidden_3')(dropout2)
#     dropout3 = Dropout(0.5)(dense3)
#     m = tf.keras.layers.Flatten()(dropout3)

#     out_accent = Dense(2, activation='sigmoid',name='output_accent')(m)
#     out_gender = Dense(2, activation='sigmoid',name='output_gender')(m)


#     model_ffn = Model(inputs = [input_layer_mfcc,input_layer_chroma] ,outputs=[out_accent, out_gender])
#     return model_ffn
    input_layer_mfcc = Input(shape=(12, 862), name='Input')
    input_layer_chroma = Input(shape=(12, 862), name='Input2')
    

    dense1=Dense(128, activation='relu',name='Hidden_1')(input_layer_mfcc)
    dropout1 = Dropout(0.5)(dense1)

    dense2=Dense(256, activation='relu',name='Hidden_2')(dropout1)
    dropout2 = Dropout(0.5)(dense2)

    dense3=Dense(128, activation='relu',name='Hidden_3')(dropout2)
    dropout3 = Dropout(0.5)(dense3)
    
    dense1_chroma=Dense(128, activation='relu',name='Hidden_11')(input_layer_chroma)
    dropout1_chroma = Dropout(0.5)(dense1_chroma)

    dense2_chroma=Dense(256, activation='relu',name='Hidden_21')(dropout1_chroma)
    dropout2_chroma = Dropout(0.5)(dense2_chroma)

    dense3_chroma=Dense(128, activation='relu',name='Hidden_31')(dropout2_chroma)
    dropout3_chroma = Dropout(0.5)(dense3_chroma)
    concat = Concatenate()([dropout3, dropout3_chroma])
    m = tf.keras.layers.Flatten()(concat)

    out_accent = Dense(2, activation='sigmoid',name='output_accent')(m)
    out_gender = Dense(2, activation='sigmoid',name='output_gender')(m)


    model_ffn = Model(inputs = [input_layer_mfcc,input_layer_chroma] ,outputs=[out_accent, out_gender])
    return model_ffn

In [18]:
model_ffn = ffn_multiinputs()

In [19]:
model_ffn.summary()

In [180]:
model_ffn.compile(optimizer='Adam',loss={'output_accent':'binary_crossentropy','output_gender': 'binary_crossentropy'}, metrics ={'output_accent': 'accuracy', 'output_gender': 'accuracy'})

In [181]:
## Training my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR + 'saved_models/multitask_multiinput_classification_ffn.h5', 
                               verbose=1, save_best_only=True,patience=10, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)

start = datetime.now()

history_multitask_crnn2 = model_ffn.fit({"Input": X_train_mfcc, "Input2": X_train_chroma}, {"output_accent": y_accent_train, "output_gender": y_gender_train},validation_data=({"Input": X_val_mfcc, "Input2": X_val_chroma},{"output_accent": y_accent_val, "output_gender": y_gender_val}),
                        batch_size=num_batch_size, epochs=num_epochs, callbacks=[checkpointer, early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [182]:
from keras.models import load_model
crnn_multi_model = load_model('/kaggle/working/saved_models/multitask_multiinput_classification_ffn.h5')
results_multitask_crnn = crnn_multi_model.evaluate({"Input": X_test_mfcc, "Input2": X_test_chroma}, {"output_accent": y_accent_test, "output_gender": y_gender_test})
print(results_multitask_crnn)

### CNN with multi inputs

In [194]:
def create_cnn_model2():
    # Create CNN model
    input_layer_mfcc = Input(shape=(12, 862,1), name='Input')
    input_layer_chroma = Input(shape=(12, 862,1), name='Input2')
    # conv 1
    conv = Conv2D(16, 3, padding='same', activation='relu')(input_layer_mfcc)
    batchnorm = BatchNormalization()(conv)
    maxpool = MaxPooling2D(pool_size=(2, 2))(batchnorm)
    batchnorm1 = BatchNormalization()(maxpool)
    # conv 2
    conv1 = Conv2D(16, 3, padding='same', activation='relu')(batchnorm1)
    batchnorm2 = BatchNormalization()(conv1)
    maxpool1 = MaxPooling2D(pool_size=(2, 2))(batchnorm1)
    batchnorm3 = BatchNormalization()(maxpool1)
    # conv 3
    conv2 = Conv2D(128, 3, padding='same', activation='relu')(batchnorm3)
    batchnorm4 = BatchNormalization()(conv2)
    maxpool2 = MaxPooling2D(pool_size=(2, 2))(batchnorm4)
    batchnorm5 = BatchNormalization()(maxpool2)
    
    conv_chroma = Conv2D(16, 3, padding='same', activation='relu')(input_layer_chroma)
    batchnorm_chroma = BatchNormalization()(conv_chroma)
    maxpool_chroma = MaxPooling2D(pool_size=(2, 2))(batchnorm_chroma)
    batchnorm1_chroma = BatchNormalization()(maxpool_chroma)
    # conv 2
    conv1_chroma = Conv2D(16, 3, padding='same', activation='relu')(batchnorm1_chroma)
    batchnorm2_chroma = BatchNormalization()(conv1_chroma)
    maxpool1_chroma = MaxPooling2D(pool_size=(2, 2))(batchnorm1_chroma)
    batchnorm3_chroma = BatchNormalization()(maxpool1_chroma)
    # conv 3
    conv2_chroma = Conv2D(128, 3, padding='same', activation='relu')(batchnorm3_chroma)
    batchnorm4_chroma = BatchNormalization()(conv2_chroma)
    maxpool2_chroma = MaxPooling2D(pool_size=(2, 2))(batchnorm4_chroma)
    batchnorm5_chroma = BatchNormalization()(maxpool2_chroma)
    
    concat = Concatenate()([batchnorm5, batchnorm5_chroma])
    
    # flatten
    flatten = Flatten()(concat)
#     concat = Concatenate()([input_layer_mfcc, input_layer_chroma])
#     dense = Dense(64, activation='relu',name='Hidden_1')(flatten)
#     dropout = Dropout(0.5)(dense)
    # output
    out_accent = Dense(2, activation='sigmoid',name='output_accent')(flatten)
    out_gender = Dense(2, activation='sigmoid',name='output_gender')(flatten)
    model = Model(inputs = [input_layer_mfcc,input_layer_chroma] ,outputs=[out_accent, out_gender])
    return model

In [195]:
multinput_cnn_multi = create_cnn_model2()

In [196]:
multinput_cnn_multi.compile(optimizer='Adam',loss={'output_accent':'binary_crossentropy','output_gender': 'binary_crossentropy'}, metrics ={'output_accent': 'accuracy', 'output_gender': 'accuracy'})

In [197]:
## Training my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR + 'saved_models/multitask_multiinput_classification_cnn.h5', 
                               verbose=1, save_best_only=True,patience=10, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)

start = datetime.now()

history_multinput_cnn_multi = multinput_cnn_multi.fit({"Input": X_train_mfcc, "Input2": X_train_chroma}, {"output_accent": y_accent_train, "output_gender": y_gender_train},validation_data=({"Input": X_val_mfcc, "Input2": X_val_chroma},{"output_accent": y_accent_val, "output_gender": y_gender_val}),
                        batch_size=num_batch_size, epochs=num_epochs, callbacks=[checkpointer, early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)

In [198]:
from keras.models import load_model
crnn_multi_model = load_model('/kaggle/working/saved_models/multitask_multiinput_classification_cnn.h5')
results_multitask_crnn = crnn_multi_model.evaluate({"Input": X_test_mfcc, "Input2": X_test_chroma}, {"output_accent": y_accent_test, "output_gender": y_gender_test})
print(results_multitask_crnn)

In [184]:
def create_cnn_model2():
    # Create CNN model
#     input_layer_mfcc = Input(shape=(12, 862,1), name='Input')
    input_layer_chroma = Input(shape=(12, 862,1), name='Input2')
#     # conv 1
#     conv = Conv2D(32, 3, padding='same', activation='relu')(input_layer_mfcc)
#     batchnorm = BatchNormalization()(conv)
#     maxpool = MaxPooling2D(pool_size=(2, 2))(batchnorm)
#     batchnorm1 = BatchNormalization()(maxpool)
#     # conv 2
#     conv1 = Conv2D(32, 3, padding='same', activation='relu')(batchnorm1)
#     batchnorm2 = BatchNormalization()(conv1)
#     maxpool1 = MaxPooling2D(pool_size=(2, 2))(batchnorm1)
#     batchnorm3 = BatchNormalization()(maxpool1)
#     # conv 3
#     conv2 = Conv2D(128, 3, padding='same', activation='relu')(batchnorm3)
#     batchnorm4 = BatchNormalization()(conv2)
#     maxpool2 = MaxPooling2D(pool_size=(2, 2))(batchnorm4)
#     batchnorm5 = BatchNormalization()(maxpool2)
    
    conv_chroma = Conv2D(32, 3, padding='same', activation='relu')(input_layer_chroma)
    batchnorm_chroma = BatchNormalization()(conv_chroma)
    maxpool_chroma = MaxPooling2D(pool_size=(2, 2))(batchnorm_chroma)
    batchnorm1_chroma = BatchNormalization()(maxpool_chroma)
    # conv 2
    conv1_chroma = Conv2D(64, 3, padding='same', activation='relu')(batchnorm1_chroma)
    batchnorm2_chroma = BatchNormalization()(conv1_chroma)
    maxpool1_chroma = MaxPooling2D(pool_size=(2, 2))(batchnorm1_chroma)
    batchnorm3_chroma = BatchNormalization()(maxpool1_chroma)
    # conv 3
    conv2_chroma = Conv2D(128, 3, padding='same', activation='relu')(batchnorm3_chroma)
    batchnorm4_chroma = BatchNormalization()(conv2_chroma)
    maxpool2_chroma = MaxPooling2D(pool_size=(2, 2))(batchnorm4_chroma)
    batchnorm5_chroma = BatchNormalization()(maxpool2_chroma)
    
#     concat = Concatenate()([batchnorm5, batchnorm5_chroma])
    
    # flatten
    flatten = Flatten()(batchnorm5_chroma)
#     concat = Concatenate()([input_layer_mfcc, input_layer_chroma])
    dense = Dense(64, activation='relu',name='Hidden_1')(flatten)
    dropout = Dropout(0.5)(dense)
    # output
    out = Dense(2, activation='sigmoid',name='output')(dropout)
    model = Model(inputs = input_layer_chroma ,outputs=out)
    return model

In [None]:
multinput_cnn_accent = create_cnn_model2()

In [None]:
multinput_cnn_accent.compile(optimizer='Adam',loss='binary_crossentropy', metrics =['accuracy'])

In [None]:
## Training my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

num_epochs = 35
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath=OUTPUT_DIR + 'saved_models/multitask_multiinput_classification_ffn.h5', 
                               verbose=1, save_best_only=True,patience=10, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)

start = datetime.now()

history_multitask_crnn2 = multinput_cnn_accent.fit({"Input": X_train_mfcc, "Input2": X_train_chroma}, y_accent_train,validation_data=({"Input": X_val_mfcc, "Input2": X_val_chroma},y_accent_val),batch_size=num_batch_size, epochs=num_epochs, callbacks=[checkpointer, early_stopping], verbose=1)


duration = datetime.now() - start
print("Training completed in time: ", duration)