<a href="https://colab.research.google.com/github/atick-faisal/Crowd-Emotion/blob/main/src/tl-generic/Spectrogram_TL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import time
import json
import joblib
import librosa
import datetime

import numpy as np
import seaborn as sns
import soundfile as sf
import matplotlib.pyplot as plt

import tensorflow as tf

from scipy.signal import spectrogram
from sklearn.metrics import classification_report, confusion_matrix

from matplotlib.mlab import specgram
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
from matplotlib.figure import Figure

tf.random.set_seed(42)

tf.__version__

'2.4.1'

In [2]:
CONFIG = {
    'timestamp'             : str(datetime.datetime.now()),
    'model'                 : 'Transfer Learning on MobileNet',
    'base_model'            : 'mobile_net_v2',
    'input_shape'           : (160, 160, 3),
    'test_fold'             : 'Fold 5',
    'frame_length'          : 44100,
    'frame_inc'             : 11025,
    'fig_size'              : (2.23, 2.23), # 160x160
    'architecture'          : '',
    'batch_size'            : 32,
    'epochs'                : 300,
    'learning_rate'         : 0.0001,
    'monitor'               : 'val_loss',
    'patience'              : 10,
    'class_weight'          : { 0: 0.71, 1:  2.32, 2: 0.86 },
    'training_time'         : 0,
    'testing_time'          : 0,
    'cm_atick'              : '',
    'cr_atick'              : '',
    'cm_valentina'          : '',
    'cr_valentina'          : ''
}

In [3]:
BASE_DIR            = os.getcwd()
LOG_FILE            = '/content/drive/MyDrive/Research/Crowd Emotion Logs/tl_generic.txt'
FOLDS               = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5']
EMOTIONS            = ['Approval', 'Disapproval', 'Neutral']

ATICK_DATA_DIR      = '/content/Dataset-Atick/'
VALENTINA_DATA_DIR  = '/content/Dataset-Valentina/'

RANGE               = np.array([20, 20000])

ATICK_DATASET_LEN   = 13806
VAL_DATASET_LEN     = 9484

In [4]:
%%time

# # ----------------- Loading my dataset -------------------
# !mkdir /content/Dataset-Atick/
# !gdown --id '1zGfANn9GKi9OUMbMehkfzk_Pvhc-b68N'
# !tar -xf /content/Atick_CE_Dataset.tar.xz -C /content/Dataset-Atick/

# # ----------------- Loading Valentina's dataset -------------------
# !mkdir /content/Dataset-Valentina/
# !gdown --id '11tC2Nmie9v3ljo60oQJ3sN1rVkorV-N1'
# !tar -xf /content/Valentina_CE_Dataset.tar.xz -C /content/Dataset-Valentina/


CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.48 µs


## Log Spectrogram

In [5]:
# def generate_dataset(data_len, data_dir):
#     X = np.zeros((data_len, 160, 160, 3), dtype=np.uint8)
#     y = np.zeros((data_len, 1), dtype=np.uint8)
#     f = np.zeros((data_len, ), dtype=np.uint8)

#     count = 0

#     for emotion in EMOTIONS:
#         emo_path = os.path.join(BASE_DIR, data_dir, emotion)

#         print('processing data for ' + emotion + ' ... ')

#         for fold in ['']:
#             fold_path = os.path.join(emo_path, fold)
#             files = os.listdir(fold_path)

#             print('processing data for ' + fold, end=' ... ')
            
#             for filename in files:
#                 wav, sr = librosa.load(
#                     path            = os.path.join(fold_path, filename),
#                     sr              = 44100,
#                     mono            = True
#                 )
#                 # wav, sr = sf.read(os.path.join(fold_path, filename))
#                 n = len(wav)
#                 idx = 0
#                 while idx < (n - CONFIG['frame_length']):
#                     frame = wav[idx:(idx + CONFIG['frame_length'])]
#                     idx = idx + CONFIG['frame_inc']

#                     fig = Figure(figsize=CONFIG['fig_size'])
#                     canvas = FigureCanvas(fig)
#                     ax = fig.gca()

#                     ax.axis('off')
#                     fig.tight_layout(pad=0)
#                     ax.margins(0)

#                     ax.specgram(
#                         x               = frame,
#                         Fs              = 44100,
#                         window          = np.hamming(400),
#                         NFFT            = 400,
#                         cmap            = 'jet',
#                         noverlap        = 200,
#                         mode            = 'psd',
#                         scale           = 'dB',
#                         detrend         = None,
#                         scale_by_freq   = True,
#                         vmin            = -160,
#                         vmax            = -25,
#                         interpolation   = 'hamming'      
#                     )
#                     ax.set_yscale('symlog')
#                     ax.set_ylim(RANGE)

#                     fig.canvas.draw()
#                     img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
#                     img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))

#                     X[count, :] = img
#                     y[count, 0] = EMOTIONS.index(emotion)
#                     # f[count] = FOLDS.index(fold)

#                     count = count + 1

#             print('√')

#     return X, y, f

## MEL Spectrogram

In [4]:
def generate_dataset(data_len, data_dir):
    X = np.zeros((data_len, 160, 160, 3), dtype=np.uint8)
    y = np.zeros((data_len, 1), dtype=np.uint8)
    f = np.zeros((data_len, ), dtype=np.uint8)

    count = 0

    for emotion in EMOTIONS:
        emo_path = os.path.join(BASE_DIR, data_dir, emotion)

        print('processing data for ' + emotion + ' ... ')

        for fold in FOLDS:
            fold_path = os.path.join(emo_path, fold)
            files = os.listdir(fold_path)

            print('processing data for ' + fold, end=' ... ')
            
            for filename in files:
                wav, sr = librosa.load(
                    path            = os.path.join(fold_path, filename),
                    sr              = 44100,
                    mono            = True
                )
                # wav, sr = sf.read(os.path.join(fold_path, filename))
                n = len(wav)
                idx = 0
                while idx < (n - CONFIG['frame_length']):
                    frame = wav[idx:(idx + CONFIG['frame_length'])]
                    idx = idx + CONFIG['frame_inc']

                    fig = Figure(figsize=CONFIG['fig_size'])
                    canvas = FigureCanvas(fig)
                    ax = fig.gca()

                    ax.axis('off')
                    fig.tight_layout(pad=0)
                    ax.margins(0)

                    S, f, t = specgram(
                        x               = frame,
                        NFFT            = 400,
                        Fs              = 44100,
                        window          = np.hamming(400),
                        noverlap        = 200,
                        scale_by_freq   = True,
                        mode            = 'psd'
                    )

                    f[0] = 1e-10
                    f = 2595 * np.log10(1 + (f / 700))
                    range = 2595 * np.log10(1 + (RANGE / 700))

                    Z = 10. * np.log10(S)

                    ax.pcolormesh(t, f, Z, cmap='jet', vmin=-160,
                                  vmax=-25, shading='gouraud')
                    ax.axis('auto')
                    ax.set_ylim(range)

                    fig.canvas.draw()
                    img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
                    img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))

                    X[count, :] = img
                    y[count, 0] = EMOTIONS.index(emotion)
                    # f[count] = FOLDS.index(fold)

                    count = count + 1

            print('√')

    return X, y, f

In [None]:
%%time
X_AF, y_AF, f_AF = generate_dataset(ATICK_DATASET_LEN, ATICK_DATA_DIR)
# joblib.dump(X_AF, '/content/drive/MyDrive/Research/Crowd Emotion Dataset v4/X_MEL_AF.joblib')
# joblib.dump(y_AF, '/content/drive/MyDrive/Research/Crowd Emotion Dataset v4/y_AF.joblib')
# joblib.dump(f_AF, '/content/drive/MyDrive/Research/Crowd Emotion Dataset v4/f_AF.joblib')

processing data for Approval ... 
processing data for Fold 1 ... 

In [None]:
class TransferLearning():
    def __init__(self, config):

        self.base_model = config['base_model']
        self.input_shape = config['input_shape']
        self.learning_rate = config['learning_rate']
        self.batch_size = config['batch_size']
        self.epochs = config['epochs']
        self.monitor = config['monitor']
        self.patience = config['patience']
        self.class_weight = config['class_weight']

        if self.base_model == 'mobile_net_v2':
            self.base_model = tf.keras.applications.MobileNetV2(
                input_shape     = self.input_shape,
                include_top     = False,
                weights         = 'imagenet'
            )
            self.base_model.trainable = False
            self.preprocess = tf.keras.applications.mobilenet_v2.preprocess_input
            self.model = None
            self.callbacks = None

    def __init_model(self, num_classes):
        inputs = tf.keras.Input(shape=self.input_shape)
        x = self.preprocess(inputs)
        x = self.base_model(x, training=False)
        x = tf.keras.layers.GlobalAveragePooling2D()(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        outputs = tf.keras.layers.Dense(num_classes)(x)
        self.model = tf.keras.Model(inputs, outputs)
        self.model.compile(
            optimizer      = tf.keras.optimizers.Adam(lr=self.learning_rate),
            loss           = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics        = ['accuracy']
        )

    def __init_callbacks(self):
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor        = self.monitor,
            patience       = self.patience,
            verbose        = 1
        )
        self.callbacks = [
                          early_stopping,
        ]

    def fit(self, train_X, train_y, val_X, val_y):
        num_classes = np.unique(train_y).shape[0]
        self.__init_model(num_classes)
        self.__init_callbacks()
        history = self.model.fit(
            x                 = train_X,
            y                 = train_y,
            batch_size        = self.batch_size,
            epochs            = self.epochs,
            verbose           = 1,
            validation_data   = (val_X, val_y),
            shuffle           = True,
            callbacks         = self.callbacks,
            class_weight      = self.class_weight
        )
    
        return self.model, history 

    def evaluate(self, test_X, test_y):
        prob_model = tf.keras.Sequential([self.model, tf.keras.layers.Softmax()])
        y_pred_hot = prob_model.predict(test_X)
        y_pred = np.argmax(y_pred_hot, axis=1)
        return classification_report(test_y.ravel(), y_pred)

In [None]:
X_AF = joblib.load('/content/drive/MyDrive/Research/Crowd Emotion Dataset v4/X_AF.joblib')
y_AF = joblib.load('/content/drive/MyDrive/Research/Crowd Emotion Dataset v4/y_AF.joblib')
f_AF = joblib.load('/content/drive/MyDrive/Research/Crowd Emotion Dataset v4/f_AF.joblib')

mask      = (f_AF == FOLDS.index(CONFIG['test_fold']))
train_X   = X_AF[~mask, :]
train_y   = y_AF[~mask, :]
val_X     = X_AF[mask, :]
val_y     = y_AF[mask, :]

del X_AF

test_X = joblib.load('/content/drive/MyDrive/Research/Crowd Emotion Dataset v4/X_VAL.joblib')
test_y = joblib.load('/content/drive/MyDrive/Research/Crowd Emotion Dataset v4/y_VAL.joblib')

In [None]:
%%time
start_time = time.time()

tl = TransferLearning(CONFIG)
model, history = tl.fit(train_X, train_y, val_X, val_y)

training_time = time.time() - start_time


In [None]:
%%time

# ---------------- Testing on valentina's data -------------------

start_time = time.time()

loss, accuracy = model.evaluate(test_X, test_y, batch_size=CONFIG['batch_size'])

print("Loss: ", loss)
print("Accuracy: ", accuracy)

testing_time = time.time() - start_time

In [None]:
metrics = history.history
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

In [None]:
# ------------------ Testing on Aticks's data -----------------

y_true = val_y.ravel()
y_pred = np.argmax(model.predict(val_X), axis=1)

result_atick = classification_report(y_true, y_pred)
print(result_atick)

In [None]:
# confusion_mtx_atick = tf.math.confusion_matrix(y_true, y_pred) 
confusion_mtx_atick = (confusion_matrix(y_true, y_pred, normalize='true') * 100).astype('int')
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx_atick, xticklabels=EMOTIONS, yticklabels=EMOTIONS, 
            annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

In [None]:
# ------------------ Testing on Valentina's data -----------------

y_true = test_y.ravel()
y_pred = np.argmax(model.predict(test_X), axis=1)

result_valentina = classification_report(y_true, y_pred)
print(result_valentina)

In [None]:
# confusion_mtx_valentina = tf.math.confusion_matrix(y_true, y_pred) 
confusion_mtx_valentina = (confusion_matrix(y_true, y_pred, normalize='true') * 100).astype('int')
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx_valentina, xticklabels=EMOTIONS, yticklabels=EMOTIONS, 
            annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

In [None]:
summary = []
model.summary(print_fn=lambda x: summary.append(x))
CONFIG['architecture'] = summary
CONFIG['epochs'] = max(history.epoch)
CONFIG['training_time'] = training_time
CONFIG['testing_time'] = testing_time
CONFIG['cm_atick'] = np.array2string(confusion_mtx_atick)
result_list_atick = result_atick.split('\n')
CONFIG['cr_atick'] = result_list_atick
CONFIG['cm_valentina'] = np.array2string(confusion_mtx_valentina)
result_list_valentina = result_valentina.split('\n')
CONFIG['cr_valentina'] = result_list_valentina

In [None]:
config = json.dumps(CONFIG, indent=4)
print(config)

In [None]:
f = open(LOG_FILE, 'a')
f.write('\n')
f.write(config)
f.write('\n')
f.close()

In [None]:
wav, sr = librosa.load(
    path            = '/content/Dataset-Valentina/Approval/appl0000.wav',
    sr              = 44100,
    mono            = True
)
# wav, sr = sf.read(os.path.join(fold_path, filename))
n = len(wav)
idx = 0
while idx < (n - CONFIG['frame_length']):
    frame = wav[idx:(idx + CONFIG['frame_length'])]
    idx = idx + CONFIG['frame_inc']

    fig = Figure(figsize=CONFIG['fig_size'])
    canvas = FigureCanvas(fig)
    ax = fig.gca()

    ax.axis('off')
    fig.tight_layout(pad=0)
    ax.margins(0)

    spec, f1, t1, _ = ax.specgram(
        x               = frame,
        Fs              = 44100,
        window          = np.hamming(400),
        NFFT            = 400,
        cmap            = 'jet',
        noverlap        = 200,
        mode            = 'psd',
        scale           = 'dB',
        detrend         = None,
        scale_by_freq   = True,
        vmin            = -160,
        vmax            = -25,
        # interpolation   = 'hamming'      
    )
    ax.set_yscale('symlog')
    ax.set_ylim(RANGE)

    fig.canvas.draw()
    img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
    img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))

    break

plt.figure(figsize=(7, 7))
plt.imshow(img)
plt.show()

In [None]:
wav, sr = librosa.load(
    path            = '/content/Dataset-Valentina/Approval/appl0000.wav',
    sr              = 44100,
    mono            = True
)
# wav, sr = sf.read(os.path.join(fold_path, filename))
n = len(wav)
idx = 0
while idx < (n - CONFIG['frame_length']):
    frame = wav[idx:(idx + CONFIG['frame_length'])]
    idx = idx + CONFIG['frame_inc']

    fig = Figure(figsize=CONFIG['fig_size'])
    canvas = FigureCanvas(fig)
    ax = fig.gca()

    ax.axis('off')
    fig.tight_layout(pad=0)
    ax.margins(0)

    S, f, t = specgram(
        x               = frame,
        NFFT            = 400,
        Fs              = 44100,
        window          = np.hamming(400),
        noverlap        = 200,
        scale_by_freq   = True,
        mode            = 'psd'
    )

    # f, t, S = spectrogram(
    #     x               = frame,
    #     fs              = 44100,
    #     window          = np.hamming(400),
    #     nperseg         = 400,
    #     noverlap        = 200,
    #     nfft            = 400
    # )


    f[0] = 1e-10
    # f = np.log10(f)
    f = 2595 * np.log10(1 + (f / 700))
    range = 2595 * np.log10(1 + (RANGE / 700))

    Z = 10. * np.log10(S)
    # Z = np.flipud(Z)

    pad_xextent = (400 - 200) / 44100 / 2
    xextent = np.min(t) - pad_xextent, np.max(t) + pad_xextent

    xmin, xmax = xextent
    extent = xmin, xmax, f[0], f[-1]

    # ax.imshow(Z, extent=extent, cmap='jet', origin='upper')
    ax.pcolormesh(t, f, Z, cmap='jet', vmin=-160, vmax=-25, shading='gouraud')

    ax.axis('auto')

    # ax.set_yscale('symlog')
    ax.set_ylim(range)

    fig.canvas.draw()
    img = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
    img = img.reshape(fig.canvas.get_width_height()[::-1] + (3,))

    break

plt.figure(figsize=(7, 7))
plt.imshow(img)
plt.show()

In [None]:
img.shape

In [None]:
f1

In [None]:
t2

In [None]:
np.log10(0)