In [1]:
import gc
import os
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

from shared.utils import *
from shared.local_path import *

from collections import Counter
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# Data Preprocessing
## Audio Data

In [2]:
label = []
filepath = []

for classes in os.listdir(AUDIO_PATH):
    for file in os.listdir(os.path.join(AUDIO_PATH, classes)):
        file_path = os.path.join(AUDIO_PATH, classes, file)

        filepath.append(file_path)
        label.append(classes)

audio = pd.DataFrame({
    'filepath': filepath,
    'label': label,
})

del filepath, label

audio.head()

Unnamed: 0,filepath,label
0,E:\\Skripsi\data\audio\angry\03-01-05-01-01-01...,angry
1,E:\\Skripsi\data\audio\angry\03-01-05-01-01-01...,angry
2,E:\\Skripsi\data\audio\angry\03-01-05-01-01-01...,angry
3,E:\\Skripsi\data\audio\angry\03-01-05-01-01-01...,angry
4,E:\\Skripsi\data\audio\angry\03-01-05-01-01-01...,angry


In [3]:
le = LabelEncoder()
le.fit(audio['label'])
audio['label_encoded'] = le.transform(audio['label'])
num_classes = len(le.classes_)

In [4]:
with tf.device('/GPU:0'):
    audio['data'] = audio.filepath.apply(preprocess_audio)
    audio = audio.sample(frac=1).reset_index(drop=True)



In [5]:
x_audio = np.stack(audio['data'].values)
y_audio = audio['label_encoded'].values
print(x_audio.shape)
del audio

(27406, 128, 110, 3)


In [6]:
x_audio_train, x_temp, y_audio_train, y_temp = train_test_split(
    x_audio, y_audio, test_size=0.3, random_state=100,
)

x_audio_val, x_audio_test, y_audio_val, y_audio_test = train_test_split(
    x_temp, y_temp, test_size=0.5, random_state=222,
)

del x_audio, y_audio
print(x_audio_train.shape)
print(x_audio_train.shape[0] == len(y_audio_train))

(19184, 128, 110, 3)
True


In [7]:
print(x_audio_train.shape[0] == len(y_audio_train))

True


## Image Data

In [8]:
filepath = []
label = []

i = 0
for classes in os.listdir(TRAIN_IMAGE_PATH):
    for file in os.listdir(os.path.join(TRAIN_IMAGE_PATH, classes)):
        path = os.path.join(TRAIN_IMAGE_PATH, classes, file)
        filepath.append(path)
        label.append(classes)

train_df = pd.DataFrame({
    'filepath': filepath,
    'label': label
})

del filepath, label

print(train_df.shape)
print(train_df['label'].unique())
train_df.head()

(33228, 2)
['angry' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']


Unnamed: 0,filepath,label
0,E:\\Skripsi\data\new_data\train\angry\angry_0_...,angry
1,E:\\Skripsi\data\new_data\train\angry\angry_0_...,angry
2,E:\\Skripsi\data\new_data\train\angry\angry_0_...,angry
3,E:\\Skripsi\data\new_data\train\angry\angry_0_...,angry
4,E:\\Skripsi\data\new_data\train\angry\angry_0_...,angry


In [9]:
filepath = []
label = []

i = 0
for classes in os.listdir(TEST_IMAGE_PATH):
    for file in os.listdir(os.path.join(TEST_IMAGE_PATH, classes)):
        path = os.path.join(TEST_IMAGE_PATH, classes, file)
        filepath.append(path)
        label.append(classes)

img_test_df = pd.DataFrame({
    'filepath': filepath,
    'label': label
})

del filepath, label

print(img_test_df.shape)
print(img_test_df['label'].unique())
img_test_df.head()

(7311, 2)
['angry' 'disgust' 'fear' 'happy' 'neutral' 'sad' 'surprise']


Unnamed: 0,filepath,label
0,E:\\Skripsi\data\new_data\test\angry\Anger.jpg,angry
1,E:\\Skripsi\data\new_data\test\angry\Anger_1.jpg,angry
2,E:\\Skripsi\data\new_data\test\angry\Anger_10.jpg,angry
3,E:\\Skripsi\data\new_data\test\angry\Anger_11.jpg,angry
4,E:\\Skripsi\data\new_data\test\angry\Anger_12.jpg,angry


In [10]:
le = LabelEncoder()
le.fit(train_df['label'])
train_df['label_encoded'] = le.transform(train_df['label'])
img_test_df['label_encoded'] = le.transform(img_test_df['label'])

In [11]:
with tf.device('/GPU:0'):
    train_df['data'] = train_df['filepath'].apply(lambda x: preprocess_image(x, (100, 100)))
    img_test_df['data'] = img_test_df['filepath'].apply(lambda x: preprocess_image(x, (100, 100)))

train_df.shape

(33228, 4)

In [12]:
x_img = np.stack(train_df['data'].values)
y_img = train_df['label_encoded'].values

x_img_test = np.stack(img_test_df['data'].values)
y_img_test = img_test_df['label_encoded'].values

del train_df, img_test_df

x_img_train, x_img_val, y_img_train, y_img_val = train_test_split(x_img, y_img, test_size=0.3, random_state=100,
                                                                  shuffle=True, stratify=y_img)
del x_img, y_img

# Modeling
## Creating Model

In [14]:
def create_base_model(inputs: tf.keras.layers.Input, instance_name: str) -> tf.keras.Model:
    base = tf.keras.applications.VGG19(
        include_top=False,
        weights='imagenet',
    )
    outputs = base(inputs)

    for layer in base.layers:
        layer.trainable = False

    return tf.keras.Model(inputs, outputs, name=f"inception_v3_{instance_name}")

In [15]:
input_image = tf.keras.Input(shape=(100, 100, 3), name='input_image')
input_audio = tf.keras.Input(shape=(128, 110, 3), name='input_audio')

image_base = create_base_model(input_image, 'image_base')
audio_base = create_base_model(input_audio, 'audio_base')

left = image_base(input_image)
right = audio_base(input_audio)

left = tf.keras.layers.Flatten()(left)
left = tf.keras.layers.Dense(512, activation='relu')(left)

right = tf.keras.layers.Flatten()(right)
right = tf.keras.layers.Dense(512, activation='relu')(right)

out1 = tf.keras.layers.Dense(7, activation='softmax', name='image_class')(left)
out2 = tf.keras.layers.Dense(8, activation='softmax', name='audio_class')(right)

model = tf.keras.models.Model(inputs=[input_image, input_audio], outputs=[out1, out2])


In [16]:
plot_model(model, show_shapes=True)

## Compile the model

In [17]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss={
        'image_class': tf.keras.losses.SparseCategoricalCrossentropy(),
        'audio_class': tf.keras.losses.SparseCategoricalCrossentropy()
    },
    metrics={
        'image_class': 'accuracy',
        'audio_class': 'accuracy'
    }
)

## Make sure the data has same quantity

In [18]:
min_samples = min(len(x_img_train), len(x_audio_train), len(y_img_train), len(y_audio_train))

x_img_train = x_img_train[:min_samples]
x_audio_train = x_audio_train[:min_samples]
y_img_train = y_img_train[:min_samples]
y_audio_train = y_audio_train[:min_samples]

print("Input Image Shape:", x_img_train.shape)
print("Input Audio Shape:", x_audio_train.shape)
print("Image Labels Shape:", y_img_train.shape)
print("Audio Labels Shape:", y_audio_train.shape)

print("Input Image Type:", x_img_train.dtype)
print("Input Audio Type:", x_audio_train.dtype)
print("Image Labels Type:", y_img_train.dtype)
print("Audio Labels Type:", y_audio_train.dtype)

Input Image Shape: (19184, 100, 100, 3)
Input Audio Shape: (19184, 128, 110, 3)
Image Labels Shape: (19184,)
Audio Labels Shape: (19184,)
Input Image Type: uint8
Input Audio Type: float32
Image Labels Type: int32
Audio Labels Type: int32


In [19]:
min_samples = min(len(x_img_val), len(x_audio_val), len(y_img_val), len(y_audio_val))

x_img_val = x_img_val[:min_samples]
x_audio_val = x_audio_val[:min_samples]
y_img_val = y_img_val[:min_samples]
y_audio_val = y_audio_val[:min_samples]

In [None]:
min_samples = min(len(x_img_test), len(x_audio_test), len(y_img_test), len(y_audio_test))

x_img_test = x_img_test[:min_samples]
x_audio_test = x_audio_test[:min_samples]
y_img_test = y_img_test[:min_samples]
y_audio_test = y_audio_test[:min_samples]

In [14]:
def stratified_downsample(x_img, x_audio, y_img, y_audio, downsample_size=None):
    """
    Downsamples the datasets while preserving class distribution.
    
    Args:
        x_img (np.array): Image inputs.
        x_audio (np.array): Audio inputs.
        y_img (np.array): Image labels.
        y_audio (np.array): Audio labels.
        downsample_size (int, optional): Total number of samples after downsampling.
                                           If None, downsample to the smallest class count.
                                           
    Returns:
        Tuple of downsampled (x_img, x_audio, y_img, y_audio).
    """
    # Count samples per class
    class_counts = Counter(y_img)
    min_class_count = min(class_counts.values())

    if downsample_size is None:
        downsample_size = min_class_count
    else:
        downsample_size = min(downsample_size, min_class_count)

    indices_per_class = {}
    for cls in class_counts:
        indices = np.where(y_img == cls)[0]
        np.random.shuffle(indices)
        selected_indices = indices[:downsample_size]
        indices_per_class[cls] = selected_indices

    selected_indices = np.concatenate(list(indices_per_class.values()))
    np.random.shuffle(selected_indices)

    x_img_ds = x_img[selected_indices]
    x_audio_ds = x_audio[selected_indices]
    y_img_ds = y_img[selected_indices]
    y_audio_ds = y_audio[selected_indices]

    return x_img_ds, x_audio_ds, y_img_ds, y_audio_ds



In [15]:
x_img_train, x_audio_train, y_img_train, y_audio_train = stratified_downsample(
    x_img_train, x_audio_train, y_img_train, y_audio_train
)

min_samples_val = min(len(y_img_val), len(y_audio_val))
x_img_val, x_audio_val, y_img_val, y_audio_val = stratified_downsample(
    x_img_val, x_audio_val, y_img_val, y_audio_val, downsample_size=min_samples_val
)

min_samples_test = min(len(y_img_test), len(y_audio_test))
x_img_test, x_audio_test, y_img_test, y_audio_test = stratified_downsample(
    x_img_test, x_audio_test, y_img_test, y_audio_test, downsample_size=min_samples_test
)

print("Downsampled training shape:", x_img_train.shape)
print("Training labels distribution:", Counter(y_img_train))

IndexError: index 20415 is out of bounds for axis 0 with size 19184

In [20]:
tf.keras.backend.clear_session()
gc.collect()

## Prepare needed callbacks

In [22]:
early = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    restore_best_weights=True,
    mode='min',
    # start_from_epoch=2,
    patience=5
)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(MODEL_CHECKPOINT_PATH, 'best_model.keras'),
    verbose=2,
    save_best_only=True,
    monitor='val_loss',
    mode='min'
)

lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                    factor=0.2,
                                                    min_lr=0.000001,
                                                    patience=3,
                                                    mode='min'
                                                    )

## Train

In [24]:
history = model.fit(
    x=[x_img_train, x_audio_train],
    y=[y_img_train, y_audio_train],
    validation_data=(
        [x_img_val, x_audio_val],
        [y_img_val, y_audio_val]
    ),
    callbacks=[lr_scheduler, checkpoint],
    epochs=100,
    batch_size=BATCH_SIZE,
    steps_per_epoch=len(x_img_train) // BATCH_SIZE,
)


InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

## Plot the training result

In [None]:
def plot_and_save(metric_name, history, ylabel, filename):
    plt.figure()
    plt.plot(history.history[metric_name], label=f'Train {ylabel}')
    if f'val_{metric_name}' in history.history:
        plt.plot(history.history[f'val_{metric_name}'], label=f'Val {ylabel}')
    plt.title(f'{ylabel} over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel(ylabel)
    plt.legend()
    plt.grid(True)
    plt.savefig(filename)
    plt.close()

In [None]:
plot_and_save('loss', history, 'Loss', 'loss_plot.png')
plot_and_save('image_class_loss', history, 'Image Classification Loss', 'image_class_loss_plot.png')
plot_and_save('audio_class_loss', history, 'Audio Classification Loss', 'audio_class_loss_plot.png')
plot_and_save('image_class_accuracy', history, 'Image Classification Accuracy', 'image_class_accuracy_plot.png')
plot_and_save('audio_class_accuracy', history, 'Audio Classification Accuracy', 'audio_class_accuracy_plot.png')

# Evaluation
## Using model.evaluate

In [None]:
model.evaluate([x_img_val, x_audio_val], [y_img_val, y_audio_val], batch_size=BATCH_SIZE, verbose=2,
               steps=len(x_img_val) // BATCH_SIZE)

In [None]:
model.evaluate([x_img_test, x_audio_test], [y_img_test, y_audio_test], batch_size=BATCH_SIZE, verbose=2,
               steps=len(x_img_test) // BATCH_SIZE)

In [None]:
predictions = model.predict([x_img_test, x_audio_test], batch_size=BATCH_SIZE)

image_preds = predictions[0]
audio_preds = predictions[1]

## Using confusion matrix

In [None]:
image_class = np.argmax(predictions[0], axis=1)
audio_class = np.argmax(predictions[1], axis=1)

image_cm = confusion_matrix(y_img_test, image_class)
audio_cm = confusion_matrix(y_audio_test, audio_class)

### Plot the confusion Matrix

In [None]:
cm = confusion_matrix(y_audio_test, predictions)

plt.figure(figsize=(10, 8))  # Set the figure size if needed
sns.heatmap(image_cm, annot=True, cmap='Blues', fmt='g')

plt.title('Confusion Matrix Image', pad=20, fontsize=20, fontweight="bold")
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Set ticks for the x and y axes using class names
plt.xticks(ticks=range(len(image_class)), labels=image_class, rotation=45)
plt.yticks(ticks=range(len(image_class)), labels=image_class, rotation=0)
plt.savefig('confusion_matrix_image.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
cm = confusion_matrix(y_audio_test, predictions)

plt.figure(figsize=(10, 8))  # Set the figure size if needed
sns.heatmap(audio_cm, annot=True, cmap='Blues', fmt='g')

plt.title('Confusion Matrix Audio', pad=20, fontsize=20, fontweight="bold")
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Set ticks for the x and y axes using class names
plt.xticks(ticks=range(len(audio_class)), labels=audio_class, rotation=45)
plt.yticks(ticks=range(len(audio_class)), labels=audio_class, rotation=0)
plt.savefig('confusion_matrix_audio.png', dpi=300, bbox_inches='tight')
plt.show()

## Using Classification Report

In [None]:
aud_classes = [x for x in os.listdir(AUDIO_PATH)]
img_classes = [x for x in os.listdir(TRAIN_IMAGE_PATH)]

print(aud_classes, img_classes, sep='\n')

In [None]:
print(classification_report(y_img_test, image_class))

In [None]:
print(classification_report(y_audio_test, audio_class))

In [None]:
model.save('model_sc3t3.h5')