In [2]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from PIL import Image
import gc

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!unzip -o "/content/drive/MyDrive/Colab Notebooks/speech-emotion-recognition-ravdess-data.zip" -d "./ravdess_data"

Archive:  /content/drive/MyDrive/Colab Notebooks/speech-emotion-recognition-ravdess-data.zip
   creating: ./ravdess_data/Actor_01/
  inflating: ./ravdess_data/Actor_01/03-01-01-01-01-01-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-01-01-01-02-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-01-01-02-01-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-01-01-02-02-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-02-01-01-01-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-02-01-01-02-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-02-01-02-01-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-02-01-02-02-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-02-02-01-01-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-02-02-01-02-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-02-02-02-01-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-02-02-02-02-01.wav  
  inflating: ./ravdess_data/Actor_01/03-01-03-01-01-01-01.wav  
  inflating: ./ravdess_data/Actor_01/

In [5]:
# Define dataset path
dataset_path = "./ravdess_data"

# Verify if dataset path exists
if os.path.isdir(dataset_path):
    print("Dataset extracted successfully!")
    actor_folders = os.listdir(dataset_path)
    print(f"Total Actor Folders: {len(actor_folders)}")
    print(f"Sample Actor Folders: {actor_folders[:5]}")
else:
    print("Extraction failed.")

# Get list of all .wav files
audio_files = []
for actor_folder in actor_folders:
    actor_path = os.path.join(dataset_path, actor_folder)
    if os.path.isdir(actor_path):
        for file in os.listdir(actor_path):
            if file.endswith(".wav"):
                audio_files.append(os.path.join(actor_path, file))
audio_files

Dataset extracted successfully!
Total Actor Folders: 24
Sample Actor Folders: ['Actor_15', 'Actor_19', 'Actor_11', 'Actor_07', 'Actor_14']


['./ravdess_data/Actor_15/03-01-08-01-01-02-15.wav',
 './ravdess_data/Actor_15/03-01-05-02-02-01-15.wav',
 './ravdess_data/Actor_15/03-01-05-02-01-02-15.wav',
 './ravdess_data/Actor_15/03-01-06-02-02-01-15.wav',
 './ravdess_data/Actor_15/03-01-02-02-01-01-15.wav',
 './ravdess_data/Actor_15/03-01-03-02-01-01-15.wav',
 './ravdess_data/Actor_15/03-01-07-01-01-02-15.wav',
 './ravdess_data/Actor_15/03-01-08-01-01-01-15.wav',
 './ravdess_data/Actor_15/03-01-08-02-02-01-15.wav',
 './ravdess_data/Actor_15/03-01-03-01-02-02-15.wav',
 './ravdess_data/Actor_15/03-01-03-02-02-02-15.wav',
 './ravdess_data/Actor_15/03-01-04-02-02-01-15.wav',
 './ravdess_data/Actor_15/03-01-05-02-01-01-15.wav',
 './ravdess_data/Actor_15/03-01-02-02-02-02-15.wav',
 './ravdess_data/Actor_15/03-01-06-02-01-02-15.wav',
 './ravdess_data/Actor_15/03-01-02-01-02-01-15.wav',
 './ravdess_data/Actor_15/03-01-07-02-02-01-15.wav',
 './ravdess_data/Actor_15/03-01-02-01-01-01-15.wav',
 './ravdess_data/Actor_15/03-01-08-02-01-01-15

In [6]:
def get_emotion(filename):
    """Extracts emotion from filename."""
    emotion_dict = {
        "01": "Neutral",
        "02": "Calm",
        "03": "Happy",
        "04": "Sad",
        "05": "Angry",
        "06": "Fearful",
        "07": "Disgust",
        "08": "Surprised"
    }
    parts = filename.split("-")
    emotion_code = parts[2]
    return emotion_dict.get(emotion_code, "unknown")

In [7]:
import librosa
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile
from io import BytesIO
from PIL import Image
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import gc

In [10]:
def generate_plot_in_memory(audio_path):
    try:
        sampling_rate, audio_data = wavfile.read(audio_path)
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return None

    if len(audio_data.shape) > 1:
        audio_data = audio_data[:, 0]

    time = np.arange(0, len(audio_data)) / sampling_rate

    plt.figure(figsize=(6, 3), dpi=100)
    plt.plot(time, audio_data, linewidth=0.5)
    plt.axis('off')  # Remove axes for cleaner image
    plt.tight_layout(pad=0)

    buf = BytesIO()
    plt.savefig(buf, format='png', dpi=100)
    # plt.savefig(buf, format='png', optimize=True, dpi=100)  # Enable compression
    buf.seek(0)
    plt.close()

    return Image.open(buf)


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import Sequence, to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import gc

class AudioImageGenerator(Sequence):
    def __init__(self, audio_files, labels, label_encoder, batch_size=20, shuffle=True):
        self.audio_files = audio_files
        self.labels = labels
        self.label_encoder = label_encoder
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indices = np.arange(len(self.audio_files))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.audio_files) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_files = [self.audio_files[i] for i in batch_indices]
        batch_labels = [self.labels[i] for i in batch_indices]

        images = []
        encoded = []

        for f, lbl in zip(batch_files, batch_labels):
            img = generate_plot_in_memory(f)
            if img:
                img = img.convert("RGB").resize((64, 32), resample=Image.LANCZOS)
                img_arr = np.asarray(img, dtype=np.float32) / 255.0
                if img_arr.shape == (32, 64, 3):
                    images.append(img_arr)
                    encoded.append(lbl)


        if not images:
            return np.zeros((1, 32, 64, 3)), np.zeros((1, len(self.label_encoder.classes_)))

        X = np.array(images)
        y = to_categorical(self.label_encoder.transform(encoded), num_classes=len(self.label_encoder.classes_))

        return X, y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

labels = [get_emotion(f) for f in audio_files]
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

train_files, val_files, y_train_raw, y_val_raw = train_test_split(
    audio_files, labels, test_size=0.2, stratify=encoded_labels, random_state=42
)

train_gen = AudioImageGenerator(train_files, y_train_raw, label_encoder, batch_size=32, shuffle=True)
val_gen = AudioImageGenerator(val_files, y_val_raw, label_encoder, batch_size=32, shuffle=False)


model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 64, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


history = model.fit(train_gen, validation_data=val_gen, epochs=10, verbose=1)


val_loss, val_accuracy = model.evaluate(val_gen, verbose=0)
print(f"\nFinal Validation Accuracy: {val_accuracy:.4f}")

# --- Cleanup ---
del model, train_gen, val_gen
tf.keras.backend.clear_session()
gc.collect()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  self._warn_if_super_not_called()


Epoch 1/10
[1m34/36[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m7s[0m 4s/step - accuracy: 0.1270 - loss: 2.0955 

In [None]:
labels = [get_emotion(f) for f in audio_files]
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
fold = 1
acc_per_fold = []
all_histories = []

for train_idx, val_idx in skf.split(audio_files, encoded_labels):
    print(f"\n--- Fold {fold} train-idx {len(train_idx)} val-idx {len(val_idx)}---")

    train_files = [audio_files[i] for i in train_idx]
    val_files = [audio_files[i] for i in val_idx]
    y_train_raw = [labels[i] for i in train_idx]
    y_val_raw = [labels[i] for i in val_idx]

    # Generate train images
    X_train, y_train = [], []
    for f, lbl in zip(train_files, y_train_raw):
        img = generate_plot_in_memory(f)
        if img:
            img = img.convert("RGB").resize((64, 32))
            img_arr = np.array(img) / 255.0
            if img_arr.shape == (32, 64, 3):
                X_train.append(img_arr)
                y_train.append(lbl)

    # Generate val images
    X_val, y_val = [], []
    for f, lbl in zip(val_files, y_val_raw):
        img = generate_plot_in_memory(f)
        if img:
            img = img.convert("RGB").resize((64, 32))
            img_arr = np.array(img) / 255.0
            if img_arr.shape == (32, 64, 3):
                X_val.append(img_arr)
                y_val.append(lbl)

    # Skip if no data was loaded
    if len(X_train) == 0 or len(X_val) == 0:
        print(f"Skipping Fold {fold}: insufficient data after image conversion")
        continue

    # Convert to arrays and encode
    X_train = np.array(X_train)
    y_train = to_categorical(label_encoder.transform(y_train))
    X_val = np.array(X_val)
    y_val = to_categorical(label_encoder.transform(y_val))

    # --- Build CNN ---
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 64, 3)),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(y_train.shape[1], activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # --- Train ---
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), verbose=1)
    all_histories.append(history)

    # --- Evaluate ---
    val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Accuracy for Fold {fold}: {val_accuracy:.4f}")
    acc_per_fold.append(val_accuracy)

    # --- Cleanup ---
    del model, X_train, X_val, y_train, y_val
    tf.keras.backend.clear_session()
    gc.collect()

    fold += 1

# --- Summary ---
print(f"\n✅ Average Validation Accuracy over {k} folds: {np.mean(acc_per_fold):.4f}")

In [None]:
# Predict on test data
y_pred_probs = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Decode to emotion labels
predicted_emotions = label_encoder.inverse_transform(y_pred_classes)
true_emotions = label_encoder.inverse_transform(y_true_classes)

# Display a few predictions
for i in range(5):
    print(f"True: {true_emotions[i]} | Predicted: {predicted_emotions[i]}")

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_true_classes, y_pred_classes))
print(classification_report(y_true_classes, y_pred_classes, target_names=label_encoder.classes_))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import wavfile
from io import BytesIO
from PIL import Image

def generate_amplitude_plot(audio_path):
    try:
        sampling_rate, audio_data = wavfile.read(audio_path)
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return None

    # If stereo, take one channel
    if len(audio_data.shape) > 1:
        audio_data = audio_data[:, 0]

    time = np.arange(0, len(audio_data)) / sampling_rate

    plt.figure(figsize=(6, 3))
    plt.plot(time, audio_data, linewidth=0.5)
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.title('Time vs. Amplitude')
    plt.grid(True)
    plt.tight_layout()

    buf = BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    plt.close()
    return Image.open(buf)

In [None]:
# import librosa
# import librosa.display
# import matplotlib.pyplot as plt
# import numpy as np
# from scipy.io import wavfile

# # Load the .wav file
# for ind, file_path in enumerate(audio_files):
#   # file_path = audio_files[i]  # Replace with the path to your .wav file
#   try:
#     sampling_rate, audio_data = wavfile.read(file_path)
#   except Exception as e:
#     print(f"Error loading audio file: {e}")
#     exit()

# # Ensure audio_data is 1-dimensional (mono)
#   if len(audio_data.shape) > 1:
#     audio_data = audio_data[:, 0]  # Take the first channel if stereo

# # Time vector
#   time = np.arange(0, len(audio_data)) / sampling_rate

# # Plotting
#   plt.figure(figsize=(10, 4))
#   plt.plot(time, audio_data, linewidth=0.5)  # Explicitly set linewidth
#   plt.xlabel('Time (s)')
#   plt.ylabel('Intensity (Amplitude)')
#   plt.title('Time vs. Intensity Plot of Audio')
#   plt.grid(True)
#   plt.tight_layout()
#   # plt.show()

# # To save the plot as an image (optional)
#   saving_loc = f'/content/drive/MyDrive/Colab Notebooks/plots'
#   if not os.path.exists('./plots'):
#     os.makedirs('./plots')
#     plt.savefig(saving_loc + f'/plot{ind}.png')

In [None]:
def audio_to_melspectrogram(file_path, max_pad_len=128):
    try:
        y, sr = librosa.load(file_path, sr=16000)
        spect = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        log_spect = librosa.power_to_db(spect, ref=np.max)
        if log_spect.shape[1] < max_pad_len:
            pad_width = max_pad_len - log_spect.shape[1]
            log_spect = np.pad(log_spect, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            log_spect = log_spect[:, :max_pad_len]
        return log_spect
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [None]:
import matplotlib.pyplot as plt
import numpy as np

X = []
y = []
count = 0

for row in audio_files:
    spectrogram = audio_to_melspectrogram(row)
    if spectrogram is not None:
        # Visualize the spectrogram
        if count < 5:
          print(row + 'gave this:\n')
          plt.figure(figsize=(10, 4))
          plt.imshow(spectrogram, aspect='auto', origin='lower', cmap='magma')
          plt.title(f"Spectrogram for {row}")
          plt.xlabel("Time")
          plt.ylabel("Mel Frequency")
          plt.colorbar(format='%+2.0f dB')
          plt.tight_layout()
          plt.show()
          count+=1

        X.append(spectrogram)
        y.append(get_emotion(row))

X = np.array(X)
X = X[..., np.newaxis]  # Add channel dimension for CNN
y = np.array(y)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_onehot = to_categorical(y_encoded)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

In [None]:
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(128, 128, 1)),
    BatchNormalization(),
    MaxPooling2D((2,2)),

    Conv2D(64, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2,2)),

    Conv2D(128, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2,2)),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=25, batch_size=32, validation_data=(X_test, y_test))

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {acc:.2f}")

y_pred = model.predict(X_test)
y_pred_labels = le.inverse_transform(np.argmax(y_pred, axis=1))
y_true_labels = le.inverse_transform(np.argmax(y_test, axis=1))

print("\nClassification Report:\n", classification_report(y_true_labels, y_pred_labels))

In [None]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

def load_audio_generate_spectrogram(file_path, target_size=(128, 128)):
    try:
        y, sr = librosa.load(file_path, sr=16000)
        # Compute the Short-Time Fourier Transform
        D = librosa.stft(y)
        S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)

        # Resize to fixed shape (128x128)
        S_db_resized = librosa.util.fix_length(S_db, size=target_size[1], axis=1)
        if S_db_resized.shape[0] < target_size[0]:
            S_db_resized = np.pad(S_db_resized, ((0, target_size[0] - S_db_resized.shape[0]), (0, 0)), mode='constant')
        else:
            S_db_resized = S_db_resized[:target_size[0], :]

        return S_db_resized

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [None]:
X = []
y = []
for actor_folder in os.listdir(dataset_path):
    actor_path = os.path.join(dataset_path, actor_folder)
    if os.path.isdir(actor_path):
        for file in os.listdir(actor_path):
            if file.endswith(".wav"):
                file_path = os.path.join(actor_path, file)
                spectrogram = load_audio_generate_spectrogram(file_path, (128, 128))

                if spectrogram is not None:
                    X.append(spectrogram)
                    y.append(get_emotion(file))

X = np.array(X)
y = np.array(y)

In [None]:
X = np.array(X)
y = np.array(y)

# Normalize X
X = (X - np.min(X)) / (np.max(X) - np.min(X))  # Min-Max scaling to 0-1

# Add channel dimension for CNN
X = X[..., np.newaxis]  # shape becomes (samples, 128, 128, 1)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded = to_categorical(y_encoded)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)

In [None]:
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(128, 128, 1)),
    BatchNormalization(),
    MaxPooling2D((2,2)),

    Conv2D(64, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2,2)),

    Conv2D(128, (3,3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2,2)),

    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(le.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from PIL import Image
import gc

In [None]:
# --- Preprocess all audio files into image paths and labels ---
image_data = []
labels = []

for audio_file in audio_files:
    img = generate_plot_in_memory(audio_file)
    if img:
        img = img.convert("RGB")
        img = img.resize((64, 32))
        img_array = np.array(img) / 255.0
        if img_array.shape == (32, 64, 3):
            image_data.append(img_array)
            labels.append(get_emotion(audio_file))
        else:
            print(f"Skipping {audio_file}, shape: {img_array.shape}")


print(f"Loaded {len(audio_files)}, images")
image_data = np.array(image_data)
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
one_hot_labels = to_categorical(encoded_labels)

# --- K-Fold CNN Training ---
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
fold = 1
acc_per_fold = []

for train_idx, val_idx in skf.split(image_data, encoded_labels):
    print(f"\n--- Fold {fold} ---")

    X_train, X_val = image_data[train_idx], image_data[val_idx]
    y_train, y_val = one_hot_labels[train_idx], one_hot_labels[val_idx]

    # --- Build CNN model ---
    model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 64, 3)),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.MaxPooling2D((2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(one_hot_labels.shape[1], activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # --- Train on current fold ---
    all_histories = []
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), verbose=1)

    all_histories.append(history)

    # --- Evaluate on validation set ---
    val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Accuracy for Fold {fold}: {val_accuracy:.4f}")
    acc_per_fold.append(val_accuracy)

    # Cleanup
    del model
    tf.keras.backend.clear_session()
    gc.collect()

    fold += 1

print(f"\n✅ Average Validation Accuracy over {k} folds: {np.mean(acc_per_fold):.4f}")


In [None]:
# --- 1. Function to Load Audio and Generate Plot in Memory ---
def generate_plot_in_memory(audio_path):
    try:
        sampling_rate, audio_data = wavfile.read(audio_path)
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return None

    if len(audio_data.shape) > 1:
        audio_data = audio_data[:, 0]

    time = np.arange(0, len(audio_data)) / sampling_rate

    plt.figure(figsize=(6, 3))  # Adjust figure size as needed
    plt.plot(time, audio_data, linewidth=0.5)
    plt.xlabel('Time (s)')
    plt.ylabel('Intensity')
    plt.title('Time vs. Intensity')
    plt.grid(True)
    plt.tight_layout()

    # Save the plot to a BytesIO object (in memory)
    buf = BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    plt.close()
    return Image.open(buf)

In [None]:
import matplotlib.pyplot as plt

def plot_kfold_history(histories):
    for i, history in enumerate(histories):
        plt.figure(figsize=(12, 4))

        # Accuracy Plot
        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Train Accuracy')
        plt.plot(history.history['val_accuracy'], label='Val Accuracy')
        plt.title(f'Fold {i+1} Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()

        # Loss Plot
        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Train Loss')
        plt.plot(history.history['val_loss'], label='Val Loss')
        plt.title(f'Fold {i+1} Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()

        plt.tight_layout()
        plt.show()

# Call after training
plot_kfold_history(all_histories)