In [1]:
import os #for using operating system-dependent functionality
import numpy as np # for numerical computations
import pandas as pd # for data manipulation and analysis
import matplotlib.pyplot as plt #for creating visualizations 
import seaborn as sns #for high-level interfaces, built on-top plt
from PIL import Image #downsample the images
import warnings
warnings.filterwarnings(action="ignore")

import librosa #for audio analysis
import librosa.display #for displaying audio data

from sklearn.preprocessing import MinMaxScaler #for feature scaling
from tqdm import tqdm, notebook,trange #functions/classes for displaying progress bars during iterations
from sklearn.linear_model import LogisticRegression # for logistic regression modeling

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

from keras.preprocessing import image
from keras import backend as K #pre-trained deep learning models for computer vision tasks
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential #to build sequential neural network models
from keras.layers import Flatten,BatchNormalization #DL layers
from keras.layers import Dense,Dropout
from keras.optimizers import Adam # optimizer used in training
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

import plotly as px #for creating interactive visualizations
import plotly.graph_objects as go #for creating graph objects

from pylab import rcParams #to customize graph figure
rcParams['figure.figsize'] = 15,6
plt.style.use('fivethirtyeight')

In [2]:
df = pd.read_csv("D:/AMINA/PFE24/application/data.csv")
df.head(2)

Unnamed: 0,filename,age,gender,accent
0,D:/AMINA/PFE24/datasets/commonvoice/cv-valid-t...,2,female,us
1,D:/AMINA/PFE24/datasets/commonvoice/cv-valid-t...,7,male,us


In [5]:
#Image preprocessing
output_folder = "D:/AMINA/PFE24/application/audio_imgs"
image_paths = [os.path.join(output_folder, filename) for filename in os.listdir(output_folder) if filename.endswith('_mfcc.png')]

batch_size = 10
total_images = len(image_paths)
num_batches = (total_images + batch_size - 1) // batch_size

images = []

# Define the new size for downsampling
new_width = 200
new_height = 200

# Start processing
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, total_images)
    
    batch_paths = image_paths[start_idx:end_idx]
    batch_images = []
    for img_path in batch_paths:
        img = image.load_img(img_path)
        img = img.resize((new_width, new_height))
        img = image.img_to_array(img)
        img = img / 255.0
        batch_images.append(img)
    
    images.extend(batch_images)
    print(f"Processed batch {batch_idx + 1}/{num_batches}")

images = np.array(images)

Processed batch 1/12611
Processed batch 2/12611
Processed batch 3/12611
Processed batch 4/12611
Processed batch 5/12611
Processed batch 6/12611
Processed batch 7/12611
Processed batch 8/12611
Processed batch 9/12611
Processed batch 10/12611
Processed batch 11/12611
Processed batch 12/12611
Processed batch 13/12611
Processed batch 14/12611
Processed batch 15/12611
Processed batch 16/12611
Processed batch 17/12611
Processed batch 18/12611
Processed batch 19/12611
Processed batch 20/12611
Processed batch 21/12611
Processed batch 22/12611
Processed batch 23/12611
Processed batch 24/12611
Processed batch 25/12611
Processed batch 26/12611
Processed batch 27/12611
Processed batch 28/12611
Processed batch 29/12611
Processed batch 30/12611
Processed batch 31/12611
Processed batch 32/12611
Processed batch 33/12611
Processed batch 34/12611
Processed batch 35/12611
Processed batch 36/12611
Processed batch 37/12611
Processed batch 38/12611
Processed batch 39/12611
Processed batch 40/12611
Processed

In [4]:
# Load audio features (MFCC) from your DataFrame
audio_features = df.iloc[:, :-1].values  # Exclude 'age' column
# Combine image and audio data
X = np.concatenate((images, audio_features), axis=1)
# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['age'])

#bc StratifiedKFold doesn't split into 3 sets: 
#split into temp train set (split into train 60% + val 20% sets) + test set 20%
X_train, X_temp, y_train, y_temp = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42)

NameError: name 'images' is not defined

In [None]:
#THE MODEL
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(new_width, new_height, 3)))  # All imgs are 200x200 pixels
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(8, activation='softmax')) #8 = number of age classes

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)

# Predict classes for the test data
y_pred = model.predict_classes(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

# Print out the evaluation metrics results
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

In [None]:
print("Confusion Matrix:")
print(conf_matrix)