In [None]:
#importing the libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
import warnings
warnings.filterwarnings(action="ignore")

#importing dataset
file_paths = [
    "D:/AMINA/PFE24/datasets/commonvoice/cv-valid-train.csv",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-other-train.csv",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-invalid.csv"
]

dfs_train = []
for i in file_paths:
    df_train = pd.read_csv(i)
    dfs_train.append(df_train)

df_train = pd.concat(dfs_train, ignore_index=True)

file_paths2 = [
    "D:/AMINA/PFE24/datasets/commonvoice/cv-valid-dev.csv",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-other-dev.csv",
]
dfs_val = []
for i in file_paths2:
    df_val = pd.read_csv(i)
    dfs_val.append(df_val)
df_val = pd.concat(dfs_val, ignore_index=True)

file_paths3 = [
    "D:/AMINA/PFE24/datasets/commonvoice/cv-valid-test.csv",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-other-test.csv",
]
dfs_test = []
for i in file_paths3:
    df_test = pd.read_csv(i)
    dfs_test.append(df_test)

df_test = pd.concat(dfs_test, ignore_index=True)

#data cleaning
df_train = df_train.dropna(subset=['age'])
df_train = df_train[["filename","age","gender","accent"]]
df_train["gender"].fillna("other", inplace=True)
df_train = df_train.dropna(subset=['accent'])
#same for validation set
df_val = df_val.dropna(subset=['age'])
df_val = df_val[["filename","age","gender","accent"]]
df_val["gender"].fillna("other", inplace=True)
df_val = df_val.dropna(subset=['accent'])
#same for testing set
df_test = df_test.dropna(subset=['age'])
df_test = df_test[["filename","age","gender","accent"]]
# Convert 'age' column to numerical
cleanup_nums = {"age": {"teens":1.0,"twenties":2.0,"thirties":3.0,"fourties":4.0,"fifties":5.0,"sixties":6.0,"seventies":7.0,"eighties":8.0}}
df_test = df_test.replace(cleanup_nums)
df_train = df_train.replace(cleanup_nums)
#Convert categorical variables to numerical using one-hot encoding
df_train = pd.get_dummies(df_train, columns=['gender', 'accent'])
df_test = pd.get_dummies(df_test, columns=['gender', 'accent'])


In [None]:
# Define the model
model = Sequential([#define  a linear stack of layers for the mode
    Embedding(input_dim=len(df_train.columns)-1, output_dim=64, input_length=1),#map categorical variables to dense vectors.
    LSTM(units=64, activation='tanh', dropout=0.2, recurrent_dropout=0.2),#work for sequential data
    Dense(32, activation='relu'),#fully connected layer with 32 neurons and ReLU activation function
    Dropout(0.5),#prevent overfitting by randomaly setting fraction of input units to 0 at each update
    Dense(8, activation='softmax')  # Softmax activation for multi-class classification to output probabilities for each class
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Adam is an optimization algorithm that is an extension to stochastic gradient descent
#Sparse categorical cross-entropy is commonly used in multi-class classification problems where the labels are integers

# Define training and validation data
X_train = df_train.drop(columns=['filename', 'age']).values
y_train = df_train['age'].values - 1  # Convert labels to start from 0
X_val = df_val.drop(columns=['filename', 'age']).values
y_val = df_val['age'].values - 1  # Convert labels to start from 0

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))
# An epoch is one complete pass through the entire training dataset.

# Evaluate the model on test data
X_test = df_test.drop(columns=['filename', 'age']).values
y_test = df_test['age'].values - 1  # Convert labels to start from 0
loss, accuracy = model.evaluate(X_test, y_test)

print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

In [None]:
# Define preprocessing steps
filename_preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  
    ('encoder', OneHotEncoder())  
])

text_preprocessor = Pipeline([
    ('tfidf', TfidfVectorizer()),  
])

# Combine preprocessing steps for all features
preprocessor = ColumnTransformer([
    ('filename', filename_preprocessor, ['filename']),
    ('text', text_preprocessor, 'text'),  
    ('other', 'passthrough', df.select_dtypes(exclude="object").columns)  
])

# Split data into X (predictors) and y (target)
X = df.drop(columns=['age'])
y = df['age']

# Split text data
text_data = X['text']

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
X_text = tokenizer.texts_to_sequences(text_data)
X_text = pad_sequences(X_text, maxlen=100)

# Split the data
X_train, X_test, X_text_train, X_text_test, y_train, y_test = train_test_split(X, X_text, y, test_size=0.2, random_state=42)

# Define input layers for numeric and text data
numeric_input = Input(shape=(X_train.shape[1],))
text_input = Input(shape=(X_text_train.shape[1],))  # Use the shape of preprocessed text data

# Define embedding layer for categorical data (filename)
filename_embedding = Embedding(input_dim=len(df['filename'].unique()), output_dim=64, input_length=1)(numeric_input)

# LSTM layer for text data
lstm_layer = LSTM(units=64, activation='tanh', dropout=0.2, recurrent_dropout=0.2)(text_input)

# Concatenate numerical and text embeddings
concatenated = concatenate([filename_embedding, lstm_layer])

# Additional layers
dense_layer = Dense(32, activation='relu')(concatenated)
dropout_layer = Dropout(0.5)(dense_layer)

# Output layer
output_layer = Dense(8, activation='softmax')(dropout_layer)

# Define the model
model = Model(inputs=[numeric_input, text_input], outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([X_train, X_text_train], y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate([X_test, X_text_test], y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

In [None]:
#preprocessing the audio samples
# Function to play audio file
def play_audio(audio_path):
    display(Audio(filename=audio_path))

# Specify the folder paths containing MP3 files
folder_paths = [
    "D:/AMINA/PFE24/datasets/commonvoice/cv-invalid/cv-invalid",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-other-dev/cv-other-dev",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-other-test/cv-other-test",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-other-train/cv-other-train",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-valid-dev/cv-valid-dev",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-valid-test/cv-valid-test",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-valid-train/cv-valid-train",
]
# Iterate through each folder path
for folder_path in folder_paths:
    # Get the list of all files in the folder and sort them
    all_files = sorted([f for f in os.listdir(folder_path) if f.endswith('.mp3')])
    # Get the total file count
    total_files = len(all_files)
    print(f'Total number of MP3 files in the folder: {total_files}')
    
    # Check if there are MP3 files in the folder
    if len(all_files) == 0:
        print(f"No MP3 files found in {folder_path}")
    else:
        # Select the first MP3 file in the folder
        file_path = os.path.join(folder_path, all_files[0])
        # Play the audio file
        print(f'Playing file: {all_files[0]}')
        play_audio(file_path)

In [None]:
#extract features
#Spectral Centroid + Spectral Bandwidth + Spectral Rolloff + MFCCs
#this function is used to extract audio frequency features

# Specify the folder paths containing MP3 files
folder_paths = [
    "D:/AMINA/PFE24/datasets/commonvoice/cv-invalid/cv-invalid",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-other-dev/cv-other-dev",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-other-test/cv-other-test",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-other-train/cv-other-train",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-valid-dev/cv-valid-dev",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-valid-test/cv-valid-test",
    "D:/AMINA/PFE24/datasets/commonvoice/cv-valid-train/cv-valid-train",
]
def feature_extraction(filename, folder_path, sampling_rate=48000):
    path = os.path.join(folder_path, filename)
    features = []
    audio, _ = librosa.load(path, sr=sampling_rate)
    
    # Extract age label from DataFrame based on filename
    #age = df[df['filename'] == filename].age.values[0]
    # Extract age label from DataFrame based on full file path
    # Construct the full file path to match with DataFrame
    full_file_path = os.path.join(os.path.basename(folder_path), filename)
    
    # Filter DataFrame based on full file path
    filtered_df = df[df['filename'] == full_file_path]
    
    # Check if DataFrame is empty (no entry found for the filename)
    if filtered_df.empty:
        print(f"No entry found in DataFrame for file: {full_file_path}. Skipping...")
        return None  # Skip this filename and move on to the next one
    
    age = filtered_df.age.values[0]
    
    # Feature extraction
    features.append(age)
    features.append(np.mean(librosa.feature.spectral_centroid(y=audio, sr=sampling_rate)))
    features.append(np.mean(librosa.feature.spectral_bandwidth(y=audio, sr=sampling_rate)))
    features.append(np.mean(librosa.feature.spectral_rolloff(y=audio, sr=sampling_rate)))
    
    mfcc = librosa.feature.mfcc(y=audio, sr=sampling_rate)
    for el in mfcc:
        features.append(np.mean(el))
    
    return features

# Extract features for all files in all folder paths
all_features = []
for folder_path in folder_paths:
    files = os.listdir(folder_path)
    for file in files:
        file_features = feature_extraction(file, folder_path)
        all_features.append(file_features)

print("features: ", all_features)

In [None]:
#MFCC per age
mfcc_features = []
age_labels = []

for index, row in df.iterrows():
    audio_path = row['filename']  # Replace 'audio_path' with the column name containing the file paths
    age_label = row['age']     # Replace 'age_label' with the column name containing the age labels
    
    audio_array, sampling_rate = librosa.load(audio_path, sr=sample_rate)
    mfcc = librosa.feature.mfcc(y=audio_array, sr=sample_rate, n_mfcc=20)  # You can change n_mfcc to 10 if needed
    
    mfcc_features.append(mfcc)
    age_labels.append(age_label)

# Visualize MFCC features for different age groups
sns.boxplot(x=age_labels, y=mfcc_features)
plt.xlabel('Age Label', fontsize=14)
plt.ylabel('MFCC', fontsize=14)
plt.title('MFCC Variation with Age', fontsize=16)
plt.show()

In [None]:
for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, total_images)
    
    batch_paths = image_paths[start_idx:end_idx]
    batch_images = []
    
    for img_path in batch_paths:
        img = image.load_img(img_path)
        img = image.img_to_array(img)
        img = img / 255.0
        batch_images.append(img)
    
    images.extend(batch_images)
    print(f"Processed batch {batch_idx + 1}/{num_batches}")

images = np.array(images)