In [None]:
# Standard imports
import numpy as np
import pandas as pd
import random
from datetime import datetime
from collections import Counter

# Sklearn imports
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GroupKFold, StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.inspection import permutation_importance

# Imbalanced-learn import
from imblearn.over_sampling import SMOTE

# TensorFlow and Keras imports
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization, concatenate
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adamax, AdamW
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from keras.regularizers import l1, l2, L1L2

# Optuna import
import optuna

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# to save the scaler
import joblib

In [None]:
# Set a fixed random seed for reproducibility
random.seed(5390) 
np.random.seed(5390)
tf.random.set_seed(5390)

# Load datasets
dataframe = pd.read_csv('/Users/astrid/PycharmProjects/audioset-thesis-work/audioset/vggish/embeddings/8april_looped_embeddings.csv')

dataframe.drop('mean_freq', axis=1, inplace=True)

def assign_age_group(age, age_groups):
    for group_name, age_range in age_groups.items():
        if age_range[0] <= age < age_range[1]:
            return group_name
    return 'Unknown'  # For any age that doesn't fit the defined groups

# Define age groups
age_groups = {
    'kitten': (0, 0.5),
    'adult': (0.5, 10),
    'senior': (10, 20)
}

# Create a new column for the age group
dataframe['age_group'] = dataframe['target'].apply(assign_age_group, age_groups=age_groups)

# Drop Adult
dataframe.drop(dataframe[dataframe['age_group'] == 'adult'].index, inplace=True)

print(dataframe['age_group'].value_counts())

# save demo rows to external csv

In [None]:
# Select all rows corresponding to the specified cat_id values
selected_cat_ids = ['108A', '109A']
demo_samples = dataframe[dataframe['cat_id'].isin(selected_cat_ids)]

Save the selected samples to a CSV file
demo_samples.to_csv('demo_samples.csv', index=False)

In [None]:
demo_samples

## save embeddings and labels from demo set to .txt

In [None]:
# Ensure the target labels are encoded as 0 for kitten and 1 for senior
demo_samples = demo_samples.copy()  # Avoid SettingWithCopyWarning
demo_samples['label'] = demo_samples['age_group'].apply(lambda x: 0 if x == 'kitten' else 1)

# Extract features and labels
features = demo_samples.iloc[:, :-5].values
labels = demo_samples['label'].values

# Save each row to a separate .csv file
for i, (feature_row, label) in enumerate(zip(features, labels)):
    # Create a DataFrame for the current row
    row_df = pd.DataFrame([np.append(feature_row, label)])
    
    # Create a filename
    filename = f'demo_sample_{i}.csv'
    
    # Save to .csv file
    row_df.to_csv(filename, index=False, header=False)
    
    print(f'Saved {filename}')


In [None]:
# Ensure the target labels are encoded as 0 for kitten and 1 for senior
demo_samples = demo_samples.copy()  # Avoid SettingWithCopyWarning
demo_samples['label'] = demo_samples['age_group'].apply(lambda x: 0 if x == 'kitten' else 1)

# Extract features and labels
features = demo_samples.iloc[:, :-5].values
labels = demo_samples['label'].values

# Combine features and labels into a single DataFrame
combined_data = np.hstack((features, labels.reshape(-1, 1)))
combined_df = pd.DataFrame(combined_data)

# Create a filename for the combined CSV file
combined_filename = 'combined_demo_samples.csv'

# Save the combined data to a single CSV file
combined_df.to_csv(combined_filename, index=False, header=False)

print(f'Saved {combined_filename}')

In [None]:
# # Load the demo samples
# demo_data = pd.read_csv('/Users/astrid/Documents/Thesis/JupyterNotebooks/April/PRODUCTION-MODEL/demo_samples.csv')

# # Extract features (assuming the last four columns are not features)
# X_demo = demo_data.iloc[:, :-4].values

# # Set numpy print options to print the full array
# np.set_printoptions(threshold=np.inf)

# # Print the numpy array
# print(X_demo)

In [None]:
# Count the occurrences of each cat_id
cat_id_counts = dataframe['cat_id'].value_counts().reset_index()
cat_id_counts.columns = ['cat_id', 'count']

# Merge with the age group information
age_group_info = dataframe[['cat_id', 'age_group']].drop_duplicates()
cat_id_counts_with_age_group = cat_id_counts.merge(age_group_info, on='cat_id')

# Display the result
print(cat_id_counts_with_age_group)

In [None]:
# Separate features and labels
X = dataframe.iloc[:, :-4].values  # all columns except the last four

# Encode the 'age_group' column as integers using LabelEncoder
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(dataframe['age_group'].values)

# Use the encoded labels for splitting and one-hot encoding
y = encoded_y  

# Convert 'cat_id' column to numpy array to be used as groups array for GroupKFold
groups = dataframe['cat_id'].values

In [None]:
# Scale the features using StandardScaler
scaler_full = StandardScaler().fit(X)
X_scaled = scaler_full.transform(X)

# Encode labels using one-hot encoding
y_encoded = y.astype('float32')

### samples for demo

In [None]:
# Sample one cat_id for each age group
# kitten_cat_id = dataframe[dataframe['age_group'] == 'kitten']['cat_id'].sample(1, random_state=42).iloc[0]
# senior_cat_id = dataframe[dataframe['age_group'] == 'senior']['cat_id'].sample(1, random_state=42).iloc[0]

kitten_cat_id = "109A"
senior_cat_id = "108A"


# Select all rows corresponding to the sampled cat_id values
demo_samples = dataframe[(dataframe['cat_id'] == kitten_cat_id) | (dataframe['cat_id'] == senior_cat_id)].index

# Convert dataframe indices to positional indices
demo_sample_positions = dataframe.index.get_indexer(demo_samples)

# Separate demonstration samples using positional indices
X_demo = X_scaled[demo_sample_positions]
y_demo = y_encoded[demo_sample_positions]

# Remove demonstration samples from the training set
X_train_full = np.delete(X_scaled, demo_sample_positions, axis=0)
y_train_full = np.delete(y_encoded, demo_sample_positions, axis=0)

In [None]:
senior_cat_id

In [None]:
kitten_cat_id

In [None]:
demo_samples

### train

In [None]:
# EarlyStopping callback: monitor 'loss' instead of 'val_loss' for the test set
early_stopping = EarlyStopping(
    monitor='loss',  
    min_delta=0.001, 
    patience=30,  
    verbose=1,  
    restore_best_weights=True  
)

In [None]:
# Define optimizers
optimizers = {
    'Adamax': Adamax(learning_rate=0.00038188800331973483)
}

# Full model definition with dynamic number of layers
model_full = Sequential()
model_full.add(Dense(480, activation='relu', input_shape=(X_train_full.shape[1],)))  # units and input shape from parameters
model_full.add(BatchNormalization())
model_full.add(Dropout(0.27188281261238406))
model_full.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

optimizer = optimizers['Adamax']  # optimizer selection

# Compile the model for binary classification
model_full.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on the full training set
history_full = model_full.fit(X_train_full, y_train_full, epochs=1500, batch_size=32,
                              verbose=1, callbacks=[early_stopping])

In [None]:
# verify encoded labels
dataframe['encoded_label'] = y_encoded

# Drop duplicates to find unique mappings
unique_mappings = dataframe[['age_group', 'encoded_label']].drop_duplicates().reset_index(drop=True)

# Print the unique mappings for verification
print("Class Encoding Verification:")
print(unique_mappings)

# to do change this to demo set instead of training

In [None]:
# Evaluate model on training set to get total accuracy
loss, accuracy = model_full.evaluate(X_train_full, y_train_full, verbose=0)
print(f"Total Training Set Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Evaluate the model on the training set to get total accuracy
loss, accuracy = model_full.evaluate(X_train_full, y_train_full, verbose=0)
print(f"Total Training Set Accuracy: {accuracy * 100:.2f}%")

# Evaluate the model on the demo set to get accuracy
loss, accuracy = model_full.evaluate(X_demo, y_demo, verbose=0)
print(f"Demo Set Accuracy: {accuracy * 100:.2f}%")

# Predict probabilities for the demo samples
probabilities = model_full.predict(X_demo)

# Convert probabilities to binary predictions
predictions = (probabilities > 0.5).astype(int)

# Map predictions and actual labels to "Kitten" or "Senior"
label_map = {0: 'Kitten', 1: 'Senior'}
mapped_predictions = [label_map[pred[0]] for pred in predictions]
mapped_actual_labels = [label_map[int(label)] for label in y_demo]

# Print out the probabilities along with actual labels and predictions
for i in range(len(probabilities)):
    print(f"Sample {i}: Predicted={mapped_predictions[i]}, Actual={mapped_actual_labels[i]}, Score={probabilities[i][0]:.4f}")


In [None]:
# Predict probabilities for the demonstration samples
probabilities = model_full.predict(X_demo)

# Print out the probabilities along with actual labels
for i in range(len(probabilities)):
    print(f"Sample {i}: Probability={probabilities[i][0]}, Actual Label={y_demo[i]}")

### Save model

In [None]:
# Save the StandardScaler
joblib.dump(scaler_full, 'scaler_full.pkl')

# Save the trained model
model_full.save('cat_age_model.keras')