# Notebook for generating the Production Demo for Binary Age Prediction of Domestic Felines (kitten, senior)

Three cat_ids are selected that each have 7 contributions. 

Demo samples are removed from training set and model is built on remaining data. 

Demo samples are available for the production demo in https://github.com/aster-droide/age-prediction-demo-binary

In [4]:
# Standard imports
import numpy as np
import pandas as pd
import random
from datetime import datetime
from collections import Counter

# Sklearn imports
from sklearn.model_selection import train_test_split, GroupShuffleSplit, GroupKFold, StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.inspection import permutation_importance

# Imbalanced-learn import
from imblearn.over_sampling import SMOTE

# TensorFlow and Keras imports
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization, concatenate
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adamax, AdamW
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from keras.regularizers import l1, l2, L1L2

# Optuna import
import optuna

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# to save the scaler
import joblib

In [5]:
# Set a fixed random seed for reproducibility
random.seed(5390) 
np.random.seed(5390)
tf.random.set_seed(5390)

# Load datasets
dataframe = pd.read_csv('/Users/astrid/PycharmProjects/audioset-thesis-work/audioset/vggish/embeddings/8april_looped_embeddings.csv')

dataframe.drop('mean_freq', axis=1, inplace=True)

def assign_age_group(age, age_groups):
    for group_name, age_range in age_groups.items():
        if age_range[0] <= age < age_range[1]:
            return group_name
    return 'Unknown'  # For any age that doesn't fit the defined groups

# Define age groups
age_groups = {
    'kitten': (0, 0.5),
    'adult': (0.5, 10),
    'senior': (10, 20)
}

# Create a new column for the age group
dataframe['age_group'] = dataframe['target'].apply(assign_age_group, age_groups=age_groups)

# Drop Adult
dataframe.drop(dataframe[dataframe['age_group'] == 'adult'].index, inplace=True)

print(dataframe['age_group'].value_counts())

senior    306
kitten    171
Name: age_group, dtype: int64


# save demo rows to external csv

In [6]:
# Select all rows corresponding to the specified cat_id values
selected_cat_ids = ['117A', '050A']
demo_samples = dataframe[dataframe['cat_id'].isin(selected_cat_ids)]

# Save the selected samples to a CSV file
demo_samples.to_csv('demo_samples.csv', index=False)

In [7]:
demo_samples

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,122,123,124,125,126,127,gender,target,cat_id,age_group
202,3426.396,-1453.432,-3635.5828,178.96103,306.44357,5830.955,-2044.4401,-2213.743,7086.1597,2355.9998,...,3041.8423,-527.4547,-3442.1597,4118.902,-7263.379,-1356.3002,F,18.0,117A,senior
209,3355.9521,-1303.412,-3631.605,141.45258,244.16048,5786.3228,-2031.5255,-2233.3796,7078.1445,2368.2544,...,3040.051,-640.22815,-3344.331,3989.8276,-7187.931,-1249.6372,X,0.0,050A,kitten
210,2545.754,-1022.4693,-2688.3418,83.26001,181.08517,4263.4214,-1531.0897,-1716.8507,5238.873,1717.6139,...,2250.8567,-462.13022,-2463.6191,2999.9104,-5323.4185,-910.9742,X,0.0,050A,kitten
211,2886.9104,-1155.1565,-3081.1362,104.03778,181.50655,4943.773,-1759.0626,-1962.9388,6099.0923,1992.6163,...,2565.9219,-504.01306,-2830.1445,3407.9817,-6132.166,-1061.849,X,0.0,050A,kitten
215,3212.699,-1365.4951,-3333.8115,174.9643,375.1787,5477.5283,-1932.7018,-2047.5621,6611.2944,2187.4224,...,2755.9788,-451.632,-3125.1729,3826.057,-6694.327,-1277.1831,F,18.0,117A,senior
239,3224.6812,-1306.4841,-3483.2856,155.92056,214.70952,5574.161,-1957.7963,-2125.3423,6849.415,2297.224,...,2919.4226,-584.5537,-3240.3708,3825.4539,-6930.2627,-1185.7925,X,0.0,050A,kitten
275,2827.5178,-1220.6775,-2959.4004,150.27759,115.613266,4793.8667,-1631.2704,-1904.5444,5927.4536,1959.3539,...,2474.943,-398.57608,-2907.0308,3327.128,-5999.854,-1065.6439,F,18.0,117A,senior
292,3376.4229,-1375.8116,-3668.7173,135.95068,188.34818,5808.946,-2060.1282,-2271.3206,7147.075,2407.6265,...,3057.3933,-641.9585,-3405.468,4017.4436,-7231.245,-1256.056,X,0.0,050A,kitten
339,3467.8066,-1463.304,-3717.7402,158.17169,243.69038,5969.657,-2132.1025,-2287.653,7308.6665,2475.667,...,3129.442,-651.19727,-3503.169,4131.718,-7379.838,-1321.2385,X,0.0,050A,kitten
393,3594.0679,-1586.2054,-3861.7378,175.72461,264.81836,6251.356,-2162.9236,-2404.4595,7608.1875,2509.8042,...,3204.818,-628.9015,-3751.4497,4238.127,-7670.552,-1459.2391,F,18.0,117A,senior


## save embeddings and labels from demo set to .txt

In [8]:
# Ensure the target labels are encoded as 0 for kitten and 1 for senior
demo_samples = demo_samples.copy()  # Avoid SettingWithCopyWarning
demo_samples['label'] = demo_samples['age_group'].apply(lambda x: 0 if x == 'kitten' else 1)

# Extract features and labels
features = demo_samples.iloc[:, :-5].values
labels = demo_samples['label'].values

# Save each row to a separate .csv file
for i, (feature_row, label) in enumerate(zip(features, labels)):
    # Create a DataFrame for the current row
    row_df = pd.DataFrame([np.append(feature_row, label)])
    
    # Create a filename
    filename = f'demo_sample_{i}.csv'
    
    # Save to .csv file
    row_df.to_csv(filename, index=False, header=False)
    
    print(f'Saved {filename}')


Saved demo_sample_0.csv
Saved demo_sample_1.csv
Saved demo_sample_2.csv
Saved demo_sample_3.csv
Saved demo_sample_4.csv
Saved demo_sample_5.csv
Saved demo_sample_6.csv
Saved demo_sample_7.csv
Saved demo_sample_8.csv
Saved demo_sample_9.csv
Saved demo_sample_10.csv
Saved demo_sample_11.csv
Saved demo_sample_12.csv
Saved demo_sample_13.csv


In [9]:
# Ensure the target labels are encoded as 0 for kitten and 1 for senior
demo_samples = demo_samples.copy()  # Avoid SettingWithCopyWarning
demo_samples['label'] = demo_samples['age_group'].apply(lambda x: 0 if x == 'kitten' else 1)

# Extract features and labels
features = demo_samples.iloc[:, :-5].values
labels = demo_samples['label'].values

# Combine features and labels into a single DataFrame
combined_data = np.hstack((features, labels.reshape(-1, 1)))
combined_df = pd.DataFrame(combined_data)

# Create a filename for the combined CSV file
combined_filename = 'combined_demo_samples.csv'

# Save the combined data to a single CSV file
combined_df.to_csv(combined_filename, index=False, header=False)

print(f'Saved {combined_filename}')

Saved combined_demo_samples.csv


In [10]:
# # Load the demo samples
# demo_data = pd.read_csv('/Users/astrid/Documents/Thesis/JupyterNotebooks/April/PRODUCTION-MODEL/demo_samples.csv')

# # Extract features (assuming the last four columns are not features)
# X_demo = demo_data.iloc[:, :-4].values

# # Set numpy print options to print the full array
# np.set_printoptions(threshold=np.inf)

# # Print the numpy array
# print(X_demo)

In [11]:
# Count the occurrences of each cat_id
cat_id_counts = dataframe['cat_id'].value_counts().reset_index()
cat_id_counts.columns = ['cat_id', 'count']

# Merge with the age group information
age_group_info = dataframe[['cat_id', 'age_group']].drop_duplicates()
cat_id_counts_with_age_group = cat_id_counts.merge(age_group_info, on='cat_id')

# Display the result
print(cat_id_counts_with_age_group)

   cat_id  count age_group
0    046A     63    kitten
1    103A     33    senior
2    047A     28    kitten
3    057A     27    senior
4    055A     20    senior
5    097A     16    senior
6    101A     15    senior
7    001A     14    senior
8    106A     14    senior
9    059A     14    senior
10   042A     14    kitten
11   111A     13    kitten
12   028A     13    senior
13   039A     12    senior
14   116A     12    senior
15   051A     12    senior
16   025A     11    senior
17   016A     10    senior
18   014B     10    kitten
19   040A     10    kitten
20   051B      9    senior
21   015A      9    senior
22   045A      9    kitten
23   094A      8    senior
24   117A      7    senior
25   050A      7    kitten
26   053A      6    senior
27   008A      6    senior
28   108A      6    senior
29   109A      6    kitten
30   044A      5    kitten
31   025C      5    senior
32   104A      4    senior
33   056A      3    senior
34   058A      3    senior
35   113A      3    senior
3

In [12]:
# Separate features and labels
X = dataframe.iloc[:, :-4].values  # all columns except the last four

# Encode the 'age_group' column as integers using LabelEncoder
label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(dataframe['age_group'].values)

# Use the encoded labels for splitting and one-hot encoding
y = encoded_y  

# Convert 'cat_id' column to numpy array to be used as groups array for GroupKFold
groups = dataframe['cat_id'].values

In [13]:
# Scale the features using StandardScaler
scaler_full = StandardScaler().fit(X)
X_scaled = scaler_full.transform(X)

# Encode labels using one-hot encoding
y_encoded = y.astype('float32')

### samples for demo

In [14]:
# Sample one cat_id for each age group
# kitten_cat_id = dataframe[dataframe['age_group'] == 'kitten']['cat_id'].sample(1, random_state=42).iloc[0]
# senior_cat_id = dataframe[dataframe['age_group'] == 'senior']['cat_id'].sample(1, random_state=42).iloc[0]

kitten_cat_id = "050A"
senior_cat_id = "117A"


# Select all rows corresponding to the sampled cat_id values
demo_samples = dataframe[(dataframe['cat_id'] == kitten_cat_id) | (dataframe['cat_id'] == senior_cat_id)].index

# Convert dataframe indices to positional indices
demo_sample_positions = dataframe.index.get_indexer(demo_samples)

# Separate demonstration samples using positional indices
X_demo = X_scaled[demo_sample_positions]
y_demo = y_encoded[demo_sample_positions]

# Remove demonstration samples from the training set
X_train_full = np.delete(X_scaled, demo_sample_positions, axis=0)
y_train_full = np.delete(y_encoded, demo_sample_positions, axis=0)

In [15]:
senior_cat_id

'117A'

In [16]:
kitten_cat_id

'050A'

In [17]:
demo_samples

Int64Index([202, 209, 210, 211, 215, 239, 275, 292, 339, 393, 528, 551, 582,
            858],
           dtype='int64')

### train

In [18]:
# EarlyStopping callback: monitor 'loss' instead of 'val_loss' for the test set
early_stopping = EarlyStopping(
    monitor='loss',  
    min_delta=0.001, 
    patience=30,  
    verbose=1,  
    restore_best_weights=True  
)

In [19]:
optimizers = {
    'Adamax': Adamax(learning_rate=0.003109800273709165)
}

# Full model definition with dynamic number of layers
model_full = Sequential()
model_full.add(Dense(128, activation='relu', input_shape=(X_train_full.shape[1],))) 
model_full.add(BatchNormalization())
model_full.add(Dropout(0.44571035356880917))  
model_full.add(Dense(1, activation='sigmoid'))  

optimizer = optimizers['Adamax']  # optimizer_key from parameters

# Compile the model
model_full.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Train the model on the full training set
history_full = model_full.fit(X_train_full, y_train_full, epochs=1500, batch_size=16,
                              verbose=1, callbacks=[early_stopping])



Epoch 1/1500
Epoch 2/1500
Epoch 3/1500
Epoch 4/1500
Epoch 5/1500
Epoch 6/1500
Epoch 7/1500
Epoch 8/1500
Epoch 9/1500
Epoch 10/1500
Epoch 11/1500
Epoch 12/1500
Epoch 13/1500
Epoch 14/1500
Epoch 15/1500
Epoch 16/1500
Epoch 17/1500
Epoch 18/1500
Epoch 19/1500
Epoch 20/1500
Epoch 21/1500
Epoch 22/1500
Epoch 23/1500
Epoch 24/1500
Epoch 25/1500
Epoch 26/1500
Epoch 27/1500
Epoch 28/1500
Epoch 29/1500
Epoch 30/1500
Epoch 31/1500
Epoch 32/1500
Epoch 33/1500
Epoch 34/1500
Epoch 35/1500
Epoch 36/1500
Epoch 37/1500
Epoch 38/1500
Epoch 39/1500
Epoch 40/1500
Epoch 41/1500
Epoch 42/1500
Epoch 43/1500
Epoch 44/1500
Epoch 45/1500
Epoch 46/1500
Epoch 47/1500
Epoch 48/1500
Epoch 49/1500
Epoch 50/1500
Epoch 51/1500
Epoch 52/1500
Epoch 53/1500
Epoch 54/1500
Epoch 55/1500
Epoch 56/1500
Epoch 57/1500
Epoch 58/1500
Epoch 59/1500
Epoch 60/1500
Epoch 61/1500
Epoch 62/1500
Epoch 63/1500
Epoch 64/1500
Epoch 65/1500
Epoch 66/1500
Epoch 67/1500
Epoch 68/1500
Epoch 69/1500
Epoch 70/1500
Epoch 71/1500
Epoch 72/1500
E

In [20]:
# verify encoded labels
dataframe['encoded_label'] = y_encoded

# Drop duplicates to find unique mappings
unique_mappings = dataframe[['age_group', 'encoded_label']].drop_duplicates().reset_index(drop=True)

# Print the unique mappings for verification
print("Class Encoding Verification:")
print(unique_mappings)

Class Encoding Verification:
  age_group  encoded_label
0    kitten            0.0
1    senior            1.0


# to do change this to demo set instead of training

In [21]:
# Evaluate model on training set to get total accuracy
loss, accuracy = model_full.evaluate(X_train_full, y_train_full, verbose=0)
print(f"Total Training Set Accuracy: {accuracy * 100:.2f}%")

Total Training Set Accuracy: 99.57%


In [22]:
# Evaluate the model on the training set to get total accuracy
loss, accuracy = model_full.evaluate(X_train_full, y_train_full, verbose=0)
print(f"Total Training Set Accuracy: {accuracy * 100:.2f}%")

# Evaluate the model on the demo set to get accuracy
loss, accuracy = model_full.evaluate(X_demo, y_demo, verbose=0)
print(f"Demo Set Accuracy: {accuracy * 100:.2f}%")

# Predict probabilities for the demo samples
probabilities = model_full.predict(X_demo)

# Convert probabilities to binary predictions
predictions = (probabilities > 0.5).astype(int)

# Map predictions and actual labels to "Kitten" or "Senior"
label_map = {0: 'Kitten', 1: 'Senior'}
mapped_predictions = [label_map[pred[0]] for pred in predictions]
mapped_actual_labels = [label_map[int(label)] for label in y_demo]

# Print out the probabilities along with actual labels and predictions
for i in range(len(probabilities)):
    print(f"Sample {i}: Predicted={mapped_predictions[i]}, Actual={mapped_actual_labels[i]}, Score={probabilities[i][0]:.4f}")


Total Training Set Accuracy: 99.57%
Demo Set Accuracy: 92.86%
Sample 0: Predicted=Senior, Actual=Senior, Score=0.9992
Sample 1: Predicted=Kitten, Actual=Kitten, Score=0.1999
Sample 2: Predicted=Senior, Actual=Kitten, Score=0.9878
Sample 3: Predicted=Kitten, Actual=Kitten, Score=0.0675
Sample 4: Predicted=Senior, Actual=Senior, Score=1.0000
Sample 5: Predicted=Kitten, Actual=Kitten, Score=0.3152
Sample 6: Predicted=Senior, Actual=Senior, Score=0.9998
Sample 7: Predicted=Kitten, Actual=Kitten, Score=0.0923
Sample 8: Predicted=Kitten, Actual=Kitten, Score=0.0546
Sample 9: Predicted=Senior, Actual=Senior, Score=0.9738
Sample 10: Predicted=Senior, Actual=Senior, Score=0.9995
Sample 11: Predicted=Senior, Actual=Senior, Score=0.9924
Sample 12: Predicted=Kitten, Actual=Kitten, Score=0.0839
Sample 13: Predicted=Senior, Actual=Senior, Score=0.9994


In [23]:
# Predict probabilities for the demonstration samples
probabilities = model_full.predict(X_demo)

# Print out the probabilities along with actual labels
for i in range(len(probabilities)):
    print(f"Sample {i}: Probability={probabilities[i][0]}, Actual Label={y_demo[i]}")

Sample 0: Probability=0.9992295503616333, Actual Label=1.0
Sample 1: Probability=0.19993403553962708, Actual Label=0.0
Sample 2: Probability=0.9877561330795288, Actual Label=0.0
Sample 3: Probability=0.06748417019844055, Actual Label=0.0
Sample 4: Probability=0.9999938011169434, Actual Label=1.0
Sample 5: Probability=0.31522753834724426, Actual Label=0.0
Sample 6: Probability=0.9997661709785461, Actual Label=1.0
Sample 7: Probability=0.09229150414466858, Actual Label=0.0
Sample 8: Probability=0.05456143617630005, Actual Label=0.0
Sample 9: Probability=0.9737595915794373, Actual Label=1.0
Sample 10: Probability=0.999523401260376, Actual Label=1.0
Sample 11: Probability=0.9924240112304688, Actual Label=1.0
Sample 12: Probability=0.08393140882253647, Actual Label=0.0
Sample 13: Probability=0.9993523955345154, Actual Label=1.0


### Save model

In [24]:
# Save the StandardScaler
joblib.dump(scaler_full, 'scaler_full.pkl')

# Save the trained model
model_full.save('cat_age_model.keras')