Necessary imports

In [9]:
import pandas as pd
import numpy as np
!pip install gdown
import gdown
import zipfile
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import  f1_score, classification_report, confusion_matrix



Loading Data

In [10]:
# Google Drive file ID and destination filename
file_id = '15qfdoxXF_6QbHpIKX6cdLfhn5EqykrVz'
destination = 'downloaded_file.zip'

# Download the file from Google Drive
gdown.download(f'https://drive.google.com/uc?id={file_id}', destination, quiet=False)

# Unpack the zip file
with zipfile.ZipFile(destination, 'r') as zip_ref:
    zip_ref.extractall('unzipped_content')

print("Download and extraction complete.")

Downloading...
From (original): https://drive.google.com/uc?id=15qfdoxXF_6QbHpIKX6cdLfhn5EqykrVz
From (redirected): https://drive.google.com/uc?id=15qfdoxXF_6QbHpIKX6cdLfhn5EqykrVz&confirm=t&uuid=b6ace14b-4909-4fe0-9727-64f71de65e47
To: /content/downloaded_file.zip
100%|██████████| 560M/560M [00:11<00:00, 48.1MB/s]


Download and extraction complete.


Data information

In [11]:
train_data = pd.read_csv('/content/unzipped_content/data/train.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35332 entries, 0 to 35331
Data columns (total 49 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Filename                35332 non-null  object
 1   Identity                35332 non-null  int64 
 2   Male                    35332 non-null  int64 
 3   Young                   35332 non-null  int64 
 4   Middle_Aged             35332 non-null  int64 
 5   Senior                  35332 non-null  int64 
 6   Asian                   35332 non-null  int64 
 7   White                   35332 non-null  int64 
 8   Black                   35332 non-null  int64 
 9   Rosy_Cheeks             35332 non-null  int64 
 10  Shiny_Skin              35332 non-null  int64 
 11  Bald                    35332 non-null  int64 
 12  Wavy_Hair               35332 non-null  int64 
 13  Receding_Hairline       35332 non-null  int64 
 14  Bangs                   35332 non-null  int64 
 15  Si

Load and Preprocess images

In [12]:
image_directory = '/content/unzipped_content/data/image_data'

# mappig -1, 1 to 0, 1 to work easier
train_data['Male'] = train_data['Male'].map({1: 1, -1: 0})

age_columns = ['Young', 'Middle_Aged', 'Senior']
for col in age_columns:
    train_data[col] = train_data[col].map({1: 1, -1: 0})

# also mapping here to be easier to work with
def age_label(row):
    if row['Young'] == 1:
        return 0  # Young
    elif row['Middle_Aged'] == 1:
        return 1  # Middle_Aged
    elif row['Senior'] == 1:
        return 2  # Senior
    else:
        return -1  # Undefined

# Removing records without label
train_data['AgeLabel'] = train_data.apply(age_label, axis=1)
train_data = train_data[train_data['AgeLabel'] != -1]

X = train_data['Filename'].values
y_gender = train_data['Male'].values.astype(int)
y_age = train_data['AgeLabel'].values.astype(int)

# Spliting into train and validation
sss_gender = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=33)
for train_idx, val_idx in sss_gender.split(X, y_gender):
    X_train_gender, X_val_gender = X[train_idx], X[val_idx]
    y_train_gender, y_val_gender = y_gender[train_idx], y_gender[val_idx]

sss_age = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=33)
for train_idx, val_idx in sss_age.split(X, y_age):
    X_train_age, X_val_age = X[train_idx], X[val_idx]
    y_train_age, y_val_age = y_age[train_idx], y_age[val_idx]

ros_gender = RandomOverSampler(random_state=33)
X_train_gender_res, y_train_gender_res = ros_gender.fit_resample(
    X_train_gender.reshape(-1, 1), y_train_gender)
X_train_gender_res = X_train_gender_res.flatten()

ros_age = RandomOverSampler(random_state=33)
X_train_age_res, y_train_age_res = ros_age.fit_resample(
    X_train_age.reshape(-1, 1), y_train_age)
X_train_age_res = X_train_age_res.flatten()

def load_and_preprocess_image(filename):
    image_path = tf.strings.join([image_directory, filename], separator="/")
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [64, 64])
    image = image / 255.0  # Normalize to [0,1]
    return image


Feature engineering

In [13]:
# Augmentation
def augment(image):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_brightness(image, max_delta=0.1)
    image = tf.image.random_contrast(image, lower=0.9, upper=1.1)
    image = tf.image.random_saturation(image, lower=0.9, upper=1.1)
    image = tf.image.random_hue(image, max_delta=0.02)
    image = tf.clip_by_value(image, 0.0, 1.0)
    return image

# preprocessing the images for gender training
def prepare_dataset_gender(filenames, labels, training=True):
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    if training:
        dataset = dataset.shuffle(len(filenames))
    dataset = dataset.map(lambda x, y: (load_and_preprocess_image(x), y),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    if training:
        dataset = dataset.map(lambda x, y: (augment(x), y),
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

# preprocessing the images for age training
def prepare_dataset_age(filenames, labels, training=True):
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    if training:
        dataset = dataset.shuffle(len(filenames))
    dataset = dataset.map(lambda x, y: (load_and_preprocess_image(x), y),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    if training:
        dataset = dataset.map(lambda x, y: (augment(x), y),
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

# calling the functions
train_dataset_gender = prepare_dataset_gender(X_train_gender_res, y_train_gender_res, training=True)
val_dataset_gender = prepare_dataset_gender(X_val_gender, y_val_gender, training=False)

train_dataset_age = prepare_dataset_age(X_train_age_res, y_train_age_res, training=True)
val_dataset_age = prepare_dataset_age(X_val_age, y_val_age, training=False)

Building CNNs

In [14]:
def create_gender_model():
    inputs = tf.keras.Input(shape=(64, 64, 3))
    x = layers.Conv2D(32, (3, 3), activation='relu')(inputs)
    x = layers.MaxPooling2D(2)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(64, (3, 3), activation='relu')(x)
    x = layers.MaxPooling2D(2)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = Model(inputs, outputs)
    return model

def create_age_model():
    inputs = tf.keras.Input(shape=(64, 64, 3))
    x = layers.Conv2D(32, (3, 3), activation='relu')(inputs)
    x = layers.MaxPooling2D(2)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Conv2D(64, (3, 3), activation='relu')(x)
    x = layers.MaxPooling2D(2)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001))(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(3, activation='softmax')(x)
    model = Model(inputs, outputs)
    return model

gender_model = create_gender_model()
age_model = create_age_model()

Loss functions

In [15]:
def binary_focal_loss(gamma=2.0, alpha=0.25):
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        epsilon = 1e-7
        y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
        pt = tf.where(tf.equal(y_true, 1), y_pred, 1 - y_pred)
        loss = -alpha * tf.pow(1. - pt, gamma) * tf.math.log(pt)
        return tf.reduce_mean(loss)
    return loss
def sparse_categorical_focal_loss(gamma=2.0, alpha=0.25):
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, tf.int32)
        y_true_one_hot = tf.one_hot(y_true, depth=3)
        y_pred = tf.clip_by_value(y_pred, 1e-7, 1. - 1e-7)
        cross_entropy = -y_true_one_hot * tf.math.log(y_pred)
        weights = alpha * tf.pow(1 - y_pred, gamma) * y_true_one_hot
        loss = weights * cross_entropy
        return tf.reduce_mean(tf.reduce_sum(loss, axis=1))
    return loss

Building models

In [16]:
# introducing models with respective loss functions
# using adam optimizer to lower loss function
gender_model.compile(optimizer=Adam(learning_rate=0.0001),
                     loss=binary_focal_loss(),
                     metrics=['accuracy'])

age_model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss=sparse_categorical_focal_loss(),
                  metrics=['accuracy'])

# break after 5 epoch if no improvements happened
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train gender model
history_gender = gender_model.fit(
    train_dataset_gender,
    epochs=10,
    validation_data=val_dataset_gender,
    callbacks=[early_stopping]
)

# Train age model
history_age = age_model.fit(
    train_dataset_age,
    epochs=5,
    validation_data=val_dataset_age,
    callbacks=[early_stopping]
)

Epoch 1/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 256ms/step - accuracy: 0.7546 - loss: 0.1809 - val_accuracy: 0.8658 - val_loss: 0.1288
Epoch 2/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 235ms/step - accuracy: 0.8505 - loss: 0.1249 - val_accuracy: 0.8779 - val_loss: 0.1050
Epoch 3/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 219ms/step - accuracy: 0.8766 - loss: 0.1005 - val_accuracy: 0.8932 - val_loss: 0.0851
Epoch 4/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 215ms/step - accuracy: 0.8976 - loss: 0.0802 - val_accuracy: 0.9042 - val_loss: 0.0679
Epoch 5/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 212ms/step - accuracy: 0.9147 - loss: 0.0628 - val_accuracy: 0.9044 - val_loss: 0.0557
Epoch 6/10
[1m652/652[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 206ms/step - accuracy: 0.9257 - loss: 0.0497 - val_accuracy: 0.9154 - val_loss: 0.0438
Epoc

Test and Predict the model

In [17]:
gender_preds = gender_model.predict(val_dataset_gender)
gender_preds_binary = (gender_preds > 0.5).astype(int).flatten()
f1_gender = f1_score(y_val_gender, gender_preds_binary)
print(f"F1 Score for Gender: {f1_gender}")
print("Classification Report for Gender:")
print(classification_report(y_val_gender, gender_preds_binary))
print("Confusion Matrix for Gender:")
print(confusion_matrix(y_val_gender, gender_preds_binary))

age_preds = age_model.predict(val_dataset_age)
age_preds_class = np.argmax(age_preds, axis=1)
f1_age = f1_score(y_val_age, age_preds_class, average='weighted')
print(f"F1 Score for Age: {f1_age}")
print("Classification Report for Age:")
print(classification_report(y_val_age, age_preds_class))
print("Confusion Matrix for Age:")
print(confusion_matrix(y_val_age, age_preds_class))


[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 67ms/step
F1 Score for Gender: 0.9092409240924092
Classification Report for Gender:
              precision    recall  f1-score   support

           0       0.95      0.92      0.94      2605
           1       0.88      0.93      0.91      1768

    accuracy                           0.92      4373
   macro avg       0.92      0.93      0.92      4373
weighted avg       0.93      0.92      0.92      4373

Confusion Matrix for Gender:
[[2390  215]
 [ 115 1653]]
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 70ms/step
F1 Score for Age: 0.7983732824963466
Classification Report for Age:
              precision    recall  f1-score   support

           0       0.96      0.80      0.87      3070
           1       0.49      0.71      0.58       760
           2       0.61      0.81      0.69       543

    accuracy                           0.78      4373
   macro avg       0.69      0.77      0.72      4