In [1]:
!pip install -q tensorflow tensorflow-addons tensorflow-hub tensorflow-datasets
!pip install -q gdown matplotlib seaborn
!pip install keras-facenet

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers, losses, metrics, Model
import tensorflow_hub as hub
from keras_facenet import FaceNet
import pandas as pd
import numpy as np
import os
from pathlib import Path
import zipfile
import gdown
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive, files
import warnings
warnings.filterwarnings('ignore')



In [2]:
drive.mount('/content/drive')
DATASET_PATH = "/content/drive/MyDrive/datasets/FairFace"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:

if os.path.exists(DATASET_PATH):
    print("Path exists.")
else:
    print("Path does not exist.")

# Verify dataset structure
expected_files = [
    "train_labels.csv",
    "val_labels.csv",
    "train/",
    "val/"
]

print(f"Loading dataset from: {DATASET_PATH}")

# Check if all required files/folders exist
missing_files = []
for item in expected_files:
    if not os.path.exists(os.path.join(DATASET_PATH, item)):
        missing_files.append(item)

if missing_files:
    raise FileNotFoundError(
        f"Dataset incomplete. Missing: {missing_files}\n"
        f"Expected structure:\n"
        f"{DATASET_PATH}/\n"
        f"├── train_labels.csv\n"
        f"├── val_labels.csv\n"
        f"├── train/ [contains images]\n"
        f"└── val/   [contains images]"
    )


# Set paths for data loading
TRAIN_CSV_PATH = os.path.join(DATASET_PATH, "train_labels.csv")
VAL_CSV_PATH = os.path.join(DATASET_PATH, "val_labels.csv")
TRAIN_IMG_DIR = os.path.join(DATASET_PATH, "train")
VAL_IMG_DIR = os.path.join(DATASET_PATH, "val")

print("\nDataset structure verified:")
print(f"Training CSV:   {TRAIN_CSV_PATH}")
print(f"Validation CSV: {VAL_CSV_PATH}")
print(f"Training images: {TRAIN_IMG_DIR} ({len(os.listdir(TRAIN_IMG_DIR))} files)")
print(f"Validation images: {VAL_IMG_DIR} ({len(os.listdir(VAL_IMG_DIR))} files)")

Path exists.
Loading dataset from: /content/drive/MyDrive/datasets/FairFace

Dataset structure verified:
Training CSV:   /content/drive/MyDrive/datasets/FairFace/train_labels.csv
Validation CSV: /content/drive/MyDrive/datasets/FairFace/val_labels.csv
Training images: /content/drive/MyDrive/datasets/FairFace/train (26399 files)
Validation images: /content/drive/MyDrive/datasets/FairFace/val (0 files)


In [4]:
class FairFaceDataProcessor:
    def __init__(self, csv_path, img_dir, img_size=160, sample_size=None, is_validation=False):
        self.img_size = img_size
        self.img_dir = img_dir
        self.is_validation = is_validation

        # Load CSV data
        self.df = pd.read_csv(csv_path)

        # Sample data if specified
        if sample_size and sample_size < len(self.df):
            self.df = self.df.sample(n=sample_size, random_state=42).reset_index(drop=True)
            print(f"Sampled {'validation' if is_validation else 'training'}: {len(self.df)} samples")

        # Initialize label encoders only once (use training data to fit)
        if not hasattr(self, 'age_encoder'):
            self.age_encoder = LabelEncoder()
            self.gender_encoder = LabelEncoder()
            self.race_encoder = LabelEncoder()

            # Fit encoders on this dataset
            self.age_encoder.fit(self.df['age'])
            self.gender_encoder.fit(self.df['gender'])
            self.race_encoder.fit(self.df['race'])

        # Encode labels
        self.df['age_encoded'] = self.age_encoder.transform(self.df['age'])
        self.df['gender_encoded'] = self.gender_encoder.transform(self.df['gender'])
        self.df['race_encoded'] = self.race_encoder.transform(self.df['race'])

        self.num_classes = {
            'age': len(self.age_encoder.classes_),
            'gender': len(self.gender_encoder.classes_),
            'race': len(self.race_encoder.classes_)
        }

        print(f"Classes - Age: {self.num_classes['age']}, Gender: {self.num_classes['gender']}, Race: {self.num_classes['race']}")
        if not is_validation:  # Only print once
            print(f"Age groups: {list(self.age_encoder.classes_)}")
            print(f"Gender groups: {list(self.gender_encoder.classes_)}")
            print(f"Race groups: {list(self.race_encoder.classes_)}")

    def load_and_preprocess_image(self, image_path, augment=False):
        """Load and preprocess single image"""
        try:
            # Check if file exists
            if not tf.io.gfile.exists(image_path):
                print(f"Warning: File not found: {image_path}")
                return tf.zeros([self.img_size, self.img_size, 3], dtype=tf.float32)

            image = tf.io.read_file(image_path)
            image = tf.image.decode_image(image, channels=3)
            image = tf.image.resize(image, [self.img_size, self.img_size])
            image = tf.cast(image, tf.float32) / 255.0

            if augment:
                # Data augmentation
                image = tf.image.random_flip_left_right(image)
                image = tf.image.random_brightness(image, 0.1)
                image = tf.image.random_contrast(image, 0.9, 1.1)
                image = tf.image.random_saturation(image, 0.9, 1.1)
                image = tf.image.random_hue(image, 0.05)

            # Normalize using ImageNet statistics (for FaceNet compatibility)
            image = tf.image.per_image_standardization(image)
            return image
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            # Return black image if file not found
            return tf.zeros([self.img_size, self.img_size, 3], dtype=tf.float32)

    def create_dataset(self, batch_size=32, augment=False, shuffle=True):
        """Create TensorFlow dataset"""
        def generator():
            indices = np.arange(len(self.df))
            if shuffle:
                np.random.shuffle(indices)

            for idx in indices:
                row = self.df.iloc[idx]
                # Fix the path construction - don't add extra subdirectory
                img_path = os.path.join(self.img_dir, row['file'])

                # Debug: Print first few paths to verify
                if idx < 3:
                    print(f"Loading image from: {img_path}")
                    print(f"File exists: {os.path.exists(img_path)}")

                image = self.load_and_preprocess_image(img_path, augment)

                yield (
                    image,
                    {
                        'age_output': tf.cast(row['age_encoded'], dtype=tf.int32),
                        'gender_output': tf.cast(row['gender_encoded'], dtype=tf.int32),
                        'race_output': tf.cast(row['race_encoded'], dtype=tf.int32)
                    }
                )

        # Create dataset
        dataset = tf.data.Dataset.from_generator(
            generator,
            output_signature=(
                tf.TensorSpec(shape=(self.img_size, self.img_size, 3), dtype=tf.float32),
                {
                    'age_output': tf.TensorSpec(shape=(), dtype=tf.int32),
                    'gender_output': tf.TensorSpec(shape=(), dtype=tf.int32),
                    'race_output': tf.TensorSpec(shape=(), dtype=tf.int32)
                }
            )
        )

        if shuffle:
            dataset = dataset.shuffle(buffer_size=1000)

        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(tf.data.AUTOTUNE)

        return dataset

In [5]:
class FaceNetMultiTask(tf.keras.Model):
    def __init__(self, num_age_classes, num_gender_classes, num_race_classes,
                 freeze_backbone=False):
        super(FaceNetMultiTask, self).__init__()

        #  Load FaceNet backbone
        # self.facenet = FaceNet()
        # self.facenet_model = self.facenet.model

        # # Remove the final layer to get embeddings
        # self.backbone = Model(
        #     inputs=self.facenet_model.input,
        #     outputs=self.facenet_model.layers[-2].output  # Get embeddings before final layer
        # )

        self.backbone = FaceNet().model

        # Freeze backbone if specified
        if freeze_backbone:
            self.backbone.trainable = False

        # Classification heads
        self.age_classifier = tf.keras.Sequential([
            layers.Dropout(0.5),
            layers.Dense(256, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            layers.Dense(num_age_classes, activation='softmax', name='age_output')
        ], name='age_head') # Add name to sequential model

        self.gender_classifier = tf.keras.Sequential([
            layers.Dropout(0.5),
            layers.Dense(128, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            layers.Dense(num_gender_classes, activation='softmax', name='gender_output')
        ], name='gender_head') # Add name to sequential model

        self.race_classifier = tf.keras.Sequential([
            layers.Dropout(0.5),
            layers.Dense(256, activation='relu'),
            layers.BatchNormalization(),
            layers.Dropout(0.3),
            layers.Dense(num_race_classes, activation='softmax', name='race_output')
        ], name='race_head') # Add name to sequential model


    def call(self, inputs, training=None):
        # Get FaceNet embeddings
        embeddings = self.backbone(inputs, training=training)

        # Multi-task predictions
        age_pred = self.age_classifier(embeddings, training=training)
        gender_pred = self.gender_classifier(embeddings, training=training)
        race_pred = self.race_classifier(embeddings, training=training)

        return {
            'age_output': age_pred, # Changed key to match data generator and compile
            'gender_output': gender_pred, # Changed key to match data generator and compile
            'race_output': race_pred # Changed key to match data generator and compile
        }

def create_and_compile_model(num_age_classes, num_gender_classes, num_race_classes, freeze_backbone=True):
    """Create and compile the multi-task model with proper loss functions"""

    model = FaceNetMultiTask(
        num_age_classes=num_age_classes,
        num_gender_classes=num_gender_classes,
        num_race_classes=num_race_classes,
        freeze_backbone=freeze_backbone
    )

    # Build the model
    model.build((None, 160, 160, 3))

    # Optimizer with learning rate schedule
    initial_lr = 0.001
    optimizer = optimizers.AdamW(learning_rate=initial_lr, weight_decay=1e-4)

    # FIXED: Use SparseCategoricalCrossentropy for each output
    # This works with integer labels (no need for one-hot encoding)
    model.compile(
        optimizer=optimizer,
        loss={
            'age_output': losses.SparseCategoricalCrossentropy(),
            'gender_output': losses.SparseCategoricalCrossentropy(),
            'race_output': losses.SparseCategoricalCrossentropy()
        },
        loss_weights={
            'age_output': 1.0,
            'gender_output': 1.0,
            'race_output': 1.0
        },
        metrics={
            'age_output': ['sparse_categorical_accuracy'],
            'gender_output': ['sparse_categorical_accuracy'],
            'race_output': ['sparse_categorical_accuracy']
        }
    )

    return model

# # Custom loss function for multi-task learning
# class MultiTaskLoss(tf.keras.losses.Loss):
#     def __init__(self, age_weight=1.0, gender_weight=1.0, race_weight=1.0, name="multi_task_loss"):
#         super().__init__(name=name)
#         self.age_weight = age_weight
#         self.gender_weight = gender_weight
#         self.race_weight = race_weight
#         # Use SparseCategoricalCrossentropy as the dataset provides integer labels
#         self.sparse_ce = tf.keras.losses.SparseCategoricalCrossentropy()

#     def call(self, y_true, y_pred):
#         # y_true will be integer labels, y_pred will be one-hot encoded predictions
#         # SparseCategoricalCrossentropy expects integer labels and one-hot predictions

#         age_loss = self.sparse_ce(y_true['age_output'], y_pred['age_output'])
#         gender_loss = self.sparse_ce(y_true['gender_output'], y_pred['gender_output'])
#         race_loss = self.sparse_ce(y_true['race_output'], y_pred['race_output'])

#         total_loss = (self.age_weight * age_loss +
#                      self.gender_weight * gender_loss +
#                      self.race_weight * race_loss)

#         return total_loss

#     def build(self, input_shape):
#         # Properly initialize all layers
#         dummy_input = tf.keras.Input(shape=input_shape[1:])
#         self.call(dummy_input)

# # Custom metrics
# class MultiTaskAccuracy(tf.keras.metrics.Metric):
#     def __init__(self, task_name, name=None, **kwargs):
#         if name is None:
#             name = f'{task_name}_accuracy'
#         super().__init__(name=name, **kwargs)
#         self.task_name = task_name
#         # Use SparseCategoricalAccuracy as the dataset provides integer labels
#         self.accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

#     def update_state(self, y_true, y_pred, sample_weight=None):
#         # y_true will be integer labels, y_pred will be one-hot encoded predictions
#         # SparseCategoricalAccuracy expects integer labels and one-hot predictions
#         self.accuracy.update_state(y_true[self.task_name], y_pred[self.task_name], sample_weight)

#     def result(self):
#         return self.accuracy.result()

#     def reset_state(self):
#         self.accuracy.reset_state()

In [6]:
BATCH_SIZE = 32
IMG_SIZE = 160
EPOCHS = 20
SAMPLE_SIZE = 5000  # Reduced sample size to avoid memory issues

print("Creating training data processor...")
train_processor = FairFaceDataProcessor(
    TRAIN_CSV_PATH,
    DATASET_PATH,
    img_size=IMG_SIZE,
    sample_size=SAMPLE_SIZE,
    is_validation=False
)

print("\nCreating validation data processor...")
val_processor = FairFaceDataProcessor(
    VAL_CSV_PATH,
    DATASET_PATH,
    img_size=IMG_SIZE,
    sample_size=SAMPLE_SIZE//5 if SAMPLE_SIZE else None,
    is_validation=True
)

# Copy encoders from training to validation processor
val_processor.age_encoder = train_processor.age_encoder
val_processor.gender_encoder = train_processor.gender_encoder
val_processor.race_encoder = train_processor.race_encoder

# Re-encode validation labels with training encoders
val_processor.df['age_encoded'] = val_processor.age_encoder.transform(val_processor.df['age'])
val_processor.df['gender_encoded'] = val_processor.gender_encoder.transform(val_processor.df['gender'])
val_processor.df['race_encoded'] = val_processor.race_encoder.transform(val_processor.df['race'])

print("\nCreating datasets...")
train_dataset = train_processor.create_dataset(
    batch_size=BATCH_SIZE,
    augment=True,
    shuffle=True
)

val_dataset = val_processor.create_dataset(
    batch_size=BATCH_SIZE,
    augment=False,
    shuffle=False
)


Creating training data processor...
Sampled training: 5000 samples
Classes - Age: 9, Gender: 2, Race: 7
Age groups: ['0-2', '10-19', '20-29', '3-9', '30-39', '40-49', '50-59', '60-69', 'more than 70']
Gender groups: ['Female', 'Male']
Race groups: ['Black', 'East Asian', 'Indian', 'Latino_Hispanic', 'Middle Eastern', 'Southeast Asian', 'White']

Creating validation data processor...
Sampled validation: 1000 samples
Classes - Age: 9, Gender: 2, Race: 7

Creating datasets...


In [7]:
print("Building model...")
model = create_and_compile_model(
    num_age_classes=train_processor.num_classes['age'],
    num_gender_classes=train_processor.num_classes['gender'],
    num_race_classes=train_processor.num_classes['race'],
    freeze_backbone=True  # Freeze FaceNet weights initially
)

print(model.summary())

Building model...


None


In [8]:
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        '/content/best_fairface_facenet_model.h5',
        monitor='val_loss',
        save_best_only=True,
        save_weights_only=False,
        verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-7,
        verbose=1
    ),
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    )
]

# Recreate datasets for training (fresh iterators)
# train_dataset = train_gen.create_dataset(
#     train_gen.train_df,
#     batch_size=BATCH_SIZE,
#     augment=True,
#     shuffle=True
# )

# val_dataset = val_gen.create_dataset(
#    val_gen.val_df,
#     batch_size=BATCH_SIZE,
#     augment=False,
#     shuffle=False
# )

In [9]:
def verify_dataset_structure():
    print("=== Dataset Structure Verification ===")

    # Check CSV files
    train_csv = pd.read_csv(TRAIN_CSV_PATH)
    val_csv = pd.read_csv(VAL_CSV_PATH)

    print(f"Training CSV shape: {train_csv.shape}")
    print(f"Validation CSV shape: {val_csv.shape}")
    print(f"Training CSV columns: {list(train_csv.columns)}")

    # Check image directories
    train_images = os.listdir(TRAIN_IMG_DIR)
    val_images = os.listdir(VAL_IMG_DIR) if os.path.exists(VAL_IMG_DIR) else []

    print(f"Training images found: {len(train_images)}")
    print(f"Validation images found: {len(val_images)}")

    # Check if CSV files match image files
    print("\n=== File Matching Check ===")

    # Sample a few files from CSV and check if they exist
    sample_files = train_csv['file'].head(10).tolist()
    missing_files = []

    for file_name in sample_files:
        # full_path = os.path.join(TRAIN_IMG_DIR, file_name)
        full_path = os.path.join(DATASET_PATH, file_name)
        if not os.path.exists(full_path):
            missing_files.append(full_path)
        else:
            print(f"✓ Found: {full_path}")

    if missing_files:
        print(f"✗ Missing files: {missing_files}")
    else:
        print("✓ All sampled files found!")

    # Check first few rows of CSV
    print(f"\nFirst few rows of training CSV:")
    print(train_csv.head())

    return len(missing_files) == 0

# Run verification
dataset_ok = verify_dataset_structure()

if dataset_ok:
    print("\n✓ Dataset structure looks good! Proceeding with training...")
else:
    print("\n✗ Dataset structure issues found. Please check file paths.")

=== Dataset Structure Verification ===
Training CSV shape: (86744, 5)
Validation CSV shape: (10954, 5)
Training CSV columns: ['file', 'age', 'gender', 'race', 'service_test']
Training images found: 26399
Validation images found: 0

=== File Matching Check ===
✓ Found: /content/drive/MyDrive/datasets/FairFace/train/1.jpg
✓ Found: /content/drive/MyDrive/datasets/FairFace/train/2.jpg
✓ Found: /content/drive/MyDrive/datasets/FairFace/train/3.jpg
✓ Found: /content/drive/MyDrive/datasets/FairFace/train/4.jpg
✓ Found: /content/drive/MyDrive/datasets/FairFace/train/5.jpg
✓ Found: /content/drive/MyDrive/datasets/FairFace/train/6.jpg
✓ Found: /content/drive/MyDrive/datasets/FairFace/train/7.jpg
✓ Found: /content/drive/MyDrive/datasets/FairFace/train/8.jpg
✓ Found: /content/drive/MyDrive/datasets/FairFace/train/9.jpg
✓ Found: /content/drive/MyDrive/datasets/FairFace/train/10.jpg
✓ All sampled files found!

First few rows of training CSV:
          file    age  gender        race  service_test
0  

In [None]:
print("Starting training...")
history = model.fit(
    train_dataset,
    epochs=10,
    validation_data=val_dataset,
    callbacks=callbacks,
    verbose=1
)

print("Training completed!")


Starting training...
Epoch 1/10


In [None]:
# Plot training history
def plot_history(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    # Loss
    ax1.plot(history.history['loss'], label='Train')
    ax1.plot(history.history['val_loss'], label='Validation')
    ax1.set_title('Loss')
    ax1.set_xlabel('Epoch')
    ax1.legend()

    # Accuracy (Age)
    ax2.plot(history.history['age_output_accuracy'], label='Age Train')
    ax2.plot(history.history['val_age_output_accuracy'], label='Age Val')
    ax2.plot(history.history['gender_output_accuracy'], label='Gender Train')
    ax2.plot(history.history['val_gender_output_accuracy'], label='Gender Val')
    ax2.plot(history.history['race_output_accuracy'], label='Race Train')
    ax2.plot(history.history['val_race_output_accuracy'], label='Race Val')
    ax2.set_title('Task Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.legend()

    plt.tight_layout()
    plt.show()

plot_history(history)

# Load best model
model = models.load_model('best_model.h5')

# Evaluate
results = model.evaluate(val_gen)
print("\nEvaluation Results:")
print(f"Age Accuracy: {results[4]:.4f}")
print(f"Gender Accuracy: {results[6]:.4f}")
print(f"Race Accuracy: {results[8]:.4f}")


In [None]:
def predict_sample(model, img_path, img_size=(160, 160)):
    # Load and preprocess image
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, img_size)
    img = tf.subtract(img, 0.5)
    img = tf.multiply(img, 2.0)
    img = tf.expand_dims(img, axis=0)  # Add batch dim

    # Predict
    preds = model.predict(img)
    age_pred = np.argmax(preds[0][0])
    gender_pred = np.argmax(preds[1][0])
    race_pred = np.argmax(preds[2][0])

    # Decode labels
    age_label = train_gen.age_encoder.inverse_transform([age_pred])[0]
    gender_label = train_gen.gender_encoder.inverse_transform([gender_pred])[0]
    race_label = train_gen.race_encoder.inverse_transform([race_pred])[0]

    # Display
    plt.imshow(tf.squeeze(img) * 0.5 + 0.5)  # Undo normalization
    plt.title(f"Age: {age_label}\nGender: {gender_label}\nRace: {race_label}")
    plt.axis('off')
    plt.show()

    return age_label, gender_label, race_label

# Test on a sample image
sample_img = os.path.join(VAL_IMG_DIR, os.listdir(VAL_IMG_DIR)[0])
predict_sample(model, sample_img)
