## Same intial loading and cleaning steps ##

In [1]:
# Image Preparation Imports
import cv2
import pandas as pd 
import numpy as np
import datetime
import matplotlib.pyplot as plt
from glob import glob
from scipy.io import loadmat
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.optimizers import Adam

file_name = '../data/wiki.mat'

def matlab_datenum_to_date(matlab_dn):
    return datetime.date.fromordinal(int(matlab_dn) - 366)

def compute_wiki_age(photo_year, birth_datenum):
    photo_date = datetime.date(photo_year, 7, 1)
    birth_date = matlab_datenum_to_date(birth_datenum)
    delta = photo_date - birth_date
    return delta.days / 365.2425


def process_matfile(file_name):
    # Load the data
    data = loadmat(file_name)['wiki'][0, 0]
    data_dict = {key: data[key] for key in data.dtype.names}

    # Convert data into filename-based dictionary
    filename_dict = {}

    for i in range(len(data_dict['full_path'][0])):
        # Remove invalid faces
        if np.isinf(data_dict['face_score'][0][i]) or not np.isnan(data_dict['second_face_score'][0][i]):
            continue

        filename = data_dict['full_path'][0][i][0]
        filename_dict[filename] = compute_wiki_age(data_dict['photo_taken'][0][i], data_dict['dob'][0][i])

    return filename_dict

# Load filenames anda ssociated data
data = process_matfile(file_name)

In [2]:
#some data cleaning to get rid of excessively large or negative values in the predicted age category
print(len(data))
mean_val = sum(data.values()) / len(data)
print(mean_val)
for key in list(data.keys()):
    if data[key] > 100 or data[key] < 1:
        del data[key]
print(len(data))
print(min(data.values()), max(data.values()))

40216
38.60996556567274
40147
1.3525260614523227 99.87337180092678


In [3]:
# Load in images
images = []
for i in data.keys():  
    img = cv2.imread('../data/wiki_crop/'+i)
    images.append(img)

print(f"Number of images loaded: {len(images)}")

Number of images loaded: 40147


In [10]:
#resize all images (can change width and height to what we find to be best)
new_images = []
width = 200
height = 200
for image in images:
    new_im = cv2.resize(image, (width,height), interpolation=cv2.INTER_LINEAR)
    new_images.append(new_im)

## new way to classify ages into bins of 5 ##

In [9]:
edges = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]
labels = ['0-5 years', '5-10 years', '10-15 years', '15-20 years',
              '20-25 years', '25-30 years', '30-35 years', '35-40 years',
             '40-45 years', '45-50 years', '50-55 years', '55-60 years',
              '60-65 years', '65-70 years', '70-75 years', '75-80 years', 
             '80-85 years', '85-90 years', '90-95 years', '95-100 years']

def classify_age(age, edges, labels):
    for i in range(len(edges) - 1):
        if edges[i] <= age < edges[i + 1]:
            return labels[i]
    return None  

classification = [classify_age(data[key], edges, labels) for key in data.keys()]

In [12]:
# split in to train and test sets
x_train, x_test, y_train, y_test = train_test_split(new_images, classification, test_size = 0.8, random_state = 1)

In [17]:
int_labels = sorted(set(y_train))
label_to_int = {label: idx for idx, label in enumerate(int_labels)}

def label_to_int(label):
    return label_to_int[label]

# Convert datasets
X_train = np.array(x_train, dtype='float32')
X_test = np.array(x_test, dtype='float32')

# Map labels to integers
y_train_int = np.array([label_to_int(lbl) for lbl in y_train], dtype='int32')
y_test_int = np.array([label_to_int(lbl) for lbl in y_test], dtype='int32')

# Normalize pixel values to range [0, 1]
X_train /= 255.0
X_test /= 255.0

# Print shapes
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train_int.shape)

MemoryError: Unable to allocate 14.4 GiB for an array with shape (32118, 200, 200, 3) and data type float32

In [16]:
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(height, width, 3)),
    MaxPooling2D((2,2)),

    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D((2,2)),

    Flatten(),
    Dense(128, activation='relu'),
    Dense(20, activation='softmax')  #the 20 different categories for age 
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [18]:
history = model.fit(
    X_train, y_train_int,
    validation_split=0.1,  # 10% of training as validation
    epochs=5,
    batch_size=32,
    verbose=1
)

test_loss, test_acc = model.evaluate(X_test, y_test_int, verbose=1)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")

predictions = model.predict(X_test[:5])
pred_classes = np.argmax(predictions, axis=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 2.4476, Test Accuracy: 0.1766

Sample Predictions:


NameError: name 'int_to_label_mapping' is not defined

In [19]:
int_to_label = {v: k for k, v in label_to_int.items()}

print("\nSample Predictions:")
for i in range(5):
    predicted_label = int_to_label[pred_classes[i]]
    actual_label = int_to_label[y_test_int[i]]
    print(f"Predicted: {predicted_label}, Actual: {actual_label}")


Sample Predictions:
Predicted: 20-25 years, Actual: 25-30 years
Predicted: 20-25 years, Actual: 20-25 years
Predicted: 20-25 years, Actual: 35-40 years
Predicted: 20-25 years, Actual: 20-25 years
Predicted: 25-30 years, Actual: 20-25 years
