In [1]:
import pandas as pd
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt

In [3]:
# Load the CSV
df_utk = pd.read_csv("utkface_400.csv")
# Show first five rows
print(df_utk.head())
print(df_utk.columns)

                                            filepath  age  gender dress_color  \
0  utkface_400/utkface_400_subset/UTKFace/12_1_0_...   12       1         red   
1  utkface_400/utkface_400_subset/UTKFace/10_0_4_...   10       0        blue   
2  utkface_400/utkface_400_subset/UTKFace/12_1_0_...   12       1       green   
3  utkface_400/utkface_400_subset/UTKFace/12_1_3_...   12       1      orange   
4  utkface_400/utkface_400_subset/UTKFace/10_0_0_...   10       0        pink   

   race nationality  
0     0    American  
1     1     African  
2     2     Chinese  
3     3      Indian  
4     4        Arab  
Index(['filepath', 'age', 'gender', 'dress_color', 'race', 'nationality'], dtype='object')


In [4]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [6]:
#  Define preprocessing code
# Set image size
IMG_SIZE = 48

# Lists to store data
X_utk = []
gender_labels = []
age_labels = []
race_labels = []
nationality_labels = []
dress_labels = []

for index, row in df_utk.iterrows():
    img_path = row['filepath']
    
    # Read and resize image
    img = cv2.imread(img_path)
    if img is not None:
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
        img = img / 255.0  # Normalize
        X_utk.append(img)
        
        # Labels
        gender_labels.append(row['gender'])
        age_labels.append(row['age'])
        race_labels.append(row['race'])
        nationality_labels.append(row['nationality'])
        dress_labels.append(row['dress_color'])

X_utk = np.array(X_utk)
print("Image data shape:", X_utk.shape)

Image data shape: (400, 48, 48, 3)


In [10]:
# Encode labels
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Gender
le_gender = LabelEncoder()
y_gender = le_gender.fit_transform(gender_labels)
y_gender = to_categorical(y_gender)

# Age (optional: can bin ages later)
le_age = LabelEncoder()
y_age = le_age.fit_transform(age_labels)
y_age = to_categorical(y_age)

# Race
le_race = LabelEncoder()
y_race = le_race.fit_transform(race_labels)
y_race = to_categorical(y_race)

# Nationality
le_nat = LabelEncoder()
y_nationality = le_nat.fit_transform(nationality_labels)
y_nationality = to_categorical(y_nationality)

# Dress Color
le_dress = LabelEncoder()
y_dress = le_dress.fit_transform(dress_labels)
y_dress = to_categorical(y_dress)

In [9]:
print("Gender classes:", le_gender.classes_)
print("Age classes:", le_age.classes_)
print("Race classes:", le_race.classes_)
print("Nationality classes:", le_nat.classes_)
print("Dress color classes:", le_dress.classes_)

print("y_gender shape:", y_gender.shape)
print("y_age shape:", y_age.shape)

Gender classes: [0 1]
Age classes: [ 10  11  12  13 100 101 103 105 110 111 115 116]
Race classes: [0 1 2 3 4]
Nationality classes: ['African' 'American' 'Arab' 'Chinese' 'Indian']
Dress color classes: ['black' 'blue' 'green' 'nan' 'orange' 'pink' 'red' 'yellow']
y_gender shape: (400, 2)
y_age shape: (400, 12)


In [24]:
# Train test split code
from sklearn.model_selection import train_test_split

# X_utk: image data
# y_gender: labels

X_train, X_test, y_train, y_test = train_test_split(
    X_utk, y_gender, test_size=0.2, random_state=42, stratify=y_gender)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (320, 48, 48, 3) (320, 2)
Test shape: (80, 48, 48, 3) (80, 2)


In [25]:
# Define age bins and labels
bins = [0, 12, 19, 35, 60, 100]
labels = ['child', 'teenager', 'young_adult', 'adult', 'senior']

# Convert numeric ages into age groups
age_groups = pd.cut(age_labels, bins=bins, labels=labels, include_lowest=True)

# Encode the age groups
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le_age_group = LabelEncoder()
y_age_group = le_age_group.fit_transform(age_groups)
y_age_group = to_categorical(y_age_group)

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_utk, y_age_group, test_size=0.2, random_state=42, stratify=y_age_group)

print("Train shape:", X_train.shape, y_train.shape)
print("Train shape:",  X_train.shape, y_train.shape)

Train shape: (320, 48, 48, 3) (320, 4)
Train shape: (320, 48, 48, 3) (320, 4)


In [27]:
from sklearn.model_selection import train_test_split

# X_utk: image data
# y_race: labels

X_train, X_test, y_train, y_test = train_test_split(
    X_utk, y_race, test_size=0.2, random_state=42, stratify=y_race)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (320, 48, 48, 3) (320, 5)
Test shape: (80, 48, 48, 3) (80, 5)


In [28]:
from sklearn.model_selection import train_test_split

# X_utk: image data
# y_nationality: labels

X_train, X_test, y_train, y_test = train_test_split(
    X_utk, y_nationality, test_size=0.2, random_state=42, stratify=y_nationality)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (320, 48, 48, 3) (320, 5)
Test shape: (80, 48, 48, 3) (80, 5)
