Import dataset from kaggle

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("frabbisw/facial-age")

print("Path to dataset files:", path)

Check the structure of the dataset

In [None]:
from pathlib import Path

path = Path(path)

def hierarchy(root: Path):
    return { child: hierarchy(child) for child in root.iterdir() } if root.is_dir() else None

path_dict = hierarchy(path)
path_dict

In [None]:
path_dict = dict(list(path_dict.items())[1:])

path_dict

In [None]:
data = {}

for directory in path_dict:
    for age in path_dict[directory]:
        for image in path_dict[directory][age]:
            if image.is_file():
                data[image] = age.name

print(data)

In [None]:
import pandas as pd


df = pd.DataFrame(data = {'file' : data.keys(), 'age' : data.values()})
print(df.head())

After manually inspecting the dataset, it was decided to drop some of the examples as they were either corrupted files, different body parts than face or having the wrong age.

In [None]:
print(len(df))
files_to_drop = [3829, 4313, 7034, 7326, 9378, 1490,]
for filename in files_to_drop:
    filename_with_extension = f"{filename}.png"
    for index, row in df.iterrows():
        if filename_with_extension in str(row.file):
            print(row.file)
            df = df.drop(index)

print(len(df))

In [None]:
df.to_csv('facial-age.csv')

In [None]:
import numpy as np
train, validate, test = \
              np.split(df.sample(frac=1), 
                       [int(.75*len(df)), int(.9*len(df))])

print(len(train), len(validate), len(test))

We can categorize ages into bins to simplify classification task.
To start with it we can simple create bins with approximately equal number of examples in each.
The downside of it is that we won't have as precise age in case of wider bins (where number of examples were low for some ages). And the upside is that we will have approximarely equal number of examples in each bin, which in theory will allow us to predict a category with more accuracy.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(data=df['age'])

In [None]:
df['age'] = df['age'].astype(int)
df['age_bins'] = pd.qcut(x=df['age'], q=8, precision=0)
print(df.sample(5))

In [None]:
sns.countplot(data=df['age_bins'])