In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import scienceplots

plt.style.use("science")

In [None]:
tf.random.set_seed(0)
np.random.seed(0)
LATEX_WIDTH = 5.9

In [None]:
images = []
ages = []
genders = []
data_path = "../data/UTKFace/"
for i in os.listdir(data_path):
    split = i.split("_")
    ages.append(min(100, int(split[0])))
    genders.append(int(split[1]))
    images.append(mpimg.imread(data_path + i))

In [None]:
images = pd.Series(list(images), name="image")
ages = pd.Series(list(ages), name="age")
genders = pd.Series(list(genders), name="gender")

df = pd.concat([images, ages, genders], axis=1)
df

In [None]:
ax = df.age.plot.hist(
    bins=45,
    figsize=(LATEX_WIDTH, 3),
    title="Age Distribution",
    xlabel="Age",
    density=True,
)
df.age.plot.kde(ax=ax, color="red", linewidth=2)
plt.xlim(0, 120)
plt.savefig("../ouputs/age_distribution.pdf")

In [None]:
df.age.describe()

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=0, stratify=df.Ages)

In [None]:
train.age.plot.kde(label="train")
test.age.plot.kde(label="test")
plt.legend()

In [None]:
# discretization
classes = ["0-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70+"]

df["age_discrete"] = pd.cut(
    df.age, bins=list(range(0, 71, 10)) + [120], labels=classes, right=False
)
df.age_discrete.value_counts(normalize=True).plot(
    kind="pie", autopct="%.1f", legend=False, figsize=(LATEX_WIDTH, 3)
)
plt.savefig("../ouputs/age_discrete_distribution.pdf")