In [2]:
# Core libraries
import os                              # File and path operations
import numpy as np                     # Numerical computations
import pandas as pd                    # Data handling and analysis
import matplotlib.pyplot as plt        # Plotting
import seaborn as sns                  # Enhanced plotting
from tqdm import tqdm                  # Progress bars

# Scikit-learn (data preprocessing, metrics, splitting, etc.)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Deep learning: TensorFlow + Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Conv2D, MaxPooling2D, AveragePooling2D, Flatten, Dense, Dropout, BatchNormalization)

# Image processing
import cv2                             # OpenCV for image reading/processing
from PIL import Image                  # Pillow for image file handling

In [3]:
# Load CSVs containing real age and file names
train_df = pd.read_csv('gt_avg_train.csv')
val_df   = pd.read_csv('gt_avg_valid.csv')
test_df  = pd.read_csv('gt_avg_test.csv')

print(len(train_df))
print(len(val_df))
print(len(test_df))

# Choose whether to use full image or cropped 
# Set to true to use the cropped face (_face.jpg)
use_cropped_faces = True

def get_file_path(folder, fname):
    if use_cropped_faces:
        fname = fname.replace('.jpg', '_face.jpg')
    return os.path.join(folder, fname)

train_df['file_path'] = train_df['file_name'].apply(lambda x: get_file_path('train', x))
val_df['file_path']   = val_df['file_name'].apply(lambda x: get_file_path('valid', x))
test_df['file_path']  = test_df['file_name'].apply(lambda x: get_file_path('test', x))


4113
1500
1978


In [4]:
def build_dataset(df, batch_size=32, shuffle=True):
    file_paths = df['file_path'].values
    labels = df['real_age'].values

    dataset = tf.data.Dataset.from_tensor_slices((file_paths, labels))

    def load_and_preprocess(path, label):
        image = tf.io.read_file(path)
        image = tf.image.decode_jpeg(image, channels=3)         # Decode image
        image = tf.image.resize(image, [224, 224])              # Resize to uniform size
        image = image / 255.0                                   # Normalize to [0, 1]
        return image, label

    dataset = dataset.map(load_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=1000)

    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)

    return dataset

In [5]:
train_ds = build_dataset(train_df, batch_size=32, shuffle=True)
val_ds   = build_dataset(val_df, batch_size=32, shuffle=False)
test_ds  = build_dataset(test_df, batch_size=32, shuffle=False)

In [15]:
eda_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

In [17]:
print(eda_df.info())
eda_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7591 entries, 0 to 7590
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   file_name         7591 non-null   object 
 1   num_ratings       7591 non-null   int64  
 2   apparent_age_avg  7591 non-null   float64
 3   apparent_age_std  7591 non-null   float64
 4   real_age          7591 non-null   int64  
 5   file_path         7591 non-null   object 
dtypes: float64(2), int64(2), object(2)
memory usage: 356.0+ KB
None


Unnamed: 0,file_name,num_ratings,apparent_age_avg,apparent_age_std,real_age,file_path
0,000000.jpg,36,5.0,1.146423,4,train\000000_face.jpg
1,000001.jpg,63,20.079365,4.096819,18,train\000001_face.jpg
2,000002.jpg,38,76.815789,6.133009,80,train\000002_face.jpg
3,000003.jpg,38,55.657895,7.864653,50,train\000003_face.jpg
4,000004.jpg,15,17.666667,3.457222,17,train\000004_face.jpg


We are really only interested in the real_age. Therefore, the real_age column will be the focus of our exploratory data analysis. 

In [None]:
plt.style.use('seaborn')

sns.set_theme(style='whitegrid')

plt.figure(fig_size=12, 5))

plt.subplot(1, 2, 1)

sns.boxplot(data=eda_df, x=real_age, 