In [2]:
import cv2
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input,InputLayer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.losses import Huber


In [3]:
def load_and_filter_images(image_paths, base_directory):
    valid_images = []
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    for img_path in image_paths:
        full_path = os.path.join(base_directory, img_path)  # Correctly join the base directory with the image path
        image = cv2.imread(full_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            continue
        faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5)
        if len(faces) == 1:
            valid_images.append(img_path)
    return valid_images

In [4]:
data = pd.read_csv('/kaggle/input/age-metadata/wiki_labels.csv')
data['full_path'] = data['full_path'].str.extract(r"(\d+\/[^']+\.png)")  # Regex to extract the relative path


In [5]:
base_directory = '/kaggle/input/dataset-age/wiki_labeled/wiki_labeled/'

In [6]:
valid_image_paths = load_and_filter_images(data['full_path'].dropna().tolist(), base_directory)
data = data[data['full_path'].isin(valid_image_paths)]

In [7]:
data['face_score'] = pd.to_numeric(data['face_score'], errors='coerce')
data = data[data['face_score'].notna() & (data['face_score'] != float('inf')) & data['second_face_score'].isna()]

In [8]:
base_year = 1.0
data['approx_dob_year'] = base_year + (data['dob'] - 366) / 365.25
data['age'] = data['photo_taken'] - data['approx_dob_year']
data['age'] = data['age'].round().astype('float64')
data = data[(data['age'] > 0) & (data['age'] <= 100)]


In [9]:
data['full_path'] = data['full_path'].str.extract(r"(\d+\/[^']+\.png)")

In [10]:
datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    zoom_range=0.15,
    horizontal_flip=True,
    fill_mode='nearest',
    brightness_range=[0.5, 1.5],
    validation_split=0.2
)


In [11]:
train_generator = datagen.flow_from_dataframe(
    dataframe=data,
    directory=base_directory,
    x_col='full_path',
    y_col='age',
    target_size=(100, 100),
    batch_size=32,
    color_mode='grayscale',
    class_mode='raw',
    subset='training'
)

Found 22836 validated image filenames.


In [12]:
validation_generator = datagen.flow_from_dataframe(
    dataframe=data,
    directory=base_directory,
    x_col='full_path',
    y_col='age',
    target_size=(100, 100),
    batch_size=32,
    color_mode='grayscale',
    class_mode='raw',
    subset='validation'
)


Found 5709 validated image filenames.


In [13]:
def hubosh_loss(y_true, y_pred, delta=1.345):
    # Define the Huber loss component
    def huber_loss(y_true, y_pred, delta):
        error = y_true - y_pred
        is_small_error = tf.abs(error) <= delta
        squared_loss = tf.square(error) / 2
        linear_loss = delta * (tf.abs(error) - (0.5 * delta))
        return tf.where(is_small_error, squared_loss, linear_loss)

    # Define the LogCosh loss component
    def logcosh_loss(y_true, y_pred):
        def _logcosh(x):
            return x + tf.nn.softplus(-2. * x) - tf.math.log(2.0)
        return tf.reduce_mean(_logcosh(y_pred - y_true))

    # Combine the losses
    huber = huber_loss(y_true, y_pred, delta)
    logcosh = logcosh_loss(y_true, y_pred)
    return huber + logcosh


In [15]:
from tensorflow.keras.regularizers import l2
from tensorflow.keras.metrics import MeanAbsoluteError
model = Sequential([
    InputLayer(input_shape=(100, 100, 1)),
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    MaxPooling2D(2, 2),
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    MaxPooling2D(2, 2),
    Conv2D(256, (3, 3), activation='relu', padding='same'),
    MaxPooling2D(2, 2),
    Flatten(),
    Dense(1024, activation='relu', kernel_regularizer=l2(0.01)), 
    Dropout(0.5),  
    Dense(1) 
])

optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer='adam', loss=hubosh_loss,  metrics=[MeanAbsoluteError()])




In [16]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    epochs=50,
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // validation_generator.batch_size,
    callbacks=[early_stopping, reduce_lr]
)


Epoch 1/50


  self._warn_if_super_not_called()


[1m  3/713[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m45s[0m 64ms/step - loss: 101.8364 - mean_absolute_error: 36.1832   

I0000 00:00:1714395067.388487      91 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1714395067.409837      91 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - loss: 36.5576 - mean_absolute_error: 15.1418

W0000 00:00:1714395142.363568      91 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 127ms/step - loss: 36.5528 - mean_absolute_error: 15.1403 - val_loss: 31.8605 - val_mean_absolute_error: 12.9238 - learning_rate: 0.0010
Epoch 2/50
[1m  1/713[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22s[0m 31ms/step - loss: 28.7203 - mean_absolute_error: 11.5815

  self.gen.throw(typ, value, traceback)


[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 28.7203 - mean_absolute_error: 11.5815 - val_loss: 56.4271 - val_mean_absolute_error: 23.4417 - learning_rate: 0.0010
Epoch 3/50
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 111ms/step - loss: 31.5075 - mean_absolute_error: 13.2591 - val_loss: 55.6110 - val_mean_absolute_error: 12.9705 - learning_rate: 0.0010
Epoch 4/50
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27us/step - loss: 58.0545 - mean_absolute_error: 13.9993 - val_loss: 1744.4390 - val_mean_absolute_error: 733.2439 - learning_rate: 0.0010
Epoch 5/50
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 112ms/step - loss: 65.5672 - mean_absolute_error: 16.4687 - val_loss: 54.8710 - val_mean_absolute_error: 12.7074 - learning_rate: 0.0010
Epoch 6/50
[1m713/713[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27us/step - loss: 55.5473 - mean_absolute_error: 12.9938 - val_loss: 725.742

In [17]:

judge_datagen = ImageDataGenerator(rescale=1./255)

judge_data = pd.read_csv('/kaggle/input/age-testdata/wiki_judge.csv')




In [18]:
judge_data['full_path'] = judge_data['full_path'].str.extract(r"(\d+\.png)")

In [19]:
judge_generator = judge_datagen.flow_from_dataframe(
    dataframe=judge_data,
    directory='/kaggle/input/age-testimages/wiki_judge_images/wiki_judge_images/',
    x_col='full_path',
    y_col=None,
    target_size=(100, 100),
    batch_size=32,
    color_mode='grayscale',
    class_mode=None,
    shuffle=False
)

Found 1409 validated image filenames.


In [20]:
import numpy as np
predicted_ages = model.predict(judge_generator)
predicted_ages = predicted_ages.flatten()

predicted_ages = np.clip(predicted_ages, 0, 100)




  self._warn_if_super_not_called()


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 150ms/step


In [21]:
submission_df = pd.DataFrame({
    'ID': judge_data['ID'],
    'age': predicted_ages
})

submission_df.to_csv('/kaggle/working/submissionhubosh.csv', index=False)
