In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate, Flatten, Dropout, BatchNormalization, Conv2D, MaxPooling2D, Add, LayerNormalization, Multiply
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from scipy.sparse import csr_matrix
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.optimizers.schedules import CosineDecay
from keras_tuner.tuners import RandomSearch  # Updated import
from tensorflow.keras.regularizers import l2
import joblib
import os

# Create output directory
output_dir = '/kaggle/working/models'
os.makedirs(output_dir, exist_ok=True)

# Load tabular data
tabular_data_path = '/kaggle/input/asifazmain/tabulardata1.csv'
tabular_data = pd.read_csv(tabular_data_path)

# Preprocessing Tabular Data
tabular_features = tabular_data.drop(columns=["faceImage"])
tabular_labels = tabular_data["Age(years)"]

# Normalize age labels
label_scaler = StandardScaler()
y_tabular_scaled = label_scaler.fit_transform(tabular_labels.values.reshape(-1, 1)).flatten()

# Save label scaler
joblib.dump(label_scaler, os.path.join(output_dir, 'label_scaler.pkl'))

# Extract features and labels
X_tabular = tabular_features.drop(columns=["Age(years)"])
y_tabular = y_tabular_scaled

# Handle categorical and numerical features
categorical_features = ["Blood Pressure (s/d)"]
numerical_features = [col for col in X_tabular.columns if col not in categorical_features]

# Preprocessing pipelines
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Combine transformers
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ],
    sparse_threshold=0
)

# Preprocess tabular data
X_tabular_preprocessed = preprocessor.fit_transform(X_tabular)

# Save preprocessor
joblib.dump(preprocessor, os.path.join(output_dir, 'preprocessor.pkl'))

# Convert sparse to dense if needed
if isinstance(X_tabular_preprocessed, csr_matrix):
    X_tabular_preprocessed = X_tabular_preprocessed.toarray()

# Image IDs
image_ids = tabular_data["faceImage"]

# Split data
X_tabular_train, X_tabular_test, y_train, y_test, image_ids_train, image_ids_test = train_test_split(
    X_tabular_preprocessed, y_tabular, image_ids, test_size=0.2, random_state=42
)

# Image preprocessing
image_data_path = '/kaggle/input/asifazmain/imagedata/imagedata/'
image_size = (128, 128)
batch_size = 4

train_image_datagen = ImageDataGenerator(
    rescale=1.0/255.0,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)

test_image_datagen = ImageDataGenerator(rescale=1.0/255.0)

train_image_generator = train_image_datagen.flow_from_dataframe(
    pd.DataFrame({'filename': image_ids_train.apply(lambda x: f'{x}.jpg')}),
    directory=image_data_path,
    x_col='filename',
    y_col=None,
    target_size=image_size,
    class_mode=None,
    batch_size=batch_size,
    shuffle=False
)

test_image_generator = test_image_datagen.flow_from_dataframe(
    pd.DataFrame({'filename': image_ids_test.apply(lambda x: f'{x}.jpg')}),
    directory=image_data_path,
    x_col='filename',
    y_col=None,
    target_size=image_size,
    class_mode=None,
    batch_size=batch_size,
    shuffle=False
)

# Align images and tabular data
def create_tf_dataset(image_gen, tabular_data, labels, batch_size):
    images = []
    valid_indices = []
    for i in range(len(image_gen)):
        batch = image_gen[i]
        batch_size_actual = batch.shape[0]
        start_idx = i * image_gen.batch_size
        end_idx = start_idx + batch_size_actual
        if end_idx > len(tabular_data):
            batch = batch[:len(tabular_data) - start_idx]
            images.append(batch)
            valid_indices.extend(range(start_idx, start_idx + batch.shape[0]))
            break
        images.append(batch)
        valid_indices.extend(range(start_idx, end_idx))
    
    images = np.concatenate(images, axis=0)
    tabular_data = tabular_data[valid_indices]
    labels = labels[valid_indices]
    
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'image_input': images,
            'tabular_input': tabular_data.astype(np.float32)
        },
        labels.astype(np.float32)
    ))
    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Create datasets
train_dataset = create_tf_dataset(train_image_generator, X_tabular_train, y_train, batch_size)
test_dataset = create_tf_dataset(test_image_generator, X_tabular_test, y_test, batch_size)

# CNN model
def build_model(hp):
    image_input = Input(shape=(*image_size, 3), name="image_input")
    x = Conv2D(
        filters=hp.Int('conv1_filters', min_value=32, max_value=96, step=32),
        kernel_size=3,
        activation="relu",
        padding="same"
    )(image_input)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=2)(x)
    
    shortcut = x
    x = Conv2D(
        filters=hp.Int('conv2_filters', min_value=64, max_value=128, step=32),
        kernel_size=3,
        activation="relu",
        padding="same"
    )(x)
    x = BatchNormalization()(x)
    shortcut = Conv2D(
        filters=hp.Int('conv2_filters', min_value=64, max_value=128, step=32),
        kernel_size=1,
        padding="same"
    )(shortcut)
    x = Add()([x, shortcut])
    
    x = Conv2D(
        filters=hp.Int('conv3_filters', min_value=96, max_value=192, step=32),
        kernel_size=3,
        activation="relu",
        padding="same"
    )(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=2)(x)
    
    shortcut = x
    x = Conv2D(
        filters=hp.Int('conv4_filters', min_value=128, max_value=256, step=64),
        kernel_size=3,
        activation="relu",
        padding="same"
    )(x)
    x = BatchNormalization()(x)
    shortcut = Conv2D(
        filters=hp.Int('conv4_filters', min_value=128, max_value=256, step=64),
        kernel_size=1,
        padding="same"
    )(shortcut)
    x = Add()([x, shortcut])
    
    x = Conv2D(
        filters=hp.Int('conv5_filters', min_value=192, max_value=384, step=64),
        kernel_size=3,
        dilation_rate=2,
        activation="relu",
        padding="same"
    )(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=2)(x)
    
    x = Conv2D(
        filters=hp.Int('conv6_filters', min_value=256, max_value=512, step=64),
        kernel_size=3,
        activation="relu",
        padding="same"
    )(x)
    x = BatchNormalization()(x)
    
    x = Flatten()(x)
    x = Dense(
        units=hp.Int('image_dense_units', min_value=128, max_value=512, step=128),
        activation="relu",
        kernel_regularizer=l2(hp.Float('l2_reg', min_value=1e-4, max_value=5e-2, sampling='log'))
    )(x)
    x = Dropout(hp.Float('dropout_image', min_value=0.3, max_value=0.6))(x)
    x = BatchNormalization()(x)
    image_output = Dense(64, activation="relu", name="image_output")(x)

    tabular_input = Input(shape=(X_tabular_preprocessed.shape[1],), name="tabular_input")
    y = Dense(
        units=hp.Int('tabular_units_1', min_value=128, max_value=512, step=128),
        activation="relu",
        kernel_regularizer=l2(hp.Float('l2_reg', min_value=1e-4, max_value=5e-2, sampling='log'))
    )(tabular_input)
    y = LayerNormalization()(y)
    y = Dropout(hp.Float('dropout_tabular_1', min_value=0.3, max_value=0.6))(y)
    y = Dense(
        units=hp.Int('tabular_units_2', min_value=64, max_value=256, step=64),
        activation="relu",
        kernel_regularizer=l2(hp.Float('l2_reg', min_value=1e-4, max_value=5e-2, sampling='log'))
    )(y)
    y = LayerNormalization()(y)
    tabular_output = Dense(64, activation="relu", name="tabular_output")(y)

    concatenated = Concatenate()([image_output, tabular_output])
    fused = Multiply()([image_output, tabular_output])
    fused = Dense(64, activation="relu")(fused)
    combined = Concatenate()([concatenated, fused])
    
    z = Dense(
        units=hp.Int('concat_units', min_value=128, max_value=512, step=128),
        activation="relu",
        kernel_regularizer=l2(hp.Float('l2_reg', min_value=1e-4, max_value=5e-2, sampling='log'))
    )(combined)
    z = Dropout(hp.Float('dropout_concat', min_value=0.3, max_value=0.6))(z)
    z = BatchNormalization()(z)
    final_output = Dense(1, activation="linear", name="final_output")(z)

    lr_schedule = CosineDecay(
        initial_learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-3, sampling='log'),
        decay_steps=10000
    )
    model = Model(inputs=[image_input, tabular_input], outputs=final_output)
    model.compile(
        optimizer=AdamW(learning_rate=lr_schedule, weight_decay=1e-4, clipnorm=1.0),
        loss="mse",
        metrics=["mae"]
    )
    return model

# Hyperparameter tuning
tuner = RandomSearch(
    build_model,
    objective='val_mae',
    max_trials=15,
    executions_per_trial=2,
    directory='tuner_results',
    project_name='cnn_6layers_low_mae'
)

# Callbacks
early_stopping = EarlyStopping(monitor='val_mae', patience=50, restore_best_weights=True, mode='min')
checkpoint = ModelCheckpoint(
    os.path.join(output_dir, 'best_model.keras'),  # Changed to .keras
    monitor='val_mae',
    save_best_only=True,
    mode='min',
    verbose=1
)

# Perform tuning
tuner.search(
    train_dataset,
    validation_data=test_dataset,
    epochs=100,
    callbacks=[early_stopping, checkpoint]
)

# Get best model
best_model = tuner.get_best_models(num_models=1)[0]

# Save the final model explicitly
best_model.save(os.path.join(output_dir, 'final_model.keras'))  # Changed to .keras

# Evaluate with additional metrics
loss, mae_scaled = best_model.evaluate(test_dataset)
print(f"Test Loss (Scaled): {loss}, Test MAE (Scaled): {mae_scaled}")

# Convert MAE back to original scale
mae_original = label_scaler.inverse_transform([[mae_scaled]])[0][0] - label_scaler.inverse_transform([[0]])[0][0]
print(f"Test MAE (Original Scale): {mae_original}")

# Get predictions for the test dataset
y_pred_scaled = []
y_true_scaled = []
for batch in test_dataset:
    inputs, labels = batch
    preds = best_model.predict(inputs, verbose=0)
    y_pred_scaled.extend(preds.flatten())
    y_true_scaled.extend(labels.numpy().flatten())

y_pred_scaled = np.array(y_pred_scaled)
y_true_scaled = np.array(y_true_scaled)

# Inverse-transform predictions and true labels to original scale
y_pred_original = label_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
y_true_original = label_scaler.inverse_transform(y_true_scaled.reshape(-1, 1)).flatten()

# Calculate overall accuracy (±0.5 years in original scale)
accuracy_0_5 = np.mean(np.abs(y_pred_original - y_true_original) <= 0.5) * 100
print(f"Overall Accuracy (±0.5 years): {accuracy_0_5:.2f}%")

# Calculate R-squared in original scale
r2 = r2_score(y_true_original, y_pred_original)
print(f"R² Score (Original Scale): {r2:.4f}")

# Optional: Accuracy with rounded ages
accuracy_rounded = np.mean(np.round(y_pred_original) == np.round(y_true_original)) * 100
print(f"Accuracy (Rounded to Nearest Integer): {accuracy_rounded:.2f}%")



In [None]:
# Final training
history = best_model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=100,
    callbacks=[early_stopping, checkpoint]
)

# Add the corrected evaluation code here
import numpy as np
from sklearn.metrics import r2_score  # Added import for r2_score
import matplotlib.pyplot as plt

# Get predictions for the test dataset
y_pred_scaled = []
y_true_scaled = []
for batch in test_dataset:
    inputs, labels = batch
    preds = best_model.predict(inputs, verbose=0)
    y_pred_scaled.extend(preds.flatten())
    y_true_scaled.extend(labels.numpy().flatten())

y_pred_scaled = np.array(y_pred_scaled)
y_true_scaled = np.array(y_true_scaled)

# Inverse-transform predictions and true labels to original scale
y_pred_original = label_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
y_true_original = label_scaler.inverse_transform(y_true_scaled.reshape(-1, 1)).flatten()

# Calculate overall accuracy (±0.5 years in original scale)
accuracy_0_5 = np.mean(np.abs(y_pred_original - y_true_original) <= 0.5) * 100
print(f"Overall Accuracy (±0.5 years): {accuracy_0_5:.2f}%")

# Calculate R-squared in original scale
r2 = r2_score(y_true_original, y_pred_original)
print(f"R² Score (Original Scale): {r2:.4f}")

# Optional: Accuracy with rounded ages (exact integer matches)
accuracy_rounded = np.mean(np.round(y_pred_original) == np.round(y_true_original)) * 100
print(f"Accuracy (Rounded to Nearest Integer): {accuracy_rounded:.2f}%")

# Plot predicted vs. actual ages
plt.figure(figsize=(8, 8))
plt.scatter(y_true_original, y_pred_original, alpha=0.5, label='Predictions')
plt.plot([y_true_original.min(), y_true_original.max()], [y_true_original.min(), y_true_original.max()], 'r--', label='Ideal')
plt.xlabel('Actual Age (Years)')
plt.ylabel('Predicted Age (Years)')
plt.title('Predicted vs. Actual Ages')
plt.legend()
plt.savefig('/kaggle/working/models/pred_vs_actual.png')
plt.close()
print("Predicted vs. actual ages plot saved at /kaggle/working/models/pred_vs_actual.png")

# Optional: Save model in TensorFlow.js format
try:
    import tensorflowjs as tfjs
    tfjs.converters.save_keras_model(best_model, os.path.join(output_dir, 'tfjs_model'))
    print("TensorFlow.js model saved successfully.")
except ImportError:
    print("TensorFlow.js not installed. Skipping TF.js model saving.")

In [None]:
# Prediction function (corrected)
def predict_age(image_path, height_cm, weight_kg, bmi, blood_pressure, blood_oxygen, blood_sugar, model, preprocessor, label_scaler, image_size=(128, 128)):
    """
    Predict age using an image and biomarkers.
    
    Parameters:
    - image_path (str): Path to the user's face image
    - height_cm (float): Height in centimeters
    - weight_kg (float): Weight in kilograms
    - bmi (float): Body Mass Index
    - blood_pressure (str): Blood pressure in 'systolic/diastolic' format (e.g., '120/80')
    - blood_oxygen (float): Blood oxygen level in percentage
    - blood_sugar (float): Blood sugar level in mg/dL
    - model: Trained Keras model (best_model)
    - preprocessor: Fitted ColumnTransformer from training
    - label_scaler: Fitted StandardScaler for age labels
    - image_size (tuple): Target image size (default: (128, 128))
    
    Returns:
    - predicted_age (float): Predicted age in years
    """
    
    # 1. Preprocess the image
    img = load_img(image_path, target_size=image_size)
    img_array = img_to_array(img) / 255.0  # Rescale as in test_image_datagen
    img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension: (1, 128, 128, 3)

    # 2. Preprocess the tabular data (biomarkers)
    biomarkers = {
        'Height (cm)': height_cm,
        'Weight (kg)': weight_kg,
        'BMI': bmi,
        'Blood Pressure (s/d)': blood_pressure,
        'Blood Oxygen': blood_oxygen,
        'Blood Sugar(mg/dl)': blood_sugar
    }
    
    # Create DataFrame with all required columns (match training data structure)
    tabular_df = pd.DataFrame([biomarkers])
    
    # Ensure all columns from training are present, fill missing with median or mode from training data
    for col in X_tabular.columns:
        if col not in tabular_df.columns:
            if col in numerical_features:
                tabular_df[col] = X_tabular[col].median()  # Use median for numerical
            elif col in categorical_features:
                tabular_df[col] = X_tabular[col].mode()[0]  # Use mode for categorical
    
    # Reorder columns to match training data
    tabular_df = tabular_df[X_tabular.columns]
    
    # Apply preprocessing
    tabular_processed = preprocessor.transform(tabular_df)
    if isinstance(tabular_processed, csr_matrix):
        tabular_processed = tabular_processed.toarray()
    
    # Ensure tabular_processed is 2D (batch_size, features)
    if len(tabular_processed.shape) > 2:
        tabular_processed = tabular_processed.squeeze()  # Remove extra dimensions if any
    if len(tabular_processed.shape) == 1:
        tabular_processed = np.expand_dims(tabular_processed, axis=0)  # Add batch dimension if missing
    # Now tabular_processed should be (1, 77) or similar, matching the model's expected shape

    # 3. Make prediction
    inputs = {
        'image_input': img_array,
        'tabular_input': tabular_processed.astype(np.float32)
    }
    prediction_scaled = model.predict(inputs)[0][0]  # Get single value
    
    # 4. Convert to original scale
    predicted_age = label_scaler.inverse_transform([[prediction_scaled]])[0][0]
    
    return predicted_age