In [49]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.applications import ResNet50
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

# Step 1: Load the data
df = pd.read_csv('augmented_train.csv')

In [50]:
df.head()

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,Image
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,0.63,0007de18844b0dbbb5e1f607da0606e0_aug_1
1,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,0.63,0007de18844b0dbbb5e1f607da0606e0_aug_2
2,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,0.63,0007de18844b0dbbb5e1f607da0606e0_aug_3
3,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,0.63,0007de18844b0dbbb5e1f607da0606e0_aug_4
4,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,0.63,0007de18844b0dbbb5e1f607da0606e0_aug_5


In [51]:
# Step 2: Preprocess the data
# Separate features and target
features = df.drop(columns=['Pawpularity','Id','Image'])
target = df['Pawpularity']

features = features.select_dtypes(include=[np.number])

features = features.astype(np.float32)
target = target.astype(np.float32)

# Split into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

# Ensure the labels are in the correct shape
y_train = y_train.values.reshape(-1, 1)
y_val = y_val.values.reshape(-1, 1)


Regression Model

In [53]:
model = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(32, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    
    Dense(1, activation='linear')  # Output layer for regression
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [54]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss='mse', 
              metrics=[tf.keras.metrics.RootMeanSquaredError()])

# Step 4: Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Step 5: Evaluate the model
val_predictions = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
print(f"Validation RMSE (before tuning): {rmse:.4f}")

Epoch 1/100
[1m1239/1239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 634us/step - loss: 0.5251 - root_mean_squared_error: 0.6896 - val_loss: 0.0427 - val_root_mean_squared_error: 0.2065 - learning_rate: 0.0010
Epoch 2/100
[1m1239/1239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 526us/step - loss: 0.0466 - root_mean_squared_error: 0.2159 - val_loss: 0.0426 - val_root_mean_squared_error: 0.2064 - learning_rate: 0.0010
Epoch 3/100
[1m1239/1239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 532us/step - loss: 0.0427 - root_mean_squared_error: 0.2065 - val_loss: 0.0425 - val_root_mean_squared_error: 0.2062 - learning_rate: 0.0010
Epoch 4/100
[1m1239/1239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 545us/step - loss: 0.0430 - root_mean_squared_error: 0.2074 - val_loss: 0.0423 - val_root_mean_squared_error: 0.2057 - learning_rate: 0.0010
Epoch 5/100
[1m1239/1239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 560us/step - loss: 0.0425 - root_mea

In [55]:
import keras_tuner as kt
from tensorflow.keras.optimizers import Adam
# Define the model builder function for Keras Tuner
def model_builder(hp):
    model = Sequential()
    # Tune the number of units in the first Dense layer
    hp_units1 = hp.Int('units1', min_value=32, max_value=256, step=32)
    model.add(Dense(units=hp_units1, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(rate=hp.Float('dropout1', min_value=0.1, max_value=0.5, step=0.1)))

    # Second Dense layer
    hp_units2 = hp.Int('units2', min_value=32, max_value=128, step=16)
    model.add(Dense(units=hp_units2, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(rate=hp.Float('dropout2', min_value=0.1, max_value=0.5, step=0.1)))

    # Third Dense layer
    hp_units3 = hp.Int('units3', min_value=16, max_value=64, step=16)
    model.add(Dense(units=hp_units3, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(rate=hp.Float('dropout3', min_value=0.1, max_value=0.5, step=0.1)))

    # Output layer
    model.add(Dense(1, activation='linear'))

    # Tune the learning rate for the optimizer
    hp_learning_rate = hp.Choice('learning_rate', values=[0.01, 0.001, 0.0001])

    # Adjust the step size for each parameter
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                  loss='mse',
                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

# Instantiate the tuner
tuner = kt.RandomSearch(
    model_builder,
    objective='val_root_mean_squared_error',
    max_trials=10,  # Number of trials to run
    executions_per_trial=1,  # Number of models to build and evaluate for each trial
    directory='my_dir',
    project_name='regression_tuning'
)

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

# Perform the hyperparameter search
tuner.search(X_train, y_train, epochs=50, validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr], verbose=0)

# Get the best model and evaluate
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.hypermodel.build(best_hps)
history = best_model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr], verbose=0)

# Evaluate the best model
val_predictions = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
print(f"Validation RMSE (after tuning): {rmse:.4f}")


Reloading Tuner from my_dir/regression_tuning/tuner0.json


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 345us/step
Validation RMSE (after tuning): 0.2040


KNN

In [57]:
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Initialize the KNN regressor
knn = KNeighborsRegressor()

# Hyperparameter tuning using GridSearchCV
param_grid = {'n_neighbors': [3, 5, 7, 10], 'weights': ['uniform', 'distance']}
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Best parameters from GridSearch
best_knn = grid_search.best_estimator_

# Make predictions
y_pred = best_knn.predict(X_val_scaled)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f"Best Parameters from RandomizedSearchCV: {grid_search.best_params_}")
print(f" GridSearchCV RMSE: {rmse:.4f}")

# Define parameter distributions for RandomizedSearchCV
param_distributions = {
    'n_neighbors': [7, 10, 15, 20, 25],  # Exploring more values
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # Adding 'p' parameter for distance metric (1 for Manhattan, 2 for Euclidean)
}

# Hyperparameter tuning using RandomizedSearchCV
random_search = RandomizedSearchCV(
    knn, 
    param_distributions=param_distributions, 
    n_iter=10,  # Number of parameter settings sampled
    cv=5, 
    scoring='neg_mean_squared_error',
    random_state=42
)
random_search.fit(X_train_scaled, y_train)

# Best parameters from RandomizedSearch
best_knn = random_search.best_estimator_

# Make predictions
y_pred = best_knn.predict(X_val_scaled)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f"Best Parameters from RandomizedSearchCV: {random_search.best_params_}")
print(f" RandomizedSearchCV RMSE: {rmse:.4f}")

Best Parameters from RandomizedSearchCV: {'n_neighbors': 10, 'weights': 'distance'}
 GridSearchCV RMSE: 0.2225
Best Parameters from RandomizedSearchCV: {'weights': 'distance', 'p': 1, 'n_neighbors': 25}
 RandomizedSearchCV RMSE: 0.2059


Gradient Boosting Regressor

In [59]:
# Initialize the GradientBoostingRegressor with default parameters
model = GradientBoostingRegressor()

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit the model with the best parameters from GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and model
print(f"Best parameters from GridSearchCV: {grid_search.best_params_}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)

# Evaluate performance using RMSE
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f"Grid Search RMSE: {rmse:.4f}")



# Expanded hyperparameter grid for randomized search
param_distributions = {
    'n_estimators': [100, 200, 300, 400], # Broader selection for 'n_estimators'
    'learning_rate': [0.1, 0.2, 0.3, 0.4], # More optimized range for 'learning_rate'
    'max_depth': [3, 4, 5, 6], # Broader selection for depth
    'subsample': [0.6, 0.8, 1.0], # Fraction of training data for fitting base learning
    'min_samples_split': [2, 3, 4, 5], # Specific number of samples to split
    'min_samples_leaf': [1, 2, 3, 4] # Minimum number of samples in each node
}

# Use RandomizedSearchCV for faster and broader exploration
random_search = RandomizedSearchCV(
    estimator=model, 
    param_distributions=param_distributions, 
    n_iter=20,  # Number of parameter settings that are sampled
    cv=5,  # Cross-validation folds
    n_jobs=-1, 
    scoring='neg_mean_squared_error',
    random_state=42
)

# Fit the model with RandomizedSearchCV
random_search.fit(X_train, y_train)

# Extract the best model
best_model = random_search.best_estimator_

# Evaluate the best model
y_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

print(f"Best Parameters from RandomizedSearchCV: {random_search.best_params_}")
print(f"Randomized Search RMSE: {rmse:.4f}")

  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = colu

Best parameters from GridSearchCV: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
Grid Search RMSE: 0.2035


  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = column_or_1d(y, warn=True)  # TODO: Is this still required?
  y = colu

Best Parameters from RandomizedSearchCV: {'subsample': 1.0, 'n_estimators': 400, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_depth': 6, 'learning_rate': 0.3}
Randomized Search RMSE: 0.2033


SVM

In [61]:
# Initialize the Support Vector Regressor model
svm_model = SVR()

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [1, 10, 100],
    'epsilon': [0.1, 0.2, 0.3]
}
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')

# Fit the model with the best parameters from GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and model
print(f"Best parameters: {grid_search.best_params_}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)

# Evaluate performance using RMSE
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f"GridSearchCV RMSE: {rmse:.4f}")


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Best parameters: {'C': 1, 'epsilon': 0.2, 'kernel': 'rbf'}
GridSearchCV RMSE: 0.2055


In [62]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

svm_model = SVR()

# Expanded hyperparameter grid
param_distributions = {
    'kernel': ['linear', 'rbf'],  # Focus on the most promising kernels
    'C': [0.5, 1, 1.5],  # Smaller range for `C`
    'epsilon': [0.15, 0.2, 0.25],  # More focused range for `epsilon`
    'gamma': ['scale']  # Only one option for simplicity
}

# RandomizedSearchCV for faster exploration
random_search = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=param_distributions,
    n_iter=10,  # Number of random parameter settings sampled
    cv=3,  # Cross-validation folds
    n_jobs=-1,
    scoring='neg_mean_squared_error',
    random_state=42
)

# Fit the model with RandomizedSearchCV
random_search.fit(X_train, y_train)

# Extract the best model
best_model = random_search.best_estimator_

# Evaluate the best model
y_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)

print(f"SVM Best Parameters: {random_search.best_params_}")
print(f"RandomizedSearchCV RMSE: {rmse:.4f}")

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

SVM Best Parameters: {'kernel': 'rbf', 'gamma': 'scale', 'epsilon': 0.15, 'C': 1.5}
RandomizedSearchCV RMSE: 0.2041
