In [2]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.applications import ResNet50
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor


from scikeras.wrappers import KerasClassifier
from scikeras.wrappers import KerasRegressor

from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import RandomizedSearchCV


# Step 1: Load the data
df = pd.read_csv('augmented_train.csv')

In [3]:
df.head()


# Print the number of rows and columns separately
print("Number of rows:", df.shape[0])
print("Number of columns:", df.shape[1])


Number of rows: 59472
Number of columns: 14


In [4]:
# Step 2: Preprocess the data
# Separate features and target
imagedata = df['Id']
features = df.drop(columns=['Pawpularity','Id'])
target = df['Pawpularity']

features = features.select_dtypes(include=[np.number])

features = features.astype(np.float32)
target = target.astype(np.float32)

# Split into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

# Ensure the labels are in the correct shape
y_train = y_train.values.reshape(-1, 1)
y_val = y_val.values.reshape(-1, 1)



LGBM

In [23]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize the LGBMRegressor with default parameters
model = lgb.LGBMRegressor()

# Reduced hyperparameter grid for LGBM
param_grid = {
    'n_estimators': [100, 200],  # Number of boosting iterations (slightly fewer options)
    'learning_rate': [0.05, 0.1],  # More common values for learning rate
    'max_depth': [5, 7],  # Moderate tree depth
    'num_leaves': [31, 50],  # Number of leaves (reasonable values for avoiding overfitting)
    'subsample': [0.8, 1.0],  # Subsampling fraction
    'colsample_bytree': [0.8, 1.0],  # Feature subsampling fraction
}

# Initialize GridSearchCV with the reduced grid and 5-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)

# Fit the model with the reduced grid
grid_search.fit(X_train, y_train)

# Best parameters and model from GridSearchCV
print(f"Best parameters: {grid_search.best_params_}")

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)

# Evaluate performance using RMSE
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse:.4f}")


Fitting 5 folds for each of 64 candidates, totalling 320 fits




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002311 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24
[LightGBM] [Info] Number of data points in the train set: 47577, number of used features: 12
[LightGBM] [Info] Start training from score 0.380105
Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'num_leaves': 50, 'subsample': 0.8}
RMSE: 0.2035


Voting Ensemble(Soft voting)

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor

# Initialize models
model1 = GradientBoostingRegressor(n_estimators=200, learning_rate=0.2, max_depth=5)
model2 = LGBMRegressor(colsample_bytree=1.0, n_estimators=200, learning_rate=0.1, max_depth=7, num_leaves=50, subsample=0.8)

# Create a voting regressor
voting_regressor = VotingRegressor(estimators=[('gb', model1), ('lgb', model2)])

# Train the voting regressor
voting_regressor.fit(X_train, y_train)

# Predict and evaluate performance
y_pred = voting_regressor.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f"Voting Regressor RMSE: {rmse:.4f}")


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24
[LightGBM] [Info] Number of data points in the train set: 47577, number of used features: 12
[LightGBM] [Info] Start training from score 0.380105
Voting Regressor RMSE: 0.2034


Stacking Ensemble

In [26]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor

# Define the base models (level 0)
base_learners = [
    ('gb', GradientBoostingRegressor(n_estimators=200, learning_rate=0.2, max_depth=5)),
    ('lgb', LGBMRegressor(colsample_bytree=1.0, n_estimators=200, learning_rate=0.1, max_depth=7, num_leaves=50, subsample=0.8))
]

# Define the meta-learner (level 1)
meta_learner = LinearRegression()

# Create the Stacking Regressor
stacking_regressor = StackingRegressor(estimators=base_learners, final_estimator=meta_learner)

# Train the stacking model
stacking_regressor.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = stacking_regressor.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f"Stacking Regressor RMSE: {rmse:.4f}")


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24
[LightGBM] [Info] Number of data points in the train set: 47577, number of used features: 12
[LightGBM] [Info] Start training from score 0.380105
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24
[LightGBM] [Info] Number of data points in the train set: 38061, number of used features: 12
[LightGBM] [Info] Start training from score 0.380304
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001995 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y

ResNet

In [None]:
# Split into training and validation sets (80% train, 20% validation)
X_train2, X_val2, y_train2, y_val2 = train_test_split(imagedata, target, test_size=0.2, random_state=42)

# Ensure the labels are in the correct shape
y_train2 = y_train2.values.reshape(-1, 1)
y_val2 = y_val2.values.reshape(-1, 1)

In [None]:
# Load ResNet50 model with pre-trained ImageNet weights
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of ResNet
base_model.trainable = False

# Build the model
model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(1024, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='linear')  # Use linear for regression output
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss='mse', 
              metrics=[tf.keras.metrics.RootMeanSquaredError()])

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

# Train the model
history = model.fit(
    X_train2, y_train2, 
    validation_data=(X_val2, y_val2), 
    epochs=100,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Evaluate the model
val_predictions = model.predict(X_val2)
rmse = np.sqrt(mean_squared_error(y_val2, val_predictions))
print(f"Validation RMSE: {rmse:.4f}")

Epoch 1/100


ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("sequential_4_1/Cast:0", shape=(32, 1), dtype=float32). Expected shape (None, 224, 224, 3), but input has incompatible shape (32, 1)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 1), dtype=string)
  • training=True
  • mask=None