<a href="https://www.kaggle.com/code/yiiiiiwen/swin-svr-xg?scriptVersionId=199851894" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import os 
import sys
import pandas as pd
import numpy as np
import math 
import gc 
import random 
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
import torch
import xgboost as xgb
from PIL import Image
from transformers import AutoImageProcessor, SwinModel
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

2024-10-07 09:23:44.303648: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-07 09:23:44.303766: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-07 09:23:44.444351: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
torch.cuda.set_per_process_memory_fraction(0.7)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'

In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)

seed_everything(42)


In [4]:
df_train = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
df_train['norm_score'] = df_train['Pawpularity'] / 100 
df_test  = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/test.csv')

df_train['path'] = df_train['Id'].map(lambda a: f'../input/petfinder-pawpularity-score/train/{a}.jpg')
df_train = df_train.drop(columns=['Id'])
df_test['path'] = df_test['Id'].map(lambda a: f'../input/petfinder-pawpularity-score/test/{a}.jpg')
df_test['Pawpularity'] = [1]*len(df_test)
df_test = df_test.drop(columns=['Id'])

df_train = df_train.sample(frac=1.0).reset_index(drop=True)

In [5]:
def sturage_optimal_bins(data: np.array) -> int:
    n = data.size
    width = 1.0 + np.log2(n)
    nbins = math.ceil((data.max() - data.min()) / width)
    return max(1, nbins)

num_bins = sturage_optimal_bins(df_train['Pawpularity'].values)
df_train['bins'] = pd.cut(df_train['norm_score'], bins=num_bins, labels=False)

In [6]:
seed = 42 
df_train['kfold'] = -1 
n_folds = 10 
strat_kfold = StratifiedKFold(n_folds, random_state=seed, shuffle=True)
for i, (_, train_index) in enumerate(strat_kfold.split(df_train.index, df_train['bins'])):
    df_train.iloc[train_index, -1] = i 
df_train['kfold'] = df_train['kfold'].astype('int')

In [7]:
feature_extractor = AutoImageProcessor.from_pretrained('/kaggle/input/swin-tiny/swin_processor')
swin_model = SwinModel.from_pretrained('/kaggle/input/swin-tiny/swin_model')

def extract_features(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = feature_extractor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = swin_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [8]:
df_train['features'] = df_train['path'].map(extract_features)
df_test['features'] = df_test['path'].map(extract_features)

In [9]:
# Define base models
base_models = [
    ('ridge', Ridge(alpha=1.0)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=seed)),
    ('svr', SVR(kernel='linear', C=1.0))
]


In [10]:
# Initialize arrays to hold meta-features
train_meta_features = np.zeros((len(df_train), len(base_models)))
test_meta_features = np.zeros((len(df_test), len(base_models)))

kf = KFold(n_splits=n_folds, random_state=seed, shuffle=True)


In [11]:
# Train base models and generate meta-features
for i, (name, model) in enumerate(base_models):
    print(f"Training base model: {name}")
    test_fold_preds = np.zeros((len(df_test), n_folds))
    for fold, (train_idx, val_idx) in enumerate(kf.split(df_train)):
        X_train, X_val = np.vstack(df_train.iloc[train_idx]['features'].values), np.vstack(df_train.iloc[val_idx]['features'].values)
        y_train, y_val = df_train.iloc[train_idx]['Pawpularity'].values, df_train.iloc[val_idx]['Pawpularity'].values

        model.fit(X_train, y_train)
        val_preds = model.predict(X_val)
        train_meta_features[val_idx, i] = val_preds

        test_preds = model.predict(np.vstack(df_test['features'].values))
        test_fold_preds[:, fold] = test_preds

    test_meta_features[:, i] = test_fold_preds.mean(axis=1)

Training base model: ridge
Training base model: rf
Training base model: svr


In [12]:
# Train the second-level model using the meta-features
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, learning_rate=0.05, max_depth=5)
xgb_model.fit(train_meta_features, df_train['Pawpularity'].values)


In [13]:
# Predict on the test set
final_preds = xgb_model.predict(test_meta_features)
print(final_preds)

[54.38139  50.97424  51.84437  51.744244 52.847122 52.46323  55.25689
 53.653767]


In [14]:
# Calculate RMSE for each fold
rmse_scores = []
for fold, (train_idx, val_idx) in enumerate(kf.split(df_train)):
    val_preds = train_meta_features[val_idx, :]
    val_true = df_train.iloc[val_idx]['Pawpularity'].values
    fold_rmse = np.sqrt(mean_squared_error(val_true, xgb_model.predict(val_preds)))
    rmse_scores.append(fold_rmse)

# Output RMSE results
print(f'Cross-Validation RMSE: {rmse_scores}')
print(f'Mean RMSE: {np.mean(rmse_scores)}')

Cross-Validation RMSE: [17.269194613419668, 16.8200658613237, 16.31664839101078, 16.23107424761564, 17.187149944360595, 16.93719863614074, 17.25212909661776, 17.605725380675576, 17.457546754934146, 16.86147722534854]
Mean RMSE: 16.993821015144718


In [15]:
df_sample = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/sample_submission.csv')
df_sample['Pawpularity'] = final_preds
df_sample.head()

Unnamed: 0,Id,Pawpularity
0,4128bae22183829d2b5fea10effdb0c3,54.38139
1,43a2262d7738e3d420d453815151079e,50.974239
2,4e429cead1848a298432a0acad014c9d,51.844372
3,80bc3ccafcc51b66303c2c263aa38486,51.744244
4,8f49844c382931444e68dffbe20228f4,52.847122


In [16]:
df_sample.to_csv('submission.csv', index=False)