# Load Files

In [1]:
%%time

import warnings
warnings.filterwarnings('ignore')

CPU times: user 30 µs, sys: 8 µs, total: 38 µs
Wall time: 41.2 µs


In [2]:
%%time

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report
import xgboost as xgb

data = pd.read_csv('/kaggle/input/bird-data/high_confidence_predictions.csv')

CPU times: user 1.21 s, sys: 104 ms, total: 1.31 s
Wall time: 1.17 s


In [3]:
%%time

# Encode the predicted class labels
le = LabelEncoder()
data['encoded_class'] = le.fit_transform(data['predicted_class'])

CPU times: user 3.22 ms, sys: 101 µs, total: 3.32 ms
Wall time: 6.2 ms


In [4]:
%%time

X = data[['confidence_score']].values  
y = data['encoded_class']

CPU times: user 1.36 ms, sys: 0 ns, total: 1.36 ms
Wall time: 5.13 ms


In [5]:
%%time


# Define K-Fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize arrays to store all predictions and true labels
all_predictions = []
all_true_labels = []

# Set the parameters for multi-class classification
params = {
    'objective': 'multi:softprob',
    'num_class': 10,
    'eval_metric': 'mlogloss',
    'use_label_encoder': False,
    'verbosity': 1
}

# Perform K-Fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Create DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    # Training with early stopping
    evals = [(dtrain, 'train'), (dtest, 'eval')]
    bst = xgb.train(
        params, 
        dtrain, 
        num_boost_round=500, 
        evals=evals, 
        early_stopping_rounds=20, 
        verbose_eval=50
    )
    
    # Make predictions and store them
    y_pred_proba = bst.predict(dtest)
    all_predictions.append(y_pred_proba)
    all_true_labels.append(y_test)

# Combine all predictions and true labels
y_pred_proba_all = np.vstack(all_predictions)
y_true_all = np.concatenate(all_true_labels)

# Calculate single ROC AUC score across all folds
roc_auc = roc_auc_score(y_true_all, y_pred_proba_all, multi_class='ovr')
print(f"\nOverall ROC AUC Score across all folds: {roc_auc:.4f}")

[0]	train-mlogloss:1.77887	eval-mlogloss:1.78231
[36]	train-mlogloss:0.82485	eval-mlogloss:1.17201
[0]	train-mlogloss:1.77460	eval-mlogloss:1.82213
[37]	train-mlogloss:0.83270	eval-mlogloss:1.21412
[0]	train-mlogloss:1.77871	eval-mlogloss:1.78215
[39]	train-mlogloss:0.82003	eval-mlogloss:1.17095
[0]	train-mlogloss:1.75562	eval-mlogloss:1.83730
[36]	train-mlogloss:0.79975	eval-mlogloss:1.29420
[0]	train-mlogloss:1.77291	eval-mlogloss:1.81041
[33]	train-mlogloss:0.82838	eval-mlogloss:1.18441

Overall ROC AUC Score across all folds: 0.5398
CPU times: user 2.38 s, sys: 129 ms, total: 2.51 s
Wall time: 700 ms


* ROC AUC Score is 0.5398

# Create Submission File

In [6]:
%%time

# Make predictions on the test set 
y_pred_proba = bst.predict(dtest)  # Predict probabilities
y_pred_class = np.argmax(y_pred_proba, axis=1)  # Convert probabilities to predicted classes

CPU times: user 14.9 ms, sys: 0 ns, total: 14.9 ms
Wall time: 4.91 ms


In [7]:
%%time

# Decode the predicted classes back to original labels
y_pred_labels = le.inverse_transform(y_pred_class)

CPU times: user 729 µs, sys: 92 µs, total: 821 µs
Wall time: 408 µs


In [8]:
%%time

sample_submission = pd.read_csv('/kaggle/input/birdclef-2024/sample_submission.csv')

CPU times: user 9.82 ms, sys: 984 µs, total: 10.8 ms
Wall time: 12.4 ms


In [9]:
%%time

y_pred_labels_corrected = y_pred_labels[:len(sample_submission)]

CPU times: user 9 µs, sys: 1e+03 ns, total: 10 µs
Wall time: 14.3 µs


In [10]:
%%time

sample_submission['predicted_class'] = y_pred_labels_corrected

CPU times: user 487 µs, sys: 0 ns, total: 487 µs
Wall time: 482 µs


In [11]:
%%time

sample_submission.shape

CPU times: user 8 µs, sys: 1 µs, total: 9 µs
Wall time: 12.4 µs


(3, 184)

In [12]:
%%time

sample_submission.head()

CPU times: user 130 µs, sys: 17 µs, total: 147 µs
Wall time: 128 µs


Unnamed: 0,row_id,asbfly,ashdro1,ashpri1,ashwoo2,asikoe2,asiope1,aspfly1,aspswi1,barfly1,...,whcbar1,whiter2,whrmun,whtkin2,woosan,wynlau1,yebbab1,yebbul3,zitcis1,predicted_class
0,soundscape_1446779_5,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,...,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,Animal
1,soundscape_1446779_10,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,...,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,Animal
2,soundscape_1446779_15,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,...,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,0.005495,Animal


In [13]:
%%time

# Save the submission file
sample_submission.to_csv('submission.csv', index=False)

CPU times: user 3.89 ms, sys: 1.24 ms, total: 5.13 ms
Wall time: 5.18 ms
