In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# Load data
# Load data
print("Loading data...")
train_data = pd.read_csv('/kaggle/input/playground-series-s4e3/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s4e3/test.csv')
print("Data loaded successfully!")

Loading data...
Data loaded successfully!


In [3]:
# Separate features and target variables
print("Separating features and target variables...")
X = train_data.drop(columns=['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults'])
y = train_data[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]

Separating features and target variables...


In [4]:
# Split data into train and validation sets
print("Splitting data into train and validation sets...")
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("Data split completed!")

Splitting data into train and validation sets...
Data split completed!


In [5]:
# Train individual models for each defect category
def train_model(X_train, y_train, X_val, y_val):
    model = XGBClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    return model, y_pred

models = {}
val_preds = np.zeros_like(y_val)

for defect in y_train.columns:
    print(f"Training model for defect: {defect}")
    model, y_pred = train_model(X_train, y_train[defect], X_val, y_val[defect])
    models[defect] = model
    val_preds[:, y_train.columns.get_loc(defect)] = y_pred

Training model for defect: Pastry
Training model for defect: Z_Scratch
Training model for defect: K_Scatch
Training model for defect: Stains
Training model for defect: Dirtiness
Training model for defect: Bumps
Training model for defect: Other_Faults


In [6]:
# Calculate overall ROC AUC score
overall_auc = roc_auc_score(y_val, val_preds, average='macro')
print("Overall ROC AUC Score:", overall_auc)

Overall ROC AUC Score: 0.5


In [7]:
# Make predictions on test data
test_features = test_data.drop(columns=['id'])
test_preds = np.zeros((len(test_data), len(y_train.columns)))

In [8]:
for defect in y_train.columns:
    print(f"Making predictions for defect: {defect}")
    test_preds[:, y_train.columns.get_loc(defect)] = models[defect].predict_proba(test_features)[:, 1]

Making predictions for defect: Pastry
Making predictions for defect: Z_Scratch
Making predictions for defect: K_Scatch
Making predictions for defect: Stains
Making predictions for defect: Dirtiness
Making predictions for defect: Bumps
Making predictions for defect: Other_Faults


In [9]:
# Prepare submission file
submission = pd.DataFrame(test_preds, columns=y_train.columns)
submission['id'] = test_data['id']
submission = submission[['id', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']]

In [10]:
# Save submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved successfully!")

Submission file saved successfully!
