# Binary Classification with a Software Defects Dataset

This study implements an XGBoost classifier to predict software defects using code metrics.

The model achieves validation performance through AUC-ROC scoring, utilizing a 80-20 train-validation split.

Key hyperparameters include 200 estimators, 0.1 learning rate, and max depth of 4.

The model processes source code metrics to output defect probabilities, enabling automated software quality assessment.

Keywords: XGBoost, software defect prediction, machine learning, code metric

Dataset: https://www.kaggle.com/competitions/playground-series-s3e23/data

Hugging Face: https://huggingface.co/spaces/alperugurcan/software-defect-predictors

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pickle

def load_and_prepare_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    X = train_data.drop(['id', 'defects'], axis=1)
    y = train_data['defects']
    X_test = test_data.drop(['id'], axis=1)
    
    return X, y, X_test, test_data['id']

def train_model(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=4,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    auc_score = roc_auc_score(y_val, y_pred)
    print(f"Validasyon AUC-ROC Skoru: {auc_score:.4f}")
    
    return model

def make_predictions(model, X_test, test_ids):
    predictions = model.predict_proba(X_test)[:, 1]
    submission_df = pd.DataFrame({
        'id': test_ids,
        'defects': predictions
    })
    return submission_df

if __name__ == "__main__":
    train_path = '/kaggle/input/playground-series-s3e23/train.csv'
    test_path = '/kaggle/input/playground-series-s3e23/test.csv'
    
    X, y, X_test, test_ids = load_and_prepare_data(train_path, test_path)

    model = train_model(X, y)
    
    pickle.dump(model, open('xgboost_model.pkl', 'wb'))
    
    submission_df = make_predictions(model, X_test, test_ids)
    submission_df.to_csv('submission.csv', index=False)

Validasyon AUC-ROC Skoru: 0.7889
