<a href="https://colab.research.google.com/github/agsuvidha/ML-Models-Software-Defect-/blob/main/MLModelAnalysis_KC1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas numpy scikit-learn xgboost



In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

data = pd.read_csv("kc1.csv")

if "defect" in data.columns:
    target = "defect"
elif "defects" in data.columns:
    target = "defects"
else:
    raise ValueError("Target column not found!")

X = data.drop(columns=[target])
y = data[target]





In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(kernel="rbf", probability=True),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        random_state=42
    ),
    "XGBoost": XGBClassifier(
        eval_metric="logloss",
        random_state=42
    )
}

In [None]:
results = []

for model_name, model in models.items():
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    results.append({
        "Model": model_name,
        "Accuracy": round(accuracy_score(y_test, y_pred), 3),
        "Precision": round(precision_score(y_test, y_pred), 3),
        "Recall": round(recall_score(y_test, y_pred), 3),
        "F1-Score": round(f1_score(y_test, y_pred), 3),
        "AUC": round(roc_auc_score(y_test, y_prob), 3)
    })

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="F1-Score", ascending=False)

print("\n===== MACHINE LEARNING MODEL COMPARISON =====\n")
print(results_df)


===== MACHINE LEARNING MODEL COMPARISON =====

                 Model  Accuracy  Precision  Recall  F1-Score    AUC
4              XGBoost     0.872      0.612   0.462     0.526  0.811
3        Random Forest     0.858      0.556   0.385     0.455  0.835
2          Naive Bayes     0.832      0.444   0.369     0.403  0.795
1                  SVM     0.865      0.700   0.215     0.329  0.677
0  Logistic Regression     0.860      0.636   0.215     0.322  0.804


In [None]:
results_df.to_csv("PROMISE_ML_Analysis_Results.csv", index=False)

print("\nResults saved as PROMISE_ML_Analysis_Results.csv")


Results saved as PROMISE_ML_Analysis_Results.csv
