In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef
import joblib
import os

# Create model folder
os.makedirs('model', exist_ok=True)

# Load dataset directly (no file needed)
cancer = load_breast_cancer(as_frame=True)
df = cancer.frame

# Features and target (target: 0 = malignant, 1 = benign)
X = df.drop('target', axis=1)
y = df['target']

# Scale all features (all are numeric)
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Save scaler for Streamlit app
joblib.dump(scaler, 'model/scaler.pkl')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Dictionary to store results
results = {}

def train_evaluate_save(model, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else np.nan
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results[name] = {
        'Accuracy': round(acc, 4),
        'AUC': round(auc, 4),
        'Precision': round(prec, 4),
        'Recall': round(rec, 4),
        'F1': round(f1, 4),
        'MCC': round(mcc, 4)
    }

    joblib.dump(model, f'model/{name}.pkl')
    print(f"{name} trained and saved.")

# 1. Logistic Regression
train_evaluate_save(LogisticRegression(max_iter=1000), 'logistic_regression')

# 2. Decision Tree
train_evaluate_save(DecisionTreeClassifier(random_state=42), 'decision_tree')

# 3. KNN
train_evaluate_save(KNeighborsClassifier(), 'knn')

# 4. Naive Bayes (Gaussian - suitable for continuous features)
train_evaluate_save(GaussianNB(), 'naive_bayes')

# 5. Random Forest
train_evaluate_save(RandomForestClassifier(random_state=42), 'random_forest')

# 6. XGBoost
train_evaluate_save(XGBClassifier(eval_metric='logloss', random_state=42), 'xgboost')

# Show comparison table
metrics_df = pd.DataFrame(results).T
print("\n=== Model Comparison Table ===")
print(metrics_df)

# Observations (update these slightly based on your actual run if needed)
observations = {
    'logistic_regression': "Very strong linear model. High AUC and balanced precision/recall due to well-separated classes.",
    'decision_tree': "Prone to slight overfitting. Good recall but lower precision than ensembles.",
    'knn': "Distance-based; performs well after scaling. Competitive but sensitive to k.",
    'naive_bayes': "Fast and surprisingly effective despite independence assumption. High recall.",
    'random_forest': "Ensemble power → excellent AUC, robust, reduces variance over single tree.",
    'xgboost': "Best overall performance. Highest AUC/MCC thanks to boosting and handling of feature interactions."
}

print("\n=== Observations ===")
for model, obs in observations.items():
    print(f"{model.capitalize().replace('_', ' ')}: {obs}")

logistic_regression trained and saved.
decision_tree trained and saved.
knn trained and saved.
naive_bayes trained and saved.
random_forest trained and saved.
xgboost trained and saved.

=== Model Comparison Table ===
                     Accuracy     AUC  Precision  Recall      F1     MCC
logistic_regression    0.9825  0.9954     0.9861  0.9861  0.9861  0.9623
decision_tree          0.9123  0.9157     0.9559  0.9028  0.9286  0.8174
knn                    0.9649  0.9792     0.9595  0.9861  0.9726  0.9245
naive_bayes            0.9298  0.9868     0.9444  0.9444  0.9444  0.8492
random_forest          0.9561  0.9937     0.9589  0.9722  0.9655  0.9054
xgboost                0.9561  0.9901     0.9467  0.9861  0.9660  0.9058

=== Observations ===
Logistic regression: Very strong linear model. High AUC and balanced precision/recall due to well-separated classes.
Decision tree: Prone to slight overfitting. Good recall but lower precision than ensembles.
Knn: Distance-based; performs well after

In [2]:
# After: cancer = load_breast_cancer(as_frame=True)
df = cancer.frame

# Save a small test set (e.g., 100 rows)
test_sample = df.sample(n=100, random_state=42)  # or df.iloc[-100:]
test_sample.to_csv("breast_cancer_test_sample.csv", index=False)

print("Test CSV created: breast_cancer_test_sample.csv")

Test CSV created: breast_cancer_test_sample.csv
