In [2]:
import os

# Create the model directory to store your trained classifiers
if not os.path.exists('model'):
    os.makedirs('model')
    print("Directory 'model' created successfully.")

Directory 'model' created successfully.


In [5]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Loading the dataset (Change this part if you use a custom Kaggle CSV)
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Split into Training and Testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features (important for kNN and Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the test data to a CSV for your Streamlit upload requirement [cite: 91]
test_df = pd.DataFrame(X_test_scaled, columns=data.feature_names)
test_df['target'] = y_test
test_df.to_csv('test_data.csv', index=False)
print("Dataset loaded and test_data.csv created.")

Dataset loaded and test_data.csv created.


In [11]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.2.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.29.3-py3-none-manylinux_2_18_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.2.0-py3-none-manylinux_2_28_x86_64.whl (131.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.7/131.7 MB[0m [31m168.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.29.3-py3-none-manylinux_2_18_x86_64.whl (289.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.8/289.8 MB[0m [31m104.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.29.3 xgboost-3.2.0


In [13]:
import pandas as pd
import pickle
import os
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

# 1. Ensure the 'model' directory exists 
if not os.path.exists('model'):
    os.makedirs('model')

# 2. Load the dataset (using Breast Cancer as it meets the 12+ feature requirement) [cite: 30]
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# 3. Split and Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Initialize the 6 required models [cite: 34-39]
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "kNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}

performance_metrics = []

# 5. Train, Evaluate, and Save [cite: 40-46]
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    metrics = {
        "ML Model Name": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    performance_metrics.append(metrics)
    
    # Save model file to 'model/' folder 
    filename = f"model/{name.lower().replace(' ', '_')}.pkl"
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

# 6. Display the Comparison Table for your README [cite: 70, 71]
comparison_df = pd.DataFrame(performance_metrics)
print("--- COMPARISON TABLE ---")
print(comparison_df.to_string(index=False))

--- COMPARISON TABLE ---
      ML Model Name  Accuracy      AUC  Precision   Recall       F1      MCC
Logistic Regression  0.973684 0.997380   0.972222 0.985915 0.979021 0.943898
      Decision Tree  0.947368 0.943990   0.957746 0.957746 0.957746 0.887979
                kNN  0.947368 0.981985   0.957746 0.957746 0.957746 0.887979
        Naive Bayes  0.964912 0.997380   0.958904 0.985915 0.972222 0.925285
      Random Forest  0.964912 0.994923   0.958904 0.985915 0.972222 0.925285
            XGBoost  0.956140 0.990829   0.958333 0.971831 0.965035 0.906379


In [17]:
import pandas as pd
import pickle
import os
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, matthews_corrcoef

# 1. Create folder for saved models 
if not os.path.exists('model'):
    os.makedirs('model')

# 2. Load Dataset (Breast Cancer has 30 features and 569 instances, meeting requirements) [cite: 30]
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# 3. Split & Scale
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Define the 6 Required Models [cite: 34-39]
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "kNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}

results = []

# 5. Train and Save Models
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate 6 Metrics [cite: 40-46]
    res = {
        "ML Model Name": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_proba),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }
    results.append(res)
    
    # Save the model file 
    with open(f"model/{name.lower().replace(' ', '_')}.pkl", 'wb') as f:
        pickle.dump(model, f)

# 6. OUTPUT: This table is for your README and PDF 
comparison_df = pd.DataFrame(results)
print(comparison_df.to_string(index=False))

# 7. Create a small test CSV for the Streamlit App 
test_sample = pd.DataFrame(X_test_scaled, columns=data.feature_names)
test_sample['target'] = y_test
test_sample.to_csv('test_data.csv', index=False)

      ML Model Name  Accuracy      AUC  Precision   Recall       F1      MCC
Logistic Regression  0.973684 0.997380   0.972222 0.985915 0.979021 0.943898
      Decision Tree  0.938596 0.932362   0.944444 0.957746 0.951049 0.868860
                kNN  0.947368 0.981985   0.957746 0.957746 0.957746 0.887979
        Naive Bayes  0.964912 0.997380   0.958904 0.985915 0.972222 0.925285
      Random Forest  0.964912 0.994268   0.958904 0.985915 0.972222 0.925285
            XGBoost  0.956140 0.990829   0.958333 0.971831 0.965035 0.906379
