# Breast Cancer ML Model

In [9]:
!pip install scikit-learn pandas --quiet

In [10]:
import sklearn
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer

## Saving and Importing data

In [11]:
sci_kit_internal_data = load_breast_cancer()

In [12]:
breast_cancer_data = pd.DataFrame(sci_kit_internal_data.data,
                                  columns=sci_kit_internal_data.feature_names)
breast_cancer_data['Diagnosis'] = sci_kit_internal_data.target

In [None]:
breast_cancer_data['Diagnosis'].unique()

In [14]:
breast_cancer_data.to_csv('data/breast_cancer_data.csv', index=False)

Or:

In [15]:
with open('data/breast_cancer_data.csv', 'w') as file:
    breast_cancer_data.to_csv(file, index=False)

In [None]:
bc_data = pd.read_csv('data/breast_cancer_data.csv')

## Preprocessing data

In [None]:
diagnosis_counts = bc_data['Diagnosis'].value_counts()
diagnosis_percentages = diagnosis_counts / len(bc_data) * 100
print(diagnosis_counts, diagnosis_percentages)

In [18]:
# Convert counts to a DataFrame for easy plotting
diagnosis_df = diagnosis_counts.reset_index()
diagnosis_df.columns = ['Diagnosis', 'Count']

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
ax = sns.barplot(data=diagnosis_df,
                 x='Diagnosis',
                 y='Count',
                 hue='Diagnosis',
                 palette='muted',
                 legend=False)
plt.title('Class Distribution of Breast Cancer Diagnosis', fontsize=14)
plt.xlabel('Diagnosis', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

### Handle missing values

In [None]:
# Check for missing values
bc_data.isnull().sum()

In [21]:
bc_data.fillna(bc_data.median(), inplace=True)

In [None]:
bc_data.dropna()

### Scale Numerical features

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
scaler = StandardScaler()

In [25]:
num_features = bc_data.columns.drop('Diagnosis')
bc_data[num_features] = scaler.fit_transform(bc_data[num_features])

In [None]:
bc_data[num_features]

### Split features and labels

In [27]:
X = bc_data.drop('Diagnosis', axis=1)
y = bc_data['Diagnosis']

### Train / Test Split

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
split_prop = round(float(1 - 0.8),1)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=split_prop,
    random_state=42)

In [None]:
print(f'The size of the training set is: {len(X_train)} records and is now a Numpy array = : {type(X_train)}')
print(f'The size of the test set is: {len(X_test)} records and is now a Numpy array = : {type(X_test)}.')

### Let the training commmence

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

### Fitting one classifier

In [33]:
model = LogisticRegression(max_iter=1000)
#https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
model.fit(X_train, y_train)

In [35]:
class_preds = model.predict(X_test)

In [36]:
prob_preds = model.predict_proba(X_test)

In [None]:
model.score(X_test, y_test)

### Fitting multiple candidate classifiers

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


my_classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB(),
    "LDA": LinearDiscriminantAnalysis()
}

In [39]:
model_results = {
    "Model": [],
    'Accuracy': [],
    "Precision": [],
    "Recall": [],
    "F1 Score": [],
    "Training Time (s)": [],
    "Predictions": [],
    "Probabilities": []
}

In [None]:
my_classifiers.keys()

In [None]:
my_classifiers.values()

In [42]:
from time import time
from sklearn.metrics import (precision_score, recall_score, f1_score,
                                              roc_auc_score, accuracy_score)

In [43]:
for model_name, model in my_classifiers.items():
  start_timer = time()
  model.fit(X_train, y_train)
  end_model_timer = time()

  y_pred = model.predict(X_test)
  y_prob = model.predict_proba(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  train_time = end_model_timer - start_timer

  model_results["Model"].append(model_name)
  model_results["Accuracy"].append(accuracy)
  model_results["Precision"].append(precision)
  model_results["Recall"].append(recall)
  model_results["F1 Score"].append(f1)
  model_results["Training Time (s)"].append(train_time)
  model_results["Predictions"].append(y_pred)
  model_results["Probabilities"].append(y_prob)


In [None]:
model_results.keys()

### Create a copy of our evaluation dictionary

In [None]:
model_scores_summary = model_results.copy()
model_scores_summary.pop('Predictions', None)
model_scores_summary.pop('Probabilities', None)
print(model_scores_summary)

In [46]:
results_df = pd.DataFrame(model_scores_summary)

In [None]:
results_df

### Visualise results

In [48]:
import matplotlib.pyplot as plt
import numpy as np

def multi_model_visualizer(results_df,
                           metrics=["Accuracy",
                                    "Precision",
                                    "Recall",
                                    "F1 Score",
                                    "Training Time (s)"
                                    ],
                           colors=["lightcoral", "cornflowerblue",
                                   "mediumseagreen", "mediumpurple", "gold"]):
    colors = colors[:len(metrics)] + colors * (len(metrics) - len(colors))

    fig, axes = plt.subplots(2, 3, figsize=(18, 10), sharex=True)
    axes = axes.flatten()


    for i, metric in enumerate(metrics):
        bars = axes[i].barh(results_df["Model"], results_df[metric], color=colors[i])
        axes[i].set_title(f"{metric} by Model")
        axes[i].set_xlabel(metric)
        axes[i].invert_yaxis()  # Best-performing models appear at the top

        for bar in bars:
            width = bar.get_width()
            axes[i].text(width - 0.05 * width, bar.get_y() + bar.get_height() / 2,
                         f"{width:.3f}", ha="center",
                         va="center", color="black", fontsize=10
                         )

    if len(metrics) < 6:
        for j in range(len(metrics), 6):
            fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()


In [None]:
multi_model_visualizer(results_df,
                       colors=["#2b68a5","#1af298","mediumseagreen", "mediumpurple", "gold"])

### Select best model

In [50]:
def get_best_models_by_metric(results_df, metrics):
    best_models = {}
    for metric in metrics:
        best_model_index = results_df[metric].idxmax()
        best_model_name = results_df.loc[best_model_index, "Model"]
        best_models[metric] = best_model_name
        print(f"Best model based on {metric}: {best_model_name}")
    return best_models

In [None]:
metric = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
best_models = get_best_models_by_metric(results_df, metric)

In [52]:
from collections import Counter
def find_modal_model(best_models):
    model_counts = Counter(best_models.values())
    modal_model = model_counts.most_common(1)[0][0]
    print(f"Modal (most frequently best) model across metrics: {modal_model}")
    return modal_model

In [None]:
modal_model = find_modal_model(best_models)

## K-Fold Cross Validation

### Visualisation step

In [54]:
import matplotlib.pyplot as plt

def plot_kfold_cv(N=10000, K=5,
                  val_color='#432f8e',
                  train_color='#86277b',
                  save_path=None
                  ):

    if not isinstance(N, int) or not isinstance(K, int):
        raise TypeError("N and K must be integers.")
    if N <= 0 or K <= 0:
        raise ValueError("N and K must be positive integers.")
    if K > N:
        raise ValueError("K cannot be greater than N.")

    fold_sizes = [N // K] * K
    for i in range(N % K):
        fold_sizes[i] += 1

    indices = list(range(N))
    current = 0
    fold_indices = []
    for fold_size in fold_sizes:
        fold_indices.append(indices[current:current + fold_size])
        current += fold_size

    # Create the plot
    fig, ax = plt.subplots(figsize=(12, K))

    for i in range(K):
        y = K - i - 1
        current = 0
        for j, fold_size in enumerate(fold_sizes):
            if j == i:
                color = val_color
            else:
                color = train_color
            rect = plt.Rectangle((current, y), fold_size, 0.8, facecolor=color, edgecolor='black')
            ax.add_patch(rect)
            current += fold_size
        ax.text(-5, y + 0.4, f'Fold {i + 1}', va='center', ha='right')

    ax.set_xlim(0, N)
    ax.set_ylim(-0.5, K + 0.5)
    ax.set_yticks([])
    ax.set_xlabel('Sample Index')
    ax.set_title(f'K-Fold Cross Validation Visualization (K={K})')
    plt.tight_layout()

    if save_path is not None:
        plt.savefig(save_path)
    else:
      plt.show()




In [None]:
plot_kfold_cv(K=5)

In [56]:
from sklearn.model_selection import train_test_split, KFold

In [57]:
X = X.values if isinstance(X, pd.DataFrame) else X
y = y.values if isinstance(y, pd.Series) else y
y = y.ravel()

In [58]:
model = LogisticRegression(max_iter=1000)

In [59]:
K_FOLD = 5

In [60]:
k_fold = KFold(n_splits=K_FOLD, shuffle=True, random_state=42)

In [61]:
fold_metrics = []

In [None]:
%%time
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

for fold, (train_idx, test_idx) in enumerate(k_fold.split(X)):
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]

  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  y_proba = model.predict_proba(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)

  fold_metrics.append({
      'fold': fold + 1,
      'accuracy_fold': accuracy,
      'precision_fold': precision,
      'recall_fold': recall,
      'f1_fold': f1,
  })

In [63]:
fold_eval_df = pd.DataFrame(fold_metrics)

In [None]:
fold_eval_df

### Visualise fold

In [65]:
metrics = ['accuracy_fold', 'precision_fold', 'recall_fold', 'f1_fold']

In [None]:
import random
def generate_random_hex_color():
    return "#{:06x}".format(random.randint(0, 0xFFFFFF))

chart_colors = [generate_random_hex_color() for _ in range(len(metrics))]
chart_colors

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10), sharey=True)
axes = axes.flatten()

for i, metric in enumerate(metrics):
    axes[i].bar(fold_eval_df["fold"], fold_eval_df[metric], color=chart_colors[i], alpha=0.6, label=metric)
    axes[i].plot(fold_eval_df["fold"], fold_eval_df[metric], marker="o", color='black', linestyle="--", linewidth=1, markersize=6, label=f"{metric} Points")
    axes[i].set_title(f"{metric} across {K_FOLD} folds")
    axes[i].set_xlabel("Fold")
    axes[i].set_ylabel(metric)
    axes[i].legend()

plt.tight_layout()
plt.show()

## A quicker way to get these folds

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=k_fold, scoring='recall')
print("Individual fold accuracies:", scores)
print("Mean recall:", scores.mean())

In [None]:
scores_across = ['recall', 'precision']

for score_name in scores_across:
  scores = cross_val_score(model, X, y, cv=k_fold, scoring=score_name)
  print(f"Mean {score_name}:", scores.mean())