In [1]:
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

import pandas as pd

# Data Preprocessing
- Load the Breast Cancer dataset using load_breast_cancer from sklearn.
- Partition the data into an 80% training set and a 20% test set.
- Scale the features using StandardScaler for KNN.

In [2]:
# Load Breast Cancer dataset from sklearn
X, y = load_breast_cancer(return_X_y=True)

# Partition data into 80% training and 20% test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, train_size=0.80)

# Scale features for KNN 
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Training
1. K-Nearest Neighbors (KNN): Start with n_neighbors=5.
2. Decision Tree: Use the default settings initially, then experiment
with max_depth.
3. Random Forest: Start with 100 trees (n_estimators=100) and
explore the effect of different max_depth or min_samples_split.

In [3]:
# KNN Model
knn_model = KNeighborsClassifier(n_neighbors=5, weights='uniform')

# Train the KNN model 
knn_model = knn_model.fit(X_train_scaled, y_train) 

# Predict on scaled test set
y_predict_knn = knn_model.predict(X_test_scaled) 
# KNNs are sensitive to scaling of features, so need to scale the test features

In [4]:
# Decision Tree model 
decision_tree_model = DecisionTreeClassifier(random_state=42)

# Train the Decision Tree model
decision_tree_model = decision_tree_model.fit(X_train, y_train)

# Predict on test set
y_predict_dec_tree = decision_tree_model.predict(X_test)

In [5]:
# Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=42)

# Train the Random Forest Model
random_forest_model = random_forest_model.fit(X_train, y_train)

# Predict on test set
y_predict_rand_forest = random_forest_model.predict(X_test)

# Evaluating and Comparing Results

In [6]:
# Comparing Results
results = [
    {
         "Model": "KNN",
         "Accuracy": accuracy_score(y_test, y_predict_knn),
         "Precision": precision_score(y_test, y_predict_knn),
         "Recall": recall_score(y_test, y_predict_knn),
         "F1-Score": f1_score(y_test, y_predict_knn)},
    {
         "Model": "Decision Tree",
         "Accuracy": accuracy_score(y_test, y_predict_dec_tree),
         "Precision": precision_score(y_test, y_predict_dec_tree),
         "Recall": recall_score(y_test, y_predict_dec_tree),
         "F1-Score": f1_score(y_test, y_predict_dec_tree)},
    {
         "Model": "Random Forest",
         "Accuracy": accuracy_score(y_test, y_predict_rand_forest),
         "Precision": precision_score(y_test, y_predict_rand_forest),
         "Recall": recall_score(y_test, y_predict_rand_forest),
         "F1-Score": f1_score(y_test, y_predict_rand_forest)
    }       
]

table = pd.DataFrame(results)

print(table.to_string(index=False))

        Model  Accuracy  Precision   Recall  F1-Score
          KNN  0.956140   0.934211 1.000000  0.965986
Decision Tree  0.868421   0.900000 0.887324  0.893617
Random Forest  0.947368   0.933333 0.985915  0.958904


# Confusion Matrices

In [7]:
# KNN Confusion Matrix
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, y_predict_knn))

# Decision Tree Confusion Matrix 
print("\nDecision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_predict_dec_tree))

# Random Forest Confusion Matrix 
print("\nRandom Forest Confusion Matrix:\n", confusion_matrix(y_test, y_predict_rand_forest))

KNN Confusion Matrix:
 [[38  5]
 [ 0 71]]

Decision Tree Confusion Matrix:
 [[36  7]
 [ 8 63]]

Random Forest Confusion Matrix:
 [[38  5]
 [ 1 70]]


# Ablation Study
- Modify key hyperparameters (e.g., n_neighbors for KNN, max_depth for Decision Trees and Random Forest) and observe the impact on performance.

In [8]:
ablation_results_knn = []

for k in [5, 7, 10, 13, 15]:
    knn_model_ablation = KNeighborsClassifier(n_neighbors=k, weights='uniform')
    knn_model_ablation = knn_model_ablation.fit(X_train_scaled, y_train) 
    y_predict_knn_ablation = knn_model_ablation.predict(X_test_scaled)

    ablation_results_knn.append({
        "n_neighbors": f"{k}",
        "Accuracy": accuracy_score(y_test, y_predict_knn_ablation),
        "Precision": precision_score(y_test, y_predict_knn_ablation),
        "Recall": recall_score(y_test, y_predict_knn_ablation),
        "F1-Score": f1_score(y_test, y_predict_knn_ablation)
    })
    
ablation_knn_df = pd.DataFrame(ablation_results_knn)
print("KNN modified hyperparameters:")
print(ablation_knn_df.to_string(index=False))

KNN modified hyperparameters:
n_neighbors  Accuracy  Precision  Recall  F1-Score
          5  0.956140   0.934211     1.0  0.965986
          7  0.964912   0.946667     1.0  0.972603
         10  0.964912   0.946667     1.0  0.972603
         13  0.956140   0.934211     1.0  0.965986
         15  0.956140   0.934211     1.0  0.965986


In [9]:
ablation_results_dec_tree = []

for depth in [None, 5, 7, 10, 15]:
    decision_tree_model_ablation = DecisionTreeClassifier(max_depth=depth, random_state=42)
    decision_tree_model_ablation = decision_tree_model_ablation.fit(X_train, y_train)
    y_predict_dec_tree_ablation = decision_tree_model_ablation.predict(X_test)

    ablation_results_dec_tree.append({
        "max_depth": f"{depth}",
        "Accuracy": accuracy_score(y_test, y_predict_dec_tree_ablation),
        "Precision": precision_score(y_test, y_predict_dec_tree_ablation),
        "Recall": recall_score(y_test, y_predict_dec_tree_ablation),
        "F1-Score": f1_score(y_test, y_predict_dec_tree_ablation)
    })
    
ablation_dec_tree_df = pd.DataFrame(ablation_results_dec_tree)
print("Decision Tree modified hyperparameters:")
print(ablation_dec_tree_df.to_string(index=False))

Decision Tree modified hyperparameters:
max_depth  Accuracy  Precision   Recall  F1-Score
     None  0.868421   0.900000 0.887324  0.893617
        5  0.885965   0.902778 0.915493  0.909091
        7  0.850877   0.885714 0.873239  0.879433
       10  0.868421   0.900000 0.887324  0.893617
       15  0.868421   0.900000 0.887324  0.893617


In [10]:
ablation_results_rand_forest = []

for depth in [None, 2, 5, 10, 15]:
    random_forest_model_ablation = RandomForestClassifier(n_estimators=100, max_depth=depth, random_state=42)
    random_forest_model_ablation = random_forest_model_ablation.fit(X_train, y_train)
    y_predict_rand_forest_ablation = random_forest_model_ablation.predict(X_test)

    ablation_results_rand_forest.append({
        "max_depth": f"{depth}",
        "Accuracy": accuracy_score(y_test, y_predict_rand_forest_ablation),
        "Precision": precision_score(y_test, y_predict_rand_forest_ablation),
        "Recall": recall_score(y_test, y_predict_rand_forest_ablation),
        "F1-Score": f1_score(y_test, y_predict_rand_forest_ablation)
    })
    
ablation_rand_forest_df = pd.DataFrame(ablation_results_rand_forest)
print("Random Forest modified hyperparameters - max_depth:")
print(ablation_rand_forest_df.to_string(index=False))

ablation_results_rand_forest_sample = []

for sample in [2, 5, 10, 15, 20]:
    random_forest_model_ablation = RandomForestClassifier(n_estimators=100, min_samples_split=sample, random_state=42)
    random_forest_model_ablation = random_forest_model_ablation.fit(X_train, y_train)
    y_predict_rand_forest_ablation = random_forest_model_ablation.predict(X_test)

    ablation_results_rand_forest_sample.append({
        "min_samples_split": f"{sample}",
        "Accuracy": accuracy_score(y_test, y_predict_rand_forest_ablation),
        "Precision": precision_score(y_test, y_predict_rand_forest_ablation),
        "Recall": recall_score(y_test, y_predict_rand_forest_ablation),
        "F1-Score": f1_score(y_test, y_predict_rand_forest_ablation)
    })
    
ablation_rand_forest_df = pd.DataFrame(ablation_results_rand_forest_sample)
print("\nRandom Forest modified hyperparameters - min_samples_split")
print(ablation_rand_forest_df.to_string(index=False))

Random Forest modified hyperparameters - max_depth:
max_depth  Accuracy  Precision   Recall  F1-Score
     None  0.947368   0.933333 0.985915  0.958904
        2  0.947368   0.933333 0.985915  0.958904
        5  0.947368   0.933333 0.985915  0.958904
       10  0.947368   0.933333 0.985915  0.958904
       15  0.947368   0.933333 0.985915  0.958904

Random Forest modified hyperparameters - min_samples_split
min_samples_split  Accuracy  Precision   Recall  F1-Score
                2  0.947368   0.933333 0.985915  0.958904
                5  0.956140   0.945946 0.985915  0.965517
               10  0.947368   0.933333 0.985915  0.958904
               15  0.929825   0.920000 0.971831  0.945205
               20  0.938596   0.932432 0.971831  0.951724
