We first import the necessary packages

In [152]:
# install libraries

import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import seaborn as sn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score


We create labels for the attributes before reading in the data.

In [153]:
# load dataset and transform to pandas dataframe

columns = ["ID","Diagnosis","radius","texture","perimeter","area","smoothness","compactness","concavity","concave points","symmetry","fractal dimension"]

def create_labels(columns):
  dataset_labels = ["ID","Diagnosis"]
  # iterate over columns and
  for attribute in columns[2:len(columns)]:
    dataset_labels.extend([f"SE_{attribute}",attribute,f"Worst_{attribute}"])

  return dataset_labels

# create dataset labels
dataset_labels = create_labels(columns)

We then read the data file.

Enter the File path in the file_path variable **below**

Ensure that the datafile has been uploaded to colab

Ensure that the file path is a **string**

In [None]:
file_path = "/content/wdbc.data"

df = pd.read_csv(file_path,names = dataset_labels, header = None)

print(len(df))

df.head(10)

We remove duplicate data from the dataset and fill missing data by replacing it with the class(Diagnosis) mean for each attribute

In [None]:
# functions pre-processing data

# drop duplicates before processing
dataset_distinct = df.drop_duplicates()
# we now drop the ids column (since we are sure duplicates have been removed)
dataset = dataset_distinct.drop('ID', axis=1)

def replace_missing_data(dataframe):
  grouped = dataframe.groupby("Diagnosis")

  for attribute in dataframe.columns[1:len(dataframe.columns)] :
    mean_values = grouped[attribute].transform('mean')
    dataframe[attribute].fillna(mean_values, inplace=True)

  return dataframe

dataset_post_process = replace_missing_data(dataset)

dataset_post_process.head(10)


We now visualize our data in order to better understand the attributes being used.

In [None]:
num_rows = 10
num_cols = 3
fig, axes = plt.subplots(num_rows, num_cols, figsize=(30, 50))
fig.suptitle("Box and Whisker Plots for Wisconsin Breast Cancer Dataset", fontsize=16)

axes = axes.flatten()


for i, attribute in enumerate(dataset_labels[2:len(dataset_labels)]):
    ax = axes[i]
    boxplot = ax.boxplot(dataset_post_process[attribute], vert=False)
    ax.set_title(attribute)

    # Calculate statistics
    quartiles = np.percentile(dataset_post_process[attribute], [25, 50, 75])
    q1 = quartiles[0]
    median = quartiles[1]
    q3 = quartiles [2]
    whisker_min = boxplot['whiskers'][0].get_xdata()[1]  # Minimum whisker
    whisker_max = boxplot['whiskers'][1].get_xdata()[1]  # Maximum whisker

    ax.text(0.05, 0.85, f"Q1: {quartiles[0]:.4f}", transform=ax.transAxes)
    ax.text(0.05, 0.75, f"Median: {quartiles[1]:.4f}", transform=ax.transAxes)
    ax.text(0.05, 0.65, f"Q3: {quartiles[2]:.4f}", transform=ax.transAxes)
    ax.text(0.5, 0.85, f"Min: {whisker_min:.4f}", transform=ax.transAxes)
    ax.text(0.5, 0.75, f"Max: {whisker_max:.4f}", transform=ax.transAxes)


plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

We can observe from the figures above that there are no outliers below the min of any attributes. Infact all outliers lie above the max value.

We can also observe that for most attributes, the data is skewed to the right.
This is an aspect that can affect the predictions of the model and possibly introduce bias.


We now normalize the attributes and binarize our Diagnosis column in preparation for feeding the data to the KNN model.

In [None]:

def normalize_data(dataframe):
  # we scale all attributes between 0 and 1
  # we binarize class/diagnosis

  diagnosis = dataframe["Diagnosis"]
  features = dataframe.drop("Diagnosis",axis = 1) # inplace maintains order of rows after dropping column
  diagnosis.replace({'M': 1,'B':0},inplace = True) # inplace maintains order of rows after replacing

  x = features.values
  min_max_scaler = MinMaxScaler()
  x_scaled = min_max_scaler.fit_transform(x)
  df_normalized = pd.DataFrame(x_scaled,columns = features.columns)
  df_normalized["Diagnosis"] = diagnosis

  return df_normalized

df_normalized = normalize_data(dataset_post_process)

df_normalized.head(10)



In [None]:
# f1 score to score performance

y = df_normalized["Diagnosis"]
X = df_normalized.drop("Diagnosis",axis = 1)

# split using standard 80% training and 20% test. We stratify our data in order to maintain class frequency
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, stratify = y,random_state=42)

k_values_scores = {}

for k in range(1,11):
  knn = KNeighborsClassifier(n_neighbors=k)
  knn.fit(X_train, y_train)
  y_pred = knn.predict(X_test)
  f1 = f1_score(y_test, y_pred)
  k_values_scores[k] = f1

best_K = max(k_values_scores, key=k_values_scores.get)

print(f"The best K value, with the highest F1 score is {best_K} with an F1 score of {k_values_scores[best_K]}.")

k,score = zip(*k_values_scores.items())

sn.lineplot(x = k, y = score, marker = 'o')
plt.xlabel("K")
plt.ylabel("F1 Score")


We can observe that given current train test split ratio and random state of 42, the K value of 5 gives the highest F1 score of 0.95.

This can also be observes in the above line graph.

We can also observer that the F1 score remains relatively unchanged from K = 6 to K = 10.

There is a large jump from K = 1 to 3 and then a slight decrease at K = 4.

We must however note that many attributes in our dataset were skewed. This will likely introduce bias to our model. In order to correct this, more data must be collected, ensuring that it is not biased.