## Breast Cancer Prediction

Dataset found here: https://www.kaggle.com/datasets/vijayaadithyanvg/breast-cancer-prediction

Goal is to predict whether patient has breast cancer or not based on 30 clinical breast mass features identified below

In [None]:
import pandas as pd

df = pd.read_csv('/content/breast-cancer.csv')
df = df.drop(columns=['id'])
df.head()

In [None]:
df.info()

In [None]:
import seaborn as sns

# plot target
_ = sns.countplot(df, x='diagnosis')

Findings:
- Imbalanced target classes in favor of negative outcome

In [2]:
# identify numerical features
numerical = list(df.select_dtypes(['float64', 'int64']).columns)

In [15]:
import numpy as np

# convert categorical to numerical
df['diagnosis'] = np.where(df['diagnosis'] == 'M', 1, 0)

In [None]:
# describe numerical statistics
df.describe()

In [None]:
# analyze skew
df[numerical].skew().sort_values(ascending=False)

Findings:
- All features skewed right to some extent
- Most _se features skewed heavily right

In [5]:
def plot_continuous_distribution1(ax, data, column):
  _ = sns.histplot(ax=ax, data=data, x=column, kde=True).set(title='Distribution of ' + column)

def plot_categorical_distribution1(ax, data, column):
  _ = sns.countplot(ax=ax, data=data, x=column).set(title='Distribution of ' + column)

def plot_continuous_distribution2(ax, data, column):
  _ = sns.histplot(ax=ax, data=data, x=column, kde=True, hue='diagnosis').set(title='Distribution of ' + column)

def plot_categorical_distribution2(ax, data, column):
  _ = sns.countplot(ax=ax, data=data, x=column, hue='diagnosis').set(title='Distribution of ' + column)

def plot_outlier_check(data, column):
  _ = sns.boxplot(x=df[column], data=data)

In [None]:
import matplotlib.pyplot as plt

# plot box-and-whisker plots to check for outliers
plt.figure(figsize=(12, 20))
for i in range(len(numerical)):
  plt.subplot(10, 3, i+1)
  plot_outlier_check(df, numerical[i])
  plt.tight_layout()
  plt.title(numerical[i], size=18)

Findings:
- Several features have outliers but removal would further limit data size

In [None]:
# plot numerical feature distributions
fig, axes = plt.subplots(3, 5, figsize=(18, 15))
j = 0
k = 0
for i in range(len(numerical)//2):
  k = i % 5
  if k == 0 and i != 0:
    j += 1
  #plot_continuous_distribution1(axes[j,k], df, numerical[i])
  plot_continuous_distribution2(axes[j,k], df, numerical[i])

In [None]:
# plot categorical feature distributions
fig, axes = plt.subplots(3, 5, figsize=(18, 15))
j = 0
k = 0
for i in range(len(numerical)//2, len(numerical)):
  k = i % 5
  if k == 0 and i != 0:
    j += 1
  #plot_continuous_distribution1(axes[j,k], df, numerical[i])
  plot_continuous_distribution2(axes[j,k], df, numerical[i])

In [None]:
# plot numerical feature correlations
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [9]:
# prepare data for partitioning
X1 = df.drop(columns=['diagnosis'])
y1 = df['diagnosis']

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# determine features with highest correlation to target
corrs = abs(corr['diagnosis'])
results = corrs[corrs > 0.2]
alt_cols = [i for i, v in results.iteritems()]
alt_cols.remove('diagnosis')
print(results)

In [None]:
# select best features for predicting target
classifier = SelectKBest(score_func=f_classif)
results = classifier.fit(X1, y1)

xdf = pd.DataFrame(results.scores_)
cols = pd.DataFrame(X1.columns)

fscores = pd.concat([cols, xdf], axis=1)
fscores.columns = ['Attribute', 'Score']
fscores = fscores.sort_values(by='Score', ascending=False)
print(fscores)

Findings:
- Many features are correlated with target but most radius and concave points feature variants are highly correlated
- concave_points_worst, perimeter_worst, concave_points_mean, radius_worst, perimeter_mean, area_worst, and radius_mean all appear highly relevant in predicting breast cancer


In [20]:
from sklearn.preprocessing import *

feature_select = False
alt_feature_select = True
ncols = fscores[fscores['Score'] > 100]['Attribute']

# standardize data
ss = StandardScaler()

if feature_select:
  X = X1[ncols]
  y = y1
elif alt_feature_select:
  X = X1[alt_cols]
  y = y1
else:
   X = X1
   y = y1
X = ss.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.layers import Dense, Dropout, Flatten, Input 
from keras.models import Model

# partition data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=14)

# create simple neural network
inputs = Input(shape=X_train.shape[-1])
x= Dense(16, activation='relu', kernel_initializer=tf.keras.initializers.RandomNormal())(inputs)
x = Dropout(0.4)(x)
x= Dense(16, activation='relu', kernel_initializer=tf.keras.initializers.RandomNormal())(inputs)
x = Dropout(0.4)(x)
x = Dense(25, activation='relu', kernel_initializer=tf.keras.initializers.RandomNormal())(x)
x = Dense(1, activation='sigmoid', kernel_initializer=tf.keras.initializers.RandomNormal())(x)
model = Model(inputs=inputs, outputs=x)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
# set threshold
y_pred = y_pred > 0.4

print(f"Accuracy Score : {round(accuracy_score(y_test, y_pred) * 100, 2)}%")
print(classification_report(y_test, y_pred))

Findings:
- Model performs quite well in terms of both training and validation accuracy but is likely not generalizable to real world data given small size of training dataset