# Breast Cancer

## Introduction


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedShuffleSplit
import tensorflow as tf

In [None]:
df = pd.read_csv("Cancer_Data.csv")
df.head()

In [None]:
df.shape

In [None]:
df.info()

**Conclusion:**
* Sample code number is not relevant to us.
* All columns are numerical data.

### Missing Data

In [None]:
df.isnull().sum()

**Conclusion:** No missing data.

### Duplicated data

In [None]:
df.duplicated().sum()

**Conclusion:** There are 8 rows with duplicated data.

In [None]:
df[df.duplicated()]

In [None]:
df = df[df.duplicated() == False]
df.duplicated().sum()

In [None]:
df.shape

**Conclusion:** The duplicated rows are now removed.

# EDA

In [None]:
df_clean = df.drop(['Sample code number'], axis=1)
df_clean.head()

In [None]:
df_clean["Class"] = df_clean["Class"].map({ 2: 0, 4: 1 })

In [None]:
def hide_current_axis(*args, **kwds):
    plt.gca().set_visible(False)

e = sns.pairplot(df_clean)
e.map_upper(hide_current_axis)

**Conclusion:** pair plot is not useful on this dataset.

### Counting The Malignant Tumor Cases

In [None]:
plt.title(f"Average {df_clean['Class'].mean()*100:.2f}%")
sns.countplot(data=df_clean, x="Class")

In [None]:
corr_matrix = df_clean.corr()
corr_matrix

In [None]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=np.bool))

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(15, 8))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_matrix, mask=mask, cmap=cmap,  vmin=0, vmax=1, center=.5, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot = True)

**Conclusion:**

| Value 1        | Value 2       | Value | Observation   |
| -------------- | ------------- | ----- | ------------- |
|Class          | Clump Thickness              |  +0.72  | moderate correlation |
|Class          | Uniformity of Cell Size      |  +0.82  | strong correlation |
|Class          | Uniformity of Cell Shape     |  +0.82  | strong correlation |
|Class          | Marginal Adhesion            |  +0.71  | moderate correlation |
|Class          | Single Epithelial Cell Size  |  +0.69  | moderate correlation |
|Class          | Bare Nuclei                  |  +0.82  | strong correlation |
|Class          | Bland Chromatin              |  +0.76  | moderate correlation |
|Class          | Normal Nucleoli              |  +0.72  | moderate correlation |
|Class          | Mitoses                      |  +0.42  | inverse correlation |
|Uniformity of Cell Size          | Uniformity of Cell Shape                      |  +0.91  | The Strongest correlation |

## Preprocess 

In [None]:
X = df_clean.iloc[:, :-1].values
print("X (inputs)")
print(X)

# will contains the values of the column 'class:Tumor type'
y = df_clean.iloc[:, -1].values
y = (y == 1)

print("y (outputs)")
print(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print("X_train.shape:", X_train.shape)
print("X_train:", X_train)
print("X_test.shape:", X_test.shape)
print("X_test:", X_test)

In [None]:
print("training new model")

original_acc_per_fold: list[float] = []
acc_per_fold: list[float] = []
loss_per_fold: list[float] = []

num_folds = 10

# kfold = KFold(n_splits=num_folds, shuffle=True, random_state=0)
kfold = StratifiedShuffleSplit(n_splits=num_folds, random_state=0)

# K-fold Cross Validation model evaluation
fold_no = 1
for train_indices, test_indices in kfold.split(X, y):

    X_train = X[train_indices, :]
    y_train = y[train_indices]
    X_test = X[test_indices, :]
    y_test = y[test_indices]



    ann = tf.keras.models.Sequential()
    ann.add(tf.keras.layers.Dense(units=6,  activation='relu'))
    ann.add(tf.keras.layers.Dropout(rate=0.3))
    ann.add(tf.keras.layers.Dense(units=6,  activation='relu'))
    ann.add(tf.keras.layers.Dropout(rate=0.3))
    ann.add(tf.keras.layers.Dense(units=1,  activation='sigmoid'))


    ann.compile(optimizer='adam', loss='binary_crossentropy',
                metrics=['accuracy'])

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy',

        # how long with no progress do we insist?
        patience=300,

        restore_best_weights=True,
        verbose=1
    )

    ann.fit(
        X_train, y_train,
        epochs=300,
        batch_size=32,
        validation_data=(X_test, y_test),
        callbacks=[early_stopping],
        verbose=0
    )


    scores = ann.evaluate(X_test, y_test, verbose=0)

    loss_metric = scores[0]
    accuracy_metric = scores[1] * 100

    print(f'Score for fold {fold_no}: loss={loss_metric:.3f}, accuracy={accuracy_metric:.3f}%')

    acc_per_fold.append(accuracy_metric)
    loss_per_fold.append(loss_metric)

    fold_no += 1

print(f"Accuracy (higher is better):          {np.mean(acc_per_fold):.3f}%")
print(f"Standard Deviation (lower is better): {np.std(acc_per_fold):.3f}%")
print(f"Loss (lower is better):               {np.mean(loss_per_fold):.3f}")     

In [None]:
# Does 'X' person has breast cancer?

def predict_one_person_data(
        Clump, uniformity_Cell_Size, uniformity_Cell_Shape, Marginal_Adhesion, Single_Epithelial_Cell_Size, Bare_Nuclei, Bland_Chromatin, Normal_Nucleoli, Mitoses):

    print("=" * 30)

    input_data = sc.transform([[Clump, uniformity_Cell_Size, uniformity_Cell_Shape, Marginal_Adhesion,
                              Single_Epithelial_Cell_Size, Bare_Nuclei, Bland_Chromatin, Normal_Nucleoli, Mitoses]])

    result = ann.predict(input_data)

    if result > 0.5:
        message = "this person has cancer"
    else:
        message = "this person hasn't cancer"

    print(result, message)

predict_one_person_data(4, 1, 1, 3, 2, 1, 3, 1, 1)
predict_one_person_data(8, 10, 10, 8, 7, 10, 9, 7, 1)