# Breast Cancer Diagnosis using TensorFlow

**Objective**: To classify a tumor as benign or malignant <br>
**Dataset used**: Wisconsin Diagnostic Breast Cancer dataset

### About the Dataset

The dataset was obtained by researchers at the University of Wisconsin by applying computer vision techniques to isolate
healthy and cancerous nuclei from images of FNA (Fine Needle Aspirate) biopsies. From these biposies, the following 10 features of each of the 3 nuclei were then extracted: radius, perimeter, area, compactness, smoothness, concavity, concave points, symmetry, fractal dimension, and texture. This data was tabulated for each of the 569 patients. In addition to the 30 features, the dataset also contains the ID Number and Diagnosis of the patient, yielding a database of 569 samples x 32 features.

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

  from ._conv import register_converters as _register_converters


In [2]:
df = pd.read_csv("Datasets/Diagnosis_Breast_Cancer.csv")

In [3]:
df.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,...,Column23,Column24,Column25,Column26,Column27,Column28,Column29,Column30,Column31,Column32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Labelling Columns

In [4]:
df.rename(columns={'Column1': 'ID No', 
                   'Column2': 'Diagnosis',
                   'Column3': 'Radius1',
                   'Column4': 'Texture1',
                   'Column5': 'Perimeter1',
                   'Column6': 'Area1',
                   'Column7': 'Smoothness1',
                   'Column8': 'Compactness1',
                   'Column9': 'Concavity1',
                   'Column10': 'ConcavePoints1',
                   'Column11': 'Symmetry1',
                   'Column12': 'FractalDim1',
                   'Column13': 'Radius2',
                   'Column14': 'Texture2',
                   'Column15': 'Perimeter2',
                   'Column16': 'Area2',
                   'Column17': 'Smoothness2',
                   'Column18': 'Compactness2',
                   'Column19': 'Concavity2',
                   'Column20': 'ConcavePoints2',
                   'Column21': 'Symmetry2',
                   'Column22': 'FractalDim2',
                   'Column23': 'Radius3',
                   'Column24': 'Texture3',
                   'Column25': 'Perimeter3',
                   'Column26': 'Area3',
                   'Column27': 'Smoothness3',
                   'Column28': 'Compactness3',
                   'Column29': 'Concavity3',
                   'Column30': 'ConcavePoints3',
                   'Column31': 'Symmetry3',
                   'Column32': 'FractalDim3'},inplace=True)

In [5]:
df.head()

Unnamed: 0,ID No,Diagnosis,Radius1,Texture1,Perimeter1,Area1,Smoothness1,Compactness1,Concavity1,ConcavePoints1,...,Radius3,Texture3,Perimeter3,Area3,Smoothness3,Compactness3,Concavity3,ConcavePoints3,Symmetry3,FractalDim3
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [6]:
df['Diagnosis'].unique()

array(['M', 'B'], dtype=object)

If tumor is malignant, it is denoted as '1' and if it is benign it is denoted as '0'

In [7]:
df['Diagnosis'] = df['Diagnosis'].apply(lambda x: 0 if x=='B' else 1)

In [8]:
df.head()

Unnamed: 0,ID No,Diagnosis,Radius1,Texture1,Perimeter1,Area1,Smoothness1,Compactness1,Concavity1,ConcavePoints1,...,Radius3,Texture3,Perimeter3,Area3,Smoothness3,Compactness3,Concavity3,ConcavePoints3,Symmetry3,FractalDim3
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


### Features and Labels

In [9]:
x_data = df.drop(["ID No","Diagnosis"],axis=1)
y_labels = df['Diagnosis']

### Convertng pandas DataFrame to numpy matrix

In [10]:
x_data = x_data.as_matrix()
y_labels = y_labels.as_matrix()

### Train-Test Split

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(x_data,y_labels,test_size=0.3,random_state=101)

### Normalise Data

In [13]:
from sklearn.preprocessing import MinMaxScaler

In [14]:
scaler = MinMaxScaler()

In [15]:
scaled_x_train = scaler.fit_transform(X_train)
scaled_x_test = scaler.transform(X_test)

In [16]:
onehot_y_train = pd.get_dummies(y_train).as_matrix()

### Defining the Network

In [17]:
X_train.shape

(398, 30)

### Parameters

In [18]:
num_inputs = 30
num_hidden1 = 90
num_hidden2 = 30
num_outputs = 2
learning_rate = 0.005

### Placeholders

In [19]:
X = tf.placeholder(tf.float32,[None,num_inputs],name="X")
y_true = tf.placeholder(tf.float32,[None,num_outputs],name="Labels")

### Initialise Weights

In [20]:
W1 = tf.Variable(tf.random_normal([num_inputs,num_hidden1],stddev=0.01),name="W1")
b1 = tf.Variable(tf.random_normal([num_hidden1],stddev=0.01),name="b1")
W2 = tf.Variable(tf.random_normal([num_hidden1,num_hidden2],stddev=0.01),name="W2")
b2 = tf.Variable(tf.random_normal([num_hidden2],stddev=0.01),name="b2")
W3 = tf.Variable(tf.random_normal([num_hidden2,num_outputs],stddev=0.01),name="W3")
b3 = tf.Variable(tf.random_normal([num_outputs],stddev=0.01),name="b3")


w1s = tf.summary.histogram("W1",W1)
b1s = tf.summary.histogram("b1",b1)
w2s = tf.summary.histogram("W2",W2)
b2s = tf.summary.histogram("b2",b2)
w3s = tf.summary.histogram("W3",W3)
b3s = tf.summary.histogram("b3",b3)

### Choose activation function

In [21]:
actf = tf.nn.relu

### Operation

In [22]:
with tf.name_scope("NeuralNetwork"):
    O = tf.add(tf.matmul(X,W1),b1)
    Z = actf(O)
    Z = tf.nn.dropout(Z,0.25)
    
    O1 = tf.add(tf.matmul(Z,W2),b2)
    Z1 = actf(O1)
    Z1 = tf.nn.dropout(Z1,0.2)
        
    output = tf.add(tf.matmul(Z1,W3),b3)

### Error Calculation

In [23]:
with tf.name_scope("CrossEntropyError"):
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_true,logits=output))
cs = tf.summary.scalar('cross_entropy_error',cost)

### Accuracy Calculation

In [24]:
with tf.name_scope('Accuracy'):
    with tf.name_scope('correct_prediction'):
        correct_prediction = tf.equal(tf.argmax(y_true, 1), tf.argmax(output, 1))
    with tf.name_scope('accuracy'):
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    acc = tf.summary.scalar('accuracy', accuracy)

### Optimizer

In [25]:
optimizer = tf.train.AdamOptimizer(learning_rate)
train = optimizer.minimize(cost)

In [26]:
predict = tf.argmax(output,axis=1)

### Initialise Global Variables

In [27]:
init = tf.global_variables_initializer()

### Saving Model

In [28]:
saver = tf.train.Saver()

In [29]:
training_steps = 3000

with tf.Session() as sess:
    sess.run(init)
    writer = tf.summary.FileWriter("Diagnosis/Logs",sess.graph)
    summaries = tf.summary.merge([w1s,b1s,w2s,b2s,w3s,b3s,cs,acc])
    for i in range(training_steps):
        sess.run(train,feed_dict={X:scaled_x_train,y_true:onehot_y_train})
        pred = sess.run(predict, feed_dict={X:scaled_x_train,y_true:onehot_y_train})
        if i % 300 == 0:
            
            # Printing out accuracy
            correct_prediction = tf.equal(y_train, pred)
            print("Training Accuracy:",sess.run(tf.reduce_mean(tf.cast(correct_prediction, tf.float32))))
            
            s = sess.run(summaries,feed_dict={X:scaled_x_train,y_true:onehot_y_train})
            writer.add_summary(s, global_step=i)
        
    
    # Get predictions
    logits = output.eval(feed_dict={X:scaled_x_test})
    preds = tf.argmax(logits,axis=1)
    results = preds.eval()
    writer.close()
    saver.save(sess,'Diagnosis/Models/my_base_model.ckpt')

Training Accuracy: 0.63316584
Training Accuracy: 0.96231157
Training Accuracy: 0.959799
Training Accuracy: 0.96482414
Training Accuracy: 0.96733665
Training Accuracy: 0.9698492
Training Accuracy: 0.9698492
Training Accuracy: 0.9447236
Training Accuracy: 0.9748744
Training Accuracy: 0.9698492


### Evaluating Performance on Test set

In [30]:
from sklearn.metrics import confusion_matrix,classification_report
print(classification_report(results,y_test))

             precision    recall  f1-score   support

          0       0.99      0.93      0.96       112
          1       0.88      0.98      0.93        59

avg / total       0.95      0.95      0.95       171

