# How to build Skynet: 7 steps
### Intro to Neural Networks


## Pythology Session
## September 2017
## Brandon Boynton
## Breast Cancer Identifier


In [2]:
import pandas as pd
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.metrics import accuracy_score

Using TensorFlow backend.


In [3]:
#Configuration variables
TEST_SPLIT = 0.1
HOLE_SYMBOL = '?'
PREDICTION_COL = 'class'
HM_EPOCHS = 10
LEARNING_RATE = 0.001
HM_INPUTS = 9
HM_OUTPUTS = 2


#   ~~ 1. preprocessing

In [31]:
df = pd.read_csv('breast-cancer-wisconsin.csv') #Load CSV into pandas Dataframe

#   1a. Drop any columns that are not pertinant
df.drop("id", axis=1, inplace=True) #Drop the ID column

#   1b. Replace any holes in the data with an unnoticable value
cols = list(df.columns.values) #Get all of the columns
for col in cols:
    print(col, df[col].dtype)

clump_thickness int64
unif_cell_size int64
unif_cell_shape int64
marg_adhesion int64
single_epith_cell_size int64
bare_nuclei object
bland_chrom int64
norm_nucleoli int64
mitosis int64
class int64


## column 'bare_nuclei' has "?"

In [36]:
df['bare_nuclei'].replace("?",-1,inplace=True)
df['bare_nuclei']=df.bare_nuclei.apply(lambda x: pd.to_numeric(x))

df.bare_nuclei.describe()

count    699.000000
mean       3.440629
std        3.665507
min       -1.000000
25%        1.000000
50%        1.000000
75%        5.000000
max       10.000000
Name: bare_nuclei, dtype: float64

# get the mean for those that are non negative

In [38]:
to_replace_mean = df[df.bare_nuclei>-1].bare_nuclei.mean()


In [39]:
df.bare_nuclei.replace(-1,to_replace_mean,inplace=True)
df.bare_nuclei.describe()

count    699.000000
mean       3.544656
std        3.601852
min        1.000000
25%        1.000000
50%        1.000000
75%        5.000000
max       10.000000
Name: bare_nuclei, dtype: float64

In [40]:
"""for col in cols: #for each column
    only_nums = [] #Initialize array for all the numbers in the array
    for row in df[col].tolist(): #For every row in the column
        if isinstance(row, (int,float,complex)): #(int, long, float, complex)):
            only_nums.append(row) #Add all the numbers to only_nums
    col_mean = np.mean(only_nums) #Get the average of all the rows
    new_col = df[col].tolist() #Create new column with the existing column
    for i, row in enumerate(new_col): #Loop through new column
        if row == HOLE_SYMBOL: #Replace any holes with the average value
            new_col[i] = col_mean
    df[col] = new_col #Replace existing column with new column"""




'for col in cols: #for each column\n    only_nums = [] #Initialize array for all the numbers in the array\n    for row in df[col].tolist(): #For every row in the column\n        if isinstance(row, (int,float,complex)): #(int, long, float, complex)):\n            only_nums.append(row) #Add all the numbers to only_nums\n    col_mean = np.mean(only_nums) #Get the average of all the rows\n    new_col = df[col].tolist() #Create new column with the existing column\n    for row in new_col: #Loop through new column\n        if row == HOLE_SYMBOL: #Replace any holes with the average value\n            new_col[row] = col_mean\n    df[col] = new_col #Replace existing column with new column'

In [41]:
# In other cases, you may need to drop rows, convert strings into numerical data, or clense data

#    1c. Set up dataframe for the neural network
hm_test_rows = int(len(df.index) * float(TEST_SPLIT)) #How many rows should we reserve for testing the AI
hm_train_rows = len(df.index) - hm_test_rows #How many rows should we train on

train = df.head(hm_train_rows) #Get the first # of rows for training
test = df.tail(hm_test_rows) #Get the last # of rows for testing

X = np.array(train.drop([PREDICTION_COL],1).astype(float)) #Format input data into variable X
X = np.array(X).reshape(hm_train_rows, HM_INPUTS) #turn multidimensional array into readable shape
y = np.array(df[PREDICTION_COL]) #get only self.prediction_col




In [43]:

#   1d. Convert values into one-hot encoded tensors
label_vals = [2,4] #Possible outputs

new_y = []
for label in y: #for each value in y
    empty_tensor = [0,0] #create array with 0 for each unique element in y
    modified_tensor = np.array(empty_tensor) # create new modified_tensor from empty_tensor
    label_index = label_vals.index(label) # get the index of that element from all unique elements
    modified_tensor[label_index] = 1 #set that index to 1
    new_y.append(modified_tensor)
y = new_y #replace y with the new formatted y

test_X = np.array(test.drop([PREDICTION_COL],1).astype(float))
test_X = np.array(test_X).reshape(hm_test_rows, HM_INPUTS)
test_y = y[hm_train_rows:]

y = y[:hm_train_rows]
y = np.array(y)

y_true = test_y #format test_y labels for the acuracy test
y_true = [np.argmax(x) for x in y_true]



In [49]:
test_X.shape, len(test_y), len(y_true), X.shape

((69, 9), 69, 69, (630, 9))

#   ~~ 2. Create the neural network architecture

In [50]:


net = Sequential() #Create model

import tensorflow as tf #Reset graph just in case
tf.reset_default_graph()
from tensorflow.python.framework import ops
ops.reset_default_graph()
sess = tf.InteractiveSession()

net.add(Dense(HM_INPUTS,input_dim=HM_INPUTS))
#net.add(Dense(HM_INPUTS,input_shape=(None,HM_INPUTS)))
net.add(Dense(14))
net.add(Dense(HM_OUTPUTS))
#sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) #For custom optimizer
net.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) #Uses default optimizer




#   ~~ 3. Train the neural network

In [51]:
net.fit(X, y, epochs=HM_EPOCHS)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11f398e80>

In [52]:
#   3a. Get the accuracy

y_scores = net.predict(test_X)
y_scores = [np.argmax(x) for x in y_scores]
accuracy = accuracy_score(y_true, y_scores)

print("Network Accuracy: {}".format(accuracy))



Network Accuracy: 0.9565217391304348


In [None]:
#Create infinite loop for test inferencing
while (True):
    inference_str = input('> ')
    inference_tensor = inference_str.split(',')
    inference_tensor = np.array(list(map(int, inference_tensor)))
    p
    inference_tensor = inference_tensor.reshape((1,HM_INPUTS))
    results = net.predict(inference_tensor)
    print(results)

> 1,2,1,3,2,3,1,2,1
[1 2 1 3 2 3 1 2 1]
[[ 1.32536638  1.53314674]]
> 9,1,2,3,4,1,1,2,7
[9 1 2 3 4 1 1 2 7]
[[ 4.16923618 -0.72009552]]
