# Keras Logistic Regression

using Keras with Tensorflow to run logistic regression on cancer dataset

dataset: ftp://ftp.cs.wisc.edu/math-prog/cpo-dataset/machine-learn/cancer/
breast cancer dataset from 1984, details: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Prognostic%29

after some data discovery i found these features to be corrleated somehow with the recurrence of cancer: 'radius', 'perimeter', 'area', 'concave_points', 'col14', 'col16', 'col17', 'col24', 'col26', 'col27'

https://medium.com/@the1ju/simple-logistic-regression-using-keras-249e0cc9a970



In [18]:
datasetPath = 'datasets/cancer/WPBC/WPBC.csv'
_CSV_COLUMNS = [
    'radius', 'perimeter', 'area', 'concave_points', 'col14', 
    'col16', 'col17', 'col24', 'col26', 'col27',
]

import tensorflow as tf
import tempfile


In [16]:
def normalize(X):
    mean = np.mean(X)
    std = np.std(X)
    X = (X - mean)/std
    return X



In [3]:
import pandas as pd
import numpy as np


In [4]:
cancers = pd.read_csv(datasetPath)
cancers.head()

Unnamed: 0,ID,outcome,time,radius,texture,perimeter,area,smoothness,compactness,concativity,...,col26,col27,col28,col29,col30,col31,col32,col33,col34,col35
0,119513,N,31,18.02,27.6,117.5,1013.0,0.09489,0.1036,0.1086,...,139.7,1436.0,0.1195,0.1926,0.314,0.117,0.2677,0.08113,5.0,5
1,8423,N,61,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,...,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,3.0,2
2,842517,N,116,21.37,17.44,137.5,1373.0,0.08836,0.1189,0.1255,...,159.1,1949.0,0.1188,0.3449,0.3414,0.2032,0.4334,0.09067,2.5,0
3,843483,N,123,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,...,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,2.0,0
4,843584,R,27,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,...,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,3.5,0


## labels
create two category column from outcome 0/1 column

In [56]:
cancers['outcome'] = cancers['outcome'].astype('category')
y_data = {'outcome_R': np.where(cancers['outcome'] == 'R', 1, 0), 
          'outcome_N': np.where(cancers['outcome'] == 'N', 1, 0) }
y = pd.DataFrame(data=y_data)
y.head()

Unnamed: 0,outcome_N,outcome_R
0,1,0
1,1,0
2,1,0
3,1,0
4,0,1


In [57]:
X, Y = normalize(cancers[_CSV_COLUMNS]), y

In [58]:
X.head()

Unnamed: 0,radius,perimeter,area,concave_points,col14,col16,col17,col24,col26,col27
0,0.192688,0.123934,0.1223,-0.480176,0.069679,-0.129488,0.027606,0.143701,-0.022477,0.053105
1,0.183175,0.372418,0.088138,1.785222,1.589424,1.9801,1.737769,1.029751,1.53551,1.050496
2,1.254939,1.06161,1.147186,-0.147247,-0.058018,-0.149592,0.249081,0.916336,0.650684,0.930741
3,-1.900104,-1.747667,-1.662423,0.545246,-0.348324,-0.370283,-0.89841,-1.4441,-1.43924,-1.432374
4,0.912482,0.949089,0.930821,0.518612,0.49738,0.540353,0.505867,0.358716,0.411261,0.290905


In [59]:
Y.head()

Unnamed: 0,outcome_N,outcome_R
0,1,0
1,1,0
2,1,0
3,1,0
4,0,1


## Train-test division

In [60]:
from sklearn.model_selection import train_test_split
seed = 7

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=seed)


In [61]:
X_train.shape, X_test.shape

((158, 10), (40, 10))

In [35]:
from keras.models import Sequential

Using TensorFlow backend.


In [149]:
from keras.layers import Dense, Activation
output_dim = nb_classes = 2
model = Sequential()
model.add(Dense(len(_CSV_COLUMNS), input_dim=len(_CSV_COLUMNS), activation='softmax'))
model.add(Dense(len(_CSV_COLUMNS) * 2, activation='softmax'))
model.add(Dense(len(_CSV_COLUMNS) - 5, activation='softmax'))
model.add(Dense(output_dim, activation='softmax'))

In [143]:
len(_CSV_COLUMNS)

10

## Compile the model

In [167]:
model.compile(optimizer='RMSprop',
             loss='binary_crossentropy',
              metrics=['categorical_accuracy'])
history = model.fit(X_train, y_train, 
                    batch_size=150,
                    epochs=40,
                    verbose=0,
                    validation_data=(X_test, y_test))
score = model.evaluate(X_test, y_test, verbose=0)
print('Test score: ', score[0])
print('Test accuracy: ', score[1])


Test score:  0.5104709148406983
Test accuracy:  0.85


## hyper parameter tuning

### epochs
number of epochs does not affect the accuracy above 200, it stays 0.85. 

### activation
softmax: 0.85, 

relu: .82, 

sigmoid: 0.85

tanh: 0.825

### loss
categorical_crossentropy, mean_squared_error, binary_crossentropy: no difference

### optimizer
sgd: 0.85

adam: 0.85

adamax: 0.85

RMSprop: 0.85 (but this makes 40 epochs just enough to get the same result)


### batch size

128: .85

5: 0.775

100: 0.775

