In [35]:
# Hepatocellular Carcinoma Patient Survival Prediction
# Andrea Mazzocchi
# v1 September 18, 2019

# Objective: 
"""Predict patient survival (yes/no) based on condition of liver (pressence of cirrhosis, nodules), lifestyle
(obesity, smoking, diabetes), and standard measures (biological sex, age)."""

# Hypothesis:
"""It is hypothesized that the data set will allow for predicition of survival (yes/no) with up to 
85% accuracy using condition of live, lifestyle, and standard measures factors."""

# Data Background: 
"""Publicly available data set from Kaggle entitled, "HCC dataset: Hepatocellular Carcinoma Dataset", 
uploaded by user mrsantos. The data contains 50 categories, 9 of which will be used for this study.

Features: age (20-93), gender (male/female, 0/1), cirrhosis (neg/pos, 0/1), nodules (0-5), 
          obesity (neg/pos, 0/1), smoking (neg/pos, 0/1), diabetes (neg/pos, 0/1), alcohol (neg/pos, 0/1),
          stage(1-4)
Label: class (dead/alive, 0/1)
"""

# Sample Size: 164 patients, 159 valid


# Plan of execution:
"""
1) Set objective and hypothesis, select categories/measures from dataset 
2) Import relevent libraries
3) Pre-process downloaded dataset (remove unused categories,remove incomplete sets, normalize non-binary data)
4) Randomly divide set into training (3/5), validation(1/5), and test(1/5) sets
5) Utiliize TensorFlow for logistic regression (train model)
6) Determine loss with training and validation data sets
7) Improve model through L1 regularization, L2 regularization, and learning rate
8) Use test set for final model analysis
9) Calculate final loss of model (training vs test) and output ROC and AUC curves
"""


# Step 1 - Set objective and hypothesis, select categories/measures from dataset (shown above)

# Step 2 - Import relevent libraries

#from _future_ import print_function

import math

from IPython import display 
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

# Step 3 - Pre-process downloaded dataset

# List contents in current directory
!ls

# Print working directory 
! pwd

# Opening dataset from directory 
hccSet = pd.read_csv("hcc-data-complete-balanced.csv")

# Removing unused categories (done by creating a dataframe that only includes categories of interest)
hccSet = hccSet[['Gender','Alcohol','Cirrhosis','Smoking','Diabetes','Obesity','Age','Nodule','Class']]
hccSet.shape


# Adding cancer stage at diagnosis
stage = pd.read_fwf('hcc-stages.txt')
stage.to_csv('hcc-stages.csv')
stage = pd.DataFrame(stage)
stage.columns=['Stage']
stageRow = len(stage)
hccSet[:-(204-stageRow)] # would like to change out use of 204
hccSet['Stage'] = stage

# Re-index Columns
columnsTitles = ["Gender","Alcohol","Cirrhosis","Smoking","Diabetes","Obesity","Age","Nodule","Stage","Class"]
hccSet = hccSet.reindex(columns=columnsTitles)


# Removing incomplete sets
hccSet = hccSet.replace('?', np.NaN)
hccSet = hccSet.dropna()
hccSet = hccSet.reset_index()

# Make all points floating
hccSet = hccSet.astype("float")

# Scale non-binary features to -1<x<1  (age, nodule, stage) - done by iterating through columns to determine if binary
for column in hccSet:
    a = hccSet[column].max() - hccSet[column].min()
    if a > 1 and column != 'index':
        mean = hccSet[column].mean()
        hccSet[column] = (hccSet[column].apply(lambda x: (float(x)-mean)/a))
    else:
        continue;
hccSet = hccSet.round(3)

# Further feature engineering for this data set is not required as all data is as integers or floating point

## Step 4 - Randomly divide set into training (~3/5), validation(~1/5), and test(~1/5) sets

# Reindex so data is randomized (already random, but will further randomize in case there is unknown bias)
hccSet = hccSet.reindex(np.random.permutation(hccSet.index))



# Select training examples (~3/5, 97 examples)
trainSet = hccSet.head(96)
remainSet = hccSet.tail(62)
# Select validation examples (~1/5, 31 examples)
valSet = hccSet.head(31)
# Select test examples (~1/5, 31 examples)
testSet = hccSet.tail(31)

# Separate features from targets (Class = target)
# Targets
hccTarg = hccSet[['Class']].copy()

# Features
hccFeat = hccSet[['Gender','Alcohol','Cirrhosis','Smoking','Diabetes','Obesity','Age','Nodule','Stage']].copy()


trainTarg = hccTarg.head(96)
trainFeat = hccFeat.head(96)

remainTarg = hccTarg.tail(62) 
remainFeat = hccFeat.tail(62)


valTarg = remainTarg.head(31)
valFeat = remainFeat.head(31)


testTarg = remainTarg.tail(31)
testFeat = remainFeat.tail(31)

# Check sets for similar representation
print("Training examples summary:")
display.display(trainFeat.describe())
print("Validation examples summary:")
display.display(valFeat.describe())

print("Training targets summary:")
display.display(trainTarg.describe())
print("Validation targets summary:")
display.display(valTarg.describe())

## Step 5 - Utiliize TensorFlow for logistic regression (train model)

# Creating feature columns
featureColumns = []

for header in ['Gender','Alcohol','Cirrhosis','Smoking','Diabetes','Obesity','Age','Nodule','Stage','Class']:
  featureColumns.append(tf.feature_column.numeric_column(header))


# Creating feature layers
featureLayer = tf.keras.layers.DenseFeatures(featureColumns)

# Creating batch size
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('Class')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.repeat().batch(batch_size)
  return ds

batch_size = 10
trainDS = df_to_dataset(trainSet, batch_size=batch_size)
valDS = df_to_dataset(valSet, shuffle=False, batch_size=batch_size)
testDS = df_to_dataset(testSet, shuffle=False, batch_size=batch_size)

# Train model
model = tf.keras.Sequential()
# Adds a densely-connected layer with 64 units to the model:
model.add(tf.keras.layers.Dense(64, activation='relu'))
# Add another:
model.add(tf.keras.layers.Dense(64, activation='relu'))
# Add a softmax layer with 10 output units:
model.add(tf.keras.layers.Dense(10, activation='softmax'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(trainDS,
          validation_data=valDS,
          epochs=5)



[31mhcc-data-complete-balanced.csv[m[m hcc-stages.txt
[34mhcc-dataset[m[m                    hccSurvivalPrediction.ipynb
hcc-stages.csv
/Users/test-august/Google Drive/HCC ML
Training examples summary:


Unnamed: 0,Gender,Alcohol,Cirrhosis,Smoking,Diabetes,Obesity,Age,Nodule,Stage
count,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0,96.0
mean,0.822917,0.708333,0.875,0.447917,0.28125,0.104167,0.012344,-0.0475,0.016062
std,0.383743,0.456916,0.332455,0.49989,0.451969,0.30708,0.180866,0.34921,0.266132
min,0.0,0.0,0.0,0.0,0.0,0.0,-0.516,-0.535,-0.648
25%,1.0,0.0,1.0,0.0,0.0,0.0,-0.10825,-0.335,-0.148
50%,1.0,1.0,1.0,0.0,0.0,0.0,0.032,-0.135,0.102
75%,1.0,1.0,1.0,1.0,1.0,0.0,0.156,0.465,0.1645
max,1.0,1.0,1.0,1.0,1.0,1.0,0.388,0.465,0.352


Validation examples summary:


Unnamed: 0,Gender,Alcohol,Cirrhosis,Smoking,Diabetes,Obesity,Age,Nodule,Stage
count,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0,31.0
mean,0.806452,0.774194,0.903226,0.451613,0.419355,0.16129,-0.031839,-0.018871,-0.075419
std,0.40161,0.425024,0.300537,0.505879,0.50161,0.373878,0.216182,0.349377,0.310869
min,0.0,0.0,0.0,0.0,0.0,0.0,-0.612,-0.335,-0.648
25%,1.0,1.0,1.0,0.0,0.0,0.0,-0.0705,-0.335,-0.398
50%,1.0,1.0,1.0,0.0,0.0,0.0,0.032,-0.135,-0.148
75%,1.0,1.0,1.0,1.0,1.0,0.0,0.101,0.465,0.102
max,1.0,1.0,1.0,1.0,1.0,1.0,0.279,0.465,0.352


Training targets summary:


Unnamed: 0,Class
count,96.0
mean,0.59375
std,0.49371
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


Validation targets summary:


Unnamed: 0,Class
count,31.0
mean,0.645161
std,0.486373
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


ValueError: Passing a dictionary input to a Sequential Model which doesn't have FeatureLayer as the first layer is an error.