In [287]:
import csv
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# DATA DESCRIPTION

In [288]:
data_description = pd.read_csv('data_information.csv', names=['Name', 'Description'])
data_description

Unnamed: 0,Name,Description
0,age,age in years
1,sex,sex (1 = male; 0 = female)
2,cp,chest pain type; Value 1: typical angina Value...
3,trestbps,resting blood pressure (in mm Hg on admission ...
4,chol,serum cholestoral in mg/dl
5,fbs,fasting blood sugar > 120 mg/dl (1 = true; 0 =...
6,restecg,resting electrocardiographic results; Value 0:...
7,thalach,maximum heart rate achieved
8,exang,exercise induced angina (1 = yes; 0 = no)
9,oldpeak,ST depression induced by exercise relative to ...


# RAW DATA

In [289]:
raw = np.loadtxt( 'heart.dat' )

data = pd.DataFrame(raw)
data.columns = data_description['Name'].values
data.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2.0
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1.0
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2.0
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1.0
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1.0


# CATEGORICAL DATA

In [290]:
datacat = pd.read_csv('heartcat.dat', header=None)
datacat.columns = data_description['Name'].values
datacat.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,a4,female,cp4,rb2,sc2,fbs0,hyper,mh2,ang0,op2,flat,nmv3,normal,presence
1,a4,male,cp3,rb0,sc2,fbs0,hyper,mh2,ang0,op1,flat,nmv0,reversible,absence
2,a3,female,cp2,rb1,sc2,fbs0,norm,mh2,ang0,op0,up,nmv0,reversible,presence
3,a3,female,cp4,rb1,sc2,fbs0,norm,mh1,ang1,op0,flat,nmv1,reversible,absence
4,a4,male,cp2,rb1,sc2,fbs0,hyper,mh2,ang1,op0,up,nmv1,normal,absence


# VOCAB

In [291]:
with open('heartvocab.csv', 'r') as csvFile:
    reader = csv.reader(csvFile)
    for row in reader:
        vocab = row

vocab

['a0',
 'a1',
 'a2',
 'a3',
 'a4',
 'male',
 'female',
 'cp1',
 'cp2',
 'cp3',
 'cp4',
 'rb0',
 'rb1',
 'rb2',
 'rb3',
 'rb4',
 'sc0',
 'sc1',
 'sc2',
 'fbs0',
 'fbs1',
 'norm',
 'abnormal',
 'hyper',
 'mh0',
 'mh1',
 'mh2',
 'mh3',
 'mh4',
 'mh5',
 'ang0',
 'ang1',
 'op0',
 'op1',
 'op2',
 'op3',
 'op4',
 'op5',
 'op6',
 'up',
 'flat',
 'down',
 'nmv0',
 'nmv1',
 'nmv2',
 'nmv3',
 'normal',
 'fixed',
 'reversible']

# WORD INDEX

In [292]:
# word_index is a dictionary/ vocabulary mapping words to an integer index
word_index = {}
for w in range(len(vocab)):
    word_index[vocab[w]] = {}
    word_index[vocab[w]] = (vocab.index(vocab[w]))

word_index.items()

dict_items([('a0', 0), ('a1', 1), ('a2', 2), ('a3', 3), ('a4', 4), ('male', 5), ('female', 6), ('cp1', 7), ('cp2', 8), ('cp3', 9), ('cp4', 10), ('rb0', 11), ('rb1', 12), ('rb2', 13), ('rb3', 14), ('rb4', 15), ('sc0', 16), ('sc1', 17), ('sc2', 18), ('fbs0', 19), ('fbs1', 20), ('norm', 21), ('abnormal', 22), ('hyper', 23), ('mh0', 24), ('mh1', 25), ('mh2', 26), ('mh3', 27), ('mh4', 28), ('mh5', 29), ('ang0', 30), ('ang1', 31), ('op0', 32), ('op1', 33), ('op2', 34), ('op3', 35), ('op4', 36), ('op5', 37), ('op6', 38), ('up', 39), ('flat', 40), ('down', 41), ('nmv0', 42), ('nmv1', 43), ('nmv2', 44), ('nmv3', 45), ('normal', 46), ('fixed', 47), ('reversible', 48)])

# SPLIT DATA TO X (FEATURES) AND Y (TARGET)

In [293]:
X = datacat.iloc[:,0:13].values
X

array([['a4', 'female', 'cp4', ..., 'flat', 'nmv3', 'normal'],
       ['a4', 'male', 'cp3', ..., 'flat', 'nmv0', 'reversible'],
       ['a3', 'female', 'cp2', ..., 'up', 'nmv0', 'reversible'],
       ...,
       ['a3', 'male', 'cp2', ..., 'flat', 'nmv0', 'normal'],
       ['a3', 'female', 'cp4', ..., 'flat', 'nmv0', 'fixed'],
       ['a4', 'female', 'cp4', ..., 'flat', 'nmv3', 'normal']],
      dtype=object)

In [294]:
y = data['target'].values
y[:5]

array([2., 1., 2., 1., 1.])

# ENCODE DATA

In [295]:
# one hot all docs depend on vocab
X_hot = []
for t in range(len(X)):
    hot = [word_index.get(i, '') for i in X[t]]
    X_hot.append(hot)

X_hot[:5]

[[4, 6, 10, 13, 18, 19, 23, 26, 30, 34, 40, 45, 46],
 [4, 5, 9, 11, 18, 19, 23, 26, 30, 33, 40, 42, 48],
 [3, 6, 8, 12, 18, 19, 21, 26, 30, 32, 39, 42, 48],
 [3, 6, 10, 12, 18, 19, 21, 25, 31, 32, 40, 43, 48],
 [4, 5, 8, 12, 18, 19, 23, 26, 31, 32, 39, 43, 46]]

In [296]:
label_encoder = LabelEncoder()
y_hot = label_encoder.fit_transform(y) # integer_encoded
y_hot[:5]

array([1, 0, 1, 0, 0], dtype=int64)

# RNN

In [297]:
from keras.models import Sequential
from keras.layers import Dense, GRU, Flatten, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [298]:
# # Split data train and test
# split_1 = int(0.8 * len(datacat))
# train_x = X_hot[:split_1]
# test_x = X_hot[split_1:]

# train_y = y_hot[:split_1]
# test_y = y_hot[split_1:]

In [299]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_hot, y_hot, test_size=0.2, random_state=19)

In [300]:
# We reverse it, mapping integer indices to words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
print (reverse_word_index)

{0: 'a0', 1: 'a1', 2: 'a2', 3: 'a3', 4: 'a4', 5: 'male', 6: 'female', 7: 'cp1', 8: 'cp2', 9: 'cp3', 10: 'cp4', 11: 'rb0', 12: 'rb1', 13: 'rb2', 14: 'rb3', 15: 'rb4', 16: 'sc0', 17: 'sc1', 18: 'sc2', 19: 'fbs0', 20: 'fbs1', 21: 'norm', 22: 'abnormal', 23: 'hyper', 24: 'mh0', 25: 'mh1', 26: 'mh2', 27: 'mh3', 28: 'mh4', 29: 'mh5', 30: 'ang0', 31: 'ang1', 32: 'op0', 33: 'op1', 34: 'op2', 35: 'op3', 36: 'op4', 37: 'op5', 38: 'op6', 39: 'up', 40: 'flat', 41: 'down', 42: 'nmv0', 43: 'nmv1', 44: 'nmv2', 45: 'nmv3', 46: 'normal', 47: 'fixed', 48: 'reversible'}


In [301]:
for y in range(len(X_train[:2])):
    print(X_train[y])
    tt = ' '.join([reverse_word_index.get(i,'') for i in X_train[y]])
    print (tt)

[3, 5, 9, 14, 16, 19, 23, 26, 31, 33, 41, 42, 46]
a3 male cp3 rb3 sc0 fbs0 hyper mh2 ang1 op1 down nmv0 normal
[3, 6, 10, 13, 18, 19, 21, 26, 31, 33, 40, 43, 48]
a3 female cp4 rb2 sc2 fbs0 norm mh2 ang1 op1 flat nmv1 reversible


In [302]:
X_train[0]

[3, 5, 9, 14, 16, 19, 23, 26, 31, 33, 41, 42, 46]

In [303]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [304]:
# create the model RNN
model = Sequential()
model.add(Embedding(len(vocab), 10, input_length=(len(X_train[0]))))
model.add(GRU(10))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Train
model.fit(X_train, y_train, epochs=100, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 13, 10)            490       
_________________________________________________________________
gru_15 (GRU)                 (None, 10)                630       
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 11        
Total params: 1,131
Trainable params: 1,131
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch

Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1cbc5c67518>

In [305]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 92.59%


In [306]:
from sklearn.metrics import accuracy_score, log_loss

y_pred = model.predict_proba(X_test)
log_loss(y_test, y_pred)

0.25180029382722247