In [1]:
import cadprep as cpr
import nblog
from nblog import logger as lg
from nblog import NBLog as nblc
import ipynbname

In [2]:
nb_fname = ipynbname.name()

In [3]:
nbl = nblc()
lg.info(nbl.newrun)
lg.info(f'nb name: {nb_fname}')

2021-11-08 14:57:11,873 - nblog - INFO - ------------------------- NEW RUN -------------------------
2021-11-08 14:57:11,878 - nblog - INFO - nb name: telem-NN-one-hot-encoding-breast-data


## Example of one-hot-encoding from
https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/

In [4]:
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pickle

2021-11-08 14:57:12.551702: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-08 14:57:12.551744: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
# define the location of the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"
# load the dataset
dataset = read_csv(url, header=None)
# retrieve the array of data
data = dataset.values

# deploy any clean and subset methods
lg.info(f'cadprep run')

2021-11-08 14:57:16,792 - nblog - INFO - cadprep run


In [6]:
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [7]:
lg.info(f'raw training data: {type(X_train).__name__} {X_train.shape}')

2021-11-08 14:57:16,819 - nblog - INFO - raw training data: ndarray (191, 9)


In [8]:
X_train

array([["'50-59'", "'ge40'", "'25-29'", ..., "'left'", "'right_low'",
        "'no'"],
       ["'30-39'", "'premeno'", "'5-9'", ..., "'left'", "'right_low'",
        "'no'"],
       ["'50-59'", "'premeno'", "'50-54'", ..., "'right'", "'left_up'",
        "'yes'"],
       ...,
       ["'60-69'", "'ge40'", "'10-14'", ..., "'right'", "'left_low'",
        "'no'"],
       ["'60-69'", "'ge40'", "'40-44'", ..., "'right'", "'left_low'",
        "'no'"],
       ["'60-69'", "'ge40'", "'45-49'", ..., "'left'", "'central'",
        "'no'"]], dtype='<U11')

In [9]:
y_train

array(["'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'recurrence-events'",
       "'recurrence-events'", "'recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'recurrence-events'",
       "'recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'

In [10]:
# one-hot encode input variables
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit(X_train)
X_train_enc = onehot_encoder.transform(X_train)
X_test_enc = onehot_encoder.transform(X_test)
lg.info(f'onehot encoding')

2021-11-08 14:57:16,902 - nblog - INFO - onehot encoding


In [11]:
lg.info(f'encoded training data: {type(X_train_enc).__name__} {X_train_enc.shape}')

2021-11-08 14:57:16,908 - nblog - INFO - encoded training data: ndarray (191, 43)


In [12]:
X_train_enc

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [13]:
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train_enc = label_encoder.transform(y_train)
y_test_enc = label_encoder.transform(y_test)

In [14]:
y_train_enc

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0])

In [15]:
# define the model
model = keras.Sequential()
model.add(layers.Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(layers.Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

2021-11-08 14:57:17.016461: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-11-08 14:57:17.017326: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-11-08 14:57:17.017633: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (7A300lap8Y9PR73): /proc/driver/nvidia/version does not exist
2021-11-08 14:57:17.020600: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=2)

In [16]:
model.summary

Epoch 1/100
12/12 - 1s - loss: 0.6251 - accuracy: 0.7068 - 1s/epoch - 108ms/step
Epoch 2/100
12/12 - 0s - loss: 0.6039 - accuracy: 0.7120 - 21ms/epoch - 2ms/step
Epoch 3/100
12/12 - 0s - loss: 0.5874 - accuracy: 0.7120 - 19ms/epoch - 2ms/step
Epoch 4/100
12/12 - 0s - loss: 0.5761 - accuracy: 0.7120 - 16ms/epoch - 1ms/step
Epoch 5/100
12/12 - 0s - loss: 0.5692 - accuracy: 0.7120 - 23ms/epoch - 2ms/step
Epoch 6/100
12/12 - 0s - loss: 0.5593 - accuracy: 0.7120 - 26ms/epoch - 2ms/step
Epoch 7/100
12/12 - 0s - loss: 0.5529 - accuracy: 0.7120 - 18ms/epoch - 1ms/step
Epoch 8/100
12/12 - 0s - loss: 0.5469 - accuracy: 0.7173 - 23ms/epoch - 2ms/step
Epoch 9/100
12/12 - 0s - loss: 0.5410 - accuracy: 0.7277 - 17ms/epoch - 1ms/step
Epoch 10/100
12/12 - 0s - loss: 0.5360 - accuracy: 0.7277 - 26ms/epoch - 2ms/step
Epoch 11/100
12/12 - 0s - loss: 0.5312 - accuracy: 0.7330 - 27ms/epoch - 2ms/step
Epoch 12/100
12/12 - 0s - loss: 0.5257 - accuracy: 0.7539 - 30ms/epoch - 3ms/step
Epoch 13/100
12/12 - 0s -

<bound method Model.summary of <keras.engine.sequential.Sequential object at 0x7fd13ad07100>>

In [17]:
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

# define the model
# model = LogisticRegression()
# # fit on the training set
# model.fit(X_train, y_train)
# predict on test set
lg.info(f'{model.name} run')

2021-11-08 14:57:21,875 - nblog - INFO - sequential run


Accuracy: 68.42


In [25]:
y_hat = model.predict(X_test_enc)
print(y_hat[:5])
print(y_test_enc[:5])

[[0.77409935]
 [0.37733   ]
 [0.22341153]
 [0.27094275]
 [0.07139531]]
[1 0 0 1 1]


In [23]:
# yhat = model.predict(X_test_enc)
# # print(metrics.classification_report(y_test_enc, yhat))
y_test_enc
# yhat

array([1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0])

In [19]:
# # evaluate predictions
# accuracy = metrics.accuracy_score(y_test_enc, yhat)
lg.info(f'accuracy: {accuracy*100:.2f}')

2021-11-08 14:57:21,900 - nblog - INFO - accuracy: 68.42


In [20]:
plot_loss(model.history.history['loss'], model.history.history['val_loss'])
plot_accuracy(model.history.history['accuracy'], model.history.history['val_accuracy'])


NameError: name 'plot_loss' is not defined

In [None]:
# conf_mat = metrics.confusion_matrix(y_test, yhat)
# (tn, fp, fn, tp) = conf_mat.ravel()
# print('       | pred n',  '| pred p')
# print('-------------------------')
# print('cond n | tn', tn, ' | fp', fp)
# print('cond p | fn', fn, ' | tp', tp)

In [None]:
# precision = tp/(tp+fp) # PPV
# recall    = tp/(tp+fn) # sensitivity

# lg.info(f' precision: {precision:.2f}')
# lg.info(f'    recall: {recall:.2f}')

In [None]:
# save the model to disk
pfilename = f'{nb_fname}.sav'
pickle.dump(model, open(pfilename, 'wb'))

In [None]:
print(pfilename)

In [None]:
# # some time later...
 
# # load the model from disk
# loaded_model = pickle.load(open(pfilename, 'rb'))
# result = loaded_model.score(X_test, y_test)
# print(f'{result*100:.2f}')