In [1]:
import cadprep as cpr
import nblog
from nblog import logger as lg
from nblog import NBLog as nbl

In [2]:
nbl = nbl()
lg.info(nbl.newrun)
lg.info('telem-DT-one-hot-encoding-breast-data')

2021-11-08 10:05:30,216 - nblog - INFO - ------------------------------ NEW RUN ------------------------------
2021-11-08 10:05:30,219 - nblog - INFO - telem-DT-one-hot-encoding-breast-data


## Example of CART with encoding, adapted from
https://machinelearningmastery.com/a-gentle-introduction-to-scikit-learn-a-python-machine-learning-library/

In [3]:
# evaluate logistic regression on the breast cancer dataset with one-hot encoding
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import confusion_matrix
import pickle

In [4]:
# define the location of the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"
# load the dataset
dataset = read_csv(url, header=None)
# retrieve the array of data
data = dataset.values

# deploy any clean and subset methods
lg.info(f'cadprep run')

2021-11-08 10:05:30,387 - nblog - INFO - cadprep run


In [5]:
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [6]:
lg.info(f'raw training data: {type(X_train).__name__} {X_train.shape}')

2021-11-08 10:05:30,422 - nblog - INFO - raw training data: ndarray (191, 9)


In [7]:
X_train

array([["'50-59'", "'ge40'", "'25-29'", ..., "'left'", "'right_low'",
        "'no'"],
       ["'30-39'", "'premeno'", "'5-9'", ..., "'left'", "'right_low'",
        "'no'"],
       ["'50-59'", "'premeno'", "'50-54'", ..., "'right'", "'left_up'",
        "'yes'"],
       ...,
       ["'60-69'", "'ge40'", "'10-14'", ..., "'right'", "'left_low'",
        "'no'"],
       ["'60-69'", "'ge40'", "'40-44'", ..., "'right'", "'left_low'",
        "'no'"],
       ["'60-69'", "'ge40'", "'45-49'", ..., "'left'", "'central'",
        "'no'"]], dtype='<U11')

In [8]:
y_train

array(["'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'recurrence-events'",
       "'recurrence-events'", "'recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'recurrence-events'",
       "'recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'

In [9]:
# one-hot encode input variables
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit(X_train)
X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)
lg.info(f'onehot encoding')

2021-11-08 10:05:30,509 - nblog - INFO - onehot encoding


In [10]:
lg.info(f'encoded training data: {type(X_train).__name__} {X_train.shape}')

2021-11-08 10:05:30,528 - nblog - INFO - encoded training data: ndarray (191, 43)


In [11]:
X_train

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [12]:
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

In [13]:
y_train

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0])

In [14]:
# define the model
model = DecisionTreeClassifier()
# fit on the training set
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [15]:
# predict on test set
yhat = model.predict(X_test)
lg.info(f'{model} run')

2021-11-08 10:05:30,619 - nblog - INFO - DecisionTreeClassifier() run


In [16]:
print(metrics.classification_report(y_test, yhat))

              precision    recall  f1-score   support

           0       0.70      0.85      0.77        62
           1       0.53      0.30      0.38        33

    accuracy                           0.66        95
   macro avg       0.61      0.58      0.58        95
weighted avg       0.64      0.66      0.63        95



In [17]:
# evaluate predictions
accuracy = metrics.accuracy_score(y_test, yhat)
lg.info(f'accuracy: {accuracy*100:.2f}')

2021-11-08 10:05:30,666 - nblog - INFO - accuracy: 66.32


In [18]:
conf_mat = metrics.confusion_matrix(y_test, yhat)
(tn, fp, fn, tp) = conf_mat.ravel()
print('       | pred n',  '| pred p')
print('-------------------------')
print('cond n | tn', tn, ' | fp', fp)
print('cond p | fn', fn, ' | tp', tp)

       | pred n | pred p
-------------------------
cond n | tn 53  | fp 9
cond p | fn 23  | tp 10


In [19]:
precision = tp/(tp+fp) # PPV
recall    = tp/(tp+fn) # sensitivity

lg.info(f' precision: {precision:.2f}')
lg.info(f'    recall: {recall:.2f}')

2021-11-08 10:05:30,701 - nblog - INFO -  precision: 0.53
2021-11-08 10:05:30,702 - nblog - INFO -     recall: 0.30


In [20]:
# save the model to disk
filename = 'Telem-DT-oneht-breast-model.sav'
pickle.dump(model, open(filename, 'wb'))

In [21]:
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(f'{result*100:.2f}')

66.32
