In [1]:
import cadprep as cpr
import nblog
from nblog import logger as lg
from nblog import NBLog as nbl

In [2]:
nbl = nbl()
lg.info(nbl.newrun)
lg.info('telem-LR-ordinal-encoding-breast-data')

2021-11-08 10:04:14,451 - nblog - INFO - ------------------------------ NEW RUN ------------------------------
2021-11-08 10:04:14,454 - nblog - INFO - telem-LR-ordinal-encoding-breast-data


## Example of ordinal-encoding from
https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/

In [3]:
# evaluate logistic regression on the breast cancer dataset with an ordinal encoding
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn import metrics
import pickle

In [4]:
# define the location of the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"
# load the dataset
dataset = read_csv(url, header=None)
# retrieve the array of data
data = dataset.values

# deploy any clean and subset methods
lg.info(f'cadprep run')

2021-11-08 10:04:14,622 - nblog - INFO - cadprep run


In [5]:
# separate into input and output columns
X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [6]:
X_train

array([["'50-59'", "'ge40'", "'25-29'", ..., "'left'", "'right_low'",
        "'no'"],
       ["'30-39'", "'premeno'", "'5-9'", ..., "'left'", "'right_low'",
        "'no'"],
       ["'50-59'", "'premeno'", "'50-54'", ..., "'right'", "'left_up'",
        "'yes'"],
       ...,
       ["'60-69'", "'ge40'", "'10-14'", ..., "'right'", "'left_low'",
        "'no'"],
       ["'60-69'", "'ge40'", "'40-44'", ..., "'right'", "'left_low'",
        "'no'"],
       ["'60-69'", "'ge40'", "'45-49'", ..., "'left'", "'central'",
        "'no'"]], dtype='<U11')

In [7]:
y_train

array(["'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'recurrence-events'",
       "'recurrence-events'", "'recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'no-recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'", "'recurrence-events'",
       "'recurrence-events'", "'no-recurrence-events'",
       "'recurrence-events'

In [8]:
# ordinal encode input variables
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_train)
X_train = ordinal_encoder.transform(X_train)
X_test = ordinal_encoder.transform(X_test)

In [9]:
lg.info(f'used training data: {type(X_train).__name__} {X_train.shape}')

2021-11-08 10:04:14,713 - nblog - INFO - used training data: ndarray (191, 9)


In [10]:
X_train

array([[ 3.,  0.,  4., ...,  0.,  3.,  0.],
       [ 1.,  2.,  9., ...,  0.,  3.,  0.],
       [ 3.,  2., 10., ...,  1.,  2.,  1.],
       ...,
       [ 4.,  0.,  1., ...,  1.,  1.,  0.],
       [ 4.,  0.,  7., ...,  1.,  1.,  0.],
       [ 4.,  0.,  8., ...,  0.,  0.,  0.]])

In [11]:
# ordinal encode target variable
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

In [12]:
y_train

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0])

In [13]:
# define the model
model = LogisticRegression()
# fit on the training set
model.fit(X_train, y_train)
# predict on test set
yhat = model.predict(X_test)
lg.info(f'{model} run')

2021-11-08 10:04:14,790 - nblog - INFO - LogisticRegression() run


In [None]:
print(metrics.classification_report(y_test, yhat))

In [14]:
# evaluate predictions
accuracy = metrics.accuracy_score(y_test, yhat)
lg.info(f'accuracy: {accuracy*100:.2f}')

2021-11-08 10:04:14,808 - nblog - INFO - accuracy: 75.79


In [15]:
conf_mat = metrics.confusion_matrix(y_test, yhat)
(tn, fp, fn, tp) = conf_mat.ravel()
print('       | pred n',  '| pred p')
print('-------------------------')
print('cond n | tn', tn, ' | fp', fp)
print('cond p | fn', fn, ' | tp', tp)

       | pred n | pred p
-------------------------
cond n | tn 61  | fp 1
cond p | fn 22  | tp 11


In [16]:
precision = tp/(tp+fp) # PPV
recall    = tp/(tp+fn) # sensitivity

lg.info(f' precision: {precision:.2f}')
lg.info(f'    recall: {recall:.2f}')

2021-11-08 10:04:14,846 - nblog - INFO -  precision: 0.92
2021-11-08 10:04:14,848 - nblog - INFO -     recall: 0.33


In [17]:
# save the model to disk
filename = 'Telem-LR-ord-breast-model.sav'
pickle.dump(model, open(filename, 'wb'))

In [18]:
# some time later...
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(f'{result*100:.2f}')

75.79
