In [1]:

import math
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV,cross_val_score, cross_val_predict, StratifiedKFold 
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing, metrics, svm, ensemble
from sklearn.metrics import accuracy_score, classification_report
from tabpy.tabpy_tools.client import Client


In [2]:
# Breast Cancer dataset
# Citation: Dr. William H. Wolberg, University of Wisconsin Hospitals, Madison 
# https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)

# Read the dataset (Note that the CSV provided for this demo has rows with the missing data removed)
df =  pd.read_csv('./breastcancer.csv', header=0)

# Take a look at the structure of the file
df.head(n=10)

Unnamed: 0,Id,Cl.thickness,Cell.size,Cell.shape,Marg.adhesion,Epith.c.size,Bare.nuclei,Bl.cromatin,Normal.nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,benign
1,1002945,5,4,4,5,7,10,3,2,1,benign
2,1015425,3,1,1,1,2,2,3,1,1,benign
3,1016277,6,8,8,1,3,4,3,7,1,benign
4,1017023,4,1,1,3,2,1,3,1,1,benign
5,1017122,8,10,10,8,7,10,9,7,1,malignant
6,1018099,1,1,1,1,2,10,3,1,1,benign
7,1018561,2,1,2,1,2,1,3,1,1,benign
8,1033078,2,1,1,1,2,1,1,1,5,benign
9,1033078,4,2,1,1,2,1,2,1,1,benign


In [3]:
# Drop Id column not used in analysis
df.drop(['Id'], 1, inplace=True)

# Use LabelEncoder to convert textual classifications to numeric. 
# We will use the same encoder later to convert them back.
encoder = preprocessing.LabelEncoder()
df['Class'] = encoder.fit_transform(df['Class'])

# You could also do this manually in the following way:
# df['Class'] = df['Class'].map( {'benign': 0, 'malignant': 1} ).astype(int)

# Check the result of the transform
df.head(n=6)

Unnamed: 0,Cl.thickness,Cell.size,Cell.shape,Marg.adhesion,Epith.c.size,Bare.nuclei,Bl.cromatin,Normal.nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0
5,8,10,10,8,7,10,9,7,1,1


In [4]:
# Split columns into independent/predictor variables vs dependent/response/outcome variable
X = np.array(df.drop(['Class'], 1))
y = np.array(df['Class'])

# Scale the data. We will use the same scaler later for scoring function
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

# 10 fold stratified cross validation
kf = StratifiedKFold(n_splits=3 , random_state=None, shuffle=True)

# Define the parameter grid to use for tuning the Support Vector Machine
parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

# Pick the goal you're optimizing for e.g. precision if you prefer fewer false-positives
# recall if you prefer fewer false-negatives. For demonstration purposes let's pick several
# Note that the final model selection will be based on the last item in the list
scoringmethods = ['f1','accuracy','precision', 'recall','roc_auc']

In [5]:
# Iterate through different metrics looking for best parameter set
for score in scoringmethods:
    print("~~~ Hyper-parameter tuning for best %s ~~~" % score)
    
    # Setup for grid search with cross-validation for Support Vector Machine
    # n_jobs=-1 for parallel execution using all available cores
    svmclf = GridSearchCV(svm.SVC(C=1), parameters, cv=kf, scoring=score,n_jobs=-1)
    svmclf.fit(X, y)
    
   
    
# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")
y_pred = svmclf.predict(X)
print(classification_report(y, y_pred))
    
# Show the definition of the best model
print("Best model:")
print(svmclf.best_estimator_)
    
# Show accuracy and area under ROC curve
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y, y_pred))
print("")

~~~ Hyper-parameter tuning for best f1 ~~~
~~~ Hyper-parameter tuning for best accuracy ~~~
~~~ Hyper-parameter tuning for best precision ~~~
~~~ Hyper-parameter tuning for best recall ~~~
~~~ Hyper-parameter tuning for best roc_auc ~~~
Classification report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       444
           1       0.95      0.97      0.96       239

    accuracy                           0.97       683
   macro avg       0.97      0.97      0.97       683
weighted avg       0.97      0.97      0.97       683

Best model:
SVC(C=100, kernel='linear')
Accuracy: 0.974
Aucroc: 0.973



In [6]:
# Logistic regression with 10 fold stratified cross-validation using model specific cross-validation in scikit-learn
lgclf = LogisticRegressionCV(Cs=list(np.power(10.0, np.arange(-10, 10))),penalty='l2',scoring='roc_auc',cv=kf)
lgclf.fit(X, y)
y_pred = lgclf.predict(X)

# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")
print(classification_report(y, y_pred))

# Show accuracy and area under ROC curve
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y, y_pred))

Classification report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       444
           1       0.96      0.96      0.96       239

    accuracy                           0.97       683
   macro avg       0.97      0.97      0.97       683
weighted avg       0.97      0.97      0.97       683

Accuracy: 0.971
Aucroc: 0.968


In [7]:
# Naive Bayes with 10 fold stratified cross-validation
nbclf = GaussianNB()
scores = cross_val_score(nbclf, X, y, cv=kf, scoring='roc_auc')

# Show accuracy statistics for cross-validation
print("Accuracy: %0.3f" % (scores.mean()))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y, cross_val_predict(nbclf, X, y, cv=kf)))

Accuracy: 0.984
Aucroc: 0.968


In [8]:
# Define the parameter grid to use for tuning the Gradient Boosting Classifier
gridparams = dict(learning_rate=[0.01, 0.1],loss=['deviance','exponential'])

# Parameters we're not tuning for this classifier
params = {'n_estimators': 1500, 'max_depth': 4}

# Setup for grid search with cross-validation for Gradient Boosting Classifier
# n_jobs=-1 for parallel execution using all available cores
gbclf = GridSearchCV(ensemble.GradientBoostingClassifier(**params), gridparams, cv=kf, scoring='roc_auc',n_jobs=-1)
gbclf.fit(X,y)

# Show the definition of the best model
print("Best model:")
print(gbclf.best_estimator_)
print("")

# Show classification report for the best model (set of parameters) run over the full dataset
print("Classification report:")    
y_pred = gbclf.predict(X)
print(classification_report(y, y_pred))

# Show accuracy and area under ROC curve
print("Accuracy: %0.3f" % accuracy_score(y, y_pred, normalize=True))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y, y_pred))

Best model:
GradientBoostingClassifier(loss='exponential', max_depth=4, n_estimators=1500)

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       444
           1       1.00      1.00      1.00       239

    accuracy                           1.00       683
   macro avg       1.00      1.00      1.00       683
weighted avg       1.00      1.00      1.00       683

Accuracy: 1.000
Aucroc: 1.000


<b>Tabpy tools use for deploy model</b>

In [8]:
!env

KeyboardInterrupt: 

# run tabpy in another jupyter notebook using !tabpy and don't close it

In [8]:
!ps -ef | grep tabpy


KeyboardInterrupt: 

In [10]:
help(Client)

Help on class Client in module tabpy.tabpy_tools.client:

class Client(builtins.object)
 |  Client(endpoint, query_timeout=1000)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, endpoint, query_timeout=1000)
 |      Connects to a running server.
 |      
 |      The class constructor takes a server address which is then used to
 |      connect for all subsequent member APIs.
 |      
 |      Parameters
 |      ----------
 |      endpoint : str, optional
 |          The server URL.
 |      
 |      query_timeout : float, optional
 |          The timeout for query operations.
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  deploy(self, name, obj, description='', schema=None, override=False)
 |      Deploys a Python function as an endpoint in the server.
 |      
 |      Parameters
 |      ----------
 |      name : str
 |          A unique identifier for the endpoint.
 |      
 |      obj :  function
 |          Refers to a user-defined function with any signature. Howev

In [11]:
# Connect to TabPy server using the client library
connection = Client('http://127.0.0.1:9004/')


In [12]:
connection.get_status()

{'PCA': {'version': 1,
  'type': 'model',
  'status': 'LoadSuccessful',
  'last_error': None},
 'anova': {'version': 1,
  'type': 'model',
  'status': 'LoadSuccessful',
  'last_error': None},
 'ttest': {'version': 1,
  'type': 'model',
  'status': 'LoadSuccessful',
  'last_error': None},
 'Sentiment Analysis': {'version': 1,
  'type': 'model',
  'status': 'LoadSuccessful',
  'last_error': None},
 'add': {'version': 1,
  'type': 'model',
  'status': 'LoadSuccessful',
  'last_error': None},
 'DiagnosticsDemo': {'version': 2,
  'type': 'model',
  'status': 'LoadSuccessful',
  'last_error': None}}

In [15]:
connection.get_endpoints()

{'PCA': {'schema': None, 'version': 1, 'type': 'model', 'dependencies': [], 'name': 'PCA', 'last_modified_time': datetime.datetime(2021, 1, 5, 10, 36, 22), 'description': 'Returns the specified principal component', 'creation_time': datetime.datetime(2021, 1, 5, 10, 36, 22)},
 'anova': {'schema': None, 'version': 1, 'type': 'model', 'dependencies': [], 'name': 'anova', 'last_modified_time': datetime.datetime(2021, 1, 5, 10, 36, 24), 'description': 'Returns the p-value form an ANOVA test', 'creation_time': datetime.datetime(2021, 1, 5, 10, 36, 24)},
 'ttest': {'schema': None, 'version': 1, 'type': 'model', 'dependencies': [], 'name': 'ttest', 'last_modified_time': datetime.datetime(2021, 1, 5, 10, 36, 25), 'description': 'Returns the p-value form a t-test', 'creation_time': datetime.datetime(2021, 1, 5, 10, 36, 25)},
 'Sentiment Analysis': {'schema': None, 'version': 1, 'type': 'model', 'dependencies': [], 'name': 'Sentiment Analysis', 'last_modified_time': datetime.datetime(2021, 1, 5,

In [14]:
def add(x,y):
    import numpy as np
    return np.add(x, y).tolist()

connection.deploy('add2', add, 'Adds two numbers x and y')

In [36]:
# The scoring function that will use the Gradient Boosting Classifier to classify new data points
def SuggestDiagnosis(Cl_thickness, Cell_size, Cell_shape, Marg_adhesion, Epith_c_size, 
                     Bare_nuclei, Bl_cromatin, Normal_nucleoli, Mitoses):
    X = np.column_stack([Cl_thickness, Cell_size, Cell_shape, Marg_adhesion, Epith_c_size, 
                         Bare_nuclei, Bl_cromatin, Normal_nucleoli, Mitoses])
    X = scaler.transform(X)
    return encoder.inverse_transform(gbclf.predict(X)).tolist()
    

In [43]:
# Publish the SuggestDiagnosis function to TabPy server so it can be used from Tableau
# Using the name DiagnosticsDemo and a short description of what it does
connection.deploy(name='DiagnosticsDemo',
                  obj=SuggestDiagnosis,
                  description='Returns diagnosis suggestion based on ensemble model trained using Wisconsin Breast Cancer dataset',
                  override=True)

### kubectl scale --replicas=0 -f tabpy-deployment.yaml -n tabpy
### kubectl scale --replicas=2 -f tabpy-deployment.yaml -n tabpy