# Serverless Example
## Sklearn [CLASSIFICATION]

## Setup

In [0]:
import os
import time
import logging
import numpy as np
import pandas as pd
from joblib import dump, load
from collections import OrderedDict

from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder

### Load database

In [2]:
iris = datasets.load_iris()

# Build a "real" dataset as a Pandas dataframe
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['flower type'] = pd.Series(iris.target).apply(lambda x: iris.target_names[x])

# Fix variables name
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

df.head(5)

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,flower_type
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Database split
Split the data in training and "online" testing chunks.

In [3]:
# In the "Training" section we pretend to don't know the existance of the test dataframe
train, test = train_test_split(df, test_size=0.2)

print('Training examples : {:5d}'.format(len(train)))
print('Testing examples  : {:5d}'.format(len(test)))

Training examples :   120
Testing examples  :    30


### Define variables roles

In [4]:
# Target variables
target_cols = ['flower_type']
train.flower_type = train.flower_type.apply(lambda x: x if x == 'setosa' else 'other')
target_classes = [train[col].unique().tolist() for col in target_cols]


# Features
numeric_columns = ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm', 'petal_width_cm']
categorical_columns = []

print(f'Feature variables : {list(train.columns.difference(target_cols))}\nTarget variables  : {target_cols}')

Feature variables : ['petal_length_cm', 'petal_width_cm', 'sepal_length_cm', 'sepal_width_cm']
Target variables  : ['flower_type']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


## Training Process

### Dataset Definition

In [0]:
X_train = train.drop(target_cols, axis=1).values
y_train = np.ravel(train[target_cols].values)

### Training Parameters&Environment

In [6]:
# Number of folds for cross-validation
CV_FOLDS = 3

LOG_DIR = './logs/model_{}'.format(time.strftime("%Y-%m-%d__%H.%M.%S", time.localtime()))
os.makedirs(LOG_DIR, exist_ok=True)
print('Model Directory : {}'.format(LOG_DIR))

Model Directory : ./logs/model_2019-07-05__20.21.54


### Training Pipeline

In [0]:
pipe = Pipeline([
    # Scale numeric features
    ('scaler', StandardScaler()),
    
    # Feature creation based on interactions beetween original features
    ('interactions', PolynomialFeatures()),
    
    # Classification model
    ('classifier', SGDClassifier())
])

#print(pipe.steps[1][1].explained_variance_)

### Hyperparameters Tuning
Cross-Validation is proposted here.
Eventually (if CV is too computational onerous):

```
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
hyperparameters = [{'interactions__degree': x, 'classifier__penalty': y}
                    for x in [0.05, 0.01, 0.1, 0.2]
                    for y in np.logspace(-3, 2, 6)]
scores = [pipe.set_params(**params)
              .fit(X_train, y_train)
              .score(X_val, y_val)
          for params in hyperparameters]
```



In [8]:
# Define some possible values for the most important hyperparameters
hyperparameters = {'interactions__degree': np.arange(3),
                   'classifier__penalty': ['none', 'l1', 'l2', 'elasticnet'],
                   'classifier__alpha': 10.0**np.arange(-5, +1),
                   'classifier__max_iter': 10.0**np.arange(3, 7)}

# Define the searching method for hyperparameters
gridsearch = GridSearchCV(pipe, hyperparameters, cv=CV_FOLDS, iid=True, verbose=1)


'''
Eventually (if CV is too computational onerous):

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
hyperparameters = [{'interactions__alpha': alpha, 'classifier__C': C}
                    for alpha in [0.05, 0.01, 0.1, 0.2]
                    for C in np.logspace(-3, 2, 6)]
scores = [pipe.set_params(**params)
              .fit(X_train, y_train)
              .score(X_val, y_val)
          for params in hyperparameters]
'''

"\nEventually (if CV is too computational onerous):\n\nX_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)\nhyperparameters = [{'interactions__alpha': alpha, 'classifier__C': C}\n                    for alpha in [0.05, 0.01, 0.1, 0.2]\n                    for C in np.logspace(-3, 2, 6)]\nscores = [pipe.set_params(**params)\n              .fit(X_train, y_train)\n              .score(X_val, y_val)\n          for params in hyperparameters]\n"

### Training

In [9]:
%%time
# Training
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CPU times: user 2.58 s, sys: 8.64 ms, total: 2.59 s
Wall time: 2.6 s


[Parallel(n_jobs=1)]: Done 864 out of 864 | elapsed:    2.6s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('interactions',
                                        PolynomialFeatures(degree=2,
                                                           include_bias=True,
                                                           interaction_only=False,
                                                           order='C')),
                                       ('classifier',
                                        SGDClassifier(alpha=0.0001,
                                                      average=False,
                                                      class_weight=None,
                      

In [10]:
print('Best training score: ', gridsearch.score(X_train, y_train))
print('Best hyperparameters combination', gridsearch.best_params_)

best_pipe = gridsearch.best_estimator_
model_path = '{}/{}.joblib'.format(LOG_DIR, '_'.join(best_pipe.named_steps.keys()))
dump(best_pipe, model_path);

Best training score:  1.0
Best hyperparameters combination {'classifier__alpha': 1e-05, 'classifier__max_iter': 1000.0, 'classifier__penalty': 'none', 'interactions__degree': 1}


## "Online" Testing Process 

### Blank paper

In [11]:
# Lets put us in blank paper condition
del best_pipe
print('Model Path : ', model_path)

Model Path :  ./logs/model_2019-07-05__20.21.54/scaler_interactions_classifier.joblib


### Prediction
We convert the test dataframe to a dictionary to simulate the information arriving from a serverless endpoint once the model is deployed.

In [0]:
data = test.drop(target_cols, axis=1).to_dict(orient='records', into=OrderedDict)

In [13]:
logger = logging.getLogger('iris_regr')


## Prediction 
def handle(event, **kwargs):
    # If data is received as json convert to pandas
    event = event['data'] if 'data' in event else event
    if not isinstance(event, pd.DataFrame):
        event = pd.DataFrame.from_dict(event, orient='columns')

    # Retrieve model from disk and use it for predictions
    model = load(model_path)
    return model.predict(event).tolist()

## Testing and liveness check
def test(data, **kwargs):
    pred = handle(data)

    logger.warning(f"predicted: {pred}")
    
    return True


test(data)

predicted: ['setosa', 'other', 'setosa', 'setosa', 'setosa', 'other', 'other', 'other', 'other', 'setosa', 'setosa', 'other', 'other', 'other', 'setosa', 'setosa', 'other', 'other', 'setosa', 'setosa', 'other', 'other', 'setosa', 'setosa', 'other', 'setosa', 'other', 'other', 'other', 'other']


True