## Objective: To pickle a sklearn model after ML run.

In [59]:
import numpy as np
import pandas as pd
#
from sklearn.model_selection import train_test_split

# Feature Scaling
from sklearn.preprocessing import StandardScaler
## model
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#Model perfromance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv("data/winequality-red.csv")

In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


### Data exploration: Its a clean data

Target col is `quality`.

In [6]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [7]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


### Modeling

In [9]:
feature_array = df.values
X = feature_array[:, 0:11]
Y = feature_array[:, 11]

In [14]:
val_size = 0.2
seed = 42
X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y, 
                                                                test_size = 0.2,
                                                               random_state = 42)

In [17]:
#list(X_validation[0:2])
X_validation[0:2]

array([[7.700e+00, 5.600e-01, 8.000e-02, 2.500e+00, 1.140e-01, 1.400e+01,
        4.600e+01, 9.971e-01, 3.240e+00, 6.600e-01, 9.600e+00],
       [7.800e+00, 5.000e-01, 1.700e-01, 1.600e+00, 8.200e-02, 2.100e+01,
        1.020e+02, 9.960e-01, 3.390e+00, 4.800e-01, 9.500e+00]])

### Tune scaled GBM

In [75]:
# Test and evaluate metrcs usng Root Mean Square error
num_folds = 10
seed = 42
RMS = "neg_mean_squared_error"
ACC = "accuracy"
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)

In [34]:
X_train


array([[ 8.7 ,  0.69,  0.31, ...,  3.48,  0.74, 11.6 ],
       [ 6.1 ,  0.21,  0.4 , ...,  3.25,  0.59, 11.9 ],
       [10.9 ,  0.39,  0.47, ...,  3.3 ,  0.75,  9.8 ],
       ...,
       [ 7.2 ,  0.62,  0.06, ...,  3.51,  0.54,  9.5 ],
       [ 7.9 ,  0.2 ,  0.35, ...,  3.32,  0.8 , 11.9 ],
       [ 5.8 ,  0.29,  0.26, ...,  3.39,  0.54, 13.5 ]])

In [35]:
rescaledX

array([[ 0.21833164,  0.88971201,  0.19209222, ...,  1.09349989,
         0.45822284,  1.12317723],
       [-1.29016623, -1.78878251,  0.65275338, ..., -0.40043872,
        -0.40119696,  1.40827174],
       [ 1.49475291, -0.78434707,  1.01104539, ..., -0.07566946,
         0.51551749, -0.58738978],
       ...,
       [-0.65195559,  0.49909822, -1.08752211, ...,  1.28836145,
        -0.68767023, -0.87248428],
       [-0.24582155, -1.84458448,  0.39683051, ...,  0.05423824,
         0.80199076,  1.40827174],
       [-1.46422367, -1.34236676, -0.06383064, ...,  0.50891521,
        -0.68767023,  2.92877575]])

In [76]:
param_grid = dict(n_estimators = np.array([50, 100, 200, 400]))
# model = GradientBoostingRegressor(random_state  = 42)
model = GradientBoostingClassifier(random_state  = 42)
kfold = KFold(n_splits = 10, random_state = 42)
grid = GridSearchCV(estimator = model, 
                    param_grid= param_grid,
                   #scoring=RMS,
                    scoring = ACC,
                   cv = kfold)
grid_result = grid.fit(rescaledX, Y_train)



In [77]:
grid_result.cv_results_

{'mean_fit_time': array([0.65668318, 1.53812444, 2.93075538, 4.89729855]),
 'std_fit_time': array([0.08071729, 0.24087252, 0.48353821, 0.57463092]),
 'mean_score_time': array([0.00098031, 0.00267043, 0.00230787, 0.00617981]),
 'std_score_time': array([4.95714656e-05, 2.63374365e-03, 1.08758886e-04, 4.96599628e-03]),
 'param_n_estimators': masked_array(data=[50, 100, 200, 400],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 50},
  {'n_estimators': 100},
  {'n_estimators': 200},
  {'n_estimators': 400}],
 'split0_test_score': array([0.6328125, 0.6796875, 0.6953125, 0.6640625]),
 'split1_test_score': array([0.59375  , 0.5703125, 0.5859375, 0.59375  ]),
 'split2_test_score': array([0.6328125, 0.6328125, 0.703125 , 0.6640625]),
 'split3_test_score': array([0.71875 , 0.734375, 0.734375, 0.71875 ]),
 'split4_test_score': array([0.6171875, 0.6171875, 0.59375  , 0.6328125]),
 'split5_test_score': array([0.6015625

In [80]:
print(f"Best Accuracy/rmse: {grid_result.best_score_} using {grid_result.best_params_}")
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean_accuracy, stdev, param in zip(means, stds, params):
    print(f"{mean_accuracy}, {stdev} with: {param}")



Best Accuracy/rmse: 0.6692728838582677 using {'n_estimators': 200}
0.6434793307086614, 0.03488605961919459 with: {'n_estimators': 50}
0.6567667322834645, 0.0454465773356473 with: {'n_estimators': 100}
0.6692728838582677, 0.047264011219868673 with: {'n_estimators': 200}
0.6606606791338583, 0.03217828803760272 with: {'n_estimators': 400}


### Fit Model

In [81]:
# prepare the model
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
#model = GradientBoostingRegressor(random_state=42, n_estimators = 400)
model = GradientBoostingClassifier(random_state=42, n_estimators = 200)

model.fit(rescaledX, Y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=200,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [61]:
## validation data

In [82]:
rescaledValidationX = scaler.transform(X_validation)
predictions  = model.predict(rescaledValidationX)
predictions_class = predictions.astype(int)
# print("Mean squared Error: \n")
# print(mean_squared_error(Y_validation, predictions))
print("Classification Accuracy: \n")
print(accuracy_score(Y_validation, predictions))



Classification Accuracy: 

0.671875


### Evaluate

In [83]:
predictons = predictions.astype(int)
evaluate = pd.DataFrame({
    "Original Quality": Y_validation,
    "Predicted Quality": predictions,
    "Predicted Quality class": predictions_class
})


In [84]:
evaluate["difference"] = evaluate["Original Quality"] - evaluate["Predicted Quality"]
evaluate.tail()

Unnamed: 0,Original Quality,Predicted Quality,Predicted Quality class,difference
315,6.0,6.0,6,0.0
316,5.0,5.0,5,0.0
317,5.0,5.0,5,0.0
318,6.0,6.0,6,0.0
319,4.0,5.0,5,-1.0


In [85]:
evaluate.describe()

Unnamed: 0,Original Quality,Predicted Quality,Predicted Quality class,difference
count,320.0,320.0,320.0,320.0
mean,5.684375,5.6875,5.6875,-0.003125
std,0.809663,0.727322,0.727322,0.65533
min,3.0,3.0,3.0,-2.0
25%,5.0,5.0,5.0,0.0
50%,6.0,6.0,6.0,0.0
75%,6.0,6.0,6.0,0.0
max,8.0,8.0,8.0,3.0


### Adhoc predict

In [101]:
actual_sample = df.head(1)
actual_sample

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [102]:
adhoc_predict = actual_sample[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']]
adhoc_predict.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


## JSON workflow

In [103]:
json_payload = adhoc_predict.to_json()
json_payload

'{"fixed acidity":{"0":7.4},"volatile acidity":{"0":0.7},"citric acid":{"0":0.0},"residual sugar":{"0":1.9},"chlorides":{"0":0.076},"free sulfur dioxide":{"0":11.0},"total sulfur dioxide":{"0":34.0},"density":{"0":0.9978},"pH":{"0":3.51},"sulphates":{"0":0.56},"alcohol":{"0":9.4}}'

#### scale the inputs

In [104]:
scaler = StandardScaler().fit(X_train) ## careful, you are using X_train to scale
scaled_adhoc_predict = scaler.transform(adhoc_predict)
scaled_adhoc_predict

array([[-0.5359173 ,  0.94551397, -1.39462955, -0.45670298, -0.25242785,
        -0.47296984, -0.38437616,  0.55304636,  1.28836145, -0.57308093,
        -0.96751578]])

In [105]:
list(model.predict(scaled_adhoc_predict))

[5.0]

## Pickle sklearn model

In [106]:
from sklearn.externals import joblib



In [107]:
joblib.dump(model, 'red_wine_quality_prediction.joblib')

['red_wine_quality_prediction.joblib']

## Unpickle and predict

In [112]:
clf = joblib.load("red_wine_quality_prediction.joblib")

In [113]:
actual_sample = df.head(5)
adhoc_predict2 = actual_sample[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol']]
adhoc_predict2.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [114]:
## scale input
sclaer = StandardScaler().fit(adhoc_predict2)
scaled_adhoc_predict2 = sclaer.transform(adhoc_predict2)
scaled_adhoc_predict2

array([[-0.63401552,  0.17739372, -0.54410719, -0.77015405, -0.76553545,
        -0.93138063, -1.16682248,  0.66208471,  1.19757578, -0.92669641,
        -1.22474487],
       [-0.3583566 ,  1.06436231, -0.54410719,  1.6803361 ,  1.51038075,
         1.78514621,  1.27021181, -1.40693001, -0.84225109,  1.49077248,
         0.81649658],
       [-0.3583566 ,  0.47304992, -0.36273813,  0.63012604,  0.88967633,
        -0.15523011,  0.310168  , -0.99312707, -0.44744589,  0.88640526,
         0.81649658],
       [ 1.98474425, -1.89219967,  1.99505969, -0.77015405, -0.86898618,
         0.23284516,  0.75326514,  1.07588766, -1.10545456, -0.52378493,
         0.81649658],
       [-0.63401552,  0.17739372, -0.54410719, -0.77015405, -0.76553545,
        -0.93138063, -1.16682248,  0.66208471,  1.19757578, -0.92669641,
        -1.22474487]])

In [115]:
# predict
list(clf.predict(scaled_adhoc_predict2))

[5.0, 6.0, 6.0, 3.0, 5.0]