In [59]:
#import all the required library
import numpy as np
import pandas as pd
#this module contains many utilities that will help us choose between models
from sklearn.model_selection import train_test_split
#we'll import the entire preprocessing module. This contains utilities for scaling, transforming, and wrangling data.
from sklearn import preprocessing
#We can import the random forest family like so
from sklearn.ensemble import RandomForestRegressor
#import CV pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
#import evaluation metrics
from sklearn.metrics import mean_squared_error, r2_score
#import a way to persist our models for the future
from sklearn.externals import joblib

In [10]:
#load the data
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
#add separateur
data = pd.read_csv(dataset_url, sep=';')

In [14]:
data.head()
#print (data.shape)
#(1599, 12)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [16]:
#look at the data
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [19]:
#look at the columns
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [17]:
#separate training from target feature
y = data.quality
X = data.drop('quality', axis=1)

In [21]:
#split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y)

In [41]:
print(X_train.mean(axis=0))
print(X_train.std(axis=0))
#the dataset needs to be standardized

fixed acidity            8.310008
volatile acidity         0.526892
citric acid              0.272181
residual sugar           2.547811
chlorides                0.087512
free sulfur dioxide     15.968335
total sulfur dioxide    46.625489
density                  0.996748
pH                       3.311259
sulphates                0.661955
alcohol                 10.420771
dtype: float64
fixed acidity            1.733566
volatile acidity         0.179015
citric acid              0.195788
residual sugar           1.434551
chlorides                0.048282
free sulfur dioxide     10.579362
total sulfur dioxide    32.652856
density                  0.001876
pH                       0.155704
sulphates                0.174623
alcohol                  1.059603
dtype: float64


In [28]:
#Fitting the transformer API - which is then applied on new dataset - the train and the test set for example
scaler = preprocessing.StandardScaler().fit(X_train)

In [34]:
# test that this transformer API has been succesfully fitted by apply it to the train set
X_train_scaled = scaler.transform(X_train)
#test if it has been well standardized
print(X_train_scaled.mean(axis=0))
#we can see that the mean is 0 for all columns
print(X_train_scaled.std(axis=0))
#the standard deviation is 1 also for all

[  1.16664562e-16  -3.05550043e-17  -8.47206937e-17  -2.22218213e-17
   2.22218213e-17  -6.38877362e-17  -4.16659149e-18  -2.54439854e-15
  -8.70817622e-16  -4.08325966e-16  -1.17220107e-15]
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


In [43]:
#apply the same transformation on the test data
X_test_scaled = scaler.transform(X_test)

print(X_test_scaled.mean(axis=0))
print(X_test_scaled.std(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[ 1.02160495  1.00135689  0.97456598  0.91099054  0.86716698  0.94193125
  1.03673213  1.03145119  0.95734849  0.83829505  1.0286218 ]


In [44]:
#set up pipeline with preprocessing and model
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [45]:
#look at all the parameters:
pipeline.get_params()

{'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
 'randomforestregressor__bootstrap': True,
 'randomforestregressor__criterion': 'mse',
 'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'auto',
 'randomforestregressor__max_leaf_nodes': None,
 'randomforestregressor__min_impurity_split': 1e-07,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__min_samples_split': 2,
 'randomforestregressor__min_weight_fraction_leaf': 0.0,
 'randomforestregressor__n_estimators': 100,
 'randomforestregressor__n_jobs': 1,
 'randomforestregressor__oob_score': False,
 'randomforestregressor__random_state': None,
 'rando

In [46]:
#declare hyperparmeters we want to tune thourgh the CV pipeline
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [49]:
#CV pipeline with SKlearn
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [50]:
#look at the best params defined through the CV pipeline
clf.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'sqrt'}

In [53]:
#GridSearchCV from sklearn will automatically refit the model with the best set of hyperparameters using the entire training set.
#This functionality is ON by default, but you can confirm it:
print(clf.refit)
# True

True


In [54]:
#predict a new set of dat
y_pred = clf.predict(X_test)

In [60]:
#evalate the model performance
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.46725007567
0.343769375


In [61]:
#save model to a .pkl file
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']

In [62]:
#load the model from the save file
clf2 = joblib.load('rf_regressor.pkl')

In [63]:
#predict data using the save model
clf2.predict(X_test)

array([ 6.5 ,  5.79,  5.04,  5.49,  6.38,  5.59,  4.96,  4.84,  4.99,
        5.98,  5.26,  5.72,  5.83,  5.09,  5.76,  5.56,  6.53,  5.82,
        5.65,  6.97,  5.46,  5.66,  5.  ,  6.02,  5.98,  5.05,  5.54,
        5.2 ,  5.86,  6.  ,  5.86,  6.41,  6.  ,  4.97,  4.99,  5.97,
        5.04,  6.  ,  5.08,  5.86,  4.9 ,  5.86,  6.68,  5.1 ,  6.22,
        5.38,  5.48,  5.42,  5.12,  6.64,  5.93,  5.26,  5.8 ,  5.29,
        5.47,  5.68,  5.37,  5.35,  4.99,  5.23,  5.33,  5.07,  5.03,
        5.86,  5.97,  5.42,  6.34,  5.03,  5.17,  6.68,  5.82,  5.71,
        5.09,  5.03,  5.22,  5.98,  5.33,  5.08,  5.18,  5.16,  6.41,
        5.62,  6.01,  6.32,  5.08,  5.92,  6.33,  6.44,  5.79,  5.73,
        5.93,  5.37,  6.29,  5.64,  5.64,  5.89,  6.71,  6.64,  5.42,
        6.75,  5.1 ,  5.4 ,  5.13,  6.51,  5.07,  4.69,  5.7 ,  4.99,
        5.66,  5.97,  5.83,  5.41,  6.04,  5.39,  5.22,  5.32,  5.94,
        5.04,  5.19,  5.96,  5.82,  5.11,  5.72,  5.93,  5.16,  5.35,
        5.42,  5.98,