In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.externals import joblib 


In [3]:
from sklearn.ensemble import RandomForestRegressor

In [4]:
# tools that'll help us from cv
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [5]:
# importing some metrics we can use to evaluate our model performance
from sklearn.metrics import mean_squared_error, r2_score

In [6]:
# Load wine data from remote URLPython
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [8]:
data.head() # Notice how the data is seperated with the ';'. We need to use that as separtors

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5
1,7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5
2,7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...
3,11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...
4,7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5


In [9]:
data = pd.read_csv(dataset_url, sep=';') # Using the sep parameters

In [10]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [11]:
data.shape # There are 1599 observations with 12 features

(1599, 12)

In [12]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


We have the following
- quality (target)
- fixed acidity
- volatile acidity
- citric acid
- residual sugar
- chlorides
- free sulfur dioxide
- total sulfur dioxide
- density
- pH
- sulphates
- alcohol features


In [18]:
## Splitting the data into training and testing set ##

In [13]:

# Splitting the data into training and test sets at the beginning 
# of your modeling workflow is crucia l for getting a realistic
# estimate of your model's performance.
#

y_var = data.quality
X_fts = data.drop('quality', axis=1)

In [17]:
# Using the train_test_split function to create mult. testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X_fts, y_var, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=y_var)

# Info. on parameters
# test_size: The pct of the test_size, thus 20% will be test, 80% will be training
# random_state: We set up an arbitrary number so we can reproduce the results (we do with the seed)
# stratify: You are basically asking the model to take the training and test set 
#           such that the class proportion is same as of the whole dataset, which is the right thing to do.

In [None]:
## Standarizing our features ##

In [28]:
# Prepocessing 
# Here's what that process looks like:
#    - Fit the transformer on the training set (saving the means and standard deviations)
#    - Apply the transformer to the training set (scaling the training data)
#    - Apply the transformer to the test set (using the same means and standard deviations)

scaler = preprocessing.StandardScaler().fit(X_train) # This saves the scaler object with the appr. mean and std dev


array([  1.16664562e-16,  -3.05550043e-17,  -8.47206937e-17,
        -2.22218213e-17,   2.22218213e-17,  -6.38877362e-17,
        -4.16659149e-18,  -2.54439854e-15,  -8.70817622e-16,
        -4.08325966e-16,  -1.17220107e-15])

In [29]:
X_train_scaled = scaler.transform(X_train)
X_train_scaled.std(axis=0)

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.])

In [30]:
# Performing the normalization with the info. from the training set
# to the testing set
X_train_scaled.mean(axis=0) # This should be 0?

array([  1.16664562e-16,  -3.05550043e-17,  -8.47206937e-17,
        -2.22218213e-17,   2.22218213e-17,  -6.38877362e-17,
        -4.16659149e-18,  -2.54439854e-15,  -8.70817622e-16,
        -4.08325966e-16,  -1.17220107e-15])

In [31]:
X_test_scaled = scaler.transform(X_test)

In [32]:
X_test_scaled.mean(axis=0), X_test_scaled.std(axis=0) 
# Notice how the scaled features in the test set are not perfectly centered at zero 
# with unit variance! This is exactly what we'd expect, as we're transforming  
# the test set using the means from the training set, not from the test set itself.

(array([ 0.02776704,  0.02592492, -0.03078587, -0.03137977, -0.00471876,
        -0.04413827, -0.02414174, -0.00293273, -0.00467444, -0.10894663,
         0.01043391]),
 array([ 1.02160495,  1.00135689,  0.97456598,  0.91099054,  0.86716698,
         0.94193125,  1.03673213,  1.03145119,  0.95734849,  0.83829505,
         1.0286218 ]))

In [33]:
# Creating a pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestRegressor(n_estimators=100))

In [None]:
## Declare hyperparameters to tune ##

In [34]:
pipeline.get_params() # The different hyperparams. that can be motified

{'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
 'randomforestregressor__bootstrap': True,
 'randomforestregressor__criterion': 'mse',
 'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'auto',
 'randomforestregressor__max_leaf_nodes': None,
 'randomforestregressor__min_impurity_split': 1e-07,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__min_samples_split': 2,
 'randomforestregressor__min_weight_fraction_leaf': 0.0,
 'randomforestregressor__n_estimators': 100,
 'randomforestregressor__n_jobs': 1,
 'randomforestregressor__oob_score': False,
 'randomforestregressor__random_state': None,
 'rando

In [36]:
# Using only your training set, you can use CV to evaluate 
# different hyperparameters and estimate their effectiveness.
# This allows you to keep your test set "untainted" and save 
# it for a true hold-out evaluation when you're finally ready to select a model.

# The best practice when performing CV is to include your data preprocessing 
# steps inside the cross-validation loop

hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [37]:
clf.best_params_

{'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 'sqrt'}

In [38]:
clf.refit # get a small performance improvement by refitting the model on the entire training set.

True

In [39]:
y_pred = clf.predict(X_test)

In [40]:
r2_score(y_test, y_pred)

0.46918288031963207

In [41]:
mean_squared_error(y_test, y_pred)

0.34252218750000002

In [44]:
joblib.dump(clf, 'rf_regressor.pkl')

['rf_regressor.pkl']