In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib

In [2]:
# Loading the data
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [3]:
# Lets check the data first
print( data.head())


  fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                                     
1   7.8;0.88;0;2.6;0.098;25;67;0.9968;3.2;0.68;9.8;5                                                                                                                     
2  7.8;0.76;0.04;2.3;0.092;15;54;0.997;3.26;0.65;...                                                                                                                     
3  11.2;0.28;0.56;1.9;0.075;17;60;0.998;3.16;0.58...                                                                                                                     
4   7.4;0.7;0;1.9;0.076;11;34;0.9978;3.51;0.56;9.4;5                                                                                                  

In [4]:
# Lets make it more understanable by :
data = pd.read_csv(dataset_url, sep=';')
print( data.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8        5  
2      9.8        5 

In [5]:
# The above figure is more unerstanable since the file was csv(comma seperated file)

In [6]:
# Lets check the size of data
print( data.shape)

(1599, 12)


In [7]:
# Thus we have 1599 samples and 12 features

In [9]:
# Lets observe some summary statistics
print(data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1599.000000          1599.000000           1599.000000  1599.000000   
mean      0.087467            15.874922             46.467792     0.996747   
std       0.047065            10.460157             32.895324     0.001887   
min       0.012000             1.000000         

In [10]:
# Quality is target(i.e y)
# make a note that variables have different scales. Thus, there is a need to STANDARDIZE the data. 
# We will do that shortly

In [14]:
# Lets Split the data into train and test
# Initially seperate the target from features
y = data.quality
# we need to remove target from train features
X = data.drop('quality', axis=1)
# We will use 20% of the data for testing purpose
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123,
                                                   stratify=y)
# random_state is seed to reproduce our results
# stratify is needed to ensure that train and test set is similar, makes evaluation metric reliable

In [15]:
# We will make use of Transform API for scaling
scaler = preprocessing.StandardScaler().fit(X_train)
# Scaler has saved mean and sd for every feature present in X_train
# Lets check
X_train_scaled = scaler.transform(X_train)
print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))

[  1.16664562e-16  -3.05550043e-17  -8.47206937e-17  -2.22218213e-17
   2.22218213e-17  -6.38877362e-17  -4.16659149e-18  -2.54439854e-15
  -8.70817622e-16  -4.08325966e-16  -1.17220107e-15]
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]


In [16]:
# mean is very very near to 0 or negligibly greater than 0
# standard deviation is 1

In [17]:
# Applying transformer to test data
X_test_scaled = scaler.transform(X_test)
print(X_test_scaled.mean(axis=0))
print(X_test_scaled.std(axis=0))

[ 0.02776704  0.02592492 -0.03078587 -0.03137977 -0.00471876 -0.04413827
 -0.02414174 -0.00293273 -0.00467444 -0.10894663  0.01043391]
[ 1.02160495  1.00135689  0.97456598  0.91099054  0.86716698  0.94193125
  1.03673213  1.03145119  0.95734849  0.83829505  1.0286218 ]


In [18]:
# Lets try cross validation pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(), RandomForestRegressor(n_estimators=100))
# in simple words initially we are transforming the data using StandardScaler() and 
# later we are fitting the model using RandomForest and n_estimators are no. of trees used 
# since it is an ensemble method

In [19]:
# Hyperparam section 
# Initially observe the list of tunable hyperparam
print(pipeline.get_params())

{'memory': None, 'steps': [('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))], 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True), 'randomforestregressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_star

In [20]:
# 'randomforestregressor__criterion': 'mse', 'randomforestregressor__max_depth':None,
# 'randomforestregressor__max_features': 'auto', 'randomforestregressor__max_leaf_nodes': None,

In [21]:
# Lets tune hyperparam
hyperparameters = {'randomforestregressor__max_features':['auto', 'sqrt', 'log2'],
                    'randomforestregressor__max_depth': [None, 5, 3, 1]}
# since we had opte for pipeline we are using __ 
# follow the documentation for more details on hyperparam : 

In [22]:
# Lets tune model using CV with pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
# cv =  is the no. of folds to create
# Fitting and tuning the model
clf.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...ors=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
# Now we are having the leisure of getting the best params 
print(clf.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}


In [24]:
# Follow the documentation and u will be surprised that the default param work well
# as in practise Random Forest doent require a lot of tuning

In [25]:
# GridSearchCV autoamtically  refits the model with best param but we can confirm this manually
print(clf.refit)

True


In [26]:
# Evaluating on model pipeline on test data
y_pred = clf.predict(X_test)

In [27]:
# Lets use the metrics
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.457735213996
0.3499090625


In [28]:
joblib.dump(clf, 'rf_regressor.pkl')
# saving a model to a .pkl file

['rf_regressor.pkl']

In [29]:
# you can load the .pkl file and use it
clf2 = joblib.load('rf_regressor.pkl')
# Predicting model using loaded data
clf2.predict(X_test)

array([ 6.48,  5.68,  4.98,  5.53,  6.25,  5.62,  4.9 ,  4.75,  5.01,
        5.97,  5.28,  5.71,  6.02,  5.07,  5.78,  5.6 ,  6.59,  5.66,
        5.66,  6.97,  5.5 ,  5.65,  5.01,  6.09,  5.97,  5.03,  5.42,
        5.13,  5.91,  6.01,  5.84,  6.47,  5.98,  5.02,  5.03,  5.97,
        5.07,  6.14,  5.19,  5.97,  4.91,  6.04,  6.68,  5.15,  6.22,
        5.4 ,  5.54,  5.54,  5.13,  6.54,  6.14,  5.27,  5.87,  5.08,
        5.67,  5.85,  5.42,  5.37,  5.04,  5.29,  5.18,  5.02,  5.06,
        5.81,  5.87,  5.24,  6.44,  5.03,  5.13,  6.66,  5.74,  5.72,
        5.07,  5.02,  5.33,  5.99,  5.4 ,  5.12,  5.26,  5.19,  6.25,
        5.58,  6.12,  6.26,  5.07,  6.01,  6.44,  6.4 ,  5.67,  5.88,
        5.86,  5.42,  6.4 ,  5.66,  5.71,  5.79,  6.61,  6.8 ,  5.54,
        6.73,  5.06,  5.48,  5.13,  6.48,  5.05,  4.69,  5.61,  5.05,
        5.65,  5.84,  5.94,  5.51,  5.95,  5.32,  5.07,  5.24,  5.97,
        5.03,  4.98,  5.97,  5.91,  5.1 ,  5.78,  6.1 ,  5.19,  5.42,
        5.31,  5.95,