In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.externals import joblib
from sklearn.metrics import accuracy_score

In [2]:
#load wine-datset
dataset_url = 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(dataset_url)

In [3]:
#read dataset by separating semicolon
data = pd.read_csv(dataset_url, sep=';')

In [4]:
#data analyze
print(data.head(1))
print(data.shape)
print(data.describe())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4               0.7          0.0             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
(1599, 12)
       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1599.000000       1599.000000  1599.000000     1599.000000   
mean        8.319637          0.527821     0.270976        2.538806   
std         1.741096          0.179060     0.194801        1.409928   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.390000     0.090000        1.900000   
50%         7.900000          0.520000     0.260000        2.200000   
75%         9.200000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   


In [5]:
#declare independet and dependent variabls

y = data.quality
x = data.drop('quality', axis=1)


#split data into training & testing

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123, stratify=y)

In [6]:
#pipeline with preprosessing model-parameters
pipeline = make_pipeline(preprocessing.StandardScaler(),RandomForestRegressor(n_estimators=100))


#declare random forest hyper parametrs to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]}

#sklearn cross-validaion pipeline
model = GridSearchCV(pipeline, hyperparameters, cv=10)
model.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestregressor', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decr...ors=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'], 'randomforestregressor__max_depth': [None, 5, 3, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [7]:
print(model.best_params_)

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'auto'}


In [8]:
#Conveniently, GridSearchCV from sklearn will automatically refit the model with the best set of hyperparameters using the entire training set.
print(model.refit)

True


In [9]:
#prediction

y_pred = model.predict(x_test)
y_pred

array([6.53, 5.77, 4.99, 5.58, 6.41, 5.6 , 4.83, 4.7 , 5.01, 6.05, 5.21,
       5.66, 5.81, 5.08, 5.79, 5.57, 6.7 , 5.72, 5.73, 6.98, 5.54, 5.74,
       4.99, 6.05, 5.91, 5.05, 5.54, 5.14, 6.03, 5.97, 5.92, 6.71, 6.02,
       5.01, 4.92, 5.96, 4.99, 5.91, 5.12, 5.72, 4.84, 5.86, 6.76, 5.13,
       6.21, 5.42, 5.59, 5.5 , 5.05, 6.61, 5.84, 5.2 , 5.91, 5.14, 5.61,
       5.74, 5.22, 5.3 , 4.99, 5.35, 5.36, 5.17, 5.06, 5.81, 5.89, 5.19,
       6.36, 5.08, 5.09, 6.69, 5.8 , 5.51, 5.04, 4.99, 5.33, 5.96, 5.24,
       5.1 , 5.29, 5.14, 6.51, 5.58, 6.21, 6.59, 5.11, 5.96, 6.53, 6.12,
       5.52, 5.57, 5.82, 5.29, 6.45, 5.71, 5.78, 5.82, 6.63, 6.67, 5.5 ,
       6.83, 5.02, 5.49, 5.07, 6.68, 5.02, 4.64, 5.72, 5.06, 5.55, 5.97,
       5.76, 5.61, 6.1 , 5.41, 5.21, 5.25, 5.89, 5.02, 4.94, 6.05, 5.82,
       5.06, 5.79, 5.89, 5.14, 5.29, 5.43, 5.88, 5.52, 5.47, 5.57, 6.32,
       5.1 , 5.29, 5.06, 6.53, 5.01, 5.15, 6.7 , 5.38, 5.18, 5.07, 5.64,
       6.12, 5.37, 5.44, 5.14, 6.71, 5.45, 5.08, 5.

In [10]:
#evaluating model

print(mean_squared_error(y_pred, y_test))
print(r2_score(y_pred, y_test))

0.335678125
-0.3829174541171618
