In [28]:
import sklearn as sk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# import dataset
data = pd.read_csv("WineQT.csv")

In [4]:
# overall EDA
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
Id                      0
dtype: int64

In [7]:
data.shape

(1143, 13)

In [9]:
print(data.describe())

       fixed acidity  volatile acidity  citric acid  residual sugar  \
count    1143.000000       1143.000000  1143.000000     1143.000000   
mean        8.311111          0.531339     0.268364        2.532152   
std         1.747595          0.179633     0.196686        1.355917   
min         4.600000          0.120000     0.000000        0.900000   
25%         7.100000          0.392500     0.090000        1.900000   
50%         7.900000          0.520000     0.250000        2.200000   
75%         9.100000          0.640000     0.420000        2.600000   
max        15.900000          1.580000     1.000000       15.500000   

         chlorides  free sulfur dioxide  total sulfur dioxide      density  \
count  1143.000000          1143.000000           1143.000000  1143.000000   
mean      0.086933            15.615486             45.914698     0.996730   
std       0.047267            10.250486             32.782130     0.001925   
min       0.012000             1.000000         

In [10]:
y = data.quality
X = data.drop('quality', axis=1)

In [22]:
# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0, 
                                                    stratify=y)

In [44]:
# standardization
scaler = preprocessing.StandardScaler().fit(X_train)

In [47]:
X_train_scaled = scaler.transform(X_train)
 
print(X_train_scaled.mean(axis=0))
 
print(X_train_scaled.std(axis=0))

[ 0. -0. -0.  0.  0.  0. -0. -0.  0. -0. -0. -0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [50]:
X_test_scaled = scaler.transform(X_test)
 
print(X_test_scaled.mean(axis=0))

print(X_test_scaled.std(axis=0))

[ 0.0516844   0.06599752  0.05623154 -0.10855688  0.05412457  0.00850615
 -0.03251446  0.00909912 -0.03784137  0.03269031 -0.01708086  0.1008172 ]
[1.08145869 0.95532387 1.09518898 0.62770686 1.06051842 0.9703362
 0.96438469 0.96920281 1.01193127 1.11798924 0.98500764 1.02582719]


In [51]:
# declare data preprocessing steps
pipeline = make_pipeline(preprocessing.StandardScaler(),
                         RandomForestRegressor(n_estimators=100,
                                               random_state=123))

In [65]:
# declare hyperparameters to tune
hyperparameters = { 'randomforestregressor__max_features' : ['auto', 'sqrt', 'log2'],
                  'randomforestregressor__max_depth': [None, 5, 3, 1]}

In [66]:
# Tune model using cross-validation pipeline
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
clf.fit(X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [57]:
print( clf.best_params_ )

{'randomforestregressor__max_depth': None, 'randomforestregressor__max_features': 'sqrt'}


In [58]:
y_pred = clf.predict(X_test)

In [62]:
print((r2_score(y_test, y_pred)))
print((mean_squared_error(y_test, y_pred)))

0.436636581545191
0.3634720524017468


In [63]:
import joblib 

In [64]:
# save model for future use
joblib.dump(clf, 'rf_regressor.pkl')
# To load: clf2 = joblib.load('rf_regressor.pkl')

['rf_regressor.pkl']