# Wine Quality Regression

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
import numpy as np
import sklearn
import seaborn as sns
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_note_interactivity = "all"

# Dataset
Wine Quality dataset from:
https://archive.ics.uci.edu/ml/datasets/Wine+Quality

We use only redwine dataset for this model

In [3]:
redwine = pd.read_csv("winequality-red.csv")
redwine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
redwine.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [5]:
x = redwine.drop('quality', axis=1)
y = redwine['quality']
x.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [6]:
y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

# Linear Regression
## K-fold Cross Validation
Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample.
We use 5 fold.

In [7]:
X = np.array(x)
y = np.array(y)

acc=[]

kf = KFold(n_splits=5, random_state=1, shuffle=False)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    predict_test = lm.predict(X_test)
    
    print("Mean Square Error for Test data is")
    print(np.round(metrics.mean_squared_error(y_test, predict_test),2))
    print("Mean Absolute Error for Test data is")
    print(np.round(mean_absolute_error(y_test, predict_test),2))

    print("")

Mean Square Error for Test data is
0.45
Mean Absolute Error for Test data is
0.52

Mean Square Error for Test data is
0.45
Mean Absolute Error for Test data is
0.53

Mean Square Error for Test data is
0.44
Mean Absolute Error for Test data is
0.5

Mean Square Error for Test data is
0.41
Mean Absolute Error for Test data is
0.51

Mean Square Error for Test data is
0.43
Mean Absolute Error for Test data is
0.5



# Model coeff

In [8]:
model = lm.fit(X_train, y_train)


r_sq = model.score(X_train, y_train)
intercept, coefficients = model.intercept_, model.coef_

coefficients

array([ 2.34931933e-02, -1.05500265e+00, -1.82869112e-01,  1.06292388e-02,
       -1.75741863e+00,  3.26738029e-03, -3.68402556e-03, -2.00895607e+01,
       -3.43003188e-01,  8.05254545e-01,  2.83320154e-01])

In [9]:
fdf = pd.concat([pd.DataFrame(X_test), pd.DataFrame(y_test)], 1)
fdf['Predicted']= np.round(predict_test, 1)
fdf['Prediction_Error'] = fdf.iloc[:, 11] - fdf['Predicted']
fdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,0.1,Predicted,Prediction_Error
0,7.1,0.460,0.20,1.90,0.077,28.0,54.0,0.99560,3.37,0.64,10.4,6,5.7,0.3
1,7.1,0.460,0.20,1.90,0.077,28.0,54.0,0.99560,3.37,0.64,10.4,6,5.7,0.3
2,7.9,0.765,0.00,2.00,0.084,9.0,22.0,0.99619,3.33,0.68,10.9,6,5.7,0.3
3,8.7,0.630,0.28,2.70,0.096,17.0,69.0,0.99734,3.26,0.63,10.2,6,5.4,0.6
4,7.0,0.420,0.19,2.30,0.071,18.0,36.0,0.99476,3.39,0.56,10.9,5,5.9,-0.9
5,11.3,0.370,0.50,1.80,0.090,20.0,47.0,0.99734,3.15,0.57,10.5,5,5.8,-0.8
6,7.1,0.160,0.44,2.50,0.068,17.0,31.0,0.99328,3.35,0.54,12.4,6,6.6,-0.6
7,8.0,0.600,0.08,2.60,0.056,3.0,7.0,0.99286,3.22,0.37,13.0,5,6.4,-1.4
8,7.0,0.600,0.30,4.50,0.068,20.0,110.0,0.99914,3.30,1.17,10.2,5,5.7,-0.7
9,7.0,0.600,0.30,4.50,0.068,20.0,110.0,0.99914,3.30,1.17,10.2,5,5.7,-0.7


# Logistic Regression
## K-fold Cross Validation
Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample.
We use 5 fold.

In [10]:
from sklearn.linear_model import LogisticRegression

X = np.array(x)
y = np.array(y)

acc=[]

kf = KFold(n_splits=5, random_state=1, shuffle=False)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    predict_test = lr.predict(X_test)
    
    print("Mean Square Error for Test data is")
    print(np.round(metrics.mean_squared_error(y_test, predict_test),2))
    print("Mean Absolute Error for Test data is")
    print(np.round(mean_absolute_error(y_test, predict_test),2))

    print("")



Mean Square Error for Test data is
0.47
Mean Absolute Error for Test data is
0.42

Mean Square Error for Test data is
0.62
Mean Absolute Error for Test data is
0.54

Mean Square Error for Test data is
0.59
Mean Absolute Error for Test data is
0.48

Mean Square Error for Test data is
0.53
Mean Absolute Error for Test data is
0.48

Mean Square Error for Test data is
0.5
Mean Absolute Error for Test data is
0.42





In [11]:
fdf = pd.concat([pd.DataFrame(X_test), pd.DataFrame(y_test)], 1)
fdf['Predicted']= np.round(predict_test, 1)
fdf['Prediction_Error'] = fdf.iloc[:, 11] - fdf['Predicted']
fdf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,0.1,Predicted,Prediction_Error
0,7.1,0.460,0.20,1.90,0.077,28.0,54.0,0.99560,3.37,0.64,10.4,6,6,0
1,7.1,0.460,0.20,1.90,0.077,28.0,54.0,0.99560,3.37,0.64,10.4,6,6,0
2,7.9,0.765,0.00,2.00,0.084,9.0,22.0,0.99619,3.33,0.68,10.9,6,6,0
3,8.7,0.630,0.28,2.70,0.096,17.0,69.0,0.99734,3.26,0.63,10.2,6,5,1
4,7.0,0.420,0.19,2.30,0.071,18.0,36.0,0.99476,3.39,0.56,10.9,5,6,-1
5,11.3,0.370,0.50,1.80,0.090,20.0,47.0,0.99734,3.15,0.57,10.5,5,6,-1
6,7.1,0.160,0.44,2.50,0.068,17.0,31.0,0.99328,3.35,0.54,12.4,6,6,0
7,8.0,0.600,0.08,2.60,0.056,3.0,7.0,0.99286,3.22,0.37,13.0,5,6,-1
8,7.0,0.600,0.30,4.50,0.068,20.0,110.0,0.99914,3.30,1.17,10.2,5,5,0
9,7.0,0.600,0.30,4.50,0.068,20.0,110.0,0.99914,3.30,1.17,10.2,5,5,0


In [13]:
model = lr.fit(X_train, y_train)


r_sq = model.score(X_train, y_train)
intercept, coefficient = model.intercept_, model.coef_

coefficient



array([[ 2.65589559e-01,  1.15639586e+00,  4.37796966e-02,
         1.54071022e-01,  1.46855451e-01,  1.13352685e-01,
        -8.42760310e-02, -4.13444669e-02,  4.96400955e-01,
        -1.76349710e-01, -8.95350253e-01],
       [-1.91143732e-01,  2.41570251e+00, -2.38438502e-01,
         1.83811551e-01,  1.55417147e-01, -1.37728729e-02,
        -8.93582692e-03, -2.63930058e-01, -2.55427417e-02,
        -4.18973199e-01, -2.09686378e-01],
       [ 2.17065756e-02,  2.00840259e+00,  5.76225775e-01,
        -2.15682072e-02,  1.33604241e+00, -2.46114640e-02,
         2.14703542e-02,  1.39983131e+00,  1.43627390e+00,
        -1.44332853e+00, -8.77353454e-01],
       [ 6.53770898e-02, -1.57802003e+00, -1.18385897e+00,
        -7.47262823e-02, -2.14068609e-01,  2.72328347e-02,
        -1.41529792e-02, -2.05935265e-01, -2.55600627e-01,
         5.19879954e-01,  1.45550330e-01],
       [-8.76671067e-02, -2.61818252e+00,  3.54209484e-01,
         1.31819855e-01, -1.26542034e+00,  1.35345323e-02,
  