In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error
%matplotlib inline

In [2]:

from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge, ElasticNet, HuberRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [3]:
df=pd.read_csv('winequality.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
x=df.drop(['quality'],axis=1)
y=df['quality']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [6]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

In [7]:
y_pred=model.predict(x_test)

In [8]:
pd.DataFrame(data=[metrics.mean_absolute_error(y_test, y_pred),metrics.mean_squared_error(y_test, y_pred), np.sqrt(metrics.mean_squared_error(y_test, y_pred)), round(metrics.r2_score(y_test, y_pred), 2)], index=['MAE','MSE','RMSE','R2 Score'], columns=['Evaluation Score'])

Unnamed: 0,Evaluation Score
MAE,0.469633
MSE,0.384471
RMSE,0.620057
R2 Score,0.33


In [9]:
X_stat = sm.add_constant(x)

model_OLS = sm.OLS(y, x).fit()
print(model_OLS.summary())

OLS Regression Results                                
Dep. Variable:                quality   R-squared (uncentered):                   0.987
Model:                            OLS   Adj. R-squared (uncentered):              0.987
Method:                 Least Squares   F-statistic:                          1.108e+04
Date:                Mon, 10 Aug 2020   Prob (F-statistic):                        0.00
Time:                        09:06:19   Log-Likelihood:                         -1569.7
No. Observations:                1599   AIC:                                      3161.
Df Residuals:                    1588   BIC:                                      3221.
Df Model:                          11                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------

In [10]:
models = [
    ['Linear Regression', LinearRegression()],
    ['ElasticNet', ElasticNet()],
    ['Lasso', Lasso()],
    ['Ridge', Ridge()],
    ['DecisionTree', DecisionTreeRegressor()],
    ['RandomForest', RandomForestRegressor()],
    
    ['KNeighbours', KNeighborsRegressor(n_neighbors = 2)],
    ['SVM', SVR()],
    ['AdaBoost', AdaBoostRegressor()],
    ['GradientBoosting', GradientBoostingRegressor()],
    ['BayesianRidge', BayesianRidge()],
    ['Huber', HuberRegressor()]
]

In [15]:
RMSE_score=[]
for name, model in models:
    model = model
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    RMSE_score.append([name,(np.sqrt(mean_squared_error(y_test, predictions)))/y_test.mean()*100])

In [16]:
pd.DataFrame(RMSE_score, columns=['name','score'])

Unnamed: 0,name,score
0,Linear Regression,11.084825
1,ElasticNet,13.318852
2,Lasso,13.328164
3,Ridge,11.058347
4,DecisionTree,14.482102
5,RandomForest,10.169744
6,KNeighbours,14.168373
7,SVM,12.311275
8,AdaBoost,11.222407
9,GradientBoosting,10.844303


In [13]:
x=df[['alcohol','sulphates','pH','density','chlorides']]
y=df['quality']

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [15]:
model = LinearRegression()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)

In [16]:
pd.DataFrame(data=[metrics.mean_absolute_error(y_test, y_pred),metrics.mean_squared_error(y_test, y_pred), np.sqrt(metrics.mean_squared_error(y_test, y_pred)), round(metrics.r2_score(y_test, y_pred), 2)], index=['MAE','MSE','RMSE','R2 Score'], columns=['Evaluation Score'])

Unnamed: 0,Evaluation Score
MAE,0.487705
MSE,0.396327
RMSE,0.629545
R2 Score,0.31


In [17]:
X_stat = sm.add_constant(x)

model_OLS = sm.OLS(y, x).fit()
print(model_OLS.summary())

OLS Regression Results                                
Dep. Variable:                quality   R-squared (uncentered):                   0.986
Model:                            OLS   Adj. R-squared (uncentered):              0.986
Method:                 Least Squares   F-statistic:                          2.241e+04
Date:                Fri, 07 Aug 2020   Prob (F-statistic):                        0.00
Time:                        10:58:25   Log-Likelihood:                         -1638.9
No. Observations:                1599   AIC:                                      3288.
Df Residuals:                    1594   BIC:                                      3315.
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

In [17]:
RMSE_score=[]
for name, model in models:
    model = model
    model.fit(x_train, y_train)
    predictions = model.predict(x_test)
    RMSE_score.append([name,np.sqrt(mean_squared_error(y_test, predictions))/y_test.mean()*100])

In [18]:
pd.DataFrame(RMSE_score, columns=['name','score'])

Unnamed: 0,name,score
0,Linear Regression,11.084825
1,ElasticNet,13.318852
2,Lasso,13.328164
3,Ridge,11.058347
4,DecisionTree,14.822904
5,RandomForest,10.11995
6,KNeighbours,14.168373
7,SVM,12.311275
8,AdaBoost,11.353547
9,GradientBoosting,10.853362
