In [None]:
from __future__ import print_function
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import seaborn as sns
import statsmodels.api as sm
from sklearn import linear_model
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import KFold

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("VinhoVerde_WhiteWineQuality.csv", sep=';')
df.head()



Loaded the wine data in 'df'

In [None]:
df.describe()

In [None]:
sns.pairplot(df, hue="quality")
sns.plt.show()

The pairplot show correlation of every column on every other column. As we know wine depends greatly on alcohol and density, and the pairplot above also shows the same result, there fore we will move forward with these 2.

### Single Variable Linear Regression

In [None]:
sns.lmplot(y="alcohol", x="density", data=df,  hue="quality")
sns.plt.show()

Now that we have seen the correlation and the significant relation, we will move forward and slit our datset to fit it with linear regression model.

As we can see from the above stats, R-square value of 0.98 means our data has fit the model pretty well, whereas p value of 0 depicts that the possiblity of a null hypothesis being true is 0, i.e Hnull for this model is 0.

In [None]:
print('Parameters: ', results.params)
print('R2: ', results.rsquared)

In [None]:
X = df[["density"]]
y = df[["alcohol"]]
plt.plot(X, y, "o")

The plot shows a negative relation between alcohol and density.

In [None]:
X = sm.add_constant(X) #  Our model needs an intercept so we add a column of 1s:
plt.plot(X, y, "o")
x2 = np.linspace(0.98, 1.05, 51)
y2 = 329.588 + (-320.991*x2)
plt.plot(x2, y2, lw=2, color="r")
x2 = np.linspace(0.99, 1.05, 31)
y2 = (10.574*x2)
plt.plot(x2, y2, lw=2, color="g")
plt.show()

this plot shows us the regression line and our mean line.
Now again after consulting the pairplot above we have picked chlorides, pH and density to plot with alcohol, for the multiple linear regression model.

### Multiple Variable Linear Regression

In [None]:
from sklearn.cross_validation import train_test_split
Xpoly = df[["density","chlorides","pH"]].values
Ypoly = df[["alcohol"]].values
X_trainpoly, X_testpoly, Y_trainpoly, Y_testpoly = train_test_split(Xpoly, Ypoly, test_size = .20 , random_state = 0)
print(X_trainpoly)

In [None]:
from sklearn.cross_validation import train_test_split

modelpoly = LinearRegression()
polyresults = modelpoly.fit(X_trainpoly, Y_trainpoly)

Y_predictorpoly=modelpoly.predict(X_trainpoly)
print (Y_predictorpoly)


#print (results.intercept_, results.coef_)

In [None]:
modelpoly = sm.OLS(y, X)
polyresults = modelpoly.fit()
polyresults.summary()

The r-square displays a pretty good fit for our multiple linear reg. model.
The null hypothesis assumption is here also nullified with p value of 0.

Now we shall check the accuracy of our model, by root mean square error value (rmse).

### Accuracy Check : RMSE

In [None]:
y_null = np.zeros_like(Y_testpoly, dtype=float)
y_null.fill(Y_testpoly.mean())
y_null[0:33]

In [None]:
from sklearn import metrics

y_null_rmse =np.sqrt(metrics.mean_squared_error(Y_testpoly, y_null))
y_null_rmse

In [None]:
y_null_rmse/Y_testpoly.mean()

We have a rmse value as 1.2, lets do regularization to improve our model via ridge regression.

### Regularization : Ridge Regression

In [None]:
rmse = np.sqrt(metrics.mean_squared_error(Y_testpoly, y_pred))
rmse

The rmse is reduced, therefore regularization did had a effect on our model.

Now lets begin with logistic regression. 
For that we deduced the quality column as 0 and 1 by classifying the 0-5 quality wine as bad or 0, and 6-10 quality wines as good or 1, hence making our dataset catagorical.

### Logistic Regression

In [None]:
df.quality = df.quality.replace({1: 0,2: 0,3:0,4:0,5:0,6:1,7:1,8:1,9:1,10:1})

In [None]:
sns.lmplot(x='alcohol', y='quality', data=df, ci=None)

In [None]:
#data formating ID is a non-informative column
df = df.drop("free_sulfur_dioxide", 1)
df = df.drop("citric_acid", 1)
df = df.drop("density", 1)
df.head()

We will drop free sulfur dioxide, citric acid and density as they have insignificant effect on our cassifying variable quality, therefore we removed these columns and display the remaining column relation in the below heat map.

In [None]:
sns.heatmap(df.corr())
plt.show()

The above heat map shows independence abong the independent variable i.e lack of multi-colinearity

In [None]:
#df.fixed_acidity = df.fixed_acidity.astype(int)
X = df[["residual_sugar","pH","alcohol"]].values
y = df[["quality"]].values
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = .20, random_state=0)

from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
classifier.fit(X_train, y_train)


In [None]:
classifier.score(X_test,y_test)

The logistic regression model fit the dataset and the score is 0.67

In [None]:
y_pred=classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix=confusion_matrix(y_test,y_pred)
print(confusion_matrix)

The confusion matrix result is telling us that we have 119+539 correct predictions and 86+236 incorrect predictions.

In [None]:
log_rmse =np.sqrt(metrics.mean_squared_error(Y_testpoly, y_pred))
log_rmse

the rmse is 9.7, lets perform a K fold validation with 5 folds to test our model.

### Accuracy and Cross Validation

In [None]:
from sklearn.cross_validation import train_test_split
X = df[["density"]]
Y = df[["alcohol"]]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .20 , random_state = 0)

In [None]:
model = LinearRegression()
results = model.fit(X_train.values.reshape(len(X_train),1), Y_train)

Y_predictor=model.predict(X_train.values.reshape(len(X_train),1))
print (Y_predictor)
#print (results.intercept_, results.coef_)

In [None]:
model = sm.OLS(Y, X)
results = model.fit()
results.summary()

In [None]:

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor

ridge = Ridge(fit_intercept=True, alpha=0.5)
ridge.fit(X_trainpoly,Y_trainpoly)

In [None]:
y_pred = ridge.predict(X_testpoly)

In [None]:
#y_pred = ridge.predict(X_testpoly)

plt.scatter(Y_testpoly, y_pred)
plt.xlabel("Alcohol level: $Y_i$")
plt.ylabel("Predicted alcohol levels: $\hat{y}_i$")
plt.title("Ridge Regression - Alcohol level vs Predicted Alcohol levels: $Y_i$ vs $\hat{y}_i$")