In [1]:
# Linear Regression With SGD for Wine Quality
from math import sqrt
import numpy as np

In [2]:
from numpy import genfromtxt
data = genfromtxt('winequality-white.csv', delimiter=';', skip_header =1)
data.astype(float)
print (len(data))

4898


In [3]:
def minmax_norm(dataset):
    val =[]
    for i in range(len(dataset[0])):
        col=dataset[:,i]
        min_val, max_val = min(col), max(col)
        val.append([min_val, max_val])    

    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - val[i][0]) / (val[i][1] - val[i][0])
            
# normalize
minmax_norm(data)

In [4]:
def dataset_split(dataset):
    #np.random.shuffle(dataset)
    train_set, test_set = dataset[:3500,:], dataset[3500:,:]
    y_test=[row[-1] for row in test_set]
    return train_set, test_set, y_test

#Split dataset
train_set, test_set, y_test=dataset_split(data)

In [5]:
# Make a prediction with coeffs
def predict(row, coeff):
    ypred = coeff[0]
    for i in range(len(row)-1):
        ypred += coeff[i + 1] * row[i]
    return ypred

#stochastic gradient descent
def sgd(train, l_rate, n_epoch):
    coef=[0.0] * len(train[0])
    for x in range(n_epoch):
        for row in train:
            pred = predict(row, coef)
            error = pred - row[-1]
            coef[0] = coef[0] - l_rate * error
            for i in range(len(row)-1):
                coef[i + 1] = coef[i + 1] - error*l_rate* row[i]
            
    return coef
 
#run SGD to calculate coeffs
l_rate = 0.01
n_epoch = 70
coef = sgd(train_set, l_rate, n_epoch)

In [6]:
# Calculate root mean squared error
def find_error(y_test, y_pred):
    error = 0.0
    for i in range(len(y_test)):
        error += ((y_pred[i] - y_test[i])**2)
    mean_error = error / float(len(y_test))
    return sqrt(mean_error)

#Find predictions
y_pred=[]
for row in test_set:
    pred = predict(row, coef)
    y_pred.append(pred)

#Find rmse
rmse = find_error(y_test, y_pred)
print('Mean RMSE: %.3f' % (rmse))

Mean RMSE: 0.124
