## Linear Regression on Wine Quality Prediction

In this practice session we are going to train Linear regression model using stochastic gradient descent on wine quality dataset

Reference : https://machinelearningmastery.com/implement-linear-regression-stochastic-gradient-descent-scratch-python/

### Importing required packages

In [20]:
from random import seed
from random import randrange
from csv import reader
from math import sqrt



### Loading CV dataset

In [21]:
def loadingCSV(filename):
    dataset = list()
    with open(filename, 'r') as file :
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

### Converting tring column to float

In [22]:
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

### Finding minimum and maximum values for each column

In [23]:
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax


        

### Rescale dataset columns to range of 0 - 1

In [24]:
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
        

### Split a dataset into k-folds 

In [25]:
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
        
    return dataset_split
        

### Calculate root mean squared error

In [26]:
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error** 2)
    mean_error = sum_error/ float(len(actual))
    
    return sqrt(mean_error)

### Evaluating the algorithm using cross validation split

In [27]:
def evaluate_algorithm(dataset, algorithm, n_fold, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        train_set = list()
        
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        rmse = rmse_metric(actual, predicted)
        scores.append(rmse)
    return scores
    

### Make prediction with coefficients 

In [28]:
def predict(row, coefficients):
    yhat = coefficients[0]
    for i in range(len(row) -1):
        yhat += coefficients[i + 1] * row[i]
        
    return yhat

### Estimate Linear Regression using Stochastic gradiend descent



In [29]:
def coefficients_sgd(train, l_rate, n_epoch):
    coef = [ 0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        for row in train:
            yhat = predict(row, coef)
            error = yhat - row[-1]
            coef[0] = coef[0] - l_rate * error
            for i in range(len(row) -1):
                coef[i + 1] = coef[i+1] - l_rate * error * row[i]
                
            print("From Coefficients_sgd function ", l_rate, n_epoch, error)
    return coef       

### Linear Regression Algorithm using SGD

In [30]:
def linear_regression_sgd(train, test, l_rate, n_epoch):
    predictions = list()
    coef = coefficients_sgd(train, l_rate, n_epoch)
    for row in test:
        yhat = predict(row, coef)
        predictions.append(yhat)
    return(predictions)
                       

In [32]:
import numpy as np
import pandas as pd
import sklearn

### Linear Regression on wine quality dataset

In [31]:
seed(1)
## Loading and preparing dataset
filename = "winequality-white.csv"
dataset = loadingCSV(filename)
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
    
## Normalize
minmax = datset_minmax(dataset)
normalize_datset(dataset, minmax)

## Evaluate algorithm
n_folds = 5
l_rate = 0.01
n_epoch = 50

scores = evaluate_algorithm(dataset, linear_regression_sgd, n_folds, l_rate, n_epoch)

print("Scores : %s" %scores)
print("Mean RMSE: %.3f" %(sum(scores)/float(len(scores))))

ValueError: could not convert string to float: 'fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'

In [33]:
dataset = pd.read_csv("winequality-white.csv")

In [34]:
dataset.head()

Unnamed: 0,"fixed acidity;""volatile acidity"";""citric acid"";""residual sugar"";""chlorides"";""free sulfur dioxide"";""total sulfur dioxide"";""density"";""pH"";""sulphates"";""alcohol"";""quality"""
0,7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6
1,6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...
2,8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...
3,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...
4,7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...


In [36]:
dataset.shape

(4898, 1)

In [38]:
dataset.describe

<bound method NDFrame.describe of      fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"
0      7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6                                                                                                                     
1     6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9...                                                                                                                     
2     8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;1...                                                                                                                     
3     7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...                                                                                                                     
4     7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4...                                              

In [39]:
dataset.columns

Index(['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'], dtype='object')

In [40]:
from sklearn.feature_extraction.text import CountVectorizer