In [67]:
# Building the multiple linear regression model using stochastic gradient descent
# Importing the libraries
from csv import reader
from random import randrange
from random import seed
from math import sqrt

In [68]:
# Loading the csv file
def load_csv(fileName):
    dataset=list()
    openfile=open(fileName)
    readfile=reader(openfile)
    for row in readfile:
        if not row:
            continue
        dataset.append(row)
    print("-----------------load_csv funtion dataset---------------------")
    print(dataset[0])
    return dataset

In [69]:
# Converting the column string value to float value
def convert_string_to_float(dataset,column):
    for row in dataset:
        row[column]=float(row[column])
    print("-----------------convert string to float value ---------------------")
    print(dataset[0])
        

In [70]:
# dataset values are not in same scale.
# Applying the normalization technique
def minmax(dataset):
    minmax=list()
    for i in range(len(dataset[0])):
        col_value=[row[i] for row in dataset]
        value_min=min(col_value)
        value_max=max(col_value)
        minmax.append([value_min,value_max])
    print("-----------------min and max values for all columns--------------------")
    print("minimum and maximum value :",minmax)
    return minmax

In [71]:
# Normalizing the dataset rescale the dataset to 0-1
def normalize_datset(dataset,minmax):
    for row in dataset:
        for i in range(len(dataset[0])):
            row[i]=(row[i]-minmax[i][0])/(minmax[i][1]-minmax[i][0])
    print("-----------rescaled dataset---------------------------------")
    print(dataset[0])

In [72]:
# Splitting the dataset into k-fold cross validation
def k_fold_cross_validation(dataset,n_fold):
    dataset_split=list()
    dataset_copy=list(dataset)
    fold_size=int(len(dataset)/n_fold)
    for _ in range(n_fold):
        fold=list()
        while len(fold)<fold_size:
            index=randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

In [73]:
# Calculating the root mean square error
# Statical formula rmse=sqrt((sum_error+(predicted x[i]-actual x[i]))/length of actual values)
def rmse_metrics(actual,predicted):
    sum_error=0
    for i in range(len(actual)):
        prediction_error=predicted[i]-actual[i]
        sum_error+=(prediction_error **2)
    mean_error=sum_error/float(len(actual))
    return sqrt(mean_error)

In [74]:
# evaluate an algorithm using cross validation
def evaluate_algorithm(dataset,algorithm,n_fold,*args):
    folds=k_fold_cross_validation(dataset,n_fold)
    scores=list()
    for fold in folds:
        train_set=list(folds)
        train_set.remove(fold)
        train_set=sum(train_set, [])
        test_set=list()
        for row in fold:
            row_copy=list(row)
            test_set.append(row_copy)
            row_copy[-1]=None
        predicted=algorithm(train_set,test_set,*args)
        actual=[row[-1] for row in fold]
        rmse=rmse_metrics(actual,predicted)
        scores.append(rmse)
        return scores

In [75]:
# Make prediction with coefficients
def predict(row,coefficient):
    yhat=coefficient[0]
    for i in range(len(row)-1):
        yhat+=coefficient[i+1]*row[i]
    return yhat

In [76]:
# Estimate linear regression coefficient using stochastic gradient descent
def coefficient_sgd(train,l_rate,n_epoch):
    coef=[0.0 for i in range(len(train[0]))]
    for _ in range(n_epoch):
        for row in train:
            yhat=predict(row,coef)
            error=yhat-row[-1]
            coef[0]=coef[0]-l_rate*error
            for i in range(len(row)-1):
                coef[i+1]=coef[i+1]-l_rate*error*row[i]
    return coef

In [77]:
# Linear regression algorithm with stochastic gradient descent
def linear_regression_sgd(train,test,l_rate,n_epoch):
    predictions=list()
    coef=coefficient_sgd(train,l_rate,n_epoch)
    for row in test:
        yhat=predict(row,coef)
        predictions.append(yhat)
    return(predictions)

In [78]:
# Load dataset
fileName='winequality-white.csv'
dataset=load_csv(fileName)
# convert string to float value
for i in range(len(dataset[0])):
    convert_string_to_float(dataset,i)
    

-----------------load_csv funtion dataset---------------------
['7', '0.27', '0.36', '20.7', '0.045', '45', '170', '1.001', '3', '0.45', '8.8', '6']
-----------------convert string to float value ---------------------
[7.0, '0.27', '0.36', '20.7', '0.045', '45', '170', '1.001', '3', '0.45', '8.8', '6']
-----------------convert string to float value ---------------------
[7.0, 0.27, '0.36', '20.7', '0.045', '45', '170', '1.001', '3', '0.45', '8.8', '6']
-----------------convert string to float value ---------------------
[7.0, 0.27, 0.36, '20.7', '0.045', '45', '170', '1.001', '3', '0.45', '8.8', '6']
-----------------convert string to float value ---------------------
[7.0, 0.27, 0.36, 20.7, '0.045', '45', '170', '1.001', '3', '0.45', '8.8', '6']
-----------------convert string to float value ---------------------
[7.0, 0.27, 0.36, 20.7, 0.045, '45', '170', '1.001', '3', '0.45', '8.8', '6']
-----------------convert string to float value ---------------------
[7.0, 0.27, 0.36, 20.7, 0.0

In [79]:
#Normalize
minmax=minmax(dataset)
normalize_datset(dataset,minmax)

-----------------min and max values for all columns--------------------
('minimum and maximum value :', [[3.8, 14.2], [0.08, 1.1], [0.0, 1.66], [0.6, 65.8], [0.009, 0.346], [2.0, 289.0], [9.0, 440.0], [0.98711, 1.03898], [2.72, 3.82], [0.22, 1.08], [8.0, 14.2], [3.0, 9.0]])
-----------rescaled dataset---------------------------------
[0.30769230769230776, 0.18627450980392157, 0.21686746987951808, 0.308282208588957, 0.10682492581602374, 0.14982578397212543, 0.37354988399071926, 0.26778484673221237, 0.25454545454545446, 0.26744186046511625, 0.12903225806451626, 0.5]


In [80]:
# Evaluate algorithm
n_fold=5
l_rate=0.01
n_epoch=100
scores=evaluate_algorithm(dataset,linear_regression_sgd,n_fold,l_rate,n_epoch)
print('scores: %s'%scores)
print('mean RMSE : %.3f'%(sum(scores)/float(len(scores))))

scores: [0.12829829886675384]
mean RMSE : 0.128
