In [1]:
# Multiple Regression

In [2]:
def predict(row, coefficients):
    yhat = coefficients[0]
    for i in range(len(row)-1):
        yhat += coefficients[i+1] * row[i] 
    return yhat    

In [3]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
coef = [0.4, 0.8]

In [4]:
for row in dataset:
    yhat = predict(row, coef)
    print("Expected=%.3f, Predicted=%.3f" % (row[-1], yhat))

Expected=1.000, Predicted=1.200
Expected=3.000, Predicted=2.000
Expected=3.000, Predicted=3.600
Expected=2.000, Predicted=2.800
Expected=5.000, Predicted=4.400


# Estimating the Coefiicents
- Learning-Rate:
- epochs: to update the coeff for each iteration

In [5]:
# prediction
def predict(row, coefficients):
    yhat = coefficients[0]
    for i in range(len(row)-1):
        yhat += coefficients[i+1] * row[i]      
    return yhat

# Estimate linear regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
    coef =  [0.0 for i  in range(len(train[0]))] # intial coef zero
    print("intial coef", coef)
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            yhat = predict(row, coef)
            error = yhat - row[-1]
            sum_error += error**2
            coef[0] = coef[0] - l_rate * error # intia slope or bias
            for i in range(len(row)-1):
                coef[i + 1] = coef[i + 1] - l_rate * error * row[i] # continously updating the coef
                print("coef updated",coef)
                print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
    return coef           


In [6]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
l_rate = 0.001
n_epoch = 1
coef = coefficients_sgd(dataset, l_rate, n_epoch)
coef

intial coef [0.0, 0.0]
coef updated [0.001, 0.001]
>epoch=0, lrate=0.001, error=1.000
coef updated [0.0039970000000000006, 0.006994]
>epoch=0, lrate=0.001, error=9.982
coef updated [0.006965027, 0.018866108]
>epoch=0, lrate=0.001, error=18.791
coef updated [0.008901463649000001, 0.024675417947]
>epoch=0, lrate=0.001, error=22.541
coef updated [0.013769185095616001, 0.049014025180079995]
>epoch=0, lrate=0.001, error=46.236


[0.013769185095616001, 0.049014025180079995]

# Wine case study

In [23]:
# Linear Regression With Stochastic Gradient Descent for Wine Quality
from random import seed
from random import randrange
from csv import reader
from math import sqrt

# Load a CSV file
def load_csv(filename):
	dataset = list()
	with open(filename, 'r') as file:
		csv_reader = reader(file)
		for row in csv_reader:
			if not row:
				continue
			dataset.append(row)
	return dataset


# Convert string column to float
def str_column_to_float(dataset, column):
	for row in dataset:
		row[column] = float(row[column].strip())
        
# Find the min and max values for each column
def dataset_minmax(dataset):
	minmax = list()
	for i in range(len(dataset[0])):
		col_values = [row[i] for row in dataset]
		value_min = min(col_values)
		value_max = max(col_values)
		minmax.append([value_min, value_max])
	return minmax


# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

# split dataset
def cross_validation(dataset, n_folds):
    dataset_split= []
    dataset_copy = list(dataset)
    fold_size = int(len(dataset)/ n_folds)
    for i in range(n_folds):
        fold = []
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split  


# Calculate root mean squared error
def rmse_metric(actual, predicted):
	sum_error = 0.0
	for i in range(len(actual)):
		prediction_error = predicted[i] - actual[i]
		sum_error += (prediction_error ** 2)
	mean_error = sum_error / float(len(actual))
	return sqrt(mean_error)

# evaluate the algorithm
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation(dataset, n_folds)
    scores = []
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = []
        for row in fold:
            row_copy= list(row)
            test_set.append(row_copy)
            predicted = algorithm(train_set, test_set, * args)
            actual =  [row[-1] for row in fold]
            rmse = rmse_metric(actual, predicted)
            scores.append(rmse)
    return scores       
        
# make prediction
def predict(row, coefficients):
    yhat = coefficients[0]
    for i in range(len(row)-1):
        yhat += coefficients[i+1] * row[i]
    return yhat 


# estimate the GDecent
def coefficients_sgd(train , l_rate, n_epoch):
    coef = [0.0 for row in range(len(train))]
    for epoch in range((n_epoch)):
        for row in train:
            yhat = predict(row, coef)
            error = yhat - row[-1]
            coef[0] = coef[0] -  l_rate* error
            for i in range(len(row)-1):
                coef[i+1] = coef[i+1] - l_rate * error
    return coef           
   
# define linear regreesion  
def linear_regression_sgd(train, test, l_rate, n_epoch):
    predictions = []
    coef = coefficients_sgd(train, l_rate, n_epoch)
    for row in test:
        yhat = predict(row, coef)
        predictions.append(yhat)
    return predictions    
       

In [29]:
seed(1)
filename = "winequality-white.csv"
# evaluate algorithm
n_folds = 5
l_rate = 0.01
n_epoch = 50
scores = evaluate_algorithm(dataset, linear_regression_sgd, n_folds, l_rate, n_epoch)
scores
print('Scores: %s' % scores)
print('Mean RMSE: %.3f' % (sum(scores)/float(len(scores))))

Scores: [1.0368095938244113, 0.4458267150634163, 1.000453796064074, 0.6681257406730183, 1.1730053381876813]
Mean RMSE: 0.865
