# Chapter 07 - Simple Linar Regression

In [12]:
from random import seed
from Codes.ch01_load_and_convert_data import load_csv, str_column_to_float
from Codes.ch03_resampling_methods import train_test_split
from Codes.ch04_evaluation_metrics import rmse_metric
from Codes.ch06_algorithm_test_harnesses import evaluate_algorithm_train_test_reg

* Hypothesis:

    $y = b_0 + b_1 * x$

* Coefficients:

    $b_1 = \frac{\sum{((x_i - mean(x)) * (y_i - mean(y)))}}{\sum{(x_i - mean(x))^2}}$

    $b_0 = mean(y) - b_1 * mean(x)$


### Calculate mean and variance

$mean = \frac{\sum{x_i}}{count(x)}$

In [2]:
# Calculate the mean value of a list of numbers
def mean(values):
    return sum(values) / float(len(values))

$variance = \sum{(x_i - mean(x))^2}$

In [3]:
# Calculate the variance of a list of numbers
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])

In [4]:
# Calculate mean and variance
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
mean_x, mean_y = mean(x), mean(y)
var_x, var_y = variance(x, mean_x), variance(y, mean_y)
print('x stats: mean=%.3f variance=%.3f' % (mean_x, var_x))
print('y stats: mean=%.3f variance=%.3f' % (mean_y, var_y))

x stats: mean=3.000 variance=10.000
y stats: mean=2.800 variance=8.800


### Calculate Covariance

$covariance = \sum{((x_i - mean(x)) * (y_i - mean(y)))}$

In [5]:
# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

In [6]:
# Calculate covariance
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
mean_x, mean_y = mean(x), mean(y)
covar = covariance(x, mean_x, y, mean_y)
print( ' Covariance: %.3f ' % (covar))

 Covariance: 8.000 


### Estimate Coefficients

#### Estimating $b_1$
$b_1 = \frac{\sum{((x_i - mean(x)) * (y_i - mean(y)))}}{\sum{(x_i - mean(x))^2}}$

Simplify $b_1$

$b_1 = \frac{covariance(x, y)}{variance(x)}$

#### Estimating $b_0$

$b_0 = mean(y) - b_1 * mean(x)$

In [7]:
# Calculate coefficients
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]

In [8]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
b0, b1 = coefficients(dataset)
print('Coefficients: B0=%.3f, B1=%.3f' % (b0, b1))

Coefficients: B0=0.400, B1=0.800


### Make predictions

Simple linear regression equation:
    
$y = b_0 + b_1 * x$

In [9]:
# Evaluate regression algorithm on training dataset
def evaluate_algorithm(dataset, algorithm):
    test_set = list()
    for row in dataset:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(dataset, test_set)
    print(predicted)
    actual = [row[-1] for row in dataset]
    rmse = rmse_metric(actual, predicted)
    return rmse

# Simple linear regression algorithm
def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions

In [10]:
# Test simple linear regression
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5,5]]
rmse = evaluate_algorithm(dataset, simple_linear_regression)
print('RMSE: %.3f'%(rmse))

[1.1999999999999995, 1.9999999999999996, 3.5999999999999996, 2.8, 4.3999999999999995]
RMSE: 0.693


### Swedish Auto Insurance Case Study

In [13]:
# Simple linear regression on insurance dataset
seed(1)

# load and prepare data
filename = './data/insurance.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)

# evaluate algorithm
split = 0.6
rmse = evaluate_algorithm_train_test_reg(dataset, simple_linear_regression, split)
print('RMSE: %.3f'%(rmse))

RMSE: 33.630
