### 1. Calculate Mean and Variance

In [1]:
# Calculate the mean value of a list of numbers
# mean(x) = sum(x) / count(x)
def mean(values):
    return sum(values) / float(len(values))

In [2]:
# Calculate the variance of a list of numbers
# variance = sum( (x - mean(x))^2 )
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])

In [3]:
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
mean_x, mean_y = mean(x), mean(y)
var_x, var_y = variance(x, mean_x), variance(y, mean_y)
print(f"x stats: mean={round(mean_x,2)}, variance={round(var_x,2)}")
print(f"y stats: mean={round(mean_y,2)}, variance={round(var_y,2)}")

x stats: mean=3.0, variance=10.0
y stats: mean=2.8, variance=8.8


### 2. Calculate Covariance

In [4]:
# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

In [5]:
cov = covariance(x, mean_x, y, mean_y)
print(f"Covariance={round(cov,2)}")

Covariance=8.0


### 3. Estimate Coefficients
**slope(θ1) = sum((x(i) - mean(x)) * (y(i) - mean(y))) / sum( (x(i) - mean(x))^2 )**  
**slope(θ1) = covariance(x, y) / variance(x)**

**intercept(θ0) = mean(y) - B1 * mean(x)**

In [6]:
# Calculate coefficients
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]

In [9]:
# calculate coefficients
dataset = [[1, 1], [2, 3], [4, 3], [3, 2], [5, 5]]
b0, b1 = coefficients(dataset)
print(f'Coefficients: B0={round(b0,2)}, B1={round(b1,2)}')

Coefficients: B0=0.4, B1=0.8


In [326]:
from random import seed
from random import randrange
from csv import reader
from math import sqrt

In [327]:
def load_my_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        next(file)
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

In [328]:
data = load_my_csv('insurance.csv')

In [329]:
data

[['108', '392.5'],
 ['19', '46.2'],
 ['13', '15.7'],
 ['124', '422.2'],
 ['40', '119.4'],
 ['57', '170.9'],
 ['23', '56.9'],
 ['14', '77.5'],
 ['45', '214'],
 ['10', '65.3'],
 ['5', '20.9'],
 ['48', '248.1'],
 ['11', '23.5'],
 ['23', '39.6'],
 ['7', '48.8'],
 ['2', '6.6'],
 ['24', '134.9'],
 ['6', '50.9'],
 ['3', '4.4'],
 ['23', '113'],
 ['6', '14.8'],
 ['9', '48.7'],
 ['9', '52.1'],
 ['3', '13.2'],
 ['29', '103.9'],
 ['7', '77.5'],
 ['4', '11.8'],
 ['20', '98.1'],
 ['7', '27.9'],
 ['4', '38.1'],
 ['0', '0'],
 ['25', '69.2'],
 ['6', '14.6'],
 ['5', '40.3'],
 ['22', '161.5'],
 ['11', '57.2'],
 ['61', '217.6'],
 ['12', '58.1'],
 ['4', '12.6'],
 ['16', '59.6'],
 ['13', '89.9'],
 ['60', '202.4'],
 ['41', '181.3'],
 ['37', '152.8'],
 ['55', '162.8'],
 ['41', '73.4'],
 ['11', '21.3'],
 ['27', '92.6'],
 ['8', '76.1'],
 ['3', '39.9'],
 ['17', '142.1'],
 ['13', '93'],
 ['13', '31.9'],
 ['15', '32.1'],
 ['8', '55.6'],
 ['29', '133.3'],
 ['30', '194.5'],
 ['24', '137.9'],
 ['9', '87.4'],
 ['31', 

In [330]:
# Convert string column to float
def str_column_to_float_colummn(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

In [331]:
for i in range(len(data[0])):
    str_column_to_float_colummn(data, i)

In [332]:
data

[[108.0, 392.5],
 [19.0, 46.2],
 [13.0, 15.7],
 [124.0, 422.2],
 [40.0, 119.4],
 [57.0, 170.9],
 [23.0, 56.9],
 [14.0, 77.5],
 [45.0, 214.0],
 [10.0, 65.3],
 [5.0, 20.9],
 [48.0, 248.1],
 [11.0, 23.5],
 [23.0, 39.6],
 [7.0, 48.8],
 [2.0, 6.6],
 [24.0, 134.9],
 [6.0, 50.9],
 [3.0, 4.4],
 [23.0, 113.0],
 [6.0, 14.8],
 [9.0, 48.7],
 [9.0, 52.1],
 [3.0, 13.2],
 [29.0, 103.9],
 [7.0, 77.5],
 [4.0, 11.8],
 [20.0, 98.1],
 [7.0, 27.9],
 [4.0, 38.1],
 [0.0, 0.0],
 [25.0, 69.2],
 [6.0, 14.6],
 [5.0, 40.3],
 [22.0, 161.5],
 [11.0, 57.2],
 [61.0, 217.6],
 [12.0, 58.1],
 [4.0, 12.6],
 [16.0, 59.6],
 [13.0, 89.9],
 [60.0, 202.4],
 [41.0, 181.3],
 [37.0, 152.8],
 [55.0, 162.8],
 [41.0, 73.4],
 [11.0, 21.3],
 [27.0, 92.6],
 [8.0, 76.1],
 [3.0, 39.9],
 [17.0, 142.1],
 [13.0, 93.0],
 [13.0, 31.9],
 [15.0, 32.1],
 [8.0, 55.6],
 [29.0, 133.3],
 [30.0, 194.5],
 [24.0, 137.9],
 [9.0, 87.4],
 [31.0, 209.8],
 [14.0, 95.5],
 [53.0, 244.6],
 [26.0, 187.5]]

In [334]:
dataset_copy = list(data)
dataset_copy

[[108.0, 392.5],
 [19.0, 46.2],
 [13.0, 15.7],
 [124.0, 422.2],
 [40.0, 119.4],
 [57.0, 170.9],
 [23.0, 56.9],
 [14.0, 77.5],
 [45.0, 214.0],
 [10.0, 65.3],
 [5.0, 20.9],
 [48.0, 248.1],
 [11.0, 23.5],
 [23.0, 39.6],
 [7.0, 48.8],
 [2.0, 6.6],
 [24.0, 134.9],
 [6.0, 50.9],
 [3.0, 4.4],
 [23.0, 113.0],
 [6.0, 14.8],
 [9.0, 48.7],
 [9.0, 52.1],
 [3.0, 13.2],
 [29.0, 103.9],
 [7.0, 77.5],
 [4.0, 11.8],
 [20.0, 98.1],
 [7.0, 27.9],
 [4.0, 38.1],
 [0.0, 0.0],
 [25.0, 69.2],
 [6.0, 14.6],
 [5.0, 40.3],
 [22.0, 161.5],
 [11.0, 57.2],
 [61.0, 217.6],
 [12.0, 58.1],
 [4.0, 12.6],
 [16.0, 59.6],
 [13.0, 89.9],
 [60.0, 202.4],
 [41.0, 181.3],
 [37.0, 152.8],
 [55.0, 162.8],
 [41.0, 73.4],
 [11.0, 21.3],
 [27.0, 92.6],
 [8.0, 76.1],
 [3.0, 39.9],
 [17.0, 142.1],
 [13.0, 93.0],
 [13.0, 31.9],
 [15.0, 32.1],
 [8.0, 55.6],
 [29.0, 133.3],
 [30.0, 194.5],
 [24.0, 137.9],
 [9.0, 87.4],
 [31.0, 209.8],
 [14.0, 95.5],
 [53.0, 244.6],
 [26.0, 187.5]]

In [338]:
len(dataset)

63

In [348]:
train_size = .95 * len(dataset)
train_size

59.849999999999994

In [349]:
def train_test_split_data(dataset, split):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

In [350]:
train_data,test_data = train_test_split_data(data,.95)

In [351]:
train_data

[[45.0, 214.0],
 [48.0, 248.1],
 [11.0, 23.5],
 [53.0, 244.6],
 [12.0, 58.1],
 [24.0, 134.9],
 [9.0, 48.7],
 [8.0, 55.6],
 [4.0, 11.8],
 [41.0, 73.4],
 [16.0, 59.6],
 [6.0, 14.8],
 [0.0, 0.0],
 [7.0, 27.9],
 [4.0, 38.1],
 [14.0, 77.5],
 [7.0, 77.5],
 [9.0, 52.1],
 [29.0, 133.3],
 [27.0, 92.6],
 [5.0, 20.9],
 [24.0, 137.9],
 [15.0, 32.1],
 [23.0, 56.9],
 [11.0, 57.2],
 [13.0, 15.7],
 [11.0, 21.3],
 [57.0, 170.9],
 [55.0, 162.8],
 [3.0, 4.4],
 [6.0, 50.9],
 [37.0, 152.8],
 [40.0, 119.4],
 [41.0, 181.3],
 [60.0, 202.4],
 [31.0, 209.8],
 [6.0, 14.6],
 [124.0, 422.2],
 [17.0, 142.1],
 [3.0, 39.9],
 [3.0, 13.2],
 [30.0, 194.5],
 [10.0, 65.3],
 [25.0, 69.2],
 [4.0, 12.6],
 [22.0, 161.5],
 [7.0, 48.8],
 [14.0, 95.5],
 [26.0, 187.5],
 [23.0, 113.0],
 [19.0, 46.2],
 [108.0, 392.5],
 [5.0, 40.3],
 [23.0, 39.6],
 [2.0, 6.6],
 [20.0, 98.1],
 [8.0, 76.1],
 [29.0, 103.9],
 [61.0, 217.6],
 [13.0, 93.0]]

In [352]:
test_data

[[13.0, 89.9], [13.0, 31.9], [9.0, 87.4]]

In [353]:
# Simple linear regression algorithm
def simple_linear_regression_data(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions

In [354]:
prediction = simple_linear_regression_data(train_data,test_data)

In [355]:
prediction

[63.73514871839914, 63.73514871839914, 50.02501447065356]

In [356]:
actual = [row[-1] for row in test_data]

In [357]:
actual

[89.9, 31.9, 87.4]

### Calculating RMSE(Root Mean Squared Error)

In [358]:
# Calculate root mean squared error
def rmse_metric_data(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)

In [359]:
rmse = rmse_metric_data(actual,prediction)

In [360]:
rmse

32.11939019673183

In [263]:
# Simple Linear Regression on the Swedish Insurance Dataset
from random import seed
from random import randrange
from csv import reader
from math import sqrt

# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        next(file)
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

# Split a dataset into a train and test set
def train_test_split(dataset, split):
    train = list()
    train_size = split * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy

# Calculate root mean squared error
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)

# Evaluate an algorithm using a train/test split
def evaluate_algorithm(dataset, algorithm, split, *args):
    train, test = train_test_split(dataset, split)
    test_set = list()
    for row in test:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(train, test_set, *args)
    actual = [row[-1] for row in test]
    rmse = rmse_metric(actual, predicted)
    return rmse

# Calculate the mean value of a list of numbers
def mean(values):
    return sum(values) / float(len(values))

# Calculate covariance between x and y
def covariance(x, mean_x, y, mean_y):
    covar = 0.0
    for i in range(len(x)):
        covar += (x[i] - mean_x) * (y[i] - mean_y)
    return covar

# Calculate the variance of a list of numbers
def variance(values, mean):
    return sum([(x-mean)**2 for x in values])

# Calculate coefficients
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    b1 = covariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    b0 = y_mean - b1 * x_mean
    return [b0, b1]

# Simple linear regression algorithm
def simple_linear_regression(train, test):
    predictions = list()
    b0, b1 = coefficients(train)
    for row in test:
        yhat = b0 + b1 * row[0]
        predictions.append(yhat)
    return predictions

# Simple linear regression on insurance dataset
seed(1)
# load and prepare data
filename = 'insurance.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
# evaluate algorithm
split = 0.8
rmse = evaluate_algorithm(dataset, simple_linear_regression, split)
print('RMSE: %.3f' % (rmse))

RMSE: 33.619


In [361]:
import pandas as pd

df = pd.read_csv("insurance.csv")

In [362]:
df.head(3)

Unnamed: 0,X,Y
0,108,392.5
1,19,46.2
2,13,15.7


In [363]:
X = df.iloc[:,:-1].values
y = df.iloc[:,1].values

In [364]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

In [365]:
from sklearn.linear_model import LinearRegression

regression = LinearRegression()
regression.fit(X_train,y_train)

In [366]:
y_pred = regression.predict(X_test) 
y_pred

array([154.70477387, 104.12562553,  87.26590942,  50.17453398,
        33.31481787, 124.35728487, 222.14363831,  43.43064754,
        43.43064754, 117.61339842,  97.38173909,  40.05870431,
        67.03425009,  56.91842043,  33.31481787, 181.68031964,
        29.94287465, 100.75368231,  40.05870431,  36.68676109,
        94.00979587,  26.57093143,  46.80259076,  29.94287465,
       110.86951198,  43.43064754])

In [367]:
from sklearn.metrics import mean_squared_error
import numpy as np

print(np.sqrt(mean_squared_error(y_test,y_pred)))

33.10290863044847
