# Linear Regression

### Q1

In [None]:
import numpy as np
import math
import operator
from scipy.linalg import svd
import scipy.stats as ss
import matplotlib.pyplot as plt
import scipy.linalg as la

x = np.array([[-2], [-5], [-3], [0], [-8], [-2], [1], [5], [-1], [6]])
y = np.array([[1], [-4], [1], [3], [11], [5], [0], [-1], [-3], [1]])

def lse_1(x,y):
    biasF = np.ones(((len(x)), 1))
    X = np.hstack((biasF, x))

    dotX = np.linalg.inv(np.dot(X.T,X))
    dotY = np.dot(X.T,y)

    w = np.dot(dotX, dotY)

    return w
    

w = lse_1(x,y)

Y = np.zeros(shape=(10,1))

for j in range(len(y)):
    Y[j] = (w[1]*x[j] + w[0])

print("Y_pred: \n", Y)

def rmse(predictions, targets):
    return np.sqrt(np.mean((predictions-targets)**2))

rootmean = rmse(Y, y)
print("RMSE: ", rmse(Y, y))


### Q2

In [None]:
# to do

### Q3

In [361]:
import cv2
from cv2 import VideoWriter, VideoWriter_fourcc
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.cm as cm
from collections import Counter
import glob
from matplotlib.image import imread
from enum import Enum


class Sex(Enum):
    male = 1
    female = 2

class Smoker(Enum):
    no = 1
    yes = 2
    
class Region(Enum):
    southwest = 1
    southeast = 2
    northwest = 3
    northeast = 4


np.random.seed(0)
imp_data = np.genfromtxt('insurance.csv', delimiter=',', encoding='utf8', dtype=np.str)
# dataN = data.astype(np.float)

# Separates header and data
feature_name, data = np.vsplit(imp_data, [1])

# x, y = np.hsplit(data, [-1])

n = len(data)

# Rows shuffled
np.random.shuffle(data)

# Calculates array index for splitting
spltIdx = int((2/3)*n)

# Training-validation data split
data_train, data_test = data[:spltIdx,:], data[spltIdx:,:]

# Training data
x_tr, y_tr = np.hsplit(data_train, [-1])
# Testing Data
x_tt, y_tt = np.hsplit(data_test, [-1])

def preprocessCat(dataMain):
    '''Changes categorical features to enumerated ones'''
    
    data = np.copy(dataMain)

    for i in range(len(data)):
        for s in Sex:
            if s.name == data[i][1]:
                data[i][1] = s.value
                break
        
        for sm in Smoker:
            if sm.name == data[i][4]:
                data[i][4] = sm.value
                break

        for r in Region:
            if r.name == data[i][5]:
                data[i][5] = r.value
                break

    return data

def preprocessBin(dataMain):
    '''Changes categorical features to binary features'''
    
    data = np.copy(dataMain)
    binCols = np.ones(((len(data)), 4))
    data = np.hstack((data, binCols))

    for i in range(len(data)):
        for s in Sex:
            if s.name == data[i][1]:
                data[i][1] = s.value
                break
        
        for sm in Smoker:
            if sm.name == data[i][4]:
                data[i][4] = sm.value
                break


        if data[i][5] == 'southwest':
            data[i][6] = 2
        elif data[i][5] == 'southeast':
            data[i][7] = 2
        elif data[i][5] == 'northwest':
            data[i][8] = 2
        elif data[i][5] == 'northeast':
            data[i][9] = 2

    data = np.delete(data, 5, 1)
    return data


def lse(x,y, addBias, isBin):
    '''Calculates weights using LSE'''

    x = x.astype(np.float64)
    y = y.astype(np.float64)

    if addBias:
        biasF = np.ones(((len(x)), 1))
        x = np.hstack((biasF, x))

    if isBin:
        w = np.linalg.pinv(x.transpose().dot(x)).dot(x.transpose()).dot(y)
    else:
        w = np.linalg.inv(x.transpose().dot(x)).dot(x.transpose()).dot(y)

    return w

def noiseGen(data):
    '''Adds noise to data'''
    
    data = data.astype(np.float64)
    noise = np.random.normal(0, .01, data.shape)

    data = data + noise
    return data


def rmse(w, x, y):
    x = x.astype(np.float64)
    y = y.astype(np.float64)

    b0 = w[0]
    other = w[1:]
    prediction = b0

    Y = np.zeros(shape=(y.shape))

    for j in range(len(y)):
        prediction = 0
        prediction += b0

        for i in range(len(other)):
            prediction += other[i] * x[j][i]

        Y[j] = prediction

    return np.sqrt(np.mean((Y-y)**2))




# Change categorical features to enumerated ones (NO BIAS)
d_tr_preC = preprocessCat(x_tr)
w_tr_preC = lse(d_tr_preC, y_tr, False, False)
rmse_tr_preC = rmse(w_tr_preC, d_tr_preC, y_tr)
print("RMSE (TRAINING - ENUM - NO BIAS): ", rmse_tr_preC)

d_tt_preC = preprocessCat(x_tt)
w_tt_preC = lse(d_tt_preC, y_tt, False, False)
rmse_tt_preC = rmse(w_tt_preC, d_tt_preC, y_tt)
print("RMSE (TESTING - ENUM - NO BIAS): ", rmse_tt_preC)

print()

# Change categorical features to enumerated ones (BIAS)
d_tr_preC = preprocessCat(x_tr)
w_tr_preC = lse(d_tr_preC, y_tr, True, False)
rmse_tr_preC = rmse(w_tr_preC, d_tr_preC, y_tr)
print("RMSE (TRAINING - ENUM - BIAS): ", rmse_tr_preC)

d_tt_preC = preprocessCat(x_tt)
w_tt_preC = lse(d_tt_preC, y_tt, True, False)
rmse_tt_preC = rmse(w_tt_preC, d_tt_preC, y_tt)
print("RMSE (TESTING - ENUM - BIAS): ", rmse_tt_preC)

print()
print()

# Change categorical features to binary features (NO BIAS)
d_tr_preC = preprocessBin(x_tr)
d_tr_preC = noiseGen(d_tr_preC)
w_tr_preC = lse(d_tr_preC, y_tr, False, True)
rmse_tr_preC = rmse(w_tr_preC, d_tr_preC, y_tr)
print("RMSE (TRAINING - BIN - NO BIAS): ", rmse_tr_preC)


d_tt_preC = preprocessBin(x_tt)
d_tt_preC = noiseGen(d_tt_preC)
w_tt_preC = lse(d_tt_preC, y_tt, False, True)
rmse_tt_preC = rmse(w_tt_preC, d_tt_preC, y_tt)
print("RMSE (TESTING - BIN - NO BIAS): ", rmse_tt_preC)

print()

# # Change categorical features to binary features (BIAS)
d_tr_preC = preprocessBin(x_tr)
d_tr_preC = noiseGen(d_tr_preC)
w_tr_preC = lse(d_tr_preC, y_tr, True, True)
rmse_tr_preC = rmse(w_tr_preC, d_tr_preC, y_tr)
print("RMSE (TRAINING - BIN - BIAS): ", rmse_tr_preC)


d_tt_preC = preprocessBin(x_tt)
d_tt_preC = noiseGen(d_tt_preC)
w_tt_preC = lse(d_tt_preC, y_tt, True, True)
rmse_tt_preC = rmse(w_tt_preC, d_tt_preC, y_tt)
print("RMSE (TESTING - BIN - BIAS): ", rmse_tt_preC)




RMSE (TRAINING - ENUM - NO BIAS):  151357.23969016664
RMSE (TESTING - ENUM - NO BIAS):  181121.86523476854

RMSE (TRAINING - ENUM - BIAS):  5757.954440690525
RMSE (TESTING - ENUM - BIAS):  6519.373997851638


RMSE (TRAINING - BIN - NO BIAS):  31707.42718070179
RMSE (TESTING - BIN - NO BIAS):  31434.144622561682

RMSE (TRAINING - BIN - BIAS):  5763.9397804300825
RMSE (TESTING - BIN - BIAS):  6502.067257403235
