In [8]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge

In [12]:
def rmse(pred, actual):
    return np.sqrt(mean_squared_error(pred, actual))

### Import our train data

In [6]:
train = pd.read_csv('data/train.csv')

### Simple Linear Regression

In [19]:
kfold = KFold(n_splits=5)
err = []

for train_idx, test_idx in kfold.split(train):
    train_x = train.iloc[train_idx, :-1]
    train_y = train.iloc[train_idx, -1]
    test_x = train.iloc[test_idx, :-1]
    test_y = train.iloc[test_idx, -1]
    
    linreg = LinearRegression()
    linreg.fit(train_x, train_y)
    y_pred = linreg.predict(test_x)
    err.append(rmse(y_pred, test_y))

print("RMSE: ", err)
print("Average RMSE: ", np.mean(err))

RMSE:  [183.51993345808003, 190.8911706815273, 294.9018827362626, 303.3079108752656, 306.1510613309703]
Average RMSE:  255.7543918164212


### Now, we try higher degrees to improve our accuracy

In [20]:
deg = [2,3,4,5,6]

for d in deg:
    
    err = []

    for train_idx, test_idx in kfold.split(train):
        
        poly = PolynomialFeatures(degree=d)
        
        train_x = train.iloc[train_idx, :-1]
        train_y = train.iloc[train_idx, -1]
        test_x = train.iloc[test_idx, :-1]
        test_y = train.iloc[test_idx, -1]
        
        train_x = poly.fit_transform(train_x)
        test_x = poly.transform(test_x)

        linreg = LinearRegression()
        linreg.fit(train_x, train_y)
        y_pred = linreg.predict(test_x)
        err.append(rmse(y_pred, test_y))

    print("RMSE for degree ", d, ": ", err)
    print("Average RMSE for degree ", d, ": ", np.mean(err))
    print()

RMSE for degree  2 :  [88.41748455524154, 89.21247619975938, 251.24691769466733, 266.33273255144263, 268.1805491121271]
Average RMSE for degree  2 :  192.6780320226476

RMSE for degree  3 :  [103.28033356035864, 101.12303204721815, 241.87885584201877, 257.99370919548073, 267.2928094051513]
Average RMSE for degree  3 :  194.31374801004552

RMSE for degree  4 :  [4906.892975863304, 192.4328681635377, 299.20256565444356, 2948.953166201464, 330.4197492023406]
Average RMSE for degree  4 :  1735.5802650170183

RMSE for degree  5 :  [1531.5930070275613, 759.5850068539391, 475.9962488494986, 6459.83894257086, 643.4994768632502]
Average RMSE for degree  5 :  1974.1025364330217

RMSE for degree  6 :  [8708.351399615136, 531.4982330515871, 524.6226320234839, 2429.6345017642293, 1652.9240092441362]
Average RMSE for degree  6 :  2769.4061551397144



<p>The RMSE for lower degrees is decent. However, it exploded from 4 onwards due to high variance.</p>
<p>To prevent overfitting while lowering bias, we can include a regularisation parameter. This can be achieved with the Ridge Regresssion.</p>

### Ridge Regression with GridSearch

In [28]:
x = np.eye(5,4,3)
x.put(np.array([1,2,3]), values=(0,1))
print(x[0,1])
print(np.mean(x, axis=2))

0.0


IndexError: tuple index out of range