In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge

In [2]:
def rmse(pred, actual):
    return np.sqrt(mean_squared_error(pred, actual))

### Import our train data

In [3]:
train = pd.read_csv('../data/train.csv')

In [7]:
kfold = KFold(n_splits=5)

### Simple Linear Regression

In [19]:
err = []

for train_idx, test_idx in kfold.split(train):
    train_x = train.iloc[train_idx, :-1]
    train_y = train.iloc[train_idx, -1]
    test_x = train.iloc[test_idx, :-1]
    test_y = train.iloc[test_idx, -1]
    
    linreg = LinearRegression()
    linreg.fit(train_x, train_y)
    y_pred = linreg.predict(test_x)
    err.append(rmse(y_pred, test_y))

print("RMSE: ", err)
print("Average RMSE: ", np.mean(err))

RMSE:  [183.51993345808003, 190.8911706815273, 294.9018827362626, 303.3079108752656, 306.1510613309703]
Average RMSE:  255.7543918164212


### Now, we try higher degrees to improve our accuracy

In [20]:
deg = [2,3,4,5,6]

for d in deg:
    
    err = []

    for train_idx, test_idx in kfold.split(train):
        
        poly = PolynomialFeatures(degree=d)
        
        train_x = train.iloc[train_idx, :-1]
        train_y = train.iloc[train_idx, -1]
        test_x = train.iloc[test_idx, :-1]
        test_y = train.iloc[test_idx, -1]
        
        train_x = poly.fit_transform(train_x)
        test_x = poly.transform(test_x)

        linreg = LinearRegression()
        linreg.fit(train_x, train_y)
        y_pred = linreg.predict(test_x)
        err.append(rmse(y_pred, test_y))

    print("RMSE for degree ", d, ": ", err)
    print("Average RMSE for degree ", d, ": ", np.mean(err))
    print()

RMSE for degree  2 :  [88.41748455524154, 89.21247619975938, 251.24691769466733, 266.33273255144263, 268.1805491121271]
Average RMSE for degree  2 :  192.6780320226476

RMSE for degree  3 :  [103.28033356035864, 101.12303204721815, 241.87885584201877, 257.99370919548073, 267.2928094051513]
Average RMSE for degree  3 :  194.31374801004552

RMSE for degree  4 :  [4906.892975863304, 192.4328681635377, 299.20256565444356, 2948.953166201464, 330.4197492023406]
Average RMSE for degree  4 :  1735.5802650170183

RMSE for degree  5 :  [1531.5930070275613, 759.5850068539391, 475.9962488494986, 6459.83894257086, 643.4994768632502]
Average RMSE for degree  5 :  1974.1025364330217

RMSE for degree  6 :  [8708.351399615136, 531.4982330515871, 524.6226320234839, 2429.6345017642293, 1652.9240092441362]
Average RMSE for degree  6 :  2769.4061551397144



<p>The RMSE for lower degrees is decent. However, it exploded from 4 onwards due to high variance.</p>
<p>To prevent overfitting while lowering bias, we can include a regularisation parameter. This can be achieved with the Ridge Regresssion.</p>

### Ridge Regression with GridSearch

In [12]:
deg = [2,3,4,5]
logalpha = np.arange(-3, 4, 1.0)
alpha = 10.0 ** logalpha

m = len(alpha)
n = len(deg)
grid_err = np.array([0] * (m*n)).reshape(m, n)
i = j = 0

for a in alpha:
    j=0
    for d in deg:
    
        err = []
        poly = PolynomialFeatures(degree=d)
        X = poly.fit_transform(train.iloc[:,:-1])

        for train_idx, test_idx in kfold.split(X):

            train_x = X[train_idx]
            train_y = train.iloc[train_idx, -1]
            test_x = X[test_idx]
            test_y = train.iloc[test_idx, -1]

            linreg = Ridge(alpha=a)
            linreg.fit(train_x, train_y)
            y_pred = linreg.predict(test_x)
            err.append(rmse(y_pred, test_y))
            
        grid_err[i, j] = np.mean(err)
        print("RMSE for degree {}, alpha {}: {}".format(d, a, err))
        print("Average RMSE for degree {}, alpha {}: {}".format(d, a, np.mean(err)))
        print()
        j += 1
        
    i += 1

RMSE for degree 2, alpha 0.001: [117.51086177017918, 85.3749667330402, 223.4577987928067, 281.9767657021247, 236.83143214207678]
Average RMSE for degree 2, alpha 0.001: 189.0303650280455

RMSE for degree 3, alpha 0.001: [288.7237866064351, 95.43917540462675, 210.72844534424766, 786.603823811369, 225.7566316968845]
Average RMSE for degree 3, alpha 0.001: 321.4503725727126

RMSE for degree 4, alpha 0.001: [78212.02853878315, 869.5221092148257, 3456.625311208309, 12249.999134539996, 4872.002513735793]
Average RMSE for degree 4, alpha 0.001: 19932.035521496415

RMSE for degree 5, alpha 0.001: [254326.09481663664, 29616.00876609183, 20394.348306110507, 17900.74215331261, 59239.016289980835]
Average RMSE for degree 5, alpha 0.001: 76295.2420664265

RMSE for degree 2, alpha 0.01: [117.32332137000499, 85.36482375499963, 223.45309907635817, 280.51507306086745, 236.83283221092054]
Average RMSE for degree 2, alpha 0.01: 188.69782989463016

RMSE for degree 3, alpha 0.01: [236.60983234963524, 94.17

In [13]:
min_index = np.unravel_index(grid_err.argmin(), grid_err.shape)
print(min_index)
print("Best alpha: ", alpha[min_index[0]])
print("Best degree: ", deg[min_index[1]])
print("Min error: ", grid_err[min_index])

(5, 1)
Best alpha:  100.0
Best degree:  3
Min error:  170


<p>The best hyperparameters for Ridge Regression are alpha=100 and degree=3.</p>

<h2>Accuracy of Ridge Regression on Test set</h2>

In [None]:
test = pd.read_csv('../data/test.csv')
print("RMSE for Ridge: ", rmse(ridge_a.predict(PolynomialFeatures(degree=3).fit_transform(test.iloc[:,:-1])), test.iloc[:,-1]))

## Conclusion

<p>By using polynomial terms and a regularisation parameter to reduce overfitting, the RMSE is reduced from 255.754 to 170. However, this still means that for an apartment of 1000 sqft, the error in price is as large as 170,000. Hence, we need to seek for models with much lower bias, while at the same time keeping variance low.</p>