# Evaluating linear and polynomial regression with elastic net regularization

In [1]:
import pandas
import numpy as np
from sklearn import preprocessing
from scipy import stats
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
import sklearn.linear_model as lm
import random
import DataPrepUtil
import Impute

## Reading data

In [2]:
housing = pandas.read_csv('./housing.csv')

## Preprocessing data

In [3]:
DataPrepUtil.transform_ocean_proximity(housing)
Impute.fill_lr_prediction_from_other_column(housing, 'total_rooms')

standard = preprocessing.StandardScaler().fit(housing)
df = standard.transform(housing)

median_house_value_bc, maxlog, interval = stats.boxcox(housing.median_house_value, alpha=0.05)
population_bc, maxlog, interval = stats.boxcox(housing.population, alpha=0.05)
housing_median_age_bc, maxlog, interval = stats.boxcox(housing.housing_median_age, alpha=0.05)
total_rooms_bc, maxlog, interval = stats.boxcox(housing.total_rooms, alpha=0.05)
total_bedrooms_bc, maxlog, interval = stats.boxcox(housing.total_bedrooms, alpha=0.05)
households_bc, maxlog, interval = stats.boxcox(housing.households, alpha=0.05)
median_income_bc, maxlog, interval = stats.boxcox(housing.median_income, alpha=0.05)

housing_boxcox = housing.copy()

housing_boxcox.drop(columns=['housing_median_age'], inplace=True)
housing_boxcox.drop(columns=['total_rooms'], inplace=True)
housing_boxcox.drop(columns=['total_bedrooms'], inplace=True)
housing_boxcox.drop(columns=['population'], inplace=True)
housing_boxcox.drop(columns=['households'], inplace=True)
housing_boxcox.drop(columns=['median_income'], inplace=True)
housing_boxcox.drop(columns=['median_house_value'], inplace=True)

housing_boxcox['housing_median_age'] = housing_median_age_bc
housing_boxcox['total_rooms'] = total_rooms_bc
housing_boxcox['total_bedrooms'] = total_bedrooms_bc
housing_boxcox['population'] = population_bc
housing_boxcox['households'] = households_bc
housing_boxcox['median_income'] = median_income_bc
housing_boxcox['median_house_value'] = median_house_value_bc

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
  return self.partial_fit(X, y)
  """


## Splitting input and output

In [4]:
y = housing_boxcox.median_house_value.values.reshape(-1,1)
X = housing_boxcox.drop(columns=['median_house_value'], inplace=False).values

## Create polynomial features from the original input features (for polynomial regression)

In [5]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(housing_boxcox.drop(columns=['median_house_value'], inplace=False).values)
print(X_poly.shape)

(20640, 105)


## Training and test set split

In [6]:
test = random.sample(range(0,20640),5000)
X_test = X[test]
X_poly_test = X_poly[test]
y_test = y[test]
X_train_valid = np.delete(X, test, 0)
X_poly_train_valid = np.delete(X_poly, test, 0)
y_train_valid = np.delete(y, test, 0)

## Variables for holding the optimal models and the corresponding validation score

In [7]:
ChosenLinearModel = None
current_score_linear = -1
current_paramater_linear = -1

ChosenPolyModel = None
current_score_poly = -1
current_paramater_poly = -1

## Performance of linear regression on different value of regularization parameter

In [8]:
for a in np.arange(0.01, 0.2, 0.02):
    print("a = " + str(a))
    
    Model = lm.ElasticNet(alpha=a, max_iter=3000, tol=0.13)
    
    # Have to shuffle the data because it is grouped.
    kf = KFold(n_splits=5, shuffle=True)
    total_train_score = 0
    total_valid_score = 0
    
    for train_index, valid_index in kf.split(X_train_valid):
        X_train, X_valid = X_train_valid[train_index], X_train_valid[valid_index]
        y_train, y_valid = y_train_valid[train_index], y_train_valid[valid_index]
        Model.fit(X_train, y_train)
        total_train_score += Model.score(X_train, y_train)
        total_valid_score += Model.score(X_valid, y_valid)
        
    avg_train_score = total_train_score / 5
    avg_valid_score = total_valid_score / 5
    avg_train_valid_score = (avg_train_score + avg_valid_score) / 2
    print("Average training r2 score: " + str(avg_train_score))
    print("Average validation r2 score: " + str(avg_valid_score))
    
    if (ChosenLinearModel is None) or (avg_train_valid_score > current_score_linear):
        ChosenLinearModel = Model
        current_score_linear = avg_train_valid_score
        current_paramater_linear = a

print("Optimal model (linear): a = " + str(current_paramater_linear))

a = 0.01
Average training r2 score: 0.6881225793522856
Average validation r2 score: 0.6873224625524761
a = 0.03
Average training r2 score: 0.6800257797295705
Average validation r2 score: 0.6795100973096602
a = 0.049999999999999996
Average training r2 score: 0.6743401255415954
Average validation r2 score: 0.6737976752449859
a = 0.06999999999999999
Average training r2 score: 0.6675198973577744
Average validation r2 score: 0.6666471727749059
a = 0.08999999999999998
Average training r2 score: 0.6593730002589011
Average validation r2 score: 0.6588380561080109
a = 0.10999999999999997
Average training r2 score: 0.6511796472267556
Average validation r2 score: 0.6503098463146763
a = 0.12999999999999998
Average training r2 score: 0.6426981086520334
Average validation r2 score: 0.6417339954806979
a = 0.15
Average training r2 score: 0.6318938700964627
Average validation r2 score: 0.6311352082365342
a = 0.16999999999999998
Average training r2 score: 0.6225752664018785
Average validation r2 score: 0

## Performance of polynomial regression on different value of regularization parameter

In [9]:
for a in np.arange(0.01, 0.5, 0.02):
    print("a = " + str(a))
    
    Model = lm.ElasticNet(alpha=a, max_iter=3000, tol=0.13)
    
    # Have to shuffle the data because it is grouped.
    kf = KFold(n_splits=5, shuffle=True)
    total_train_score = 0
    total_valid_score = 0
    
    for train_index, valid_index in kf.split(X_train_valid):
        X_train, X_valid = X_poly_train_valid[train_index], X_poly_train_valid[valid_index]
        y_train, y_valid = y_train_valid[train_index], y_train_valid[valid_index]
        Model.fit(X_train, y_train)
        total_train_score += Model.score(X_train, y_train)
        total_valid_score += Model.score(X_valid, y_valid)
        
    avg_train_score = total_train_score / 5
    avg_valid_score = total_valid_score / 5
    avg_train_valid_score = (avg_train_score + avg_valid_score) / 2
    print("Average training r2 score: " + str(avg_train_score))
    print("Average validation r2 score: " + str(avg_valid_score))
    
    if (ChosenPolyModel is None) or (avg_train_valid_score > current_score_poly):
        ChosenPolyModel = Model
        current_score_poly = avg_train_valid_score
        current_paramater_poly = a

print("Optimal model (polynomial): a = " + str(current_paramater_poly))

a = 0.01
Average training r2 score: 0.7406833610278759
Average validation r2 score: 0.7376190214612475
a = 0.03
Average training r2 score: 0.7384987408266317
Average validation r2 score: 0.7350819239674405
a = 0.049999999999999996
Average training r2 score: 0.7357274262616826
Average validation r2 score: 0.7328390275076738
a = 0.06999999999999999
Average training r2 score: 0.7336404154486871
Average validation r2 score: 0.7311215571066219
a = 0.08999999999999998
Average training r2 score: 0.7277558990921683
Average validation r2 score: 0.7251235322462418
a = 0.10999999999999997
Average training r2 score: 0.7260841919320792
Average validation r2 score: 0.72330276585228
a = 0.12999999999999998
Average training r2 score: 0.7181357945397442
Average validation r2 score: 0.7163262530589753
a = 0.15
Average training r2 score: 0.7200909557005497
Average validation r2 score: 0.7173958710973505
a = 0.16999999999999998
Average training r2 score: 0.7152670499125772
Average validation r2 score: 0.7

## Final performance score of the optimal models evaluated on test set

In [10]:
print("R2 score of optimal linear model: " + str (ChosenLinearModel.score(X_test, y_test)))
print("R2 score of optimal polynomial model: " + str(ChosenPolyModel.score(X_poly_test, y_test)))

R2 score of optimal linear model: 0.6885040824911826
R2 score of optimal polynomial model: 0.7404239330189452
