In [1]:
import pandas as pd
import numpy as np
import copy
from scipy.stats import skew

In [2]:
dataset = pd.read_csv('train_housing.csv')

MSZoning_types = ['A', 'C', 'FV', 'I', 'RH', 'RL', 'RP', 'RM']
dataset['MSZoning'] = dataset['MSZoning'].astype("category", categories=MSZoning_types).cat.codes

LotShape_types = ['Reg', 'IR1', 'IR2', 'IR3']
dataset['LotShape'] = dataset['LotShape'].astype("category", categories=LotShape_types).cat.codes

Neighborhood_types = ['Blmngtn','Blueste','BrDale','BrkSide','ClearCr','CollgCr','Crawfor','Edwards','Gilbert','IDOTRR','MeadowV','Mitchel','Names','NoRidge','NPkVill','NridgHt','NWAmes','OldTown','SWISU','Sawyer','SawyerW','Somerst','StoneBr','Timber','Veenker']
dataset['Neighborhood'] = dataset['Neighborhood'].astype("category", categories=Neighborhood_types).cat.codes

dataset["SalePrice"] = np.log1p(dataset["SalePrice"])
dataset["LotArea"] = np.log1p(dataset["LotArea"])

dataset['Age'] = dataset['YrSold'] - dataset['YearBuilt']
dataset['MSSubClass'] /= 10 

dataset.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Age
0,1,6.0,5,65.0,9.04204,Pave,,0,Lvl,AllPub,...,,,,0,2,2008,WD,Normal,12.247699,5
1,2,2.0,5,80.0,9.169623,Pave,,0,Lvl,AllPub,...,,,,0,5,2007,WD,Normal,12.109016,31
2,3,6.0,5,68.0,9.328212,Pave,,1,Lvl,AllPub,...,,,,0,9,2008,WD,Normal,12.317171,7
3,4,7.0,5,60.0,9.164401,Pave,,1,Lvl,AllPub,...,,,,0,2,2006,WD,Abnorml,11.849405,91
4,5,6.0,5,84.0,9.565284,Pave,,1,Lvl,AllPub,...,,,,0,12,2008,WD,Normal,12.42922,8


In [19]:
dataset['LotFrontage'] = dataset['LotFrontage'].fillna(int(dataset['LotFrontage'].mean()))

numeric_datapoints = dataset[['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond','Neighborhood', 'Age']]
targets = dataset['SalePrice']

training_dataset_size = int(numeric_datapoints.shape[0] * 0.9)
training_points = numeric_datapoints.head(training_dataset_size)
training_labels = targets.head(training_dataset_size)

validation_dataset_size = int(numeric_datapoints.shape[0] - training_dataset_size)
validation_points = numeric_datapoints.tail(validation_dataset_size).reset_index(drop = True)
validation_labels = targets.tail(validation_dataset_size).reset_index(drop = True)

In [4]:
def fitOLS(X, y):
    """
    Fit ordinary least squares model to the data.
    """
    return np.linalg.pinv(X) @ y

def fitRidge(X, y, reg_strength):
    """
    Fit ridge regression model to the data.
    """
    return np.linalg.inv(X.T @ X  + reg_strength * np.identity(len(X.T))) @ (X.T @ y)

def RMSError(y_true, y_pred):
    """
    Compute root mean squared error between true and predicted regression targets.
    """
    return np.sqrt(((y_true - y_pred) ** 2).mean())

def predict(X, w):
    """
    Generate predictions for the given samples.
    """
    return X @ w

In [104]:
# Ordinary least squares regression
#w_ls = fitOLS(training_points.values, training_labels.values)
w_ls = fitRidge(training_points.values, training_labels.values, 1)
y_pred_ls = predict(validation_points.values, w_ls)
mse_ls = RMSError(validation_labels.values, y_pred_ls)
print('MSE for Least squares = {0}'.format(np.exp(mse_ls)))

MSE for Least squares = 1.4915845628516946


In [17]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

poly = make_pipeline(PolynomialFeatures(3), Ridge())
poly.fit(training_points.values, training_labels.values)
y_pred = poly.predict(validation_points.values)
mse = RMSError(validation_labels.values, y_pred)
print('MSE for Least squares = {0}'.format(np.exp(mse)))

MSE for Least squares = 1.2273489256486134
