In [1]:
#import required libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [3]:
#read input files
train = pd.read_csv("train.csv", na_values="NA")
test = pd.read_csv("test.csv", na_values="NA")

In [5]:
#separate the output column from rest of data
prices = train['SalePrice']
train.drop('SalePrice', axis=1, inplace=True)
#concat data to get all columns
all_data = train
all_data['MSSubClass'] = all_data['MSSubClass'].astype('category')
#convert categorical columns into one-hot encoding
all_data = pd.get_dummies(all_data)
X = all_data.as_matrix()
#handle NA values 
X = np.nan_to_num(X)

In [6]:
#split data into training, development and test set
X_train = X[:int(train.shape[0] * 0.8)]
prices_train = prices[:int(train.shape[0] * 0.8)]
X_dev = X[int(train.shape[0] * 0.8):train.shape[0]]
prices_dev = prices[int(train.shape[0] * 0.8):]
X_test = X[train.shape[0]:]
prices_train.shape

(1168,)

In [5]:
#create models and train
clf = Ridge(alpha = 1.0)
clf.fit(X_train, prices_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [6]:
#evaluate on development set
Y = clf.predict(X_dev)
sq_diff = np.square(np.log(prices_dev) - np.log(Y))
error = np.sqrt(np.sum(sq_diff) / prices_dev.shape[0])
error

0.16635065269121163

In [7]:
#prepare output for submission
Y = clf.predict(X_test)
out = pd.DataFrame()
out['Id'] = [i for i in range(X_train.shape[0]+1,X_train.shape[0]+X_test.shape[0]+1)]
out['SalePrice'] = Y
out.to_csv('output_ridge.csv', index=False)

In [13]:
#evaluate on development set
Y = clf.predict(X_dev)
sq_diff = np.square(np.log(prices_dev) - np.log(Y))
error = np.sqrt(np.sum(sq_diff) / prices_dev.shape[0])
error

0.16635065269121163

In [14]:
#create models and train
clf = Lasso(alpha = 1.0)
clf.fit(X_train, prices_train)



Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [15]:
#evaluate on development set
Y = clf.predict(X_dev)
sq_diff = np.square(np.log(prices_dev) - np.log(Y))
error = np.sqrt(np.sum(sq_diff) / prices_dev.shape[0])
error

0.16665768999429451

In [16]:
#test different values of alpha to get the best model
alphas = [0.5, 1, 10, 100, 1000]
errors = {}
for alpha in alphas:
    clf = Ridge(alpha = alpha)
    clf.fit(X_train, prices_train)
    Y = clf.predict(X_dev)
    sq_diff = np.square(np.log(prices_dev) - np.log(Y))
    error = np.sqrt(np.sum(sq_diff) / prices_dev.shape[0])
    errors[alpha] = error
errors

{0.5: 0.16747693528743574,
 1: 0.16635065269121163,
 10: 0.15979498368925382,
 100: 0.16472596678921295,
 1000: 0.18909386389979971}

In [17]:
#prepare output for submission
Y = clf.predict(X_test)
out = pd.DataFrame()
out['Id'] = [i for i in range(X_train.shape[0]+1,X_train.shape[0]+X_test.shape[0]+1)]
out['SalePrice'] = Y
out.to_csv('output_ridge.csv', index=False)