In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
# load data which is stored in the /data folder of the project
train_data = pd.read_csv('../data/train.csv', sep=',', header=0)
test_data = pd.read_csv('../data/test.csv', sep=',', header=0)

In [3]:
target_variable = train_data["SalePrice"]
train_features = train_data.drop(["SalePrice"], axis=1)

In [4]:
# concat test and train data. List all train records first, attach the test data second
all_data = pd.concat((train_features, test_data), axis=0)

# convert categorical variables into dummy/indicator variable. 
# For missing values an additional column will be created - dummy_na
# The original feature will be dropped - drop_first 
all_dummies = pd.get_dummies(all_data, dummy_na=True, drop_first=True)

all_dummies = all_dummies.fillna(all_dummies.mean())

# split test and train sets again
dummies_train = all_dummies.iloc[:train_features.shape[0],:]
dummies_test = all_dummies.iloc[train_features.shape[0]:,:]

In [5]:
dummies_train = dummies_train.drop(['Id'], axis=1)

In [6]:
print(dummies_train.shape)

(1460, 288)


In [7]:
target_variable = np.log(target_variable)

In [8]:
dummies_train['GrLivArea'] = np.log(dummies_train['GrLivArea'])

In [9]:
dummies_train['HasBsmt'] = pd.Series(len(dummies_train['TotalBsmtSF']), index=dummies_train.index)
dummies_train['HasBsmt'] = 0 
dummies_train.loc[dummies_train['TotalBsmtSF']>0,'HasBsmt'] = 1

In [10]:
dummies_train.loc[dummies_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(dummies_train['TotalBsmtSF'])

  if __name__ == '__main__':


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(dummies_train, target_variable, test_size=0.2, random_state=0)

In [None]:
reg_models = [RandomForestRegressor(n_estimators=200),
              LinearRegression(),
              Lasso(),
              ElasticNet(),
              Ridge(alpha=2.5),
              SVR(kernel='poly'),
              NuSVR(),
              LinearSVR()]

log_cols = ["RegressionModel", "RMSE", "Score"]
log = pd.DataFrame(columns=log_cols)

for reg in reg_models:
    reg.fit(X_train, y_train)

    name = reg.__class__.__name__

    print("=" * 30)
    print(name)

    train_predictions = reg.predict(X_valid)
    rmse = sqrt(mean_squared_error(y_valid, train_predictions))
    print("Root mean squared error: {}".format(rmse))
    
    score = reg.score(X_valid, y_valid)
    print("Score: {}".format(score))

    log_entry = pd.DataFrame([[name, rmse, score]], columns=log_cols)
    log = log.append(log_entry)

print("="*30)

RandomForestRegressor
Root mean squared error: 0.13714811063827823
Score: 0.8757620614569145
LinearRegression
Root mean squared error: 0.2068140765896276
Score: 0.7174893860994671
Lasso
Root mean squared error: 0.2623462045890714
Score: 0.5454055287972202
ElasticNet
Root mean squared error: 0.2532652078779269
Score: 0.5763320177718327
Ridge
Root mean squared error: 0.17288270933865404
Score: 0.8025861135918182


In [None]:
# train best model
rf_reg = RandomForestRegressor(n_estimators=200)
rf_reg.fit(X_train, y_train)

In [None]:
test_ids = dummies_test['Id']
dummies_test = dummies_test.drop(['Id'], axis=1)

In [None]:
dummies_test['GrLivArea'] = np.log(dummies_test['GrLivArea'])

In [None]:
dummies_test['HasBsmt'] = pd.Series(len(dummies_test['TotalBsmtSF']), index=dummies_test.index)
dummies_test['HasBsmt'] = 0 
dummies_test.loc[dummies_test['TotalBsmtSF']>0,'HasBsmt'] = 1

In [None]:
dummies_test.loc[dummies_test['HasBsmt']==1,'TotalBsmtSF'] = np.log(dummies_test['TotalBsmtSF'])

In [None]:
predictions = rf_reg.predict(dummies_test)

# prepare submission as outlined in the submission_sample from Kaggle
submission = pd.DataFrame({"Id": test_ids,"SalePrice": predictions})

In [None]:
submission.to_csv("../data/submission.csv", index=False)