### Relevant imports and data load

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from math import sqrt

%matplotlib inline

In [3]:
# load data which is stored in the /data folder of the project
train_data = pd.read_csv('../data/train_filled_up.csv', sep=',', header=0)
test_data = pd.read_csv('../data/test_filled_up.csv', sep=',', header=0)

target_variable = train_data["SalePrice"]

### train model on quantitative (numeric) values only

In [8]:
quantitative_features = ["LotFrontage", "LotArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "2ndFlrSF",
                        "TotalBsmtSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath",
                        "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "Fireplaces", "GarageCars", "WoodDeckSF", "OpenPorchSF",
                        "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal"]

In [4]:
# list of all quantitative, numeric features, that have not been deleted due to their analysis
selected_num_columns = ['LotFrontage', 'LotArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '2ndFlrSF', 'GrLivArea',
               'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageCars',
               'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']

In [9]:
train_quantitative_features = train_data.loc[:, train_data.columns.isin(quantitative_features)]

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(train_quantitative_features,
                                                      target_variable, test_size=0.2, random_state=0)

In [11]:
reg_models = [RandomForestRegressor(n_estimators=300),
              DecisionTreeRegressor(),
              LinearRegression(),
              Lasso(),
              ElasticNet(),
              Ridge(alpha=2.5),
              SVR(),
              NuSVR(),
              LinearSVR()]

log_cols = ["RegressionModel", "RMSE", "Score"]
log = pd.DataFrame(columns=log_cols)

for reg in reg_models:
    reg.fit(X_train, y_train)

    name = reg.__class__.__name__

    print("=" * 30)
    print(name)

    train_predictions = reg.predict(X_valid)
    rmse = sqrt(mean_squared_error(y_valid, train_predictions))
    print("Root mean squared error: {}".format(rmse))
    
    score = reg.score(X_valid, y_valid)
    print("Score: {}".format(score))

    log_entry = pd.DataFrame([[name, rmse, score]], columns=log_cols)
    log = log.append(log_entry)

print("="*30)

RandomForestRegressor
Root mean squared error: 31197.165160176726
Score: 0.8398719576036158
DecisionTreeRegressor
Root mean squared error: 43309.52940134253
Score: 0.6913942173796107
LinearRegression
Root mean squared error: 36729.009531807664
Score: 0.7780497881329239
Lasso
Root mean squared error: 36728.06782839031
Score: 0.7780611692508043
ElasticNet
Root mean squared error: 37834.89026561382
Score: 0.7644830939212444
Ridge
Root mean squared error: 36685.49816192902
Score: 0.778575347495724
SVR




Root mean squared error: 79200.80044291327
Score: -0.03203938599815448
NuSVR
Root mean squared error: 78155.08555421587
Score: -0.004966571084560779
LinearSVR
Root mean squared error: 40518.61520534959
Score: 0.7298864825278706


### train model on transformed quantitative features (where useful)

usefulness is decided based on analyze_data outcome for individual features. 
Transformation back shouldn't be forgotten after the prediction.

In [11]:
transformed_target_variable = np.log(target_variable)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_quantitative_transformed_features,
                                                      transformed_target_variable, test_size=0.2, random_state=0)

In [None]:
reg_models = [RandomForestRegressor(n_estimators=300),
              DecisionTreeRegressor(),
              LinearRegression(),
              Lasso(),
              ElasticNet(),
              Ridge(alpha=2.5),
              SVR(),
              NuSVR(),
              LinearSVR()]

log_cols = ["RegressionModel", "RMSE", "Score"]
log = pd.DataFrame(columns=log_cols)

for reg in reg_models:
    reg.fit(X_train, y_train)

    name = reg.__class__.__name__

    print("=" * 30)
    print(name)

    predictions = reg.predict(X_valid)
    rmse = sqrt(mean_squared_error(y_valid, predictions))
    print("Root mean squared error: {}".format(rmse))
    
    score = reg.score(X_valid, y_valid)
    print("Score: {}".format(score))

    log_entry = pd.DataFrame([[name, rmse, score]], columns=log_cols)
    log = log.append(log_entry)

print("="*30)

In [None]:
final_predictions = np.exp(predictions)

### train model

In [24]:
train_dummies = train_dummies.drop(['Id'], axis=1)

In [25]:
X_train, X_valid, y_train, y_valid = train_test_split(train_dummies, target_variable, test_size=0.2, random_state=0)

In [26]:
reg_models = [RandomForestRegressor(n_estimators=300),
              DecisionTreeRegressor(),
              LinearRegression(),
              Lasso(),
              ElasticNet(),
              Ridge(alpha=2.5),
              SVR(),
              NuSVR(),
              LinearSVR()]

log_cols = ["RegressionModel", "RMSE", "Score"]
log = pd.DataFrame(columns=log_cols)

for reg in reg_models:
    reg.fit(X_train, y_train)

    name = reg.__class__.__name__

    print("=" * 30)
    print(name)

    train_predictions = reg.predict(X_valid)
    rmse = sqrt(mean_squared_error(y_valid, train_predictions))
    print("Root mean squared error: {}".format(rmse))
    
    score = reg.score(X_valid, y_valid)
    print("Score: {}".format(score))

    log_entry = pd.DataFrame([[name, rmse, score]], columns=log_cols)
    log = log.append(log_entry)

print("="*30)

RandomForestRegressor
Root mean squared error: 32844.43694626833
Score: 0.8437908376318094
DecisionTreeRegressor
Root mean squared error: 40507.94309609798
Score: 0.7623908045454512
LinearRegression
Root mean squared error: 54780.61989222654
Score: 0.5654530099236519




Lasso
Root mean squared error: 53479.31327065549
Score: 0.5858530140665199
ElasticNet
Root mean squared error: 50689.86456913379
Score: 0.6279295983216646
Ridge
Root mean squared error: 45995.969883387144
Score: 0.6936467985054584
SVR
Root mean squared error: 85107.82593096742
Score: -0.04887046090654246
NuSVR
Root mean squared error: 83573.52824369336
Score: -0.011393913784108811
LinearSVR
Root mean squared error: 118573.85166649462
Score: -1.0359200347985738


In [27]:
# train best model for prediction of test data
rf_reg = RandomForestRegressor(n_estimators=200)
rf_reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

### predict for submission

In [14]:
test_ids = test_dummies['Id']
test_dummies = test_dummies.drop(['Id'], axis=1)

In [18]:
predictions = rf_reg.predict(dummies_test)

# prepare submission as outlined in the submission_sample from Kaggle
submission = pd.DataFrame({"Id": test_ids,"SalePrice": predictions})

In [19]:
submission.to_csv("../data/submission.csv", index=False)