In [1]:
#Importing the libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#Visualisations
import seaborn as sns
import matplotlib.pyplot as plt

# Data Preprocessing

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline

Importing the dataset

In [3]:
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")
df = pd.concat([train.iloc[:,:-1],test],axis=0)
target = train["SalePrice"]
features = train.iloc[:,:-1].copy()

In [4]:
#Removing Skew from SalesPrice data
target_log = np.log(target)
X_train, X_test, y_train, y_test = train_test_split(train, target_log, test_size=0.2, random_state=0)
print(train.shape, target_log.shape)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1460, 81) (1460,)
(1168, 81) (1168,)
(292, 81) (292,)


Creating the preprocessing pipelines for both numeric and categorical data.

In [5]:
def features_split(df, cardinality = 10): 
    categorical_features_with_low_cardinality = (df.dtypes == "object") & (df.nunique() < cardinality)
    categorical_features_with_high_cardinality = (df.dtypes == "object") & (df.nunique() >= cardinality)
    numerical_features = (df.dtypes == "int") | (df.dtypes == "float")
    selected_categorical_features = df.columns[categorical_features_with_low_cardinality].values.tolist()
    selected_numerical_features = df.columns[numerical_features].values.tolist()
    return selected_categorical_features,selected_numerical_features

In [6]:
categorical_features, numerical_features = features_split(X_train)

In [7]:
categorical_features

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [8]:
categorical_transformer = make_pipeline(SimpleImputer(strategy='constant', fill_value = 'missing'),
                                        OneHotEncoder(handle_unknown='ignore',sparse = False))
numerical_transformer = make_pipeline(SimpleImputer(strategy='median'))
preprocess = make_column_transformer( 
    (categorical_transformer, categorical_features),
    (numerical_transformer, numerical_features))

Combining preprocessing step based on the ColumnTransformer with a classifier
in order to get a full prediction pipeline:

In [9]:
from sklearn.ensemble import RandomForestRegressor
model = make_pipeline(preprocess, RandomForestRegressor(n_estimators=10))

In [10]:
model.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('columntransformer', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('pipeline-1', Pipeline(memory=None,
     steps=[('simpleimputer', SimpleImputer(copy=True, fill_value='missing', missing_values=nan,
       s...ators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [11]:
model.score(X_test, y_test)

0.7320880048607918

# Model validation

In [12]:
#model validation
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score

In [13]:
#importing classes of models
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

In [14]:
def evaluation(y_test, predicted_values):
    print(" MAE score: "+ str(mean_absolute_error(y_test,predicted_values))+ 
          " R^2 score: " + str(r2_score(y_test,predicted_values)) + 
          " MSE score: " + str(mean_squared_error(y_test,predicted_values)))

In [15]:
predicted_values = model.predict(X_test)
evaluation(y_test, predicted_values)

 MAE score: 0.14338295500259457 R^2 score: 0.7320880048607918 MSE score: 0.040561833703403716


In [16]:
def model_validation(estimator, parameters, X_train, y_train, X_test, y_test):
    model_pipe = make_pipeline(preprocess, estimator)
    model_grid = GridSearchCV(model_pipe, parameters, 
                                      verbose=1 , scoring = "r2", cv=5)
    model_grid.fit(X_train, y_train)
   # print("Best model: " + str(model_grid.best_estimator_))
   # print("Best score: " + str(model_grid.best_score_))
    predicted_values = model_grid.best_estimator_.fit(X_train, y_train).predict(X_test)
    evaluation(y_test, predicted_values)
    print(" cross validation score: " +str(np.mean(cross_val_score(model_grid.best_estimator_,X_train,y_train,cv=10, scoring="r2"))))

# Model Tuning

XGBRegressor

In [17]:
parameters = {"xgbregressor__max_depth": [3,5]}
model_validation(XGBRegressor(), parameters, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.8s finished


 MAE score: 0.12823551956099039 R^2 score: 0.7914360542938411 MSE score: 0.03157654840300533
 cross validation score: 0.7852245057013055


Linear Regression Model

In [18]:
linear_regression_parameters = {'linearregression__fit_intercept':[True,False],
                               'linearregression__normalize':[True,False], 
                               'linearregression__copy_X':[True, False],
                               }
model_validation(LinearRegression(), linear_regression_parameters, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    6.1s finished


 MAE score: 0.1541659335128122 R^2 score: 0.647079809853667 MSE score: 0.05343206098648551
 cross validation score: 0.7646967759586397


Lasso Model

In [19]:
lasso_parameters = {"lasso__fit_intercept":[True,False],
                    "lasso__normalize":[True,False],
                    "lasso__copy_X":[True, False],
                    "lasso__precompute" : [True, False],
                   }
model_validation(Lasso(), lasso_parameters, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    6.8s finished


 MAE score: 0.22634001697632422 R^2 score: 0.3577103120826879 MSE score: 0.09724255719560494
 cross validation score: 0.4003280803272813


Ridge Model

In [20]:
ridge_pipe = make_pipeline(preprocess, Ridge())
ridge_parameters = {"ridge__fit_intercept":[True,False],
                    "ridge__normalize":[True,False],
                    "ridge__copy_X":[True, False],
                    "ridge__solver" : ["auto"],
                   }
ridge_grid = GridSearchCV(ridge_pipe, ridge_parameters, 
                                      verbose=1 , scoring = "r2", cv=3)
ridge_grid.fit(X_train, y_train)
print("Best lasso model: " + str(ridge_grid.best_estimator_))
print("Best lasso score: " + str(ridge_grid.best_score_))

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Best lasso model: Pipeline(memory=None,
     steps=[('columntransformer', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('pipeline-1', Pipeline(memory=None,
     steps=[('simpleimputer', SimpleImputer(copy=True, fill_value='missing', missing_values=nan,
       s...it_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])
Best lasso score: 0.7684938219257356


[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    2.0s finished


In [21]:
predicted_values = ridge_grid.best_estimator_.fit(X_train, y_train).predict(X_test)
evaluation(y_test, predicted_values)
#print(ridge_grid.get_params().keys())

 MAE score: 0.14339839694456882 R^2 score: 0.7050774785468363 MSE score: 0.04465122311659067


Decision Tree Model

In [22]:
decision_tree_parameters = {"decisiontreeregressor__criterion" : ["mse", "mae"], 
                            "decisiontreeregressor__splitter" : ["best", "random"], 
                            "decisiontreeregressor__min_samples_split" : [2, 3], 
                            "decisiontreeregressor__max_features" : ["auto", "log2"],
                            "decisiontreeregressor__max_depth" : [5]
                           }
model_validation(DecisionTreeRegressor(), decision_tree_parameters, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:   14.4s finished


 MAE score: 0.186919460287843 R^2 score: 0.5811652363740254 MSE score: 0.06341151698927772
 cross validation score: 0.609741088240149


Random Forest Model

In [23]:
random_forest_parameters = {"randomforestregressor__max_depth" : [3], 
                            "randomforestregressor__min_samples_split" : [2, 3], 
                            "randomforestregressor__max_features" : ["auto", "log2"]}
model_validation(RandomForestRegressor(n_estimators=10, n_jobs=-1), 
                 random_forest_parameters, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    9.2s finished


 MAE score: 0.16142682111624385 R^2 score: 0.6822808859416195 MSE score: 0.04810262363255706
 cross validation score: 0.6212998590295692


In conclusion, XGBoost perform the best based on statistics showed above.

In [26]:
model_pipe = make_pipeline(preprocess, XGBRegressor())
model_grid = GridSearchCV(model_pipe, parameters, 
                                      verbose=1 , scoring = "r2", cv=5)
model_grid.fit(X_train, y_train)
log_predicted_values = model_grid.best_estimator_.fit(X_train, y_train).predict(test)
submission_predictions = np.exp(log_predicted_values)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    5.8s finished
