In [184]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
from scipy.stats import skew
from sklearn import preprocessing
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import metrics
from xgboost import XGBRegressor

# Read Testing and Training Data

In [185]:
housing = pd.read_csv("/Users/christopherkim/Desktop/Housing Price Machine Learning/house-prices-advanced-regression-techniques/train.csv")
housingTest= pd.read_csv("/Users/christopherkim/Desktop/Housing Price Machine Learning/house-prices-advanced-regression-techniques/test.csv")
housingTestID= pd.read_csv("/Users/christopherkim/Desktop/Housing Price Machine Learning/house-prices-advanced-regression-techniques/test.csv")

# Preprocessing/Data Cleaning Training Data

In [186]:
housing["Electrical"] = housing["Electrical"].fillna("SBrkr")
housing = housing.apply(lambda x: x.fillna(0) if x.dtype.kind in 'biufc' else x.fillna("NA"))

housing.at[946, "BsmtExposure"]="No"
housing.at[332,"BsmtFinType2"]="Unf"

housing["Remod_Yes"] = np.where(housing["YearBuilt"]==housing["YearRemodAdd"], 1, 0)
housing["House_Age"] = 2010 - housing["YearBuilt"]
housing["TotalSF"] = housing["GrLivArea"] + housing["TotalBsmtSF"]
housing["Bsmt_Yes"] = np.where(housing["TotalBsmtSF"]>0,1,0)
housing["2ndFl_Yes"] = np.where(housing["2ndFlrSF"]>0,1,0)
housing["Garage_Yes"] = np.where(housing["GarageArea"]>0,1,0)
housing["TotalPorchSF"] = housing["OpenPorchSF"]+housing["EnclosedPorch"]+housing["3SsnPorch"]+housing["ScreenPorch"]
housing["Porch_Yes"] = np.where(housing["TotalPorchSF"]>0,1,0)
housing["Deck_Yes"] = np.where(housing["WoodDeckSF"]>0,1,0)

housing["Baths_Total"] = housing["BsmtFullBath"] + (0.5*housing["BsmtHalfBath"]) + housing["FullBath"] + (0.5*housing["HalfBath"])


housing = housing.drop([523, 691, 1182, 1298]).reset_index(drop=True)
housing = housing.drop(columns=["Id"])

In [187]:
housingOrdinal = pd.DataFrame(housing[[
    "OverallQual", "OverallCond", "ExterQual","ExterCond", "BsmtQual",
    "BsmtCond", "BsmtExposure", "BsmtFinType1","BsmtFinType2", 
    "HeatingQC", "KitchenQual", "FireplaceQu", "GarageFinish", 
    "GarageQual", "GarageCond", "PoolQC", "Fence", 
    "Electrical", "Functional", "LandSlope", "PavedDrive",
    "LotShape", "LandContour"]].copy())

housingNominal = pd.DataFrame(housing[
    ["MSSubClass","MSZoning","Street","Alley","Utilities","LotConfig",
     "Neighborhood","Condition1","Condition2","BldgType","HouseStyle",
     "RoofStyle","RoofMatl","Exterior1st","Exterior2nd","MasVnrType",
     "Foundation","Heating","CentralAir","GarageType","MiscFeature",
     "SaleType","SaleCondition"]].copy())

housingContinuous = housing[["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
                             "BsmtUnfSF", "LowQualFinSF", "TotalPorchSF",
                             "GarageArea", "WoodDeckSF","PoolArea", "MiscVal", "SalePrice",
                             "TotalSF", "GrLivArea", "1stFlrSF", "2ndFlrSF",
                             "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
                             "ScreenPorch", "TotalBsmtSF"]].copy()

housingDiscrete = housing.drop(columns=[
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2","BsmtUnfSF", 
    "LowQualFinSF", "GarageArea", "WoodDeckSF", "PoolArea", "MiscVal", "SalePrice", "OverallQual", 
    "OverallCond", "ExterQual","ExterCond", "BsmtQual","BsmtCond", "BsmtExposure", "BsmtFinType1",
    "BsmtFinType2", "HeatingQC", "KitchenQual", "FireplaceQu", "GarageFinish", "GarageQual", 
    "GarageCond", "PoolQC", "Fence", "Electrical", "Functional", "LandSlope", "PavedDrive","LotShape", 
    "LandContour","MSSubClass","MSZoning","Street","Alley","Utilities","LotConfig","Neighborhood",
    "Condition1","Condition2","BldgType","HouseStyle","RoofStyle","RoofMatl","Exterior1st","Exterior2nd",
    "MasVnrType","Foundation","Heating","CentralAir","GarageType","MiscFeature","SaleType","SaleCondition", 
    "TotalSF", "Remod_Yes", "Bsmt_Yes", "2ndFl_Yes", "Porch_Yes", "Deck_Yes", "Garage_Yes", "TotalPorchSF",
    "GrLivArea", "1stFlrSF", "2ndFlrSF","OpenPorchSF", "EnclosedPorch", "3SsnPorch",
    "ScreenPorch", "TotalBsmtSF"]).copy()

housingBinary = housing[["Remod_Yes", "Bsmt_Yes", "2ndFl_Yes","Porch_Yes", "Deck_Yes", "Garage_Yes"]].copy()

In [188]:
housingNominal["MSSubClass"]=housingNominal["MSSubClass"].astype("object")
dummy = pd.get_dummies(housingNominal).copy()
housingOrdinal = housingOrdinal.apply(preprocessing.LabelEncoder().fit_transform)

In [189]:
skewed_var = housingContinuous.apply(lambda x: skew(x))
skewed_var = skewed_var[abs(skewed_var)>0.5]
skewed_var = skewed_var.index
housingContinuous[skewed_var] = np.log1p(housingContinuous[skewed_var])

In [190]:
housing1 = pd.concat([housingContinuous, housingDiscrete, housingOrdinal, housingBinary, dummy], axis=1).copy()

In [191]:
Y = pd.DataFrame(housing1["SalePrice"]).copy()
X = housing1.drop(columns=["SalePrice"]).copy()

# Preprocessing/cleaning Data Set

In [192]:
imputeNA = ["PoolQC","MiscFeature","Alley","Fence","FireplaceQu"]
housingTest[imputeNA] = housingTest[imputeNA].fillna("NA")

In [193]:
housingTest["TotalBsmtSF"] = housingTest["TotalBsmtSF"].fillna(0)
housingTest.at[660, ["BsmtFinSF1","BsmtFinSF2","BsmtUnfSF"]]= 0
basements = ["TotalBsmtSF","BsmtUnfSF", "BsmtFinSF2", "BsmtFinSF1","BsmtCond","BsmtQual","BsmtExposure","BsmtFinType1","BsmtFinType2"]
nobasements = 0
indices= housingTest["TotalBsmtSF"] == nobasements
housingTest.loc[indices,basements] = housingTest.loc[indices,basements].fillna("NA")

In [194]:
garagesInt = ["GarageCars", "GarageArea","GarageYrBlt"]
garagesOrd = ["GarageType","GarageFinish","GarageQual","GarageCond"]
nogarages = 0
indices= housingTest["GarageArea"] == nogarages
housingTest.loc[indices,garagesInt] = housingTest.loc[indices,garagesInt].fillna(0)
housingTest.loc[indices,garagesOrd] = housingTest.loc[indices,garagesOrd].fillna("NA")

In [195]:
housingTest.at[[455,756,790], "MSZoning"]="RM"
housingTest.at[1444, "MSZoning"]="RL"
housingTest.BsmtFullBath = housingTest.BsmtFullBath.fillna(0)
housingTest.BsmtHalfBath = housingTest.BsmtHalfBath.fillna(0)
housingTest.at[[666,1116], "GarageYrBlt"]=1978

In [196]:
housingTest.GarageArea = housingTest.GarageArea.fillna(473)
housingTest.GarageCars = housingTest.GarageCars.fillna(2)
housingTest.LotFrontage = housingTest.LotFrontage.fillna(housingTest.LotFrontage.mean())

In [197]:
housingTest = housingTest.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [198]:
housingTest["Remod_Yes"] = np.where(housingTest["YearBuilt"]==housingTest["YearRemodAdd"], 1, 0)
housingTest["House_Age"] = 2010 - housingTest["YearBuilt"]
housingTest["TotalSF"] = housingTest["GrLivArea"] + housingTest["TotalBsmtSF"]
housingTest["Bsmt_Yes"] = np.where(housingTest["TotalBsmtSF"]>0,1,0)
housingTest["2ndFl_Yes"] = np.where(housingTest["2ndFlrSF"]>0,1,0)
housingTest["Garage_Yes"] = np.where(housingTest["GarageArea"]>0,1,0)
housingTest["TotalPorchSF"] = housingTest["OpenPorchSF"]+housingTest["EnclosedPorch"]+housingTest["3SsnPorch"]+housingTest["ScreenPorch"]
housingTest["Porch_Yes"] = np.where(housingTest["TotalPorchSF"]>0,1,0)
housingTest["Deck_Yes"] = np.where(housingTest["WoodDeckSF"]>0,1,0)

housingTest["Baths_Total"] = housingTest["BsmtFullBath"] + (0.5*housingTest["BsmtHalfBath"]) + housingTest["FullBath"] + (0.5*housingTest["HalfBath"])

housingTest = housingTest.drop(columns=["Id"])

In [199]:
housingOrdinalTest = pd.DataFrame(housingTest[[
    "OverallQual", "OverallCond", "ExterQual","ExterCond", "BsmtQual",
    "BsmtCond", "BsmtExposure", "BsmtFinType1","BsmtFinType2", 
    "HeatingQC", "KitchenQual", "FireplaceQu", "GarageFinish", 
    "GarageQual", "GarageCond", "PoolQC", "Fence", 
    "Electrical", "Functional", "LandSlope", "PavedDrive",
    "LotShape", "LandContour"]].copy())

housingNominalTest = pd.DataFrame(housingTest[
    ["MSSubClass","MSZoning","Street","Alley","Utilities","LotConfig",
     "Neighborhood","Condition1","Condition2","BldgType","HouseStyle",
     "RoofStyle","RoofMatl","Exterior1st","Exterior2nd","MasVnrType",
     "Foundation","Heating","CentralAir","GarageType","MiscFeature",
     "SaleType","SaleCondition"]].copy())

housingContinuousTest = housingTest[["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
                             "BsmtUnfSF", "LowQualFinSF", "TotalPorchSF",
                             "GarageArea", "WoodDeckSF","PoolArea", "MiscVal",
                             "TotalSF", "GrLivArea", "1stFlrSF", "2ndFlrSF",
                             "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
                             "ScreenPorch", "TotalBsmtSF"]].copy()

housingDiscreteTest = housingTest.drop(columns=[
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2","BsmtUnfSF", 
    "LowQualFinSF", "GarageArea", "WoodDeckSF", "PoolArea", "MiscVal", "OverallQual", 
    "OverallCond", "ExterQual","ExterCond", "BsmtQual","BsmtCond", "BsmtExposure", "BsmtFinType1",
    "BsmtFinType2", "HeatingQC", "KitchenQual", "FireplaceQu", "GarageFinish", "GarageQual", 
    "GarageCond", "PoolQC", "Fence", "Electrical", "Functional", "LandSlope", "PavedDrive","LotShape", 
    "LandContour","MSSubClass","MSZoning","Street","Alley","Utilities","LotConfig","Neighborhood",
    "Condition1","Condition2","BldgType","HouseStyle","RoofStyle","RoofMatl","Exterior1st","Exterior2nd",
    "MasVnrType","Foundation","Heating","CentralAir","GarageType","MiscFeature","SaleType","SaleCondition", 
    "TotalSF", "Remod_Yes", "Bsmt_Yes", "2ndFl_Yes", "Porch_Yes", "Deck_Yes", "Garage_Yes", "TotalPorchSF",
    "GrLivArea", "1stFlrSF", "2ndFlrSF","OpenPorchSF", "EnclosedPorch", "3SsnPorch",
    "ScreenPorch", "TotalBsmtSF"]).copy()

housingBinaryTest = housingTest[["Remod_Yes", "Bsmt_Yes", "2ndFl_Yes","Porch_Yes", "Deck_Yes", "Garage_Yes"]].copy()

In [200]:
housingNominalTest["MSSubClass"]=housingNominalTest["MSSubClass"].astype("object")
dummyTest = pd.get_dummies(housingNominalTest).copy()
housingOrdinalTest = housingOrdinalTest.apply(preprocessing.LabelEncoder().fit_transform)

In [201]:
skewed_varT = housingContinuousTest.apply(lambda x: skew(x))
skewed_varT = skewed_varT[abs(skewed_varT)>0.5]
skewed_varT = skewed_varT.index
housingContinuousTest[skewed_varT] = np.log1p(housingContinuousTest[skewed_varT])

In [202]:
Test_X = pd.concat([housingContinuousTest, housingDiscreteTest, housingOrdinalTest, housingBinaryTest, dummyTest], axis=1).copy()

In [208]:
Test_X = Test_X.align(X, axis=1)[0].fillna(0)
X = X.align(Test_X, axis=1)[0].fillna(0)

# Training Lasso Model 

In [209]:
scaler = StandardScaler()
X1 = scaler.fit_transform(X).copy()

In [210]:
lassoNCV = LassoCV(alphas=None, cv=10, max_iter=10000).fit(X1, Y.values.ravel())
scoreLCV = np.sqrt(-cross_val_score(lassoNCV, X1, Y.values.ravel(), scoring='neg_mean_squared_error', cv=10))

print("RMSE from 10 Folds: ",scoreLCV,"\nRMSE mean: ",scoreLCV.mean())

RMSE from 10 Folds:  [0.10675396 0.0910505  0.10535142 0.11933677 0.14363219 0.09878774
 0.11854999 0.1000029  0.092194   0.12022792] 
RMSE mean:  0.10958873801016504


In [211]:
lassoNCV.alpha_

0.0028212135387946985

In [212]:
lassoCoef = pd.Series(lassoNCV.coef_, index=X.columns)
featLasso = lassoCoef[lassoCoef!=0].index

In [214]:
X2 = X[featLasso].copy()
X2 = scaler.fit_transform(X2)

In [215]:
lassoNCV1 = LassoCV(alphas=None, cv=10, max_iter=10000).fit(X2, Y.values.ravel())
scoreLCV1 = np.sqrt(-cross_val_score(lassoNCV1, X2, Y.values.ravel(), scoring='neg_mean_squared_error', cv=10))

print("RMSE from 10 Folds: ",scoreLCV1,"\nRMSE mean: ",scoreLCV1.mean())

RMSE from 10 Folds:  [0.1031369  0.09205337 0.10129594 0.11751705 0.13837933 0.09701912
 0.11666029 0.0990787  0.08926642 0.11823225] 
RMSE mean:  0.10726393650127071


In [216]:
lassoNCV1.alpha_

0.0006517369186246348

In [217]:
lassoCoef1 = pd.Series(lassoNCV1.coef_, index=featLasso)
featLasso1 = lassoCoef1[lassoCoef1!=0].index

# Predicting with Lasso Model

In [218]:
X3 = Test_X[featLasso].copy()

In [219]:
X3 = scaler.transform(X3)

In [220]:
predicted_pricesLasso = lassoNCV1.predict(X3)

# Training the Ridge Model

In [223]:
rscaler = RobustScaler()

X4 = rscaler.fit_transform(X[featLasso]).copy()

ridgeRNCV = RidgeCV(alphas=alphas, cv=10, scoring="neg_mean_squared_error").fit(X4, Y)
scoreRRCV = np.sqrt(-cross_val_score(ridgeRNCV, X4, Y, scoring='neg_mean_squared_error', cv=10))

print("RMSE from 10 Folds: ",scoreRRCV,"\nRMSE mean: ",scoreRRCV.mean())

RMSE from 10 Folds:  [0.10451606 0.09227485 0.09698209 0.11800139 0.13530308 0.09526915
 0.11160016 0.09506654 0.08850884 0.11516252] 
RMSE mean:  0.10526846807183762


# Predicting with Ridge Model

In [226]:
X5 = rscaler.transform(Test_X[featLasso]).copy()
predicted_pricesRidge = ridgeRNCV.predict(X5)

# Training the ElasticNet Model 

In [225]:
enetFNCV = ElasticNetCV(l1_ratio= np.linspace(0.01,1),alphas=None, cv=10).fit(X2, Y.values.ravel())
scoreFECV = np.sqrt(-cross_val_score(enetFNCV, X2, Y.values.ravel(), scoring='neg_mean_squared_error', cv=10))

print("RMSE from 10 Folds: ",scoreFECV,"\nRMSE mean: ",scoreFECV.mean())

RMSE from 10 Folds:  [0.09940281 0.09251575 0.10264105 0.11851769 0.13599476 0.09449991
 0.11520254 0.10014577 0.08956788 0.11769558] 
RMSE mean:  0.10661837358805966


# Predicting with ElasticNet Model

In [227]:
enetFNCV1 = ElasticNetCV(l1_ratio= np.linspace(0.01,1),alphas=None, cv=10).fit(X4, Y.values.ravel())
scoreFECV1 = np.sqrt(-cross_val_score(enetFNCV1, X4, Y.values.ravel(), scoring='neg_mean_squared_error', cv=10))

print("RMSE from 10 Folds: ",scoreFECV1,"\nRMSE mean: ",scoreFECV1.mean())

RMSE from 10 Folds:  [0.10603704 0.09170937 0.09843715 0.11762266 0.13873392 0.09693277
 0.11271751 0.09523255 0.09068335 0.11689965] 
RMSE mean:  0.10650059621313386


In [228]:
predicted_pricesENet = enetFNCV1.predict(X5)

# Training the XGBoost model

In [278]:
xgbModel = XGBRegressor(
    learning_rate=0.07,
    n_estimators=500,
    max_depth=2,
    min_child_weight=7,
    gamma=0,
    subsample=1,
    colsample_bytree=1,
    colsample_bylevel=1,
    colsample_bynode=1,
    reg_lambda=1,
    reg_alpha=0.0009
)
xgbModel.fit(X6, Y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.07, max_delta_step=0,
             max_depth=2, min_child_weight=7, missing=None, n_estimators=500,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0.0009, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

# Predicting with XGBoost Model

In [279]:
predicted_pricesXGB = xgbModel.predict(Test_X[featLasso])

# Final Predictions

In [286]:
predictionsMean = np.mean([predicted_pricesXGB, predicted_pricesENet, predicted_pricesLasso, predicted_pricesRidge.flatten()], axis=0)

In [316]:
my_submission = pd.DataFrame({"Id": housingTestID.Id, "SalePrice": np.exp(predictionsMean)})

In [317]:
my_submission.to_csv("submission.csv", index=False)