In [55]:
import numpy as np 
import pandas as pd
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from pylab import *
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import itertools
from sklearn.metrics import confusion_matrix, r2_score
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor,ExtraTreesRegressor, RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

path_to_train_data = "./train.csv"

In [2]:
df = pd.read_csv(path_to_train_data)

def cleanup():
    df.LotFrontage.fillna(df.LotFrontage.median(), inplace=True)
    df.Alley.fillna("None",inplace=True)
    df.MasVnrType.fillna("None",inplace=True)
    df.MasVnrArea.fillna(0.0, inplace=True)
    df.BsmtQual.fillna("None",inplace=True)
    df.BsmtCond.fillna("None",inplace=True)
    df.BsmtExposure.fillna("None",inplace=True)
    df.BsmtFinType1.fillna("None",inplace=True)
    df.BsmtFinType2.fillna("None",inplace=True)
    df.Electrical.fillna("SBrkr",inplace=True)
    df.FireplaceQu.fillna("None",inplace=True)
    df.GarageType.fillna("None",inplace=True)
    df.GarageYrBlt.fillna("None",inplace=True)
    df.GarageFinish.fillna("None",inplace=True)
    df.GarageQual.fillna("None",inplace=True)
    df.GarageCond.fillna("None",inplace=True)
    df.PoolQC.fillna("None",inplace=True)
    df.Fence.fillna("None",inplace=True)
    df.MiscFeature.fillna("None",inplace=True)
    quality_dictionary = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    df['ExterQual']   = df['ExterQual'].map(quality_dictionary).astype(int)
    df['ExterCond']   = df['ExterCond'].map(quality_dictionary).astype(int)
    df['BsmtQual']    = df['BsmtQual'].map(quality_dictionary).astype(int)
    df['BsmtCond']    = df['BsmtCond'].map(quality_dictionary).astype(int)
    df['HeatingQC']   = df['HeatingQC'].map(quality_dictionary).astype(int)
    df['KitchenQual'] = df['KitchenQual'].map(quality_dictionary).astype(int)
    df['FireplaceQu'] = df['FireplaceQu'].map(quality_dictionary).astype(int)
    df['GarageQual']  = df['GarageQual'].map(quality_dictionary).astype(int)
    df['GarageCond']  = df['GarageCond'].map(quality_dictionary).astype(int)
    df['PoolQC']      = df['PoolQC'].map(quality_dictionary).astype(int)

    #saving memory
    del quality_dictionary

    df['BsmtExposure'] = df['BsmtExposure'].map({'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}).astype(int)

    bsmtdict = {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
    df['BsmtFinType1'] = df['BsmtFinType1'].map(bsmtdict).astype(int)
    df['BsmtFinType2'] = df['BsmtFinType2'].map(bsmtdict).astype(int)

    df['Functional'] = df['Functional'].map({'None': 0, 'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4, 
                                             'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8}).astype(int)

    df['GarageFinish'] = df['GarageFinish'].map({'None': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}).astype(int)
    df['Fence'] = df['Fence'].map({'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}).astype(int)

    del bsmtdict

cleanup()
print('Number of houses:', format(df.shape[0]))
print ('Number of features:', format(df.shape[1]-2))



Number of houses: 1460
Number of features: 79


In [3]:
df.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,0,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,0,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,0,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,0,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,0,,0,12,2008,WD,Normal,250000


In [4]:
X = df[['OverallQual', 'GrLivArea', 'ExterQual', 'KitchenQual', 'GarageCars', 'GarageArea']]
y = df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
n_folds=25

## Linear Model

In [56]:
model = LinearRegression()
model.fit(X=X_train, y=y_train)

predictions = model.predict(X=X_test)

In [57]:
this_scores = np.mean(cross_val_score(model, X_train, y_train, cv=n_folds))
this_scores2 = np.mean(cross_val_score(model, X_train, y_train, cv=KFold(n_splits=5,random_state=42)))
this_scores2

0.73534964404269643

In [58]:
this_scores

0.73309194944336331

In [18]:
RMSE = np.sqrt(mean_squared_error(y_true = y_test, y_pred=predictions))
baseline = np.zeros(482)+180921.195890

RMSE_Baseline = np.sqrt(mean_squared_error(y_true = y_test, y_pred=baseline))

In [19]:
print("Relative Accuracy of our Model: {}".format(1-(RMSE/RMSE_Baseline)))

Relative Accuracy of our Model: 0.535326174237118


## Bagging

In [95]:
n_folds = 25
bags = BaggingRegressor(n_estimators=1000,random_state=42)
this_scores = np.mean(cross_val_score(bags, X_train, y_train, cv=n_folds))
this_scores2 = np.mean(cross_val_score(bags, X_train, y_train, cv=KFold(n_splits=5,random_state=42)))
this_scores2

0.77328365183939307

In [96]:
this_scores

0.76714904279012752

## ExtraTrees

In [97]:
n_folds = 25
etree = ExtraTreesRegressor(n_estimators=1000,random_state=42)
this_scores = np.mean(cross_val_score(etree, X_train, y_train, cv=n_folds))
this_scores2 = np.mean(cross_val_score(etree, X_train, y_train, cv=KFold(n_splits=5,random_state=42)))
this_scores2

0.76661874697285004

In [98]:
this_scores

0.75287272125813065

## GradientBoosting

In [99]:
n_folds = 25
xgbooster = GradientBoostingRegressor(n_estimators=1000,random_state=42)
this_scores = np.mean(cross_val_score(xgbooster, X_train, y_train, cv=n_folds))
this_scores2 = np.mean(cross_val_score(xgbooster, X_train, y_train, cv=KFold(n_splits=5,random_state=42)))
this_scores2

0.74143594420045811

In [100]:
this_scores

0.73418670102185291

## RandomForest

In [101]:
n_folds = 25
randforest = RandomForestRegressor(n_estimators=100,random_state=42)
this_scores = np.mean(cross_val_score(randforest, X_train, y_train, cv=n_folds))
this_scores2 = np.mean(cross_val_score(randforest, X_train, y_train, cv=KFold(n_splits=5,random_state=42)))
this_scores2

0.76818044958969856

In [102]:
this_scores

0.76378119704064029

## AdaBoost 

In [103]:
n_folds = 25
aboost = AdaBoostRegressor(n_estimators=1000,random_state=42)
this_scores = np.mean(cross_val_score(aboost, X_train, y_train, cv=n_folds))
this_scores2 = np.mean(cross_val_score(aboost, X_train, y_train, cv=KFold(n_splits=5,random_state=42)))
this_scores2

0.69011628245890544

In [104]:
this_scores

0.6752436062569942

## Cross Validation

The two types of cross validation used were kfold and n_fold cross validation. The one that I will be speaking about is KFold cross validation.

First, it matters less how data is divided because every sample will be used once for testing and k-1 times for training. Also, it allows for a variance reduction as k increases (less overfitting). However, it also causes higher bias (more underfitting). Another drawback is that the proportion of the train-test split is dependent on the number of iterations. And obviously, a clear disadvantage is that the train-test procedure has to be repeated k times.

Finally note that the k-fold method is also non-exhaustive because although we run the train-test several times it does not explore all possible combinations.

# Stacking!

In [70]:
bags = bags.fit(X_train,y_train)
etree = etree.fit(X_train,y_train)
xgbooster = xgbooster.fit(X_train,y_train)
randforest = randforest.fit(X_train,y_train)
aboost = aboost.fit(X_train,y_train);

In [71]:
pred1 = bags.predict(X_test)

In [72]:
RMSE = np.sqrt(mean_squared_error(y_true = y_test, y_pred=pred1))
baseline = np.zeros(482)+180921.195890

RMSE_Baseline = np.sqrt(mean_squared_error(y_true = y_test, y_pred=baseline))

In [73]:
r2_score(y_pred=pred1,y_true=y_test)

0.77457719632136612

In [74]:
print("Relative Accuracy of our Model: {}".format(1-(RMSE/RMSE_Baseline)))

Relative Accuracy of our Model: 0.5252149914146144


In [75]:
pred2 = etree.predict(X_test)

In [76]:
RMSE = np.sqrt(mean_squared_error(y_true = y_test, y_pred=pred2))
baseline = np.zeros(482)+180921.195890

RMSE_Baseline = np.sqrt(mean_squared_error(y_true = y_test, y_pred=baseline))

In [77]:
r2_score(y_pred=pred2,y_true=y_test)

0.7555320756751035

In [78]:
print("Relative Accuracy of our Model: {}".format(1-(RMSE/RMSE_Baseline)))

Relative Accuracy of our Model: 0.5055652155134258


In [79]:
pred3 = xgbooster.predict(X_test)

In [80]:
RMSE = np.sqrt(mean_squared_error(y_true = y_test, y_pred=pred3))
baseline = np.zeros(482)+180921.195890

RMSE_Baseline = np.sqrt(mean_squared_error(y_true = y_test, y_pred=baseline))

In [81]:
r2_score(y_pred=pred3,y_true=y_test)

0.71670525699342158

In [82]:
print("Relative Accuracy of our Model: {}".format(1-(RMSE/RMSE_Baseline)))

Relative Accuracy of our Model: 0.4677479634305085


In [83]:
pred4 = randforest.predict(X_test)

In [84]:
RMSE = np.sqrt(mean_squared_error(y_true = y_test, y_pred=pred4))
baseline = np.zeros(482)+180921.195890

RMSE_Baseline = np.sqrt(mean_squared_error(y_true = y_test, y_pred=baseline))

In [85]:
r2_score(y_pred=pred4,y_true=y_test)

0.76523089085999718

In [86]:
print("Relative Accuracy of our Model: {}".format(1-(RMSE/RMSE_Baseline)))

Relative Accuracy of our Model: 0.5154723672015006


In [87]:
pred5 = aboost.predict(X_test)

In [88]:
RMSE = np.sqrt(mean_squared_error(y_true = y_test, y_pred=pred5))
baseline = np.zeros(482)+180921.195890

RMSE_Baseline = np.sqrt(mean_squared_error(y_true = y_test, y_pred=baseline))

In [89]:
r2_score(y_pred=pred5,y_true=y_test)

0.66313181156421397

In [90]:
print("Relative Accuracy of our Model: {}".format(1-(RMSE/RMSE_Baseline)))

Relative Accuracy of our Model: 0.4195991034531167


## Stacked

In [91]:
Stacked = (pred1+pred2+pred3+pred4+pred5+predictions)/6

In [92]:
RMSE = np.sqrt(mean_squared_error(y_true = y_test, y_pred=Stacked))
baseline = np.zeros(482)+180921.195890

RMSE_Baseline = np.sqrt(mean_squared_error(y_true = y_test, y_pred=baseline))

In [93]:
print("Relative Accuracy of our Model: {}%".format(100*(1-(RMSE/RMSE_Baseline))))

Relative Accuracy of our Model: 52.15385868961666%


In [94]:
r2_score(y_pred=Stacked,y_true=y_test)

0.77107264578720514