In [1]:
from IPython.display import display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pprint
%load_ext autoreload
%autoreload 2
pd.options.display.max_columns = 300
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV, BayesianRidge
import xgboost as xgb
import re
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
# Choose model. MLR, Ridge, Lasso, ElasticNet, RandomForest, XGBoost
model_list = ['MLR']

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
print("Train data shape:", train.shape)
print("Test data shape:", test.shape)

Train data shape: (1460, 81)
Test data shape: (1459, 80)


In [5]:
# Remove outliers
train = train[train['GrLivArea']<4000]

# Estimate price per SF of Living Area. Remove the high and low end outliers 
# (This did not give better results but still can be used to provide a story for outlier detection and removal part)
#myarr = (train['SalePrice'] / train['GrLivArea']).sort_values(ascending=False)
#drop_index_high = myarr[myarr>250].index
#drop_index_low = myarr[myarr<40].index
#train.drop(drop_index_low, inplace=True)
#train.drop(drop_index_high, inplace=True)

In [6]:
# Log transform target
train['SalePrice'] = np.log(train['SalePrice'])

In [7]:
df_concat = pd.concat([train, test], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [8]:
# Drop columns
# Bsmt
BsmtSumSF = df_concat[['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF']].sum(axis=1)
print((df_concat['TotalBsmtSF'] == BsmtSumSF).all())
df_concat.drop(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'], axis=1, inplace=True)

house_sqft = df_concat[['1stFlrSF','2ndFlrSF','LowQualFinSF']].sum(axis=1)
print((house_sqft == df_concat['GrLivArea']).all())
df_concat.drop(['1stFlrSF','2ndFlrSF','LowQualFinSF'], axis=1, inplace=True)

df_concat['TotalPorchSF'] = df_concat[['OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','WoodDeckSF']].sum(axis=1)
df_concat.drop(['OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','WoodDeckSF'], axis=1, inplace=True)

# MiscVal holds no value. Drop!
df_concat.drop(['MiscVal'], axis=1, inplace=True)

False
True


Note that some numerical variables are not continuous. They must be regarded as categorical variables:

- MSSubClass
- OverallQual
- OverallCond
- BsmtFullBath	
- BsmtHalfBath	
- FullBath
- HalfBath
- BedroomAbvGr
- KitchenAbvGr
- TotRmsAbvGrd
- Fireplaces
- GarageCars
- YrSold

Let's add them to the **categorical_features** dataframe, remove from the **numerical_features** dataframe and see the distributions of classes for each variable.

Below columns need special attention. I will first bin them and then add to the categorical variables.
- MoSold
- YearBuilt
- YearRemodAdd
- GarageYrBlt

In [9]:
# In test dataset GarageYrBlt has a column with value of 2207. Update it to 2010 which is the max after 2207.
df_concat.loc[df_concat['GarageYrBlt'] == 2207,'GarageYrBlt'] = 2010

In [10]:
# Houses without a garage have NAs. Fill them with year 0. Then, bin.
df_concat['GarageYrBlt'].fillna(0, inplace = True)

In [11]:
# GarageCars, BsmtFullBath, BsmtHalfBath column has very few nulls. Impute them with median before casting to category.
med_GarageCars = df_concat['GarageCars'].median()
df_concat['GarageCars'].fillna(med_GarageCars, inplace = True)

med_BsmtFullBath = df_concat['BsmtFullBath'].median()
df_concat['BsmtFullBath'].fillna(med_BsmtFullBath, inplace = True)

med_BsmtHalfBath = df_concat['BsmtHalfBath'].median()
df_concat['BsmtHalfBath'].fillna(med_BsmtHalfBath, inplace = True)

In [12]:
# Fill NA randomly using non-NA values of that column
def fill_with_random(df2, column):
    df = df2.copy()
    df[column] = df[column].apply(lambda x: np.random.choice(df[column].dropna().values) if np.isnan(x) else x)
    return df

#med_GarageArea = df_concat['GarageArea'].median()
#med_TotalBsmtSF = df_concat['TotalBsmtSF'].median()
#med_MasVnrArea = df_concat['MasVnrArea'].median()
#med_LotFrontage = df_concat['LotFrontage'].median()
#df_concat['GarageArea'].fillna(med_GarageArea, inplace = True)
#df_concat['TotalBsmtSF'].fillna(med_TotalBsmtSF, inplace = True)
#df_concat['MasVnrArea'].fillna(med_MasVnrArea, inplace = True)
#df_concat['LotFrontage'].fillna(med_LotFrontage, inplace = True)

df_concat = fill_with_random(df_concat, 'GarageArea')
df_concat = fill_with_random(df_concat, 'TotalBsmtSF')
df_concat = fill_with_random(df_concat, 'MasVnrArea')
df_concat = fill_with_random(df_concat, 'LotFrontage')

# Extra null columns in the df_concat dataset. Impute with the most common class.
df_concat['KitchenQual'].fillna(df_concat['KitchenQual'].value_counts().index[0], inplace = True)
df_concat['Utilities'].fillna(df_concat['Utilities'].value_counts().index[0], inplace = True)
df_concat['Functional'].fillna(df_concat['Functional'].value_counts().index[0], inplace = True)
df_concat['SaleType'].fillna(df_concat['SaleType'].value_counts().index[0], inplace = True)
df_concat['Exterior1st'].fillna(df_concat['Exterior1st'].value_counts().index[0], inplace = True)
df_concat['Exterior2nd'].fillna(df_concat['Exterior2nd'].value_counts().index[0], inplace = True)
df_concat['MSZoning'].fillna(df_concat['MSZoning'].value_counts().index[0], inplace = True)
df_concat['Electrical'].fillna(df_concat['Electrical'].value_counts().index[0], inplace = True)

In [13]:
# Instead of binning, normalize
# Note: Normalization did not help. Use them as they are.
#df_concat['MasVnrArea'] = (df_concat['MasVnrArea'] - np.mean(df_concat['MasVnrArea'])) / np.std(df_concat['MasVnrArea'])
#df_concat['TotalPorchSF'] = (df_concat['TotalPorchSF'] - np.mean(df_concat['TotalPorchSF'])) / np.std(df_concat['TotalPorchSF'])
#df_concat['GarageArea'] = (df_concat['GarageArea'] - np.mean(df_concat['GarageArea'])) / np.std(df_concat['GarageArea'])

In [14]:
# YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)
# Set YearRemodAdd = 0 if YearRemodAdd == YearBuilt
df_concat.loc[df_concat['YearRemodAdd'] == df_concat['YearBuilt'], 'YearRemodAdd'] = 0

In [15]:
#sns.catplot(x='YearRemodAdd', y='SalePrice', data=df_concat, kind='bar')

In [16]:
# Binning
df_concat['MoSold'] = pd.cut(df_concat['MoSold'], [1,4,8,12], include_lowest=True)
df_concat['YearBuilt'] = pd.cut(df_concat['YearBuilt'], [1872,1950,1980,2000,2010], include_lowest=True)
df_concat['YearRemodAdd'] = pd.cut(df_concat['YearRemodAdd'], [0,1,1980,2000,2010], include_lowest=True)
#df_concat['YearRemodAdd'] = pd.cut(df_concat['YearRemodAdd'], [1950,1980,2000,2010], include_lowest=True)
df_concat['GarageYrBlt'] = pd.cut(df_concat['GarageYrBlt'], [0,1894,1950,1980,2000,2010], include_lowest=True)
#df_concat['MasVnrArea'] = pd.cut(df_concat['MasVnrArea'], [0,100,300,1600], include_lowest=True)
#df_concat['TotalPorchSF'] = pd.cut(df_concat['TotalPorchSF'], [0,100,300,1424], include_lowest=True)
#df_concat['GarageArea'] = pd.cut(df_concat['GarageArea'], [0,200,400,600,800,1488], include_lowest=True)
df_concat['PoolArea'] = pd.cut(df_concat['PoolArea'], [0,1,800], include_lowest=True)

In [17]:
#sns.catplot(x='PoolArea', y='SalePrice', kind='box', data=df_concat)

In [18]:
from_num_to_cat = ['MSSubClass','OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars','MoSold','YearBuilt','YearRemodAdd','YrSold','GarageYrBlt','PoolArea']

In [19]:
df_concat[from_num_to_cat] = df_concat[from_num_to_cat].apply(lambda x: x.astype('category'))

In [20]:
# Numeric variables
numerical_features = df_concat.select_dtypes(include=[np.number])

# Categorical variables
categorical_features = df_concat.select_dtypes(exclude=[np.number])

In [21]:
categorical_features.sample(5)

Unnamed: 0,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,CentralAir,Condition1,Condition2,Electrical,ExterCond,ExterQual,Exterior1st,Exterior2nd,Fence,FireplaceQu,Fireplaces,Foundation,FullBath,Functional,GarageCars,GarageCond,GarageFinish,GarageQual,GarageType,GarageYrBlt,HalfBath,Heating,HeatingQC,HouseStyle,KitchenAbvGr,KitchenQual,LandContour,LandSlope,LotConfig,LotShape,MSSubClass,MSZoning,MasVnrType,MiscFeature,MoSold,Neighborhood,OverallCond,OverallQual,PavedDrive,PoolArea,PoolQC,RoofMatl,RoofStyle,SaleCondition,SaleType,Street,TotRmsAbvGrd,Utilities,YearBuilt,YearRemodAdd,YrSold
87,Pave,2,TwnhsE,TA,Av,Unf,Unf,0.0,0.0,Gd,Y,Norm,Norm,SBrkr,TA,Gd,VinylSd,VinylSd,,,0,PConc,2,Typ,2.0,TA,RFn,TA,Detchd,"(2000.0, 2010.0]",1,GasA,Ex,2Story,1,Gd,Lvl,Gtl,Corner,Reg,160,FV,Stone,,"(4.0, 8.0]",Somerst,5,6,Y,"(-0.001, 1.0]",,CompShg,Gable,Partial,New,Pave,4,AllPub,"(2000.0, 2010.0]","(-0.001, 1.0]",2009
335,,3,2fmCon,TA,Gd,ALQ,BLQ,2.0,0.0,TA,Y,Norm,Norm,SBrkr,TA,TA,Plywood,Plywood,,Gd,2,CBlock,2,Typ,2.0,TA,Fin,TA,Attchd,"(1950.0, 1980.0]",0,GasA,Ex,1.5Fin,1,TA,HLS,Sev,Corner,IR1,190,RL,,Shed,"(4.0, 8.0]",Timber,6,5,Y,"(-0.001, 1.0]",,CompShg,Gable,Normal,WD,Grvl,7,AllPub,"(1950.0, 1980.0]","(-0.001, 1.0]",2008
2790,,3,1Fam,TA,No,Unf,Unf,0.0,0.0,TA,Y,Norm,Norm,SBrkr,Gd,TA,MetalSd,MetalSd,,,0,BrkTil,1,Typ,1.0,TA,Unf,Fa,Detchd,"(1894.0, 1950.0]",0,GasA,Gd,1.5Fin,1,Gd,Lvl,Gtl,Inside,Reg,50,RM,,,"(0.999, 4.0]",BrkSide,7,5,Y,"(-0.001, 1.0]",,CompShg,Gable,Normal,WD,Pave,7,AllPub,"(1871.999, 1950.0]","(1980.0, 2000.0]",2006
2475,,3,1Fam,TA,Gd,LwQ,BLQ,0.0,1.0,TA,Y,Norm,Norm,SBrkr,Gd,Gd,HdBoard,HdBoard,,Po,1,CBlock,2,Typ,2.0,TA,Unf,TA,Detchd,"(1950.0, 1980.0]",0,GasA,TA,SLvl,1,TA,Lvl,Gtl,Inside,IR1,80,RL,BrkFace,,"(4.0, 8.0]",Sawyer,8,5,Y,"(-0.001, 1.0]",,CompShg,Hip,Normal,WD,Pave,5,AllPub,"(1950.0, 1980.0]","(2000.0, 2010.0]",2007
1240,,2,1Fam,Gd,Gd,GLQ,Unf,1.0,0.0,Ex,Y,Norm,Norm,SBrkr,TA,Ex,VinylSd,VinylSd,,Gd,1,PConc,2,Typ,3.0,TA,Fin,TA,Attchd,"(2000.0, 2010.0]",1,GasA,Ex,1Story,1,Ex,Lvl,Gtl,Inside,Reg,20,RL,,,"(8.0, 12.0]",NridgHt,5,10,Y,"(-0.001, 1.0]",,CompShg,Gable,Partial,New,Pave,7,AllPub,"(2000.0, 2010.0]","(-0.001, 1.0]",2006


In [22]:
numerical_features.sample(5)

Unnamed: 0,GarageArea,GrLivArea,Id,LotArea,LotFrontage,MasVnrArea,SalePrice,TotalBsmtSF,TotalPorchSF
1745,336.0,1216,1750,9000,75.0,200.0,,1216.0,0
1864,662.0,1290,1869,6762,64.0,0.0,,1282.0,168
1653,440.0,1456,1658,2364,24.0,0.0,,855.0,147
2013,384.0,984,2018,9100,70.0,0.0,,984.0,201
2611,313.0,1024,2616,10533,85.0,244.0,,1008.0,280


In [23]:
# Log transform numerical predictors with the highest correlation with target 
df_concat['GrLivArea'] = np.log(df_concat['GrLivArea'])
df_concat['LotArea'] = np.log(df_concat['LotArea'])
df_concat['LotFrontage'] = np.log(df_concat['LotFrontage'])

# has zeros
#df_concat['TotalBsmtSF'] = np.log(df_concat['TotalBsmtSF'])

In [24]:
df_concat['Alley'].fillna('NoAlley', inplace = True)
df_concat['BsmtQual'].fillna('NoBsmt', inplace = True)
df_concat['BsmtCond'].fillna('NoBsmt', inplace = True)
df_concat['BsmtExposure'].fillna('NoBsmt', inplace = True)
df_concat['BsmtFinType1'].fillna('NoBsmt', inplace = True)
df_concat['BsmtFinType2'].fillna('NoBsmt', inplace = True)
df_concat['FireplaceQu'].fillna('NoFirePl', inplace = True)
df_concat['GarageType'].fillna('NoGarage', inplace = True)
df_concat['GarageFinish'].fillna('NoGarage', inplace = True)
df_concat['GarageQual'].fillna('NoGarage', inplace = True)
df_concat['GarageCond'].fillna('NoGarage', inplace = True)
df_concat['PoolQC'].fillna('NoPool', inplace = True)
df_concat['Fence'].fillna('NoFence', inplace = True)
df_concat['MiscFeature'].fillna('NoMiscFeature', inplace = True)
df_concat['MasVnrType'].fillna('NoVeneer', inplace = True)
df_concat['MasVnrType'].replace('None','NoVeneer', inplace=True) # None means there is no veneer. Pass None to category NoVeneer

In [25]:
df_concat.isnull().sum().sort_values(ascending=False).head(20)

SalePrice       1459
Exterior1st        0
Fence              0
FireplaceQu        0
Fireplaces         0
Foundation         0
FullBath           0
Functional         0
GarageArea         0
TotalPorchSF       0
GarageCars         0
GarageCond         0
GarageFinish       0
GarageQual         0
GarageType         0
GarageYrBlt        0
GrLivArea          0
Exterior2nd        0
ExterQual          0
Heating            0
dtype: int64

In [26]:
df_concat_dummified = pd.get_dummies(df_concat, drop_first=True)

In [27]:
# Now, we do not have any nulls. Time to split between train and test.
test = df_concat_dummified.loc[df_concat.SalePrice.isnull(),]
train = df_concat_dummified.loc[~df_concat.SalePrice.isnull(),]
test.drop('SalePrice', inplace=True, axis=1)
test.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [28]:
print(test.shape)
print(train.shape)

(1459, 322)
(1456, 323)


In [29]:
submission = pd.DataFrame()
submission['Id'] = test.Id

Y=train['SalePrice']
X=train.loc[:, ~train.columns.isin(['Id','SalePrice'])]
test2=test.loc[:, ~test.columns.isin(['Id'])]

In [30]:
# Before fitting to models, check shape of datasets.
print(X.shape)
print(test2.shape)
print(set(X.columns) - set(test2.columns))
print(len(set(X.columns) - set(test2.columns)))
print(set(test2.columns) - set(X.columns))

(1456, 321)
(1459, 321)
set()
0
set()


In [31]:
# We use cross_val_score for its simplicity to estimate MSE for different models.
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=.33)

In [38]:
# MLR
if 'MLR' in model_list:
    
# looking at the results, we choose to run Ridge regression with below settings:
#mlr_selected = RidgeCV(alphas=[0], cv=10, normalize=True, fit_intercept=True)
#mlr_selected.fit(X, Y)

#y_pred_ridge = mlr_selected.predict(test2)

    lm = Ridge(alpha = 0)
    lm.fit(X, Y)
    scores = cross_val_score(estimator=lm, X=X_train, y=Y_train, cv=10)
    #cv_list = [3, 5, 10]
    print('Best R^2 for MLR is: %.5f' %(lm.score(X,Y)))
    print('Test Scores: {}'.format(scores))
    print('Mean of Test Scores: {}'.format(scores.mean()))
    print('Std. Deviation of Test Scores: {}'.format(scores.std()))

    

    
    
    

    

Best R^2 for MLR is: 0.95154
Test Scores: [-3.97566024e+20 -1.96543810e+21 -7.48503343e+23 -3.85233276e+21
 -2.11994905e+21 -5.54291119e+21 -6.98714005e+20 -1.84971640e+21
 -2.14739602e+21 -2.02335883e+21]
Mean of Test Scores: -7.691007255023796e+22
Std. Deviation of Test Scores: 2.23868791151652e+23


In [None]:
# Ridge with Cross Validation

# String to Boolean
def str2bool(v):
    return v.lower() in ("true")


if 'Ridge' in model_list:
    
    # Alpha range
    alphas = np.linspace(0.01, 5, 200)
    
    # CV range
    sqrt_n = int(np.sqrt(len(X_train))) # Square root of # of obs. (CV = 31)
    cv_list = [None, 3, 5, 10, sqrt_n]
    CV_index = []
    
    # Normalization
    norm_list = ['False', 'True']
    norm_list2 = []
    
    # Intercept
    intercept_list = ['False', 'True']
    intercept_list2 = []
    
    # Initialization of lists to be added to dict_
    best_alphas = []
    best_R2s = []
    best_MSEs = []
    mean_CVscores = []
    std_CVscores = []
    
    # Initialization of dictionary object to cast to df
    dict_ = {}
    
    # Loop for different values of CV and normalization type using the alpha range specified
    for cv in cv_list:
        for norm in norm_list:
            for intercept in intercept_list:
            
                if cv == None:
                    cv_name = 'LOOCV' # Leave one out CV (Generalized Cross Validation)
                else:
                    cv_name = str(cv)
                    
                print('CV: %s' %cv_name)
                print('Normalize: %s' %norm)
                print('Intercept: %s' %intercept)

                ridgeCV = RidgeCV(alphas=alphas, cv=cv, normalize=str2bool(norm), fit_intercept=str2bool(intercept), scoring = 'neg_mean_squared_error')
                ridgeCV.fit(X_train,Y_train)
                y_pred_ridge = ridgeCV.predict(test2)
                print('Best alpha for Ridge is: %.2f' %(ridgeCV.alpha_))
                print('Best R^2 for Ridge is: %.5f' %(ridgeCV.score(X_train,Y_train)))

                best_alphas.append(round(ridgeCV.alpha_, 2))
                best_R2s.append(round(ridgeCV.score(X_train,Y_train), 5))

                ridge = Ridge(alpha = ridgeCV.alpha_, normalize = str2bool(norm), fit_intercept=str2bool(intercept))
                ridge.fit(X_train, Y_train)
                mse = round(mean_squared_error(Y_test, ridge.predict(X_test)), 5)
                print('MSE for Ridge: %.5f' %mse)

                best_MSEs.append(mse)
                
                if cv == None:
                    cv_new = len(X_train)
                    scores = cross_val_score(estimator=ridge, X=X_train, y=Y_train, cv=cv_new) # Estimate score using cv = (# of obs.)
                else:
                    scores = cross_val_score(estimator=ridge, X=X_train, y=Y_train, cv=cv) # Estimate score using cv = (# of obs.)
                
                # Estimate test scores using cross_val_score
                scores = np.array(list(map(lambda x: round(x,5), scores)))
                avg_score = round(scores.mean(), 5)
                std_score = round(scores.std(), 5)
                print('Test Scores: {}'.format(scores))
                print('Mean of Test Scores: {}'.format(avg_score))
                print('Std. Deviation of Test Scores: {}'.format(std_score))
                print('*' * 50, '\n')

                mean_CVscores.append(avg_score)
                std_CVscores.append(std_score)
                
                CV_index.append('CV_' + cv_name)
                norm_list2.append(norm)
                intercept_list2.append(intercept)

                dict_['Norm_Type'] = norm_list2
                dict_['Intercept_Type'] = intercept_list2
                dict_['best_alphas'] = best_alphas
                dict_['best_R2s'] = best_R2s
                dict_['best_MSEs'] = best_MSEs
                dict_['mean_CVscores'] = mean_CVscores
                dict_['std_CVscores'] = std_CVscores

                #print(dict_)

    df_results = pd.DataFrame(dict_, index=CV_index)
    df_results.reset_index(inplace=True)
    df_results.rename({'index':'CV'}, axis=1, inplace=True) #rename index as CV
    #df_results.sort_values(by=['best_MSEs', 'mean_CVscores'], ascending=[True,False], inplace=True)
    print(df_results)

    # looking at the results, we choose to run Ridge regression with below settings:
    ridge_selected = RidgeCV(alphas=[0.16], cv=10, normalize=True, fit_intercept=True)
    ridge_selected.fit(X, Y)
    y_pred_ridge = ridge_selected.predict(test2)

In [None]:
# Lasso with Cross Validation

# String to Boolean
def str2bool(v):
    return v.lower() in ("true")


if 'Lasso' in model_list:
    
    # Alpha range
    alphas = np.logspace(-1, 1, 200)
    
    # CV range
    sqrt_n = int(np.sqrt(len(X_train))) # Square root of # of obs. (CV = 31)
    cv_list = [None, 3, 5, 10, sqrt_n]
    CV_index = []
    
    # Normalization
    norm_list = ['False', 'True']
    norm_list2 = []
    
    # Intercept
    intercept_list = ['False', 'True']
    intercept_list2 = []
    
    # Initialization of lists to be added to dict_
    best_alphas = []
    best_R2s = []
    best_MSEs = []
    mean_CVscores = []
    std_CVscores = []
    
    # Initialization of dictionary object to cast to df
    dict_ = {}
    
    # Loop for different values of CV and normalization type using the alpha range specified
    for cv in cv_list:
        for norm in norm_list:
            for intercept in intercept_list:
            
                if cv == None:
                    cv_name = 'LOOCV' # Leave one out CV (Generalized Cross Validation)
                else:
                    cv_name = str(cv)
                    
                print('CV: %s' %cv_name)
                print('Normalize: %s' %norm)
                print('Intercept: %s' %intercept)

                lassoCV = LassoCV(alphas=alphas, cv=cv, normalize=str2bool(norm), fit_intercept=str2bool(intercept))
                lassoCV.fit(X_train,Y_train)
                #y_pred_Lasso = lassoCV.predict(test2)
                print('Best alpha for Lasso is: %.2f' %(lassoCV.alpha_))
                print('Best R^2 for Lasso is: %.5f' %(lassoCV.score(X_train,Y_train)))

                best_alphas.append(round(lassoCV.alpha_, 2))
                best_R2s.append(round(lassoCV.score(X_train,Y_train), 5))

                lasso = Lasso(alpha = lassoCV.alpha_, normalize = str2bool(norm), fit_intercept=str2bool(intercept))
                lasso.fit(X_train, Y_train)
                mse = round(mean_squared_error(Y_test, lasso.predict(X_test)), 5)
                print('MSE for Lasso: %.5f' %mse)

                best_MSEs.append(mse)
                
                if cv == None:
                    cv_new = len(X_train)
                    scores = cross_val_score(estimator=lasso, X=X_train, y=Y_train, cv=cv_new) # Estimate score using cv = (# of obs.)
                else:
                    scores = cross_val_score(estimator=lasso, X=X_train, y=Y_train, cv=cv) # Estimate score using cv = (# of obs.)
                
                # Estimate test scores using cross_val_score
                scores = np.array(list(map(lambda x: round(x,5), scores)))
                avg_score = round(scores.mean(), 5)
                std_score = round(scores.std(), 5)
                print('Test Scores: {}'.format(scores))
                print('Mean of Test Scores: {}'.format(avg_score))
                print('Std. Deviation of Test Scores: {}'.format(std_score))
                print('*' * 50, '\n')

                mean_CVscores.append(avg_score)
                std_CVscores.append(std_score)
                
                CV_index.append('CV_' + cv_name)
                norm_list2.append(norm)
                intercept_list2.append(intercept)

                dict_['Norm_Type'] = norm_list2
                dict_['Intercept_Type'] = intercept_list2
                dict_['best_alphas'] = best_alphas
                dict_['best_R2s'] = best_R2s
                dict_['best_MSEs'] = best_MSEs
                dict_['mean_CVscores'] = mean_CVscores
                dict_['std_CVscores'] = std_CVscores

                #print(dict_)

    df_results = pd.DataFrame(dict_, index=CV_index)
    df_results.reset_index(inplace=True)
    df_results.rename({'index':'CV'}, axis=1, inplace=True) #rename index as CV
    #df_results.sort_values(by=['best_MSEs', 'mean_CVscores'], ascending=[True,False], inplace=True)
    print(df_results)

    # looking at the results, we choose to run Lasso regression with below settings:
    lasso_selected = LassoCV(alphas=[0.1], cv=5, normalize=False, fit_intercept=True)
    lasso_selected.fit(X, Y)
    y_pred_lasso = lasso_selected.predict(test2)

In [None]:
# ElasticNetCV with Cross Validation

# String to Boolean
def str2bool(v):
    return v.lower() in ("true")


if 'ElasticNet' in model_list:
    
    # Alpha range
    alphas = np.linspace(0.1, 10, 100)

    # Rho ranges
    rhos   = np.linspace(0.01, 1, 100)
    
    # CV range
    sqrt_n = int(np.sqrt(len(X_train))) # Square root of # of obs. (CV = 31)
    cv_list = [3, 5, 10, sqrt_n] # If None alphas are set automatically. So, drop it!
    CV_index = []
    
    # Normalization
    norm_list = ['False', 'True']
    norm_list2 = []
    
    # Intercept
    intercept_list = ['False', 'True']
    intercept_list2 = []
    
    # Initialization of lists to be added to dict_
    best_alphas = []
    best_rhos = []
    best_R2s = []
    best_MSEs = []
    mean_CVscores = []
    std_CVscores = []
    
    # Initialization of dictionary object to cast to df
    dict_ = {}
    
    # Loop for different values of CV and normalization type using the alpha range specified
    for cv in cv_list:
        for norm in norm_list:
            for intercept in intercept_list:
            
                if cv == None:
                    cv_name = 'LOOCV' # Leave one out CV (Generalized Cross Validation)
                else:
                    cv_name = str(cv)
                    
                print('CV: %s' %cv_name)
                print('Normalize: %s' %norm)
                print('Intercept: %s' %intercept)

                elasticNetCV = ElasticNetCV(l1_ratio=rhos, alphas=alphas, cv=cv, normalize=str2bool(norm), fit_intercept=str2bool(intercept), max_iter=10000)
                elasticNetCV.fit(X_train,Y_train)
                #y_pred_elastic = elasticNetCV.predict(test2)
                print('Best alpha for ElasticNetCV is: %.2f' %(elasticNetCV.alpha_))
                print('Best rho for ElasticNetCV is: %.2f' %(elasticNetCV.l1_ratio_))
                print('Best R^2 for ElasticNetCV is: %.5f' %(elasticNetCV.score(X_train,Y_train)))

                best_alphas.append(round(elasticNetCV.alpha_, 2))
                best_rhos.append(round(elasticNetCV.l1_ratio_, 2))
                best_R2s.append(round(elasticNetCV.score(X_train,Y_train), 5))

                elasticnet = ElasticNet(l1_ratio=elasticNetCV.l1_ratio_, alpha = elasticNetCV.alpha_, normalize = str2bool(norm), fit_intercept=str2bool(intercept), max_iter=10000)
                elasticnet.fit(X_train, Y_train)
                mse = round(mean_squared_error(Y_test, elasticnet.predict(X_test)), 5)
                print('MSE for ElasticNet: %.5f' %mse)

                best_MSEs.append(mse)
                
                if cv == None:
                    cv_new = len(X_train)
                    scores = cross_val_score(estimator=elasticnet, X=X_train, y=Y_train, cv=cv_new) # Estimate score using cv = (# of obs.)
                else:
                    scores = cross_val_score(estimator=elasticnet, X=X_train, y=Y_train, cv=cv) # Estimate score using cv = (# of obs.)
                
                # Estimate test scores using cross_val_score
                scores = np.array(list(map(lambda x: round(x,5), scores)))
                avg_score = round(scores.mean(), 5)
                std_score = round(scores.std(), 5)
                print('Test Scores: {}'.format(scores))
                print('Mean of Test Scores: {}'.format(avg_score))
                print('Std. Deviation of Test Scores: {}'.format(std_score))
                print('*' * 50, '\n')

                mean_CVscores.append(avg_score)
                std_CVscores.append(std_score)
                
                CV_index.append('CV_' + cv_name)
                norm_list2.append(norm)
                intercept_list2.append(intercept)

                dict_['Norm_Type'] = norm_list2
                dict_['Intercept_Type'] = intercept_list2
                dict_['best_alphas'] = best_alphas
                dict_['best_rhos'] = best_rhos
                dict_['best_R2s'] = best_R2s
                dict_['best_MSEs'] = best_MSEs
                dict_['mean_CVscores'] = mean_CVscores
                dict_['std_CVscores'] = std_CVscores

                #print(dict_)

    df_results = pd.DataFrame(dict_, index=CV_index)
    df_results.reset_index(inplace=True)
    df_results.rename({'index':'CV'}, axis=1, inplace=True) #rename index as CV
    #df_results.sort_values(by=['best_MSEs', 'mean_CVscores'], ascending=[True,False], inplace=True)
    print(df_results)

    # looking at the results, we choose to run ElasticNet regression with below settings:
    elas_selected = ElasticNetCV(l1_ratio=[0.1], alphas=[0.1], cv=5, normalize=False, fit_intercept=True)
    elas_selected.fit(X, Y)
    y_pred_elastic = elas_selected.predict(test2)

In [None]:
# Random Forest
if 'RandomForest' in model_list:
    
    from sklearn import ensemble
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import RandomizedSearchCV
    
    
    best_R2s = []
    mean_CVscores = []
    std_CVscores = []
    best_MSEs = []
    
    # Initialization of dictionary object to cast to df
    dict_ = {}

    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    pprint.pprint(random_grid)

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 10 fold cross validation, 
    # search across 10 different combinations, and use all available cores
    cv_list = [3, 5, 10]
    CV_index = []
    
    for cv in cv_list:
        rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = cv, verbose=2, random_state=42, n_jobs = -1)
        # Test scores
        scores = cross_val_score(estimator=rf, X=X_train, y=Y_train, cv=cv)
        avg_score = round(scores.mean(), 5)
        std_score = round(scores.std(), 5)
        print('Test Scores: {}'.format(scores))
        print('Mean of Test Scores: {}'.format(scores.mean()))
        print('Std. Deviation of Test Scores: {}'.format(scores.std()))
        
        rf_random.fit(X_train, Y_train)
        print('Best R^2 for RandomForest is: %.5f' %(rf_random.score(X_train,Y_train)))
        
        
        #rf = RandomForestRegressor()
        #rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 5, verbose=2, random_state=42, n_jobs = -1)
        #rf_random.fit(X_train, Y_train)
        mse = round(mean_squared_error(Y_test, rf_random.predict(X_test)), 5)
        print('MSE for RF: %.5f' %mse)
        best_MSEs.append(mse)
        
        best_R2s.append(round(rf_random.score(X_train,Y_train), 5))
        mean_CVscores.append(avg_score)
        std_CVscores.append(std_score)
    
        dict_['best_R2s'] = best_R2s
        dict_['mean_CVscores'] = mean_CVscores
        dict_['std_CVscores'] = std_CVscores
        dict_['best_MSEs'] = best_MSEs
        
        CV_index.append('CV_' + str(cv))
    
    df_results = pd.DataFrame(dict_, index=CV_index)
    df_results.reset_index(inplace=True)
    df_results.rename({'index':'CV'}, axis=1, inplace=True) #rename index as CV
    #df_results.sort_values(by=['best_MSEs', 'mean_CVscores'], ascending=[True,False], inplace=True)
    print(df_results)
    
    # looking at the results, we choose to run RandomForest regression using below settings:
    rf = RandomForestRegressor()
    rf_selected = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 10, verbose=2, random_state=42, n_jobs = -1)
    rf_selected.fit(X,Y)
    y_pred_rf = rf_selected.predict(test2)
    
    
    mse = round(mean_squared_error(Y_test, rf_random.predict(X_test)), 5)
    print('MSE for RF: %.5f' %mse)
    #best_MSEs.append(mse)


In [None]:
df_results

In [None]:
# XGBoost
if 'XGBoost' in model_list:

    regex = re.compile(r"\[|\]|<", re.IGNORECASE)
    X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]
    test2.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in test2.columns.values]    

    #xgb = xgb.XGBRegressor()
    #xgb.fit(X, Y)
    #y_pred_xgb = xgb.predict(test2)
    #mse=mean_squared_error(Y, xgb.predict(X))
    #print('Best R^2 for XGBoost is: %.5f' %(xgb.score(X, Y)))
    #print('RMSE is: ', np.sqrt(mse))

    params = {
    "colsample_bytree": [0.5,1],
    "gamma": [0,0.1,0.2,0.3,0.4,0.5],
    "learning_rate": [0.05, 0.1, 0.2, 0.3], # default 0.1 
    "max_depth": [3,4,5,6,7,8,9,10], # default 3
    "n_estimators": [100,110,120,130,140,150], # default 100
    "subsample": [0.5,1]
    }

    xgb_search = RandomizedSearchCV(xgb, param_distributions=params, random_state=42, n_iter=10, cv=3, verbose=1, n_jobs=1, return_train_score=True)
    xgb_search.fit(X, Y)
    y_pred_xgb = xgb_search.predict(test2)
    print(xgb_search.cv_results_, 1)
    print(xgb_search.best_params_)
    print("best score: {0}".format(xgb_search.best_score_))

In [None]:
# Output to CSV
for model in model_list:
    
    if model == 'MLR':
        y_pred = y_pred_mlr
    elif model == 'Ridge':
        y_pred = y_pred_ridge
    elif model == 'BayesianRidge':
        y_pred = y_pred_bridge
    elif model == 'Lasso':
        y_pred = y_pred_lasso
    elif model == 'ElasticNet':
        y_pred = y_pred_elastic
    elif model == 'RandomForest':
        y_pred = y_pred_rf
#    else:
#        y_pred = y_pred_xgb

    if submission.columns.isin(['SalePrice']).any():
        submission.drop('SalePrice', inplace=True, axis=1)
    
    submission['SalePrice'] = pd.Series(np.exp(y_pred))
    filename = 'SalePrice_Prediction_' + model + '.csv'
    submission.to_csv(filename, index=False)
    
    fname_cv_results = 'Results_' + model + '.csv'
    df_results.to_csv(fname_cv_results, index=False)

In [None]:
submission.head()