In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,BaggingRegressor,ExtraTreesRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split,cross_val_score,KFold,StratifiedKFold,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from mlxtend.regressor import StackingCVRegressor
from scipy.stats import skew,randint
import numpy as np
from time import time
import preprocess

In [None]:
%load_ext autoreload
%autoreload 2

## Model Stacking for better performance

### Stacking Model

In [None]:
params = {'n_estimators':[100,200,300,400,500,1500],
           "max_features": randint(100,248),
           "min_samples_split": randint(2, 11),
           "min_samples_leaf": randint(1, 11),
           "subsample":[0.6,0.7,0.75,0.8,0.9]
         }

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

start = time()
randomSearch_gb = RandomizedSearchCV(GradientBoostingRegressor(warm_start=True),
                                     param_distributions=params,n_iter=20,
                                     cv=kfolds,n_jobs=6)       

params_rf = {'n_estimators':[300,500,800,1100,1500,1800],
              "min_samples_split": randint(2, 11),
              "min_samples_leaf": randint(1, 11)
         }

randomSearch_rf = RandomizedSearchCV(RandomForestRegressor(warm_start=True),
                                     param_distributions=params_rf,cv=kfolds,
                                     n_jobs=-1, n_iter=20)        

params_elastic = {'alpha':[0.001,0.01,0.1,1.],
          'l1_ratio': [0.4,0.5,0.6,0.7,0.8,0.9],
          'max_iter':[1000,2000,5000,10000],
          'selection':['cyclic','random']
         }

randomSearch_elastic = RandomizedSearchCV(ElasticNet(warm_start=True),param_distributions=params_elastic,
                                          cv=kfolds,n_jobs=-1, n_iter=20)        


In [None]:
models = [('RF',randomSearch_rf),
              ('GB',randomSearch_gb),
              ('ET',ExtraTreesRegressor(n_estimators=500, random_state=100)),
              ('LR',LinearRegression()),
              ('Lasso',Lasso(max_iter=10000,random_state=100)),
              ('Ridge',Ridge(random_state=100)),
              ('Elastic',randomSearch_elastic)
             ]

stacked_models = tuple([model[1] for model in models])

In [None]:
models = [('RF',randomSearch_rf),
           ('GB',randomSearch_gb),
           ('Elastic',randomSearch_elastic)
         ]

stacked_models = tuple([model[1] for model in models])

In [None]:
stack_gen = StackingCVRegressor(regressors=stacked_models,
                                meta_regressor= models[1][1],
                                use_features_in_secondary=True)


In [None]:
def advanced_feature_eng(df):
    df['Total_porch_SF'] = (df['OpenPorchSF'] + df['3SsnPorch'] +
                              df['EnclosedPorch'] + df['ScreenPorch'] +
                              df['WoodDeckSF'])
    df['Total_SF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['Total_SF_2'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] +
                           df['1stFlrSF'] + df['2ndFlrSF'])
    df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) +
                                   df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))
    df['haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    df['hasgarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    
    return df

In [None]:
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')
X = advanced_feature_eng(X)
X_test = advanced_feature_eng(X_test)
pre_processed = preprocess.preprocess_df(X,'SalePrice',X_test,one_hot=True,scaler=RobustScaler)
splits = pre_processed.split_df(stratify=pre_processed.X['OverallQual'])
X_train,X_valid,y_train,y_valid = splits['X_train'],splits['X_test'],splits['y_train'],splits['y_test']


In [None]:
stack_gen_model = stack_gen.fit(np.array(X_train), np.array(y_train))
preds = stack_gen_model.predict(np.array(X_valid))
mean_absolute_error(y_valid,preds)

In [None]:
preds_test = stack_gen_model.predict(pre_processed.X_test)
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission4.csv', index=False)
