In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,BaggingRegressor,ExtraTreesRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split,cross_val_score,KFold,StratifiedKFold,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.impute import SimpleImputer
from scipy.stats import skew,randint
import numpy as np
from time import time
import preprocess
%load_ext autoreload
%autoreload 2

## Best Hyper-Parameters Search

### GridSearchCV and RandomizedSearchCV

In [None]:
def get_regression_scores(X_train,X_test,y_train,y_test,error_fn):
    best_error = np.inf
    best_model = None
    models = [('RF',RandomForestRegressor(n_estimators=100, random_state=100)),
              ('GB',GradientBoostingRegressor(n_estimators=100, random_state=100)),
              ('ET',ExtraTreesRegressor(n_estimators=100, random_state=100)),
              ('LR',LinearRegression()),
              ('Lasso',Lasso(max_iter=10000,random_state=100)),
              ('Ridge',Ridge(random_state=100)),
              ('Elastic',ElasticNet(max_iter=10000,random_state=100))
             ]
    
    for model in models:
        model_instance = model[1]
        model_instance.fit(X_train,y_train)
        preds = model_instance.predict(X_test)
        error = error_fn(y_test, preds)
        print("{} error: {:.2f}".format(model[0],error))
        if best_error > error:
            best_error = error
            best_model = model_instance
            
    return best_model    
    

In [None]:
def add_new_features4(df):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df["OverallGrade"] = df["OverallQual"] * df["OverallCond"]
    df['TotalLivArea'] = df['GrLivArea'] + df['GarageArea'] + df['LotArea']
    
    df["GrLivArea-2"] = df["GrLivArea"] ** 2
    df["GrLivArea-3"] = df["GrLivArea"] ** 3
    df["GrLivArea-Sq"] = np.sqrt(df["GrLivArea"])
    df["GarageArea-2"] = df["GarageArea"] ** 2
    df["GarageArea-3"] = df["GarageArea"] ** 3
    df["GarageArea-Sq"] = np.sqrt(df["GarageArea"])
    return df

    

In [None]:
X = pd.read_csv('../datasets/iowa_housing/train.csv', index_col='Id') 
X_test = pd.read_csv('../datasets/iowa_housing/test.csv', index_col='Id')


In [None]:
len(X.columns)

In [None]:
X = add_new_features4(X)
X_test = add_new_features4(X_test)


In [None]:
len(X.columns)

In [None]:
pre_processed = preprocess.preprocess_df(X,'SalePrice',X_test,one_hot=False,scaler=RobustScaler)
splits = pre_processed.split_df()
X_train,X_valid,y_train,y_valid = splits['X_train'],splits['X_test'],splits['y_train'],splits['y_test']


In [None]:
# Utility function to report best scores
def report_hyper_param_search(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
params = {'n_estimators':[100,200,300,400,500,1500],
           "max_features": randint(0,89),
           "min_samples_split": randint(2, 11),
           "min_samples_leaf": randint(1, 11),
           "subsample":[0.6,0.7,0.75,0.8,0.9]
         }

start = time()
randomSearch_gb = RandomizedSearchCV(GradientBoostingRegressor(warm_start=True,random_state=100),
                                     param_distributions=params,n_iter=20,
                                     cv=kfold,n_jobs=6)        
randomSearch_gb.fit(X_train,y_train)

print('training took {} minutes'.format((time() - start)/60.))

In [None]:
params = {'n_estimators':[50,100,200,300],
              'max_features': [0.5,0.7,0.9,'auto'],
              'min_samples_split': [2,3,10],
              'min_samples_leaf': [1,3,10],
              "subsample":[0.7,0.8,0.9]}

start = time()
gridSearch_gb = GridSearchCV(GradientBoostingRegressor(warm_start=True,random_state=100),
                                     param_grid=params,cv=kfold,n_jobs=6)        
gridSearch_gb.fit(X_train,y_train)

print('training took {} minutes'.format((time() - start)/60.))

In [None]:
report_hyper_param_search(randomSearch_gb.cv_results_)

In [None]:
report_hyper_param_search(gridSearch_gb.cv_results_)

In [None]:
def score_dataset(model,X_train, X_valid, y_train, y_valid,error_fn=mean_absolute_error):
    preds = model.predict(X_valid)
    return error_fn(y_valid, preds)

In [None]:
score_dataset(randomSearch_gb.best_estimator_,X_train, X_valid, y_train, y_valid,error_fn=mean_absolute_error)

In [None]:
score_dataset(gridSearch_gb.best_estimator_,X_train, X_valid, y_train, y_valid,error_fn=mean_absolute_error)

In [None]:
scores = cross_val_score(randomSearch_gb.best_estimator_,pre_processed.X,pre_processed.y,cv=4)
print("Cross-validation scores: {}, mean score = {}".format(scores,scores.mean()))

In [None]:
scores = cross_val_score(gridSearch_gb.best_estimator_,pre_processed.X,pre_processed.y,cv=4)
print("Cross-validation scores: {}, mean score = {}".format(scores,scores.mean()))

In [None]:
preds_test = randomSearch_gb.predict(pre_processed.X_test)
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission3.csv', index=False)
