In [1]:
import pandas as pd
import numpy as np

from sklearn.base import RegressorMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK


In [2]:
# Functions



def eval_model(y_train_true, y_train_pred, y_test_true, y_test_pred, metric, **kwargs):
    """Calculate metrics for train and test sets"""
    try:
        return {
            f"{metric.__name__}_train": metric(y_train_true, y_train_pred, **kwargs),
            f"{metric.__name__}_test": metric(y_test_true, y_test_pred, **kwargs),
        }
    except:
        import pdb; pdb.set_trace()


        
def create_predictions(model, X_train, X_test, per_m2: bool = False):
    """Create predictions for train and test"""
    if per_m2:
        res = (
            model.predict(X_train) * X_train["GrLivArea"],
            model.predict(X_test) * X_test["GrLivArea"],
        )
         
     
    else:
        res = ( model.predict(X_train), model.predict(X_test))
    try:
        assert np.isnan(res[0]).sum()==0

    except:
        import pdb; pdb.set_trace()
    return res


def setup_regression(
    train, train_features, core_model: RegressorMixin, per_m2=False, log_y=False
):
    """Create a model and separate the data into X, y train and test"""

    if per_m2:
        y_name = "SalePrice_per_GrLivArea"
    else:
        y_name = "SalePrice"

    X_train, X_test, y_train, y_test = train_test_split(
        train[train_features], train[y_name], test_size=0.33, random_state=42
    )

    if log_y:
        model = TransformedTargetRegressor(
            regressor=core_model, func=np.log, inverse_func=np.exp
        )
    else:
        model = core_model
    model_pipeline = Pipeline(
        [("OHE", OneHotEncoder(handle_unknown="ignore")), ("model", model)]
    )
    model_pipeline.fit(X_train, y_train)
    if per_m2:
        return (
            model_pipeline,
            X_train,
            X_test,
            y_train * X_train["GrLivArea"],
            y_test * X_test["GrLivArea"],
        )
    else:
        return model_pipeline, X_train, X_test, y_train, y_test
    

def calculate_metrics_for_pipeline(train, train_features, core_model, per_m2, log_y):
    model_pipeline, X_train, X_test, y_train, y_test = setup_regression(train.fillna(0),train_features,
                                                               core_model= core_model,
                                                                per_m2=per_m2, log_y=log_y)

    y_train_pred, y_test_pred = create_predictions(model_pipeline, X_train, X_test, per_m2=per_m2)
    
    
    return {**eval_model(y_train, y_train_pred,y_test, y_test_pred, r2_score),
              **eval_model(y_train, y_train_pred,y_test, y_test_pred, mean_squared_error, squared=False)}

In [3]:
# Saját függvények

def tree_based_prediction (X, y, core_model = RandomForestRegressor()):

    """Docstring-et írni!"""

    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    model_pipeline = Pipeline(
        [("OHE", OneHotEncoder(handle_unknown="ignore")), ("model", core_model)])
    
    SalePrice_prediction = model_pipeline.fit(X_train, y_train)
    
    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_test_pred)
    
    return mae

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [5]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
X_train = train.drop(['SalePrice'], axis = 1)
y_train = train['SalePrice']

In [7]:
X_train
y_train

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [8]:
# Kidobom az oszlopokat, ahol NaN értékek szerepelnek.

nan_columns = [col for col in train.columns if train[col].isnull().sum() > 0]
train = train.drop(nan_columns, axis = 1)

In [9]:
# A 81 oszlopból már csak 61 maradt meg.

train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [10]:
X = train.drop('SalePrice', axis = 1)
y = train['SalePrice']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1095, 61) (365, 61) (1095,) (365,)


### Random Forest

In [40]:
tree_based_prediction(X,y, core_model = RandomForestRegressor())

22766.27920547945

### Gradient Boosted Tree

In [14]:
tree_based_prediction(X,y, core_model = GradientBoostingRegressor())

22457.543186849678

### AdaBoost

In [15]:
tree_based_prediction(X,y, core_model = AdaBoostRegressor())

31162.243009518108

### XGBoost

In [16]:
tree_based_prediction(X,y, core_model = XGBRegressor())

22855.914897260274

## Bayesian Optimization - Random Forest

In [28]:
rf_search_space = {
    "n_estimators": hp.randint("n_estimators", 200, 1000),
    "max_depth": hp.randint("max_depth", 1, 8),
    "min_samples_split": hp.randint("min_samples_split", 20, 100),
    "min_samples_leaf": hp.uniform("min_samples_leaf", 0, 0.5),
    "max_features": hp.uniform("max_features", 0.05, 0.6),
    "min_impurity_decrease": hp.uniform("min_impurity_decrease", 0, 0.3),
    "bootstrap": hp.choice("bootstrap", [True, False])

}

In [29]:
def black_box_fn(search_space_point):
    return tree_based_prediction(
        X,
        y,
        core_model=RandomForestRegressor(**search_space_point)
    )

In [30]:
best_params_rf = fmin(
  fn=black_box_fn,
  space=rf_search_space,
  algo=tpe.suggest,
  max_evals=50)

100%|█████████| 50/50 [00:42<00:00,  1.18trial/s, best loss: 27666.072328145536]


In [31]:
best_params_rf

{'bootstrap': 1,
 'max_depth': 5,
 'max_features': 0.24560896063009216,
 'min_impurity_decrease': 0.24357551910242103,
 'min_samples_leaf': 0.09129141203594165,
 'min_samples_split': 25,
 'n_estimators': 618}

In [42]:
tree_based_prediction(X,y, core_model = RandomForestRegressor(**best_params_rf))

35655.33760986302

## Bayesian Optimization - Gradien Boosted Tree

In [49]:
gbt_search_space = {
    "learning_rate": hp.loguniform('learning_rate',0.1, 1),
    "n_estimators": hp.randint("n_estimators", 200, 1000),  
    "max_depth": hp.randint("max_depth", 1, 8),
    "min_samples_split": hp.randint("min_samples_split", 20, 100),
    "min_samples_leaf": hp.uniform("min_samples_leaf", 0, 0.5),
    "max_features": hp.uniform("max_features", 0.05, 0.6),
    "min_impurity_decrease": hp.uniform("min_impurity_decrease", 0, 0.3)
}

In [50]:
def black_box_fn(search_space_point):
    return tree_based_prediction(
        X,
        y,
        core_model=GradientBoostingRegressor(**search_space_point)
    )

In [55]:
best_params_gbt = fmin(
  fn=black_box_fn,
  space=gbt_search_space,
  algo=tpe.suggest,
  max_evals=50)

 62%|██████▏   | 31/50 [00:17<00:06,  2.86trial/s, best loss: 22587.10587036188]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))



 92%|█████████▏| 46/50 [00:28<00:02,  1.42trial/s, best loss: 22587.10587036188]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))

  sample_weight * ((y - raw_predictions.ravel()) ** 2)))



100%|██████████| 50/50 [00:33<00:00,  1.51trial/s, best loss: 22587.10587036188]


In [56]:
best_params_gbt

{'learning_rate': 1.1128358602458417,
 'max_depth': 1,
 'max_features': 0.0585287634518,
 'min_impurity_decrease': 0.19081529368401368,
 'min_samples_leaf': 0.04787261639196187,
 'min_samples_split': 30,
 'n_estimators': 868}

In [57]:
tree_based_prediction(X,y, core_model = GradientBoostingRegressor(**best_params_gbt))

25968.82036234169

## Bayesian Optimization - AdaBoosted Tree

In [67]:
abt_search_space = {
    "learning_rate": hp.loguniform('learning_rate',0.1, 1),
    "n_estimators": hp.randint("n_estimators", 200, 1000),
    "loss": hp.choice("loss", ['linear', 'square', 'exponential'])
}

In [59]:
def black_box_fn(search_space_point):
    return tree_based_prediction(
        X,
        y,
        core_model=AdaBoostRegressor(**search_space_point)
    )

In [61]:
best_params_abt = fmin(
  fn=black_box_fn,
  space=abt_search_space,
  algo=tpe.suggest,
  max_evals=50)

best_params_abt

100%|█████████| 50/50 [02:03<00:00,  2.47s/trial, best loss: 29364.034759040343]


{'learning_rate': 1.9196999503614776, 'loss': 2, 'n_estimators': 297}

In [65]:
best_params_abt = {'learning_rate': 1.9196999503614776, 'loss': 'exponential', 'n_estimators': 297}

In [66]:
tree_based_prediction(X,y, core_model = AdaBoostRegressor(**best_params_abt))

32958.888993735716

## Bayesian Optimization - XGBoost

In [74]:
xgb_search_space = {
    "eta": hp.uniform("eta", 0, 1),
    "gamma": hp.loguniform('gamma',0.1, 1),
    "max_depth": hp.randint("max_depth", 1, 8),
    "min_child_weight": hp.loguniform('min_child_weight',0.1, 1),
    "max_delta_step": hp.randint("max_delta_step", 1, 10),
#    "num_feature": hp.quniform("num_feature", 1, 15, 1),
#    "num_parallel_tree": hp.quniform("num_parallel_tree", 1, 8, 1)
}

In [75]:
def black_box_fn(search_space_point):
    return tree_based_prediction(
        X,
        y,
        core_model=XGBRegressor(**search_space_point)
    )

In [76]:
best_params_xgb = fmin(
  fn=black_box_fn,
  space=xgb_search_space,
  algo=tpe.suggest,
  max_evals=50)

best_params_xgb

100%|██████████| 50/50 [00:05<00:00,  9.52trial/s, best loss: 174740.7778000714]


{'eta': 0.35248003272532086,
 'gamma': 2.233132387198671,
 'max_delta_step': 3,
 'max_depth': 1,
 'min_child_weight': 1.2838115532140115}

In [78]:
tree_based_prediction(X,y, core_model = XGBRegressor(**best_params_xgb))

181815.49560829057