In [10]:
import pandas as pd
import wandb
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [11]:
wandb.login()

True

In [12]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
TRUNCATED_BASELINE_TRAIN = "data/train/baseline_truncated_train.csv"
TRUNCATED_BASLINE_VAL = "data/train/baseline_truncated_val.csv"
BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline_w_feature_eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline_w_feature_eng_val.csv"
PPSM_BASELINE_TRAIN = "data/train/ppsm_baseline_train.csv"
PPSM_BASELINE_VAL = "data/train/ppsm_baseline_val.csv"
PPSM_FEAT_ENG_TRAIN = "data/train/feature_eng_ppsm_train.csv"
PPSM_FEAT_ENG_VAL = "data/train/feature_eng_ppsm_val.csv"

### Model For the Baseline Dataset

In [13]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'val_rmse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric


In [14]:
parameters_dict = {
    'learning_rate': {
        'values': [0.1]
    },
    'max_iter': {
        'values': [100, 200, 300, 500, 1000, 2000]
    },
    'max_leaf_nodes': {
        'values': [10, 31, 50, 100]
    },
    'max_depth' : {
        'values': [3, 4, 5, 6, 7, 8, 9]
    },
    'min_samples_leaf': {
        'values': [20, 40, 60]
    },
    'l2_regularization': {
        'values': [0, 0.1, 0.2, 0.3]
    },
    'max_bins': {
        'values': [255, 128]
    }
}
sweep_config['parameters'] = parameters_dict

In [15]:
TRAIN_DATA = BASELINE_TRAIN
VAL_DATA = BASELINE_VAL

In [16]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["monthly_rent"]
X_train = train_set.drop(columns=['monthly_rent'])

y_test = val_set['monthly_rent']
X_test = val_set.drop(columns=['monthly_rent'])


In [17]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = HistGradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            max_iter=config.max_iter,
            max_depth=config.max_depth,
            max_leaf_nodes=config.max_leaf_nodes,
            min_samples_leaf=config.min_samples_leaf,
            l2_regularization=config.l2_regularization,
            max_bins=config.max_bins)
        regressor.fit(X_train, y_train)
        train_score = regressor.score(X_train, y_train)
        val_score = regressor.score(X_test, y_test)
        
        y_train_pred = regressor.predict(X_train)
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
        y_test_pred = regressor.predict(X_test)
        val_rmse = mean_squared_error(y_test, y_test_pred, squared=False)


    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"train_rmse": train_rmse, "train_score": train_score, "val_rmse": val_rmse, "val_score": val_score})



In [18]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_baseline_rmse_histgb")
wandb.agent(sweep_id, train, count=20)

Create sweep with ID: aq7um8de
Sweep URL: https://wandb.ai/minjabenho/hdb_rental_prices_baseline_rmse_histgb/sweeps/aq7um8de


[34m[1mwandb[0m: Agent Starting Run: i04ewzf2 with config:
[34m[1mwandb[0m: 	l2_regularization: 0.1
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_bins: 255
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	max_iter: 2000
[34m[1mwandb[0m: 	max_leaf_nodes: 100
[34m[1mwandb[0m: 	min_samples_leaf: 40
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
train_rmse,▁
train_score,▁
val_rmse,▁
val_score,▁

0,1
train_rmse,465.22262
train_score,0.57842
val_rmse,474.99006
val_score,0.53978


[34m[1mwandb[0m: Agent Starting Run: joudmmrc with config:
[34m[1mwandb[0m: 	l2_regularization: 0.3
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_bins: 255
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	max_iter: 500
[34m[1mwandb[0m: 	max_leaf_nodes: 31
[34m[1mwandb[0m: 	min_samples_leaf: 40
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
train_rmse,▁
train_score,▁
val_rmse,▁
val_score,▁

0,1
train_rmse,472.2943
train_score,0.56551
val_rmse,474.82777
val_score,0.5401


[34m[1mwandb[0m: Agent Starting Run: apk40rro with config:
[34m[1mwandb[0m: 	l2_regularization: 0.1
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_bins: 255
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	max_iter: 500
[34m[1mwandb[0m: 	max_leaf_nodes: 31
[34m[1mwandb[0m: 	min_samples_leaf: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
train_rmse,▁
train_score,▁
val_rmse,▁
val_score,▁

0,1
train_rmse,482.78497
train_score,0.54599
val_rmse,477.15122
val_score,0.53559


[34m[1mwandb[0m: Agent Starting Run: wnq1mdas with config:
[34m[1mwandb[0m: 	l2_regularization: 0.3
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_bins: 255
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	max_iter: 1000
[34m[1mwandb[0m: 	max_leaf_nodes: 50
[34m[1mwandb[0m: 	min_samples_leaf: 40
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
train_rmse,▁
train_score,▁
val_rmse,▁
val_score,▁

0,1
train_rmse,468.13616
train_score,0.57313
val_rmse,473.89466
val_score,0.5419


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7byz92ai with config:
[34m[1mwandb[0m: 	l2_regularization: 0.3
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_bins: 128
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	max_iter: 2000
[34m[1mwandb[0m: 	max_leaf_nodes: 100
[34m[1mwandb[0m: 	min_samples_leaf: 60
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
train_rmse,▁
train_score,▁
val_rmse,▁
val_score,▁

0,1
train_rmse,464.91286
train_score,0.57899
val_rmse,474.63443
val_score,0.54047


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: a1k4thlk with config:
[34m[1mwandb[0m: 	l2_regularization: 0.3
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_bins: 255
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	max_iter: 2000
[34m[1mwandb[0m: 	max_leaf_nodes: 50
[34m[1mwandb[0m: 	min_samples_leaf: 60
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
train_rmse,▁
train_score,▁
val_rmse,▁
val_score,▁

0,1
train_rmse,469.58732
train_score,0.57048
val_rmse,474.8252
val_score,0.5401


[34m[1mwandb[0m: Agent Starting Run: wswo3kar with config:
[34m[1mwandb[0m: 	l2_regularization: 0.3
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_bins: 255
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	max_iter: 2000
[34m[1mwandb[0m: 	max_leaf_nodes: 100
[34m[1mwandb[0m: 	min_samples_leaf: 40
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
train_rmse,▁
train_score,▁
val_rmse,▁
val_score,▁

0,1
train_rmse,462.10597
train_score,0.58405
val_rmse,475.23986
val_score,0.5393


[34m[1mwandb[0m: Agent Starting Run: cvl01fxr with config:
[34m[1mwandb[0m: 	l2_regularization: 0.2
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_bins: 128
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	max_iter: 2000
[34m[1mwandb[0m: 	max_leaf_nodes: 50
[34m[1mwandb[0m: 	min_samples_leaf: 20
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
train_rmse,▁
train_score,▁
val_rmse,▁
val_score,▁

0,1
train_rmse,467.13164
train_score,0.57496
val_rmse,474.26316
val_score,0.54119


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: lf2n4ui0 with config:
[34m[1mwandb[0m: 	l2_regularization: 0.2
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_bins: 255
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	max_iter: 300
[34m[1mwandb[0m: 	max_leaf_nodes: 31
[34m[1mwandb[0m: 	min_samples_leaf: 40
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
train_rmse,▁
train_score,▁
val_rmse,▁
val_score,▁

0,1
train_rmse,470.65111
train_score,0.56853
val_rmse,474.7594
val_score,0.54023


[34m[1mwandb[0m: Agent Starting Run: bxuzi789 with config:
[34m[1mwandb[0m: 	l2_regularization: 0.1
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_bins: 255
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	max_iter: 100
[34m[1mwandb[0m: 	max_leaf_nodes: 50
[34m[1mwandb[0m: 	min_samples_leaf: 40
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


### Model for the Truncated Baseline Set

In [None]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'mse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

In [None]:
parameters_dict = {
    'learning_rate': {
        'values': [0.1]
    },
    'max_iter': {
        'values': [100, 200, 300, 500, 1000, 2000]
    },
    'max_leaf_nodes': {
        'values': [10, 31, 50, 100]
    },
    'max_depth' : {
        'values': [3, 4, 5, 6, 7, 8, 9]
    },
    'min_samples_leaf': {
        'values': [20, 40, 60]
    },
    'l2_regularization': {
        'values': [0, 0.1, 0.2, 0.3]
    },
    'max_bins': {
        'values': [255, 128]
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
TRAIN_DATA = TRUNCATED_BASELINE_TRAIN
VAL_DATA = TRUNCATED_BASLINE_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["monthly_rent"]
X_train = train_set.drop(columns=['monthly_rent'])

y_test = val_set['monthly_rent']
X_test = val_set.drop(columns=['monthly_rent'])

In [None]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = HistGradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            subsample=config.subsample,
            min_samples_leaf=config.min_samples_leaf,
            min_samples_split=config.min_samples_split,
            min_impurity_decrease=config.min_impurity_decrease,
            min_weight_fraction_leaf=config.min_weight_fraction_leaf)
        regressor.fit(X_train, y_train)
        score = regressor.score(X_test, y_test)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"mse": mse, "coefficient_of_determination": score})

In [None]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_truncated_baseline")
wandb.agent(sweep_id, train, count=20)

### Model for the Feature Engineering Set

In [None]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'mse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

In [None]:
parameters_dict = {
    'learning_rate': {
        'values': [0.1]
    },
    'max_iter': {
        'values': [100, 200, 300, 500, 1000, 2000]
    },
    'max_leaf_nodes': {
        'values': [10, 31, 50, 100]
    },
    'max_depth' : {
        'values': [3, 4, 5, 6, 7, 8, 9]
    },
    'min_samples_leaf': {
        'values': [20, 40, 60]
    },
    'l2_regularization': {
        'values': [0, 0.1, 0.2, 0.3]
    },
    'max_bins': {
        'values': [255, 128]
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
TRAIN_DATA = BASELINE_W_FEAT_ENG_TRAIN
VAL_DATA = BASELINE_W_FEAT_ENG_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["monthly_rent"]
X_train = train_set.drop(columns=['monthly_rent'])

y_test = val_set['monthly_rent']
X_test = val_set.drop(columns=['monthly_rent'])

In [None]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = HistGradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            max_iter=config.max_iter,
            max_depth=config.max_depth,
            max_leaf_nodes=config.max_leaf_nodes,
            min_samples_leaf=config.min_samples_leaf,
            l2_regularization=config.l2_regularization,
            max_bins=config.max_bins)
        regressor.fit(X_train, y_train)
        train_score = regressor.score(X_train, y_train)
        val_score = regressor.score(X_test, y_test)
        
        y_train_pred = regressor.predict(X_train)
        train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)
        y_test_pred = regressor.predict(X_test)
        val_rmse = mean_squared_error(y_test, y_test_pred, squared=False)


    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"train_rmse": train_rmse, "train_score": train_score, "val_rmse": val_rmse, "val_score": val_score})

In [None]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_feature_eng_rmse_histgb")
wandb.agent(sweep_id, train, count=20)

### Model for the Price Per Square Metre Set

In [None]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'val_rmse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

In [None]:
parameters_dict = {
    'learning_rate': {
        'values': [0.1]
    },
    'n_estimators': {
        'values': [100, 200, 300, 500]
    },
    'min_samples_split': {
        'values': [4, 16, 32]
    },
    'min_samples_leaf': {
        'values': [20]
    },
    'max_depth': {
        'values': [3, 6]
    },
    'min_impurity_decrease': {
        'values': [0.1, 0.2]
    },
    'min_weight_fraction_leaf': {
        'values': [0.0]
    },
    'subsample': {
        'values': [0.75, 0.5]
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
TRAIN_DATA = PPSM_BASELINE_TRAIN
VAL_DATA = PPSM_BASELINE_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["rent_per_sqm"]
X_train = train_set.drop(columns=['rent_per_sqm'])

y_test = val_set['rent_per_sqm']
X_test = val_set.drop(columns=['rent_per_sqm'])

In [None]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = HistGradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            subsample=config.subsample,
            min_samples_leaf=config.min_samples_leaf,
            min_samples_split=config.min_samples_split,
            min_impurity_decrease=config.min_impurity_decrease,
            min_weight_fraction_leaf=config.min_weight_fraction_leaf)
        regressor.fit(X_train, y_train)
        score = regressor.score(X_test, y_test)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"mse": mse, "coefficient_of_determination": score})

In [None]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_ppsm_baseline")
wandb.agent(sweep_id, train, count=20)

### Model for the Price Per Square Metre Set with Feature Engineering

In [None]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'mse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

In [None]:
parameters_dict = {
    'learning_rate': {
        'values': [0.01, 0.1]
    },
    'n_estimators': {
        'values': [100, 200, 300, 500]
    },
    'min_samples_split': {
        'values': [4, 16, 32]
    },
    'min_samples_leaf': {
        'values': [10, 20, 30]
    },
    'max_depth': {
        'values': [3, 6, 9]
    },
    'min_impurity_decrease': {
        'values': [0.1, 0.2]
    },
    'min_weight_fraction_leaf': {
        'values': [0.0, 0.1]
    },
    'subsample': {
        'values': [0.75, 0.5]
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
TRAIN_DATA = PPSM_FEAT_ENG_TRAIN
VAL_DATA = PPSM_FEAT_ENG_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["rent_per_sqm"]
X_train = train_set.drop(columns=['rent_per_sqm'])

y_test = val_set['rent_per_sqm']
X_test = val_set.drop(columns=['rent_per_sqm'])

In [None]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = HistGradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            subsample=config.subsample,
            min_samples_leaf=config.min_samples_leaf,
            min_samples_split=config.min_samples_split,
            min_impurity_decrease=config.min_impurity_decrease,
            min_weight_fraction_leaf=config.min_weight_fraction_leaf)
        regressor.fit(X_train, y_train)
        
        score = regressor.score(X_test, y_test)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"mse": mse, "coefficient_of_determination": score})

In [None]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_ppsm_feat_eng")
wandb.agent(sweep_id, train, count=20)