In [1]:
import pandas as pd
import wandb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mminjabenho[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
TRUNCATED_BASELINE_TRAIN = "data/train/baseline_truncated_train.csv"
TRUNCATED_BASLINE_VAL = "data/train/baseline_truncated_val.csv"
BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline_w_feature_eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline_w_feature_eng_val.csv"
PPSM_BASELINE_TRAIN = "data/train/ppsm_baseline_train.csv"
PPSM_BASELINE_VAL = "data/train/ppsm_baseline_val.csv"
PPSM_FEAT_ENG_TRAIN = "data/train/feature_eng_ppsm_train.csv"
PPSM_FEAT_ENG_VAL = "data/train/feature_eng_ppsm_val.csv"

### Model For the Baseline Dataset

In [None]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'mse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric


In [None]:
parameters_dict = {
    'learning_rate': {
        'values': [0.1]
    },
    'n_estimators': {
        'values': [100, 200, 300, 500]
    },
    'min_samples_split': {
        'values': [4, 16, 32]
    },
    'min_samples_leaf': {
        'values': [20]
    },
    'max_depth': {
        'values': [3, 6]
    },
    'min_impurity_decrease': {
        'values': [0.1, 0.2]
    },
    'min_weight_fraction_leaf': {
        'values': [0.0]
    },
    'subsample': {
        'values': [0.75, 0.5]
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
TRAIN_DATA = BASELINE_TRAIN
VAL_DATA = BASELINE_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["monthly_rent"]
X_train = train_set.drop(columns=['monthly_rent'])

y_test = val_set['monthly_rent']
X_test = val_set.drop(columns=['monthly_rent'])


In [None]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = GradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            subsample=config.subsample,
            min_samples_leaf=config.min_samples_leaf,
            min_samples_split=config.min_samples_split,
            min_impurity_decrease=config.min_impurity_decrease,
            min_weight_fraction_leaf=config.min_weight_fraction_leaf)
        regressor.fit(X_train, y_train)
        score = regressor.score(X_test, y_test)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"mse": mse, "coefficient_of_determination": score})



In [None]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_baseline")
wandb.agent(sweep_id, train(X_train, y_train, X_test, y_test), count=20)

### Model for the Truncated Baseline Set

In [None]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'mse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

In [None]:
parameters_dict = {
    'learning_rate': {
        'values': [0.1]
    },
    'n_estimators': {
        'values': [100, 200, 300, 500]
    },
    'min_samples_split': {
        'values': [4, 16, 32]
    },
    'min_samples_leaf': {
        'values': [20]
    },
    'max_depth': {
        'values': [3, 6]
    },
    'min_impurity_decrease': {
        'values': [0.1, 0.2]
    },
    'min_weight_fraction_leaf': {
        'values': [0.0]
    },
    'subsample': {
        'values': [0.75, 0.5]
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
TRAIN_DATA = TRUNCATED_BASELINE_TRAIN
VAL_DATA = TRUNCATED_BASLINE_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["monthly_rent"]
X_train = train_set.drop(columns=['monthly_rent'])

y_test = val_set['monthly_rent']
X_test = val_set.drop(columns=['monthly_rent'])

In [None]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = GradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            subsample=config.subsample,
            min_samples_leaf=config.min_samples_leaf,
            min_samples_split=config.min_samples_split,
            min_impurity_decrease=config.min_impurity_decrease,
            min_weight_fraction_leaf=config.min_weight_fraction_leaf)
        regressor.fit(X_train, y_train)
        score = regressor.score(X_test, y_test)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"mse": mse, "coefficient_of_determination": score})

In [None]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_truncated_baseline")
wandb.agent(sweep_id, train, count=20)

### Model for the Feature Engineering Set

In [4]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'mse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

In [5]:
parameters_dict = {
    'learning_rate': {
        'values': [0.1]
    },
    'n_estimators': {
        'values': [100, 200, 300, 500]
    },
    'min_samples_split': {
        'values': [4, 16, 32]
    },
    'min_samples_leaf': {
        'values': [20]
    },
    'max_depth': {
        'values': [3, 6]
    },
    'min_impurity_decrease': {
        'values': [0.1, 0.2]
    },
    'min_weight_fraction_leaf': {
        'values': [0.0]
    },
    'subsample': {
        'values': [0.75, 0.5]
    }
}
sweep_config['parameters'] = parameters_dict

In [6]:
TRAIN_DATA = BASELINE_W_FEAT_ENG_TRAIN
VAL_DATA = BASELINE_W_FEAT_ENG_VAL

In [7]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["monthly_rent"]
X_train = train_set.drop(columns=['monthly_rent'])

y_test = val_set['monthly_rent']
X_test = val_set.drop(columns=['monthly_rent'])

In [8]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = GradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            subsample=config.subsample,
            min_samples_leaf=config.min_samples_leaf,
            min_samples_split=config.min_samples_split,
            min_impurity_decrease=config.min_impurity_decrease,
            min_weight_fraction_leaf=config.min_weight_fraction_leaf)
        regressor.fit(X_train, y_train)
        score = regressor.score(X_test, y_test)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"mse": mse, "coefficient_of_determination": score})

In [9]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_feature_eng")
wandb.agent(sweep_id, train, count=20)

Create sweep with ID: 0jp3waxw
Sweep URL: https://wandb.ai/minjabenho/hdb_rental_prices_feature_eng/sweeps/0jp3waxw


[34m[1mwandb[0m: Agent Starting Run: 6c77sxc1 with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	min_impurity_decrease: 0.1
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 16
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 500
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.53936
mse,225881.99885


[34m[1mwandb[0m: Agent Starting Run: fqyvoktc with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	min_impurity_decrease: 0.1
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 200
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.53667
mse,227201.26337


[34m[1mwandb[0m: Agent Starting Run: poa1jh9e with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 32
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 500
[34m[1mwandb[0m: 	subsample: 0.75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54034
mse,225401.31931


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vbdtmr4j with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.1
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: 7mi0unlz with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	min_impurity_decrease: 0.1
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 32
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 200
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.53667
mse,227201.26337


[34m[1mwandb[0m: Agent Starting Run: wcb64ijp with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 200
[34m[1mwandb[0m: 	subsample: 0.75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.5398
mse,225667.87298


[34m[1mwandb[0m: Agent Starting Run: 3eclldko with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 16
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: cm3lvoe9 with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	min_impurity_decrease: 0.1
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 16
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 200
[34m[1mwandb[0m: 	subsample: 0.75
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.53746
mse,226814.02398


[34m[1mwandb[0m: Agent Starting Run: hoa23249 with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 16
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 200
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.5407
mse,225225.1865


[34m[1mwandb[0m: Agent Starting Run: 74bu9vuc with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 16
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: h6kv9m2a with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.1
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: foa9itso with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 32
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: o9mcgkdi with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6piyz98x with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: bx1bgs1p with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 32
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: pxr3jfnx with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 16
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: mvijeccq with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: mfei53hs with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: ylwwqvey with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 16
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


[34m[1mwandb[0m: Agent Starting Run: 56cqnbr4 with config:
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	min_impurity_decrease: 0.2
[34m[1mwandb[0m: 	min_samples_leaf: 20
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	min_weight_fraction_leaf: 0
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	subsample: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
coefficient_of_determination,▁
mse,▁

0,1
coefficient_of_determination,0.54095
mse,225100.69559


### Model for the Price Per Square Metre Set

In [None]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'mse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

In [None]:
parameters_dict = {
    'learning_rate': {
        'values': [0.1]
    },
    'n_estimators': {
        'values': [100, 200, 300, 500]
    },
    'min_samples_split': {
        'values': [4, 16, 32]
    },
    'min_samples_leaf': {
        'values': [20]
    },
    'max_depth': {
        'values': [3, 6]
    },
    'min_impurity_decrease': {
        'values': [0.1, 0.2]
    },
    'min_weight_fraction_leaf': {
        'values': [0.0]
    },
    'subsample': {
        'values': [0.75, 0.5]
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
TRAIN_DATA = PPSM_BASELINE_TRAIN
VAL_DATA = PPSM_BASELINE_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["rent_per_sqm"]
X_train = train_set.drop(columns=['rent_per_sqm'])

y_test = val_set['rent_per_sqm']
X_test = val_set.drop(columns=['rent_per_sqm'])

In [None]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = GradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            subsample=config.subsample,
            min_samples_leaf=config.min_samples_leaf,
            min_samples_split=config.min_samples_split,
            min_impurity_decrease=config.min_impurity_decrease,
            min_weight_fraction_leaf=config.min_weight_fraction_leaf)
        regressor.fit(X_train, y_train)
        score = regressor.score(X_test, y_test)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"mse": mse, "coefficient_of_determination": score})

In [None]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_ppsm_baseline")
wandb.agent(sweep_id, train, count=20)

### Model for the Price Per Square Metre Set with Feature Engineering

In [None]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'mse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

In [None]:
parameters_dict = {
    'learning_rate': {
        'values': [0.01, 0.1]
    },
    'n_estimators': {
        'values': [100, 200, 300, 500]
    },
    'min_samples_split': {
        'values': [4, 16, 32]
    },
    'min_samples_leaf': {
        'values': [10, 20, 30]
    },
    'max_depth': {
        'values': [3, 6, 9]
    },
    'min_impurity_decrease': {
        'values': [0.1, 0.2]
    },
    'min_weight_fraction_leaf': {
        'values': [0.0, 0.1]
    },
    'subsample': {
        'values': [0.75, 0.5]
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
TRAIN_DATA = PPSM_FEAT_ENG_TRAIN
VAL_DATA = PPSM_FEAT_ENG_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["rent_per_sqm"]
X_train = train_set.drop(columns=['rent_per_sqm'])

y_test = val_set['rent_per_sqm']
X_test = val_set.drop(columns=['rent_per_sqm'])

In [None]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = GradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            subsample=config.subsample,
            min_samples_leaf=config.min_samples_leaf,
            min_samples_split=config.min_samples_split,
            min_impurity_decrease=config.min_impurity_decrease,
            min_weight_fraction_leaf=config.min_weight_fraction_leaf)
        regressor.fit(X_train, y_train)
        score = regressor.score(X_test, y_test)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"mse": mse, "coefficient_of_determination": score})

In [None]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_ppsm_feat_eng")
wandb.agent(sweep_id, train, count=20)