In [1]:
import pandas as pd
import wandb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [2]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mminjabenho[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
# Datasets
BASELINE_TRAIN = "data/train/baseline_train.csv"
BASELINE_VAL = "data/train/baseline_val.csv"
TRUNCATED_BASELINE_TRAIN = "data/train/baseline_truncated_train.csv"
TRUNCATED_BASLINE_VAL = "data/train/baseline_truncated_val.csv"
BASELINE_W_FEAT_ENG_TRAIN = "data/train/baseline_w_feature_eng_train.csv"
BASELINE_W_FEAT_ENG_VAL = "data/train/baseline_w_feature_eng_val.csv"
PPSM_BASELINE_TRAIN = "data/train/ppsm_baseline_train.csv"
PPSM_BASELINE_VAL = "data/train/ppsm_baseline_val.csv"

In [5]:
def train():
    with wandb.init():
        config = wandb.config

        regressor = GradientBoostingRegressor(
            random_state=42,
            learning_rate=config.learning_rate,
            n_estimators=config.n_estimators,
            max_depth=config.max_depth,
            subsample=config.subsample,
            min_samples_leaf=config.min_samples_leaf,
            min_samples_split=config.min_samples_split,
            min_impurity_decrease=config.min_impurity_decrease,
            min_weight_fraction_leaf=config.min_weight_fraction_leaf)
        regressor.fit(X_train, y_train)
        score = regressor.score(X_test, y_test)
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)

    
        # feat_importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
        # plt.barh(X_train.columns, feat_importances.nlargest(10))
        # wandb.log({'feature_importances': plt})

        # wandb.sklearn.plot_regressor(regressor, X_train, X_test, y_train, y_test, 'GradientBoost')
        wandb.log(data={"mse": mse, "coefficient_of_determination": score})



### Model For the Baseline Dataset

In [3]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'mse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric


In [None]:
parameters_dict = {
    'learning_rate': {
        'values': [0.1]
    },
    'n_estimators': {
        'values': [100, 200, 300, 500]
    },
    'min_samples_split': {
        'values': [4, 16, 32]
    },
    'min_samples_leaf': {
        'values': [20]
    },
    'max_depth': {
        'values': [3, 6]
    },
    'min_impurity_decrease': {
        'values': [0.1, 0.2]
    },
    'min_weight_fraction_leaf': {
        'values': [0.0]
    },
    'subsample': {
        'values': [0.75, 0.5]
    }
}
sweep_config['parameters'] = parameters_dict

In [None]:
TRAIN_DATA = BASELINE_TRAIN
VAL_DATA = BASELINE_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["monthly_rent"]
X_train = train_set.drop(columns=['monthly_rent'])

y_test = val_set['monthly_rent']
X_test = val_set.drop(columns=['monthly_rent'])


In [None]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_baseline")
wandb.agent(sweep_id, train(X_train, y_train, X_test, y_test), count=20)

### Model for the Truncated Baseline Set

In [None]:
TRAIN_DATA = TRUNCATED_BASELINE_TRAIN
VAL_DATA = TRUNCATED_BASLINE_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["monthly_rent"]
X_train = train_set.drop(columns=['monthly_rent'])

y_test = val_set['monthly_rent']
X_test = val_set.drop(columns=['monthly_rent'])

### Model for the Feature Engineering Set

In [None]:
TRAIN_DATA = BASELINE_W_FEAT_ENG_TRAIN
VAL_DATA = BASELINE_W_FEAT_ENG_VAL

In [None]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["monthly_rent"]
X_train = train_set.drop(columns=['monthly_rent'])

y_test = val_set['monthly_rent']
X_test = val_set.drop(columns=['monthly_rent'])

### Model for the Price Per Square Metre Set

In [10]:
sweep_config = {
    'method': 'bayes'
}
metric = {
    'name': 'mse',
    'goal': 'minimize'   
    }

sweep_config['metric'] = metric

In [11]:
parameters_dict = {
    'learning_rate': {
        'values': [0.1]
    },
    'n_estimators': {
        'values': [100, 200, 300, 500]
    },
    'min_samples_split': {
        'values': [4, 16, 32]
    },
    'min_samples_leaf': {
        'values': [20]
    },
    'max_depth': {
        'values': [3, 6]
    },
    'min_impurity_decrease': {
        'values': [0.1, 0.2]
    },
    'min_weight_fraction_leaf': {
        'values': [0.0]
    },
    'subsample': {
        'values': [0.75, 0.5]
    }
}
sweep_config['parameters'] = parameters_dict

In [12]:
TRAIN_DATA = PPSM_BASELINE_TRAIN
VAL_DATA = PPSM_BASELINE_VAL

In [13]:
train_set = pd.read_csv(TRAIN_DATA)
val_set = pd.read_csv(VAL_DATA)

y_train = train_set["rent_per_sqm"]
X_train = train_set.drop(columns=['rent_per_sqm'])

y_test = val_set['rent_per_sqm']
X_test = val_set.drop(columns=['rent_per_sqm'])

In [14]:
sweep_id = wandb.sweep(sweep_config, project="hdb_rental_prices_ppsm_baseline")
wandb.agent(sweep_id, train(X_train, y_train, X_test, y_test), count=20)

Create sweep with ID: w6iqpm1j
Sweep URL: https://wandb.ai/minjabenho/hdb_rental_prices_ppsm_baseline/sweeps/w6iqpm1j


Traceback (most recent call last):
  File "/home/benhgm/miniconda3/envs/agents/lib/python3.10/site-packages/wandb/sdk/wandb_config.py", line 162, in __getattr__
    return self.__getitem__(key)
  File "/home/benhgm/miniconda3/envs/agents/lib/python3.10/site-packages/wandb/sdk/wandb_config.py", line 130, in __getitem__
    return self._items[key]
KeyError: 'learning_rate'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/tmp/ipykernel_700758/4271729088.py", line 7, in train
    learning_rate=config.learning_rate,
  File "/home/benhgm/miniconda3/envs/agents/lib/python3.10/site-packages/wandb/sdk/wandb_config.py", line 164, in __getattr__
    raise AttributeError(
AttributeError: <class 'wandb.sdk.wandb_config.Config'> object has no attribute 'learning_rate'




AttributeError: <class 'wandb.sdk.wandb_config.Config'> object has no attribute 'learning_rate'