From 0cb88b2ba7b039552da4ac8d779f708993c1d072 Mon Sep 17 00:00:00 2001 From: Vladimir Iglovikov Date: Tue, 25 Oct 2016 11:56:34 -0700 Subject: [PATCH 1/3] Added method points_to_csv that saves known data points to csv file --- bayes_opt/bayesian_optimization.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/bayes_opt/bayesian_optimization.py b/bayes_opt/bayesian_optimization.py index 4c74334e2..06c7eb128 100644 --- a/bayes_opt/bayesian_optimization.py +++ b/bayes_opt/bayesian_optimization.py @@ -327,3 +327,17 @@ def maximize(self, # Print a final report if verbose active. if self.verbose: self.plog.print_summary() + + def points_to_csv(self, file_name): + """ + After training all points for which we know target variable + (both from initialization and optimization) are saved + + :param file_name: name of the file where points will be saved in the csv format + + :return: None + """ + import pandas as pd + points_df = pd.DataFrame(self.X, columns=self.keys) + points_df['target'] = self.Y + points_df.to_csv(file_name, index=False) \ No newline at end of file From eda6bed4ee3970c5c927bf24332fc737deb7d512 Mon Sep 17 00:00:00 2001 From: Vladimir Iglovikov Date: Tue, 25 Oct 2016 17:33:16 -0700 Subject: [PATCH 2/3] Added xgboost example --- examples/xgb_example.py | 79 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 examples/xgb_example.py diff --git a/examples/xgb_example.py b/examples/xgb_example.py new file mode 100644 index 000000000..b638cf756 --- /dev/null +++ b/examples/xgb_example.py @@ -0,0 +1,79 @@ +""" +Baysian hyperparameter optimization [https://github.com/fmfn/BayesianOptimization] +for Mean Absoulte Error objective +on default features for https://www.kaggle.com/c/allstate-claims-severity +""" + +__author__ = "Vladimir Iglovikov" + +import pandas as pd +import xgboost as xgb +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import mean_absolute_error +from bayes_opt import BayesianOptimization +from tqdm import tqdm + + +def xgb_evaluate(min_child_weight, + colsample_bytree, + max_depth, + subsample, + gamma, + alpha): + + params['min_child_weight'] = int(min_child_weight) + params['cosample_bytree'] = max(min(colsample_bytree, 1), 0) + params['max_depth'] = int(max_depth) + params['subsample'] = max(min(subsample, 1), 0) + params['gamma'] = max(gamma, 0) + params['alpha'] = max(alpha, 0) + + + cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5, + seed=random_state, + callbacks=[xgb.callback.early_stop(50)]) + + return -cv_result['test-mae-mean'].values[-1] + + +def prepare_data(): + train = pd.read_csv('../input/train.csv') + categorical_columns = train.select_dtypes(include=['object']).columns + + for column in tqdm(categorical_columns): + le = LabelEncoder() + train[column] = le.fit_transform(train[column]) + + y = train['loss'] + + X = train.drop(['loss', 'id'], 1) + xgtrain = xgb.DMatrix(X, label=y) + + return xgtrain + + +if __name__ == '__main__': + xgtrain = prepare_data() + + num_rounds = 3000 + random_state = 2016 + num_iter = 25 + init_points = 5 + params = { + 'eta': 0.1, + 'silent': 1, + 'eval_metric': 'mae', + 'verbose_eval': True, + 'seed': random_state + } + + xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (1, 20), + 'colsample_bytree': (0.5, 1), + 'max_depth': (5, 15), + 'subsample': (0.5, 1), + 'gamma': (0, 10), + 'alpha': (0, 10), + }) + + xgbBO.maximize(init_points=init_points, n_iter=num_iter) + From 0e95b604efae17157786f9980b912a34633791c4 Mon Sep 17 00:00:00 2001 From: Vladimir Iglovikov Date: Tue, 25 Oct 2016 11:56:34 -0700 Subject: [PATCH 3/3] Added method points_to_csv that saves known data points to csv file --- bayes_opt/bayesian_optimization.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bayes_opt/bayesian_optimization.py b/bayes_opt/bayesian_optimization.py index 06c7eb128..eda86e139 100644 --- a/bayes_opt/bayesian_optimization.py +++ b/bayes_opt/bayesian_optimization.py @@ -337,7 +337,7 @@ def points_to_csv(self, file_name): :return: None """ - import pandas as pd - points_df = pd.DataFrame(self.X, columns=self.keys) - points_df['target'] = self.Y - points_df.to_csv(file_name, index=False) \ No newline at end of file + + points = np.hstack((self.X, np.expand_dims(self.Y, axis=1))) + header = ', '.join(self.keys + ['target']) + np.savetxt(file_name, points, header=header, delimiter=',') \ No newline at end of file