In [1]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

import numpy as np
import time
import pickle
from h5py import File
import pandas as pd

In [2]:
params = {
        'min_child_weight': [1, 2, 4, 5, 8, 10, 12, ],
        'gamma': list(0.1 * np.arange(25) + 0.1),
        'subsample': list(0.01 * np.arange(100) + 0.01),
        'colsample_bytree': list(0.01 * np.arange(100) + 0.01),
        'max_depth': [3, 4, 5]
        }

xgb = XGBRegressor(objective='reg:linear', learning_rate=0.1,
                   alpha=10, n_estimators=512)

In [3]:
random_search = RandomizedSearchCV(xgb,
                                   param_distributions=params,
                                   n_iter=256,
                                   scoring='neg_mean_squared_error',
                                   n_jobs=36,
                                   cv=3, verbose=3, random_state=42)

In [4]:
t1 = time.time()
with File('mixed_values.h5', 'r') as hf:
    X = hf['mixed_values'][:]
#with open('raw_values.pkl', 'rb') as f:
#    X = pickle.load(f)


#raw_values = pickl
df = pd.read_pickle('for_y.pkl')
y = df['LowGap_SOHSE_Approx'].values
t2 = time.time()

print(f'X.shape is {X.shape}')
print(f'y.shape is {y.shape}')
print(f'... load time is {t2 - t1:.2f} seconds')

t1 = time.time()
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.15, random_state=42)
t2 = time.time()

print("Number of training examples: %i \t number of features: %i" %
      (Xtr.shape[0], Xtr.shape[1]))

print("Number of testing examples:  %i \t number of features: %i" %
      (Xte.shape[0], Xtr.shape[1]))
print(f'... load time is {t2 - t1:.2f} seconds')

X.shape is (1921, 23436)
y.shape is (1921,)
... load time is 0.16 seconds
Number of training examples: 1632 	 number of features: 23436
Number of testing examples:  289 	 number of features: 23436
... load time is 0.11 seconds


In [5]:
random_search.fit(Xtr, ytr)

Fitting 3 folds for each of 256 candidates, totalling 768 fits


[Parallel(n_jobs=36)]: Using backend LokyBackend with 36 concurrent workers.
[Parallel(n_jobs=36)]: Done  56 tasks      | elapsed: 11.0min
[Parallel(n_jobs=36)]: Done 216 tasks      | elapsed: 39.4min
[Parallel(n_jobs=36)]: Done 440 tasks      | elapsed: 69.8min
[Parallel(n_jobs=36)]: Done 768 out of 768 | elapsed: 121.7min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=512, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1),
          fit_params=None, iid='warn', n_iter=256, n_jobs=36,
          param_distributions={'min_child_weight': [1, 2, 4, 5, 8, 10, 12], 'gamma': [0.1, 0.2, 0.30000000000000004, 0.4, 0.5, 0.6, 0.7000000000000001, 0.8, 0.9, 1.0, 1.1, 1.2000000000000002, 1.3000000000000003, 1.4000000000000001, 1.5000000000000002, 1.6, 1.7000000000000002, 1.8000000000000003, 1.90000000000...00000000001, 0.9500000000000001, 0.9600000000000001, 0.97, 0.98, 0.99, 1.0], 'max_depth': [3, 4, 5]},
          pre_dispatc

In [6]:
mse = np.sqrt(mean_squared_error(
    yte, random_search.best_estimator_.predict(Xte)
))

mae = mean_absolute_error(
    yte, random_search.best_estimator_.predict(Xte)
)

print(f'MSE = {mse:.3f}, MAE = {mae:.3f}')

MSE = 0.454, MAE = 0.319


In [7]:
random_search.best_estimator_

XGBRegressor(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.59, gamma=0.2, importance_type='gain',
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=2, missing=None, n_estimators=512, n_jobs=1,
       nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.63)