In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
from typing import List #for type hinting

In [12]:

cols = [    
  'class_label', 'lepton_pt', 'lepton_eta', 'lepton_phi', 'missing_energy_magnitude',
  'missing_energy_phi', 'jet_1_pt', 'jet_1_eta', 'jet_1_phi', 'jet_1_b-tag',
  'jet_2_pt', 'jet_2_eta', 'jet_2_phi', 'jet_2_b-tag', 'jet_3_pt',
  'jet_3_eta', 'jet_3_phi', 'jet_3_b-tag', 'jet_4_pt', 'jet_4_eta',
  'jet_4_phi', 'jet_4_b-tag', 'm_jj', 'm_jjj', 'm_lv', 'm_jlv',
  'm_bb', 'm_wbb', 'm_wwbb'
] 
filename = "./drive/MyDrive/HIGGS_train.csv"

df = pd.read_csv(filename, header=None, names=cols)
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df.isnull().sum()
#remove rows with missing values
df.dropna(inplace=True)

  df = pd.read_csv(filename, header=None, names=cols)


In [110]:
scaler = StandardScaler()
    
cols_to_scale = df.columns[1:]

scaler.fit(df[cols_to_scale])

df[cols_to_scale] = scaler.transform(df[cols_to_scale])

X = df.iloc[:, 1:].values 
y = df.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)        
X_test, X_val, y_test, y_val = train_test_split(X, y, test_size=0.5, random_state=0) 

train_matrix = xgb.DMatrix(X_train, label=y_train)
val_matrix = xgb.DMatrix(X_val, label=y_val)
test_matrix = xgb.DMatrix(X_test, y_test)

In [72]:
params = {
  'learning_rate': hp.choice('learning_rate', [0.01, 0.1, 0.15, 0.2, 0.3]),
  'max_depth' : hp.choice('max_depth', range(3,15,3)),
  'gamma' : hp.choice('gamma', [i/10.0 for i in range(0,5)]),
  'colsample_bytree' : hp.choice('colsample_bytree', [i/10.0 for i in range(3,10)]),     
  'reg_alpha' : hp.choice('reg_alpha', [1e-5, 1e-2, 0.1, 1, 10, 100]), 
  'reg_lambda' : hp.choice('reg_lambda', [1e-5, 1e-2, 0.1, 1, 10, 100]),
  'min_child_weight' : hp.choice('min_child_weight', [1, 3, 5, 7, 9]),
  'subsample' : hp.choice('subsample', [i/10.0 for i in range(3,10)]),
  'objective' : 'binary:logistic',
  'tree_method' : 'gpu_hist',
}

In [73]:
def objective(params):
  clf = xgb.train(params=params, dtrain=train_matrix, num_boost_round=100)
  
  y_pred = clf.predict(val_matrix)
  y_pred_labels = np.array([1 if p > 0.5 else 0 for p in y_pred])

  acc = np.mean(y_pred_labels == y_val)
  # dictionary with information for evaluation
  return {'loss': -acc, 'params': params, 'status': STATUS_OK}

In [74]:
trials = Trials()
best = fmin(fn = objective, space = params, algo = tpe.suggest, max_evals = 300, trials = trials)

100%|██████████| 300/300 [23:53<00:00,  4.78s/trial, best loss: -0.8236721578143854]


In [75]:
print(best)

{'colsample_bytree': 4, 'gamma': 1, 'learning_rate': 4, 'max_depth': 3, 'min_child_weight': 0, 'reg_alpha': 2, 'reg_lambda': 0, 'subsample': 6}


In [111]:
best_params = space_eval(params, best)

In [112]:
xgb_best =  xgb.train(params=best_params, dtrain=train_matrix, num_boost_round=100)

In [115]:
print(f'train acc: {np.mean(np.array([1 if p > 0.5 else 0 for p in xgb_best.predict(train_matrix)]) == y_train)}')
print(f'val acc: {np.mean(np.array([1 if p > 0.5 else 0 for p in xgb_best.predict(val_matrix)]) == y_val)}') 
print(f'test acc: {np.mean(np.array([1 if p > 0.5 else 0 for p in xgb_best.predict(test_matrix)])  == y_test)}')

train acc: 0.9713783669883357
val acc: 0.8236721578143854
test acc: 0.9711564743764959


In [116]:
xgb_best.save_model('xgb_model.bin')

In [117]:
from google.colab import files
files.download('xgb_model.bin')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>