In [1]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupShuffleSplit
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import joblib
from sklearn.metrics import classification_report



from util import multiclass_roc_auc_score

In [2]:
npzfile = np.load('data/data_d1_d2_data.npz')
features = npzfile['features']
labels = npzfile['labels']
groups = npzfile['groups']
features = np.nan_to_num(features)

In [3]:
# GroupShuffleSplit for train-test split
gss = GroupShuffleSplit(n_splits=1, train_size=.8, random_state=42)
idx_trn, idx_test = next(gss.split(labels, groups=groups))
X_train = features[idx_trn, :]
X_test = features[idx_test, :]
y_train = labels[idx_trn, 1]
y_test = labels[idx_test, 1]

In [4]:
estimators = [
    ('clf', XGBClassifier(objective='multi:softmax', num_class=len(np.unique(3)), random_state=8))
]
pipe = Pipeline(steps=estimators)
search_space = {
    'clf__max_depth': [2, 4, 6, 8],
    'clf__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'clf__subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'clf__colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'clf__colsample_bylevel': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'clf__colsample_bynode': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'clf__reg_alpha': [0.0, 0.1, 0.5, 1.0, 5.0, 10.0],
    'clf__reg_lambda': [0.0, 0.1, 0.5, 1.0, 5.0, 10.0],
    'clf__gamma': [0.0, 0.1, 0.5, 1.0, 5.0, 10.0],
    'clf__objective': ['multi:softmax', 'multi:softprob'],
    'clf__num_class': [0, 1, 2, 3, 4, 5, 6, 7, 8],
}

# Create RandomizedSearchCV object
opt = RandomizedSearchCV(pipe, search_space, n_iter=10,cv=5, scoring='accuracy', random_state=8, n_jobs=-1)

In [5]:
opt

In [6]:
opt.fit(X_train, y_train)
joblib.dump(opt, 'model/model_xgboost_data_d1_d2.pkl')

y_pred = opt.predict(X_test)
print('Accuracy for 3 class problem using XGBoost classifier:', accuracy_score(y_test, y_pred, normalize=True))
# Assuming you have the implementation of multiclass_roc_auc_score function
print('AUC for 3 class problem using XGBoost classifier:', multiclass_roc_auc_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy for 3 class problem using XGBoost classifier: 0.6882640586797066
AUC for 3 class problem using XGBoost classifier: 0.7527385116151324
              precision    recall  f1-score   support

         0.0       0.73      0.65      0.69       950
         1.0       0.61      0.78      0.69      1378
         2.0       0.85      0.59      0.70       944

    accuracy                           0.69      3272
   macro avg       0.73      0.67      0.69      3272
weighted avg       0.71      0.69      0.69      3272



In [2]:
opt = joblib.load('model/model_xgboost_data_d1_d2.pkl')
best_model = opt.best_estimator_.steps[0][1]

In [3]:
opt.best_params_

{'clf__subsample': 0.5,
 'clf__reg_lambda': 0.5,
 'clf__reg_alpha': 1.0,
 'clf__objective': 'multi:softmax',
 'clf__num_class': 4,
 'clf__max_depth': 4,
 'clf__learning_rate': 0.001,
 'clf__gamma': 5.0,
 'clf__colsample_bytree': 0.7,
 'clf__colsample_bynode': 0.8,
 'clf__colsample_bylevel': 0.8}

70-30  
Accuracy for 3 class problem using XGBoost classifier: 0.68071000855432  
AUC for 3 class problem using XGBoost classifier: 0.7488851164032369  
```
              precision    recall  f1-score   support

         0.0       0.73      0.67      0.70      1427
         1.0       0.60      0.75      0.67      1946
         2.0       0.81      0.59      0.68      1303

    accuracy                           0.68      4676
   macro avg       0.71      0.67      0.68      4676
weighted avg       0.70      0.68      0.68      4676   ```