## Multiclassification

In [11]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
OMP_NUM_THREADS=1 
import hyperopt
from hyperopt import hp
from hpsklearn import HyperoptEstimator, xgboost_classification, sgd, any_classifier
import xgboost as xgb
from sklearn import datasets, linear_model
import matplotlib.pyplot as plt

from yellowbrick.target import class_balance, ClassBalance
from yellowbrick.classifier import ConfusionMatrix

In [12]:
RANDOM_STATE = 42

In [13]:
def timer(start_time=None):
    # this is customized function to print the time of performance
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

## Data
Dataset used for numbers classification problems

In [14]:
mnist = fetch_openml('mnist_784', version=1)
# split for test, train and validation

X_train, X_test, y_train, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, train_size=50000, random_state=RANDOM_STATE)

#X_train_val, X_test, y_train_val, y_test = train_test_split(
#    mnist.data, mnist.target, test_size=10000, random_state=RANDOM_STATE)
#X_train, X_val, y_train, y_val = train_test_split(
#    X_train_val, y_train_val, test_size=10000, random_state=RANDOM_STATE)

In [15]:
labels = np.unique(y_train)

## Multiclass

Część metod modelarskich obługuje też multiclass classification. Lista tutaj: https://scikit-learn.org/stable/modules/multiclass.html#
Poniżej przykład xgboost z użyciem multiclass ('objective':"multi:softmax" zamiast "binary").

In [None]:
start_time = timer(start_time=None)

param = {'objective':"multi:softmax",
         'random_state':RANDOM_STATE}

xgb_cl = XGBClassifier(**param)
xgb_cl.fit(X_train, y_train)

timer(start_time)

In [None]:
xgb_cl.params

In [16]:
accuracy_score(xgb_cl.predict(X_test), y_test)

0.976

## Optimized hiperparameters

In [67]:
# parameters optimization

# A parameter grid for XGBoost
params_grid = {
        'min_child_weight': [1, 5, 10],
        'learning_rate': [0.1, 0.5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5, 7, 10],
        'objective': ["multi:softmax"]
        }

In [41]:
# hyperopt

search_space = { 
 'min_child_weight': hp.choice('min_child_weight', np.arange(1, 10, dtype=int)),
     'learning_rate': hp.uniform('learning_rate', 0.1, 0.5),
     'subsample': hp.uniform('subsample', 0.6, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1),
     'max_depth': hp.choice( 'max_depth', np.arange(5, 10, dtype=int))
}

# define objective function
def hyperparameter_tuning(params):
    bst = XGBClassifier(**param)
    bst.fit(X_train, y_train)
    acc = accuracy_score(bst.predict(X_val), y_val)
    return -acc

start_time = timer(None)
best = hyperopt.fmin(fn=hyperparameter_tuning, space=search_space, algo=hyperopt.tpe.suggest, max_evals=5)
timer(start_time) 

100%|██████████| 5/5 [36:42<00:00, 440.53s/trial, best loss: -0.9966]

 Time taken: 0 hours 36 minutes and 42.68 seconds.


In [69]:
# randomized search
folds = 3

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = RANDOM_STATE)

random_search = RandomizedSearchCV(xgb_cl, param_distributions=params_grid, 
                                   n_iter=5, 
                                   n_jobs=3, cv=skf.split(X_train, y_train),
                                   verbose=10, 
                                   random_state=RANDOM_STATE)

# optimize
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed: 15.0min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 50.6min
[Parallel(n_jobs=3)]: Done  12 out of  15 | elapsed: 64.4min remaining: 16.1min
[Parallel(n_jobs=3)]: Done  15 out of  15 | elapsed: 75.6min finished



 Time taken: 1 hours 21 minutes and 25.15 seconds.


In [70]:
random_search.best_params_

{'subsample': 0.8,
 'objective': 'multi:softmax',
 'min_child_weight': 5,
 'max_depth': 7,
 'learning_rate': 0.5,
 'colsample_bytree': 0.8}

In [71]:
xgb_cl = XGBClassifier(**random_search.best_params_)
xgb_cl.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.5, max_delta_step=0, max_depth=7,
              min_child_weight=5, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [72]:
accuracy_score(xgb_cl.predict(X_test), y_test)

0.9733

## multiclass with OneVsRest (scikit learn method)
OneVsRest buduje niezależnie osobny model dla kazdej klasy. Może zostać uzyte z dowolnym klasyfikatorem binarnym. Dokumentacja: https://scikit-learn.org/stable/modules/multiclass.html#one-vs-the-rest.
Uwaga: modele budowane są z takim samym zestawem hiperparametrów i nie znalazłam implementacji, która pozwalałaby zautomatyzować w pewnym stopniu optymalizację tych hiperparametrów dla każdego modelu. Aczkolwiek wydaje się, że takiego rozwiązanie to kwestia dopisania kilku linijek. (Przykłąd w kolejnym akapicie)

In [None]:
def compute_acc_per_class(y_pred_class, y_true_class, classes):
    acc = {}
    for label in classes:
        y_pred_class_tmp = y_pred_class == label
        y_true_class_tmp = y_true_class == y_true_class
        acc[label] = sum(y_pred_class_tmp == y_true_class_tmp)/len(y_true_class_tmp)
    return acc

In [19]:
start_time = timer(start_time=None)

param = {'objective':"binary:logistic",
         'random_state':RANDOM_STATE}
ovr_clf = OneVsRestClassifier(XGBClassifier(**param))
ovr_clf.fit(X_train, y_train)

timer(start_time)



 Time taken: 0 hours 7 minutes and 23.8 seconds.


In [95]:
y_pred_class = ovr_clf.predict(X_test)

In [None]:
compute_acc_per_class(y_pred_class, y_true_class = y_test, classes = labels)

In [20]:
accuracy_score(ovr_clf.predict(X_test), y_test)

0.9776

## One vs Rest with optimized parameters for each class


In [85]:
def optimize_hiperparams(xgb_cl, X_train, y_train, params_grid, RANDOM_STATE):
    # randomized search
    folds = 3

    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = RANDOM_STATE)

    random_search = RandomizedSearchCV(xgb_cl, param_distributions=params_grid, 
                                       n_iter=5, 
                                       n_jobs=3, cv=skf.split(X_train, y_train),
                                       verbose=10, 
                                       random_state=RANDOM_STATE)

    # optimize
    start_time = timer(None) # timing starts from this point for "start_time" variable
    random_search.fit(X_train, y_train)
    return random_search.best_params_

In [None]:
params_grid = {
        'min_child_weight': [1, 5, 10],
        'learning_rate': [0.1, 0.5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5, 7, 10],
        'objective': ["binary:logistic"]
        }

X_test_predict = []
columns = []

for label in labels:
    print("Building model for label: ", label)
    y_train_tmp = [1  if x == label else 0 for x in y_train]
    best_params = optimize_hiperparams(xgb_cl, X_train, y_train_tmp, params_grid, RANDOM_STATE)
    xgb_bin_tmp = XGBClassifier(**best_params)
    xgb_bin_tmp.fit(X_train, y_train_tmp)
    
    idx = 1
    X_test_predict.append(xgb_bin_tmp.predict_proba(X_test)[:, idx])
    columns.append(label + "_prob")


In [39]:
X_test_predict1 = pd.DataFrame(X_test_predict).T


In [40]:
X_test_predict1['pred_class'] = X_test_predict1.idxmax(axis="columns")
X_test_predict1['true_class'] = y_test

In [41]:
X_test_predict1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,pred_class,true_class
0,0.000007,6.486103e-07,5.114993e-05,0.001260,1.610277e-06,0.000005,5.220033e-07,1.228419e-06,9.999697e-01,0.000001,8,8
1,0.000003,2.875402e-07,8.070029e-07,0.000004,9.999899e-01,0.000012,1.052732e-05,1.254222e-03,5.400230e-06,0.000144,4,4
2,0.000002,3.207768e-06,1.239824e-05,0.001151,1.431903e-05,0.001840,4.519127e-03,3.908603e-06,7.310153e-01,0.000077,8,8
3,0.000003,2.413362e-06,2.115519e-04,0.000382,3.053685e-07,0.000001,1.038110e-07,9.999633e-01,1.290018e-06,0.000009,7,7
4,0.000018,4.146792e-06,1.155458e-05,0.000019,4.646954e-06,0.000015,1.436491e-07,9.998953e-01,2.375720e-06,0.000591,7,7
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.000002,7.224315e-06,9.999481e-01,0.000006,5.716324e-06,0.000005,1.369874e-06,1.483103e-05,9.785467e-07,0.000016,2,2
9996,0.000003,1.115329e-06,1.526240e-05,0.000270,3.116885e-04,0.000124,1.663161e-04,3.954419e-06,9.938950e-01,0.000715,8,8
9997,0.000007,1.636114e-05,8.965385e-04,0.921059,7.141514e-08,0.000235,2.892266e-07,1.270499e-06,3.954600e-06,0.000058,3,3
9998,0.000276,9.710186e-07,3.512192e-05,0.004201,1.759610e-06,0.000060,8.065166e-06,6.780021e-08,9.568717e-01,0.017933,8,8
