<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-the-libraries" data-toc-modified-id="Load-the-libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load the libraries</a></span></li><li><span><a href="#Load-the-data" data-toc-modified-id="Load-the-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load the data</a></span></li><li><span><a href="#Train-test-split" data-toc-modified-id="Train-test-split-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train test split</a></span></li><li><span><a href="#Modelling:-lgb" data-toc-modified-id="Modelling:-lgb-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Modelling: lgb</a></span><ul class="toc-item"><li><span><a href="#default" data-toc-modified-id="default-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>default</a></span></li><li><span><a href="#Trial-1" data-toc-modified-id="Trial-1-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Trial 1</a></span></li><li><span><a href="#Few-Trials" data-toc-modified-id="Few-Trials-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Few Trials</a></span></li></ul></li><li><span><a href="#Hyperparameter-optimization" data-toc-modified-id="Hyperparameter-optimization-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Hyperparameter optimization</a></span><ul class="toc-item"><li><span><a href="#HPO-using-RandomizedSearchCV" data-toc-modified-id="HPO-using-RandomizedSearchCV-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>HPO using RandomizedSearchCV</a></span></li><li><span><a href="#HPO-using-optuna" data-toc-modified-id="HPO-using-optuna-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>HPO using optuna</a></span></li></ul></li></ul></div>

# Load the libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

pd.options.plotting.backend = "matplotlib"
pd.set_option('max_columns',100)

import time,os,json
time_start_notebook = time.time()
home = os.path.expanduser('~')
SEED=100

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import sklearn
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

import joblib
import lightgbm as lgb

[(x.__name__,x.__version__) for x in [np,pd,sns,sklearn,lgb]]

[('numpy', '1.18.4'),
 ('pandas', '1.1.0'),
 ('seaborn', '0.10.1'),
 ('sklearn', '0.23.1'),
 ('lightgbm', '2.3.1')]

In [2]:
def print_scores(ytest,ypreds):
    # for auc score we need to binarize
    labels = [0, 1, 2, 3]
    ytest_bin = label_binarize(ytest, classes=labels)
    ypreds_bin = label_binarize(ypreds, classes=labels)
    a = roc_auc_score(ytest_bin,ypreds_bin,
                      average='macro',multi_class='ovo')
    
    # precision recall
    p = precision_score(ytest,ypreds,average='macro')
    r = recall_score(ytest,ypreds,average='macro')
    f = f1_score(ytest,ypreds,average='macro')
    print(f'Precision: {p: .2f}')
    print(f'Recall   : {r: .2f}')
    print(f'F1-score : {f: .2f}')
    print(f'AUC      : {f: .2f}')
    

    c = classification_report(ytest, ypreds)
    print(c)

    cm = confusion_matrix(ytest,ypreds)
    names = list('ABCD')
    df_cm = pd.DataFrame(cm,index=names,columns=names)
    df_cm = df_cm.style.background_gradient()
    display(df_cm)

In [3]:
df_eval = pd.DataFrame({'Model': [],
                        'Description':[],
                        'Accuracy':[],
                        'Precision':[],
                        'Recall':[],
                        'F1':[],
                        'AUC': []
                    })

# Load the data

In [4]:
df_raw = pd.read_csv('../data/raw/train.csv')
print(df_raw.shape)

df_raw.head(2).append(df_raw.tail(2))

(8068, 11)


Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
8066,467299,Female,No,27,Yes,Healthcare,1.0,Low,4.0,Cat_6,B
8067,461879,Male,Yes,37,Yes,Executive,0.0,Average,3.0,Cat_4,B


In [5]:
df = pd.read_csv('../data/processed/clean_data.csv')
print(df.shape)

df.head(2).append(df.tail(2))

(8068, 38)


Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Segmentation,Ever_Married_NA,Graduated_NA,Profession_NA,Work_Experience_NA,Family_Size_NA,Var_1_NA,Age_cat,Family_Size_cat,Work_Experience_cat,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,Var_1_Cat_1,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7,gen_mar,gen_grad,gen_spend,grad_spend,grad_spend_gen
0,0,0,22,0,1.0,0,4.0,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,1,1,38,1,3.0,1,3.0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,3,3,4,4,13
8066,1,0,27,1,1.0,0,4.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,3,1,1,10
8067,0,1,37,1,0.0,1,3.0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,2,2,3,4,4


# Train test split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
target = 'Segmentation'

df_Xtrain_orig, df_Xtest, ser_ytrain_orig, ser_ytest = train_test_split(
    df.drop(target,axis=1), df[target],shuffle=True,
    test_size=0.2, random_state=SEED, stratify=df[target])

df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
    df_Xtrain_orig, ser_ytrain_orig,
    test_size=0.2, random_state=SEED, stratify=ser_ytrain_orig)

ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
ytest = ser_ytest.to_numpy().ravel()

print(f"df             : {df.shape}")

print(f"\ndf_Xtrain_orig : {df_Xtrain_orig.shape}")
print(f"ser_ytrain_orig: {ser_ytrain_orig.shape}")

print(f"\ndf_Xtrain      : {df_Xtrain.shape}")
print(f"ser_ytrain     : {ser_ytrain.shape}")

print(f"\ndf_Xvalid      : {df_Xvalid.shape}")
print(f"ser_yvalid     : {ser_yvalid.shape}")

print(f"\ndf_Xtest       : {df_Xtest.shape}")
print(f"ser_ytest      : {ser_ytest.shape}")

df_Xtrain_orig.head(2)

df             : (8068, 38)

df_Xtrain_orig : (6454, 37)
ser_ytrain_orig: (6454,)

df_Xtrain      : (5163, 37)
ser_ytrain     : (5163,)

df_Xvalid      : (1291, 37)
ser_yvalid     : (1291,)

df_Xtest       : (1614, 37)
ser_ytest      : (1614,)


Unnamed: 0,Gender,Ever_Married,Age,Graduated,Work_Experience,Spending_Score,Family_Size,Ever_Married_NA,Graduated_NA,Profession_NA,Work_Experience_NA,Family_Size_NA,Var_1_NA,Age_cat,Family_Size_cat,Work_Experience_cat,Profession_Artist,Profession_Doctor,Profession_Engineer,Profession_Entertainment,Profession_Executive,Profession_Healthcare,Profession_Homemaker,Profession_Lawyer,Profession_Marketing,Var_1_Cat_1,Var_1_Cat_2,Var_1_Cat_3,Var_1_Cat_4,Var_1_Cat_5,Var_1_Cat_6,Var_1_Cat_7,gen_mar,gen_grad,gen_spend,grad_spend,grad_spend_gen
3582,0,1,27,0,2.0,0,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0
6827,0,0,41,1,3.0,0,1.0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,1,1


In [8]:
cols_all = df_Xtrain_orig.columns.to_list()
cols_no_na = [i for i in cols_all if not i.endswith('_NA')]


cross = ['gen_mar', 'gen_grad', 'gen_spend',
         'grad_spend', 'grad_spend_gen']
cols_no_na_no_cross = [i for i in cols_no_na if i not in cross]
cols_no_na_no_cross

['Gender',
 'Ever_Married',
 'Age',
 'Graduated',
 'Work_Experience',
 'Spending_Score',
 'Family_Size',
 'Age_cat',
 'Family_Size_cat',
 'Work_Experience_cat',
 'Profession_Artist',
 'Profession_Doctor',
 'Profession_Engineer',
 'Profession_Entertainment',
 'Profession_Executive',
 'Profession_Healthcare',
 'Profession_Homemaker',
 'Profession_Lawyer',
 'Profession_Marketing',
 'Var_1_Cat_1',
 'Var_1_Cat_2',
 'Var_1_Cat_3',
 'Var_1_Cat_4',
 'Var_1_Cat_5',
 'Var_1_Cat_6',
 'Var_1_Cat_7']

In [9]:
features = cols_no_na

Xtr = df_Xtrain_orig[features]
Xtx = df_Xtest[features]
Xvd = df_Xvalid[features]

ytr = ser_ytrain_orig.to_numpy().ravel()
ytx = ser_ytest.to_numpy().ravel()
yvd = ser_yvalid.to_numpy().ravel()

In [10]:
dtrain = lgb.Dataset(Xtr, ytr)
dvalid = lgb.Dataset(Xvd, yvd, reference=dtrain)
# I have used dtrain and dvalid in optuna grid search only.
# lgb required raw dataframe for prediction, no need of dtest.

# Modelling: lgb

## default

In [11]:
# time
time_start = time.time()
EVAL_METRIC = 'multi_error' # xgboost has 'auc'

model_name = 'lightgbm'
desc = 'default'

# model
model = lgb.LGBMClassifier(random_state=SEED)

# fit and save the model
model.fit(Xtr, ytr, eval_metric=EVAL_METRIC)


# predictions
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(model, Xtx, ytx, cv=skf)
ypreds = ypreds_cv

# auc
labels = [0, 1, 2, 3]
ytest_bin = label_binarize(ytest, classes=labels)
ypreds_bin = label_binarize(ypreds, classes=labels)
auc = roc_auc_score(ytest_bin,ypreds_bin,
                  average='macro',multi_class='ovo')

# model evaluation
average = 'macro'
row_eval = [model_name,desc, 
            accuracy_score(ytx, ypreds),
            precision_score(ytx, ypreds, average=average),
            recall_score(ytx, ypreds, average=average),
            f1_score(ytx, ypreds, average=average),
            auc
            ]

df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
print_scores(ytest,ypreds)

Time taken: 0 min 3 secs


Unnamed: 0,Model,Description,Accuracy,Precision,Recall,F1,AUC
0,lightgbm,default,0.491945,0.48121,0.483835,0.482098,0.657252


Precision:  0.48
Recall   :  0.48
F1-score :  0.48
AUC      :  0.48
              precision    recall  f1-score   support

           0       0.44      0.44      0.44       394
           1       0.37      0.34      0.36       372
           2       0.51      0.50      0.50       394
           3       0.60      0.65      0.62       454

    accuracy                           0.49      1614
   macro avg       0.48      0.48      0.48      1614
weighted avg       0.49      0.49      0.49      1614



Unnamed: 0,A,B,C,D
A,175,82,49,88
B,86,127,115,44
C,41,93,197,63
D,93,37,29,295


## Trial 1

In [12]:
arr_class_weights = class_weight.compute_class_weight(
    'balanced',
    np.unique(ytrain.reshape(-1, )),
    ytrain.reshape(-1, ))

dic_class_weights = dict(enumerate(arr_class_weights))
arr_class_weights



array([1.0227813 , 1.08557611, 1.02359239, 0.88955892])

In [13]:
params = {"objective": "multiclass",
          "boosting_type": "gbdt",
          'num_class':4,
          'random_state': SEED,
          "metric": "multi_logloss",
          "verbose": 0}

params['learning_rate'] = 0.04
params['max_depth'] = 18
params['n_estimators'] = 3000
params['subsample'] = 0.7
params['colsample_bytree']=0.7
params['min_data_in_leaf'] = 55
params['reg_alpha'] = 1.7
params['reg_lambda'] = 1.11
params['class_weight']: dic_class_weights


model = lgb.LGBMClassifier(**params)

In [14]:
%%capture
model.fit(df_Xtrain,ser_ytrain,
          early_stopping_rounds = 100,
          eval_metric='multi_error',
          eval_set = (df_Xvalid,ser_yvalid))

In [15]:
params['n_estimators'] = model.best_iteration_
model.best_iteration_

84

In [16]:
# time
time_start = time.time()

model_name = 'lightgbm'
desc = 'example_1'

# model
model = lgb.LGBMClassifier(**params)

# fit and save the model
model.fit(Xtr, ytr)

# predictions
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(model, Xtx, ytx, cv=skf)
ypreds = ypreds_cv

# auc
labels = [0, 1, 2, 3]
ytest_bin = label_binarize(ytest, classes=labels)
ypreds_bin = label_binarize(ypreds, classes=labels)
auc = roc_auc_score(ytest_bin,ypreds_bin,
                  average='macro',multi_class='ovo')

# model evaluation
average = 'macro'
row_eval = [model_name,desc, 
            accuracy_score(ytx, ypreds),
            precision_score(ytx, ypreds, average=average),
            recall_score(ytx, ypreds, average=average),
            f1_score(ytx, ypreds, average=average),
            auc
            ]

df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
print_scores(ytest,ypreds)

Time taken: 0 min 2 secs


Unnamed: 0,Model,Description,Accuracy,Precision,Recall,F1,AUC
0,lightgbm,default,0.491945,0.48121,0.483835,0.482098,0.657252
1,lightgbm,example_1,0.503098,0.487655,0.492798,0.485116,0.663384


Precision:  0.49
Recall   :  0.49
F1-score :  0.49
AUC      :  0.49
              precision    recall  f1-score   support

           0       0.41      0.46      0.43       394
           1       0.40      0.26      0.31       372
           2       0.54      0.57      0.56       394
           3       0.60      0.68      0.64       454

    accuracy                           0.50      1614
   macro avg       0.49      0.49      0.49      1614
weighted avg       0.49      0.50      0.49      1614



Unnamed: 0,A,B,C,D
A,182,69,52,91
B,110,95,121,46
C,53,45,225,71
D,98,29,17,310


## Few Trials

In [17]:
model = lgb.LGBMClassifier(learning_rate=0.1, n_estimators=250, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, nthread=4, scale_pos_weight=3,seed=27)

model.fit(Xtr,ytr)
ypreds = model.predict(Xtx)

print(confusion_matrix(ytest, ypreds))
print(accuracy_score(ytest,ypreds))

print(model.score(Xtx, ytx))
print(model.score(Xtr, ytr))

[[170  85  43  96]
 [ 98 120 119  35]
 [ 41  70 216  67]
 [ 86  31  22 315]]
0.5086741016109045
0.5086741016109045
0.6747753331267431


# Hyperparameter optimization

## HPO using RandomizedSearchCV

In [18]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

In [19]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
X = np.array(Xtr)
y = np.array(ytr)

In [20]:
model = lgb.LGBMClassifier(boosting_type='gbdt',
                           objective='multiclass',
                           num_class=4,
                           random_state=SEED,
                           n_jobs=-1,
                           verbose=1)

params = {"max_depth":[4,6,8,10,-1],
          "learning_rate":[0.01, 0.03, 0.05, 0.1, 0.3, 0.5,
                           0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.5],
          "subsample":[0.5,0.6,0.7,0.8,0.9,1],
          "colsample_bytree":[0.5,0.6,0.7,0.8,0.9,1],
          "reg_alpha":[0,0.25,0.5,1,2],
          "reg_lambda":[0,0.25,0.5,1,2],
          "num_leaves":[7,15,31,63,127],
          "min_data_in_leaf":[1,3,5,7,10,25,50,75,100],
          "n_estimators":list(range(50,1001,25))
         }
grid = RandomizedSearchCV(model,params,
                          cv=skf.split(X, y),
                          scoring='accuracy')

In [21]:
%%time
# grid.fit(X,y)
# best_model = grid.best_estimator_
# joblib.dump(best_model, '../outputs/lgb_randomsearch_best_model.pkl')

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [22]:
model_grid_random = joblib.load('../outputs/lgb_randomsearch_best_model.pkl')
model_grid_random

LGBMClassifier(colsample_bytree=0.6, learning_rate=0.03, min_data_in_leaf=1,
               n_estimators=375, num_class=4, num_leaves=15,
               objective='multiclass', random_state=100, reg_alpha=0.25,
               reg_lambda=1, subsample=1, verbose=1)

In [23]:
# time
time_start = time.time()
EVAL_METRIC = 'multi_error'

model_name = 'lightgbm'
desc = 'grid_randomsearch'

# model
model = model_grid_random

# fit and save the model
model.fit(Xtr, ytr,eval_metric=EVAL_METRIC)

# predictions
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(model, Xtx, ytx, cv=skf)
ypreds = ypreds_cv

# auc
labels = [0, 1, 2, 3]
ytest_bin = label_binarize(ytest, classes=labels)
ypreds_bin = label_binarize(ypreds, classes=labels)
auc = roc_auc_score(ytest_bin,ypreds_bin,
                  average='macro',multi_class='ovo')

# model evaluation
average = 'macro'
row_eval = [model_name,desc, 
            accuracy_score(ytx, ypreds),
            precision_score(ytx, ypreds, average=average),
            recall_score(ytx, ypreds, average=average),
            f1_score(ytx, ypreds, average=average),
            auc
            ]

df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
print_scores(ytest,ypreds)

Time taken: 0 min 5 secs


Unnamed: 0,Model,Description,Accuracy,Precision,Recall,F1,AUC
0,lightgbm,default,0.491945,0.48121,0.483835,0.482098,0.657252
1,lightgbm,example_1,0.503098,0.487655,0.492798,0.485116,0.663384
2,lightgbm,grid_randomsearch,0.513011,0.50247,0.504972,0.502981,0.671342


Precision:  0.50
Recall   :  0.50
F1-score :  0.50
AUC      :  0.50
              precision    recall  f1-score   support

           0       0.43      0.45      0.44       394
           1       0.41      0.35      0.38       372
           2       0.54      0.55      0.55       394
           3       0.63      0.67      0.65       454

    accuracy                           0.51      1614
   macro avg       0.50      0.50      0.50      1614
weighted avg       0.51      0.51      0.51      1614



Unnamed: 0,A,B,C,D
A,178,88,51,77
B,91,131,110,40
C,46,68,217,63
D,95,33,24,302


## HPO using optuna
Optuna gave me worse result, even if the best parameters are within range.

In [24]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
from sklearn.metrics import accuracy_score

def objective(trial):

    params = {"objective": "multiclass",
              "boosting_type": "gbdt",
              'num_class':4,
              "metric": "multi_logloss",
              "verbose": 0}

#     params["learning_rate"] = trial.suggest_uniform('learning_rate',1e-5, 1.0)
    params["learning_rate"] = 0.03 # choose small lr and large estimators
    params["num_leaves"]= trial.suggest_int('num_leaves', 10, 500)
    params["max_depth"] = trial.suggest_int('max_depth', 5, 20)

    params["colsample_bytree"] = trial.suggest_uniform('learning_rate',0.1, 1.0)

    params["reg_alpha"] = trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)
    params["reg_lambda"] = trial.suggest_loguniform('reg_alpha', 1e-8, 10.0)

    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "multi_logloss")

    # booster object
    bst = lgb.train(params, dtrain,
                      num_boost_round=10_000,
                      early_stopping_rounds=50,
                      valid_sets=dvalid,
                      callbacks=[pruning_callback]
                     )

    # for prediction, it needs dataframe
    vd_preds = bst.predict(Xvd,
                             num_iteration=bst.best_iteration)


    vd_preds = np.argmax(vd_preds,axis=1)
    acc = accuracy_score(yvd,vd_preds)

    return 1- acc

In [None]:
# %%capture

# NOTE: there is inherent non-determinism in optuna hyperparameter selection
#  we may not get the same hyperparameters when run twice.


sampler = optuna.samplers.TPESampler(seed=SEED)
N_TRIALS = 1 # make it large

# NOTE: It took too long to train when features is cols_no_na

study = optuna.create_study(
    sampler=sampler,
    study_name='lgb_optuna',
    storage='sqlite:///lgb_optuna_cust_seg.db',
    load_if_exists=True)

study.optimize(objective, n_trials=N_TRIALS)

In [None]:
%%capture
# Resume from last study
N_TRIALS = 1 # make it large

study = optuna.create_study(
    sampler=sampler,
    study_name='lgb_optuna',
    storage='sqlite:///lgb_optuna_cust_seg.db',
    load_if_exists=True)

study.optimize(objective, n_trials=N_TRIALS)

In [27]:
print(f'Number of finished trials: {len(study.trials)}')

# best trail
best_trial = study.best_trial

# best params
params_opt = study.best_trial.params
params_opt

Number of finished trials: 631


{'learning_rate': 0.5724247551795747,
 'max_depth': 10,
 'num_leaves': 13,
 'reg_alpha': 0.03371044441548606}

In [28]:
# time
time_start = time.time()

model_name = 'lightgbm'
desc = 'grid_optuna'

# model
model = lgb.LGBMClassifier(**params_opt)

# fit and save the model
model.fit(Xtr, ytr,eval_metric='multi_error')

# predictions
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=SEED)
ypreds_cv = cross_val_predict(model, Xtx, ytx, cv=skf)
ypreds = ypreds_cv

# auc
labels = [0, 1, 2, 3]
ytest_bin = label_binarize(ytest, classes=labels)
ypreds_bin = label_binarize(ypreds, classes=labels)
auc = roc_auc_score(ytest_bin,ypreds_bin,
                  average='macro',multi_class='ovo')

# model evaluation
average = 'macro'
row_eval = [model_name,desc, 
            accuracy_score(ytx, ypreds),
            precision_score(ytx, ypreds, average=average),
            recall_score(ytx, ypreds, average=average),
            f1_score(ytx, ypreds, average=average),
            auc
            ]

df_eval.loc[len(df_eval)] = row_eval
df_eval = df_eval.drop_duplicates()
time_taken = time.time() - time_start
print('Time taken: {:.0f} min {:.0f} secs'.format(*divmod(time_taken,60)))
display(df_eval)
print_scores(ytest,ypreds)

Time taken: 0 min 2 secs


Unnamed: 0,Model,Description,Accuracy,Precision,Recall,F1,AUC
0,lightgbm,default,0.491945,0.48121,0.483835,0.482098,0.657252
1,lightgbm,example_1,0.503098,0.487655,0.492798,0.485116,0.663384
2,lightgbm,grid_randomsearch,0.513011,0.50247,0.504972,0.502981,0.671342
3,lightgbm,grid_optuna,0.465304,0.45592,0.457956,0.45676,0.639912


Precision:  0.46
Recall   :  0.46
F1-score :  0.46
AUC      :  0.46
              precision    recall  f1-score   support

           0       0.42      0.42      0.42       394
           1       0.35      0.33      0.34       372
           2       0.47      0.47      0.47       394
           3       0.58      0.61      0.59       454

    accuracy                           0.47      1614
   macro avg       0.46      0.46      0.46      1614
weighted avg       0.46      0.47      0.46      1614



Unnamed: 0,A,B,C,D
A,166,84,50,94
B,88,122,120,42
C,43,101,187,63
D,94,41,43,276


In [None]:
arr_fimp = model.feature_importances_
df_fimp = pd.DataFrame(arr_fimp,columns=['Importances'],index=features)
df_fimp = df_fimp.sort_values('Importances',ascending=False)

plt.figure(figsize=(12,12))
ax = sns.barplot(x=df_fimp.Importances, y= df_fimp.index);

for p in ax.patches:
    x = p.get_width()
    y = p.get_y()
    text = '{:.2f}'.format(p.get_width())
    ax.text(x, y,text,fontsize=15,color='indigo',va='top',ha='left')