In [53]:
# Pandas for table processing
import pandas as pd
import re
import lightgbm as lgb
import numpy as np

# Packages for auxilary data science tasks: dividing the dataset to train and test and metrics summary generation
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score, accuracy_score

In [54]:
data = pd.read_csv("Glass_Type.csv")

In [55]:
data

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,0,2732,1.52211,14.19,3.78,0.91,71.36,0.23,9.14,0.00,0.37,'vehic wind float'
1,1,2607,1.51645,14.94,0.00,1.87,73.11,0.00,8.67,1.38,0.00,headlamps
2,2,1653,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.00,0.00,'build wind float'
3,3,3264,1.51730,12.35,2.72,1.63,72.87,0.70,9.23,0.00,0.00,'build wind non-float'
4,4,4931,1.51673,13.30,3.64,1.53,72.53,0.65,8.03,0.00,0.29,'build wind non-float'
...,...,...,...,...,...,...,...,...,...,...,...,...
192595,192595,6341,1.51613,13.88,1.78,1.79,73.10,0.00,8.67,0.76,0.00,headlamps
192596,192596,1042,1.51915,12.73,1.85,1.86,72.69,0.60,10.09,0.00,0.00,containers
192597,192597,1210,1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,'vehic wind float'
192598,192598,191,1.51888,14.99,0.78,1.74,72.50,0.00,9.95,0.00,0.00,tableware


In [56]:
# Due to LightGBMError: Do not support special JSON characters in feature name. 
# Solution found in: https://stackoverflow.com/questions/60582050/lightgbmerror-do-not-support-special-json-characters-in-feature-name-the-same
data = data.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [57]:
data.Type.value_counts()

'build wind non-float'    68251
'build wind float'        62455
headlamps                 26201
'vehic wind float'        15233
containers                12038
tableware                  8422
Name: Type, dtype: int64

In [58]:
# Labels encoding from categories to Integers
data['Type'] = LabelEncoder().fit_transform(data['Type'])
# 0 -> 'build wind float'
# 1 -> 'build wind non-float'
# 2 -> 'vehic wind float'
# 3 -> containers
# 4 -> headlamps
# 5 -> tableware

In [59]:
def binarize(x):
    if x==5:
        value=0
    else:
        value=1
    return value
#data['Type'] = data['Type'].map(binarize)

In [60]:
data

Unnamed: 0,Unnamed0,Unnamed01,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,0,2732,1.52211,14.19,3.78,0.91,71.36,0.23,9.14,0.00,0.37,2
1,1,2607,1.51645,14.94,0.00,1.87,73.11,0.00,8.67,1.38,0.00,4
2,2,1653,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.00,0.00,0
3,3,3264,1.51730,12.35,2.72,1.63,72.87,0.70,9.23,0.00,0.00,1
4,4,4931,1.51673,13.30,3.64,1.53,72.53,0.65,8.03,0.00,0.29,1
...,...,...,...,...,...,...,...,...,...,...,...,...
192595,192595,6341,1.51613,13.88,1.78,1.79,73.10,0.00,8.67,0.76,0.00,4
192596,192596,1042,1.51915,12.73,1.85,1.86,72.69,0.60,10.09,0.00,0.00,3
192597,192597,1210,1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,2
192598,192598,191,1.51888,14.99,0.78,1.74,72.50,0.00,9.95,0.00,0.00,5


In [61]:
# get all data of the omitted class "tableware"
omitted_class = data.loc[data['Type'] == 5]

In [62]:
data_without_omitted_class = data.loc[data['Type'] != 5]

In [63]:
data_without_omitted_class

Unnamed: 0,Unnamed0,Unnamed01,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,0,2732,1.52211,14.19,3.78,0.91,71.36,0.23,9.14,0.00,0.37,2
1,1,2607,1.51645,14.94,0.00,1.87,73.11,0.00,8.67,1.38,0.00,4
2,2,1653,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.00,0.00,0
3,3,3264,1.51730,12.35,2.72,1.63,72.87,0.70,9.23,0.00,0.00,1
4,4,4931,1.51673,13.30,3.64,1.53,72.53,0.65,8.03,0.00,0.29,1
...,...,...,...,...,...,...,...,...,...,...,...,...
192594,192594,4629,1.52300,13.31,3.58,0.82,71.99,0.12,10.17,0.00,0.03,0
192595,192595,6341,1.51613,13.88,1.78,1.79,73.10,0.00,8.67,0.76,0.00,4
192596,192596,1042,1.51915,12.73,1.85,1.86,72.69,0.60,10.09,0.00,0.00,3
192597,192597,1210,1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,2


In [64]:
data_without_omitted_class

Unnamed: 0,Unnamed0,Unnamed01,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,0,2732,1.52211,14.19,3.78,0.91,71.36,0.23,9.14,0.00,0.37,2
1,1,2607,1.51645,14.94,0.00,1.87,73.11,0.00,8.67,1.38,0.00,4
2,2,1653,1.51793,13.21,3.48,1.41,72.64,0.59,8.43,0.00,0.00,0
3,3,3264,1.51730,12.35,2.72,1.63,72.87,0.70,9.23,0.00,0.00,1
4,4,4931,1.51673,13.30,3.64,1.53,72.53,0.65,8.03,0.00,0.29,1
...,...,...,...,...,...,...,...,...,...,...,...,...
192594,192594,4629,1.52300,13.31,3.58,0.82,71.99,0.12,10.17,0.00,0.03,0
192595,192595,6341,1.51613,13.88,1.78,1.79,73.10,0.00,8.67,0.76,0.00,4
192596,192596,1042,1.51915,12.73,1.85,1.86,72.69,0.60,10.09,0.00,0.00,3
192597,192597,1210,1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,2


In [65]:
X_train, X_test, Y_train, Y_test = train_test_split(data_without_omitted_class.drop(columns=["Type"]), data_without_omitted_class.Type,
                                                    train_size=0.9, stratify=data_without_omitted_class.Type, random_state=42)

In [66]:
#Converting the dataset in proper LGB format
d_train=lgb.Dataset(X_train, label=Y_train)
#setting up the parameters
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='multiclass' #Multi-class target feature
params['metric']='multi_logloss' #metric for multi-class
params['max_depth']=10
params['num_class']=6 #no.of unique values in the target class not inclusive of the end value
#training the model
clf=lgb.train(params,d_train,100)  #training the model on 100 epocs
#prediction on the test dataset
y_pred=clf.predict(X_test)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1418
[LightGBM] [Info] Number of data points in the train set: 165760, number of used features: 11
[LightGBM] [Info] Start training from score -1.081464
[LightGBM] [Info] Start training from score -0.992708
[LightGBM] [Info] Start training from score -2.492415
[LightGBM] [Info] Start training from score -2.727852
[LightGBM] [Info] Start training from score -1.950100
[LightGBM] [Info] Start training from score -34.538776


In [67]:
#argmax() method 
y_pred = [np.argmax(line) for line in y_pred]

In [68]:
y_pred

[0,
 0,
 2,
 0,
 1,
 2,
 4,
 1,
 2,
 0,
 1,
 1,
 1,
 3,
 3,
 2,
 0,
 0,
 0,
 3,
 1,
 1,
 1,
 0,
 3,
 1,
 3,
 0,
 0,
 1,
 2,
 0,
 1,
 0,
 3,
 4,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 2,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 4,
 3,
 3,
 0,
 1,
 2,
 1,
 0,
 0,
 0,
 4,
 2,
 0,
 1,
 1,
 2,
 0,
 2,
 1,
 1,
 0,
 0,
 1,
 2,
 1,
 0,
 2,
 3,
 4,
 0,
 0,
 0,
 1,
 2,
 1,
 1,
 0,
 0,
 4,
 2,
 0,
 1,
 4,
 1,
 4,
 0,
 1,
 2,
 1,
 1,
 3,
 1,
 4,
 0,
 4,
 0,
 1,
 4,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 1,
 0,
 4,
 1,
 0,
 1,
 0,
 1,
 4,
 2,
 1,
 4,
 0,
 1,
 0,
 1,
 1,
 0,
 2,
 4,
 3,
 4,
 1,
 3,
 0,
 1,
 0,
 0,
 4,
 1,
 1,
 0,
 1,
 4,
 0,
 0,
 0,
 1,
 1,
 1,
 4,
 1,
 3,
 0,
 0,
 4,
 4,
 1,
 1,
 1,
 0,
 4,
 1,
 1,
 2,
 0,
 1,
 1,
 2,
 4,
 1,
 1,
 3,
 0,
 1,
 0,
 1,
 0,
 4,
 0,
 1,
 0,
 2,
 0,
 1,
 1,
 4,
 1,
 4,
 0,
 4,
 0,
 1,
 2,
 4,
 4,
 1,
 4,
 0,
 1,
 4,
 0,
 1,
 1,
 0,
 2,
 2,
 0,
 1,
 2,
 4,
 4,
 4,
 0,
 1,
 1,
 1,
 1,
 2,
 3,
 0,
 0,
 1,
 0,
 1,
 0,
 2,
 3,
 1,
 1,
 0,
 0,
 1,
 1,
 4,
 1,
 1,


In [69]:
#using precision score for error metrics
precision_score(y_pred,Y_test,average=None).mean()

1.0

In [70]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6246
           1       1.00      1.00      1.00      6825
           2       1.00      1.00      1.00      1523
           3       1.00      1.00      1.00      1204
           4       1.00      1.00      1.00      2620

    accuracy                           1.00     18418
   macro avg       1.00      1.00      1.00     18418
weighted avg       1.00      1.00      1.00     18418



In [71]:
classification_result = clf.predict(X_test)

In [72]:
# Get the max value from each array from numpy matrix
# axis=1 to find max from each row
# Calculate the confidence scores SC of the winning classes, in case when 
# the trained classifier was  tested on dataset without omitted class
winingClassCs = np.amax(classification_result, axis=1)
print(winingClassCI)

[0.99428756 0.99399184 0.9904978  ... 0.99430067 0.99332959 0.99442667]


In [73]:
accuracy_score(Y_test, y_pred)

1.0

In [74]:
# Test on omitted class
X_test_omitted = omitted_class.iloc[:,:-1] # all colmns without the last column
Y_test_omitted = omitted_class.iloc[:,-1:] # last colmn
y_pred_omitted=clf.predict(X_test_omitted)

In [75]:
#argmax() method 
y_pred_omitted = [np.argmax(line) for line in y_pred_omitted]

In [76]:
precision_score(y_pred_omitted,Y_test_omitted,average=None).mean()

  _warn_prf(average, modifier, msg_start, len(result))


0.0

In [77]:
print(classification_report(Y_test_omitted, y_pred_omitted))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       0.0
           4       0.00      0.00      0.00       0.0
           5       0.00      0.00      0.00    8422.0

    accuracy                           0.00    8422.0
   macro avg       0.00      0.00      0.00    8422.0
weighted avg       0.00      0.00      0.00    8422.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [78]:
y_pred_omitted

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 4,
 4,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 4,


In [79]:
classification_result_omitted = clf.predict(X_test_omitted)

In [80]:
# Get the max value from each array from numpy matrix
# axis=1 to find max from each row
# Calculate the confidence scores SC of the winning classes, in case when 
# the trained classifier was tested on omitted class dataset
winningClassCsOmitted = np.amax(classification_result_omitted, axis=1)
print(x)

[0.97888526 0.30913949 0.97888526 ... 0.86354831 0.30913949 0.98589391]


In [81]:
accuracy_score(Y_test_omitted, y_pred_omitted)

0.0