In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import seaborn as sns
plt.rcParams['font.family'] = 'Arial'
sns.set(style="ticks")

In [2]:
data = pd.read_csv('data.csv')
data=data.loc[:, 'Ng1':'DG']
data

Unnamed: 0,Ng1,Ng2,Ng3,Nfl,Na,Pa,Ph,Cls,Tr,Pb,Css,DG
0,30,266,1224,1,25,5,2,t,n,s,mms,1
1,17,409,12182,2,0,13,7,t,n,s,mms,2
2,17,716,7056,2,5,12,6,o,q,s,mms,2
3,4,651,105,2,80,5,4,n,n,s,mms,1
4,3,1387,3909,5,40,5,10,t,n,o,mmb,1
...,...,...,...,...,...,...,...,...,...,...,...,...
214834,10,1007,10666,2,40,5,4,t,n,s,mms,2
214835,4,1235,4976,3,25,9,7,t,q,s,mms,1
214836,20,1359,1810,2,5,7,5,t,n,s,mms,1
214837,17,303,9803,3,55,10,6,t,q,s,mms,2


In [3]:
x=data.loc[:, 'Ng1':'Css']
x

Unnamed: 0,Ng1,Ng2,Ng3,Nfl,Na,Pa,Ph,Cls,Tr,Pb,Css
0,30,266,1224,1,25,5,2,t,n,s,mms
1,17,409,12182,2,0,13,7,t,n,s,mms
2,17,716,7056,2,5,12,6,o,q,s,mms
3,4,651,105,2,80,5,4,n,n,s,mms
4,3,1387,3909,5,40,5,10,t,n,o,mmb
...,...,...,...,...,...,...,...,...,...,...,...
214834,10,1007,10666,2,40,5,4,t,n,s,mms
214835,4,1235,4976,3,25,9,7,t,q,s,mms
214836,20,1359,1810,2,5,7,5,t,n,s,mms
214837,17,303,9803,3,55,10,6,t,q,s,mms


In [4]:
x[['Tr','Cls','Pb', 'Css']]=x[['Tr','Cls','Pb', 'Css']].astype('category')
x.dtypes

Ng1       int64
Ng2       int64
Ng3       int64
Nfl       int64
Na        int64
Pa        int64
Ph        int64
Cls    category
Tr     category
Pb     category
Css    category
dtype: object

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x[['Ng1','Ng2','Ng3','Nfl','Na','Pa','Ph']] = \
scaler.fit_transform(x[['Ng1','Ng2','Ng3','Nfl','Na','Pa','Ph']])
x

Unnamed: 0,Ng1,Ng2,Ng3,Nfl,Na,Pa,Ph,Cls,Tr,Pb,Css
0,1.981761,-1.049271,-1.375597,-1.535787,-0.022956,-0.688724,-1.759421,t,n,s,mms
1,0.370355,-0.703061,1.617499,-0.172111,-0.353404,1.079663,0.800049,t,n,s,mms
2,0.370355,0.040201,0.217370,-0.172111,-0.287314,0.858615,0.288155,o,q,s,mms
3,-1.241050,-0.117167,-1.681244,-0.172111,0.704031,-0.688724,-0.735633,n,n,s,mms
4,-1.365005,1.664725,-0.642209,3.918920,0.175314,-0.688724,2.335731,t,n,o,mmb
...,...,...,...,...,...,...,...,...,...,...,...
214834,-0.497325,0.744726,1.203415,-0.172111,0.175314,-0.688724,-0.735633,t,n,s,mms
214835,-1.241050,1.296725,-0.350766,1.191566,-0.022956,0.195470,0.800049,t,q,s,mms
214836,0.742218,1.596935,-1.215536,-0.172111,-0.287314,-0.246627,-0.223739,t,n,s,mms
214837,0.370355,-0.959692,0.967693,1.191566,0.373583,0.416518,0.288155,t,q,s,mms


In [6]:
y=data.loc[:, 'DG']
y.astype('category')
y

0         1
1         2
2         2
3         1
4         1
         ..
214834    2
214835    1
214836    1
214837    2
214838    2
Name: DG, Length: 214839, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
indices = range(len(y))

x_train, x_test, y_train, y_test, indices_train,indices_test = \
train_test_split(x, y, indices, test_size=0.3,random_state=1)


In [8]:
y_train.value_counts()/len(y_train)

1    0.608350
2    0.274585
0    0.117065
Name: DG, dtype: float64

In [9]:
from lightgbm import LGBMClassifier
model_L = LGBMClassifier()
model_L.fit(x_train, y_train)

In [10]:
from sklearn import metrics
y_pred = model_L.predict(x_test)
metrics.accuracy_score(y_test,y_pred)

0.8346986904983553

In [11]:
metrics.confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)

array([[ 3551,  3893,    75],
       [ 1210, 36127,  1828],
       [   45,  3603, 14120]], dtype=int64)

In [12]:
y_pred_t = model_L.predict(x_train)
metrics.accuracy_score(y_train,y_pred_t)

0.8400726126593389

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

parameters = {'num_leaves': [50,75,100,125],
             'n_estimators': [1200],
             'learning_rate': [0.1],
             'max_depth':[6,7,8,9]}
model = LGBMClassifier()

grid_search = GridSearchCV(model, parameters, scoring='accuracy', cv=5,n_jobs=-1,verbose=1)

grid_search.fit(x_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


{'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 1200, 'num_leaves': 100}

In [14]:
parameters = {'num_leaves': [100],
             'n_estimators': [1200],
             'learning_rate': [0.1],
             'max_depth':[8],
             'subsample':[0.8,0.9,1.0],
             'colsample_bytree':[0.8,0.9,1.0]}
model = LGBMClassifier()

grid_search = GridSearchCV(model, parameters, scoring='accuracy', cv=5,n_jobs=-1,verbose=1)

grid_search.fit(x_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


{'colsample_bytree': 0.8,
 'learning_rate': 0.1,
 'max_depth': 8,
 'n_estimators': 1200,
 'num_leaves': 100,
 'subsample': 0.8}

In [15]:
from lightgbm import LGBMClassifier
opt_model = LGBMClassifier(learning_rate=0.1, n_estimators=1200,num_leaves=100,
                           max_depth = 8,colsample_bytree=0.8,subsample=0.8)

import time
start_time = time.time()

opt_model.fit(x_train, y_train)

end_time = time.time()   
run_time = end_time - start_time 
run_time

16.40663981437683

### t = 12.68
![image-4.png](attachment:image-4.png)
(The authors' note: There may be some differences in the time taken for each running. The above is the result when writing the paper.)

In [18]:
from sklearn import metrics
y_pred = opt_model.predict(x_test)
metrics.accuracy_score(y_test,y_pred)

0.8969310494631664

In [19]:
metrics.f1_score(y_test, y_pred, labels=None,average='micro', sample_weight=None)

0.8969310494631664

In [20]:
metrics.f1_score(y_test, y_pred, labels=None,average='macro', sample_weight=None)

0.8399424211977425

In [21]:
metrics.precision_score(y_test, y_pred, labels=None, average=None)

array([0.76754613, 0.89649233, 0.94101784])

In [22]:
metrics.recall_score(y_test, y_pred, labels=None, average=None)

array([0.59196702, 0.94362313, 0.92306394])

In [23]:
metrics.confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)

array([[ 4451,  2962,   106],
       [ 1286, 36957,   922],
       [   62,  1305, 16401]], dtype=int64)

In [24]:
y_pred_t = opt_model.predict(x_train)
metrics.accuracy_score(y_train,y_pred_t)

0.9752571698351586

In [25]:
metrics.confusion_matrix(y_train, y_pred_t, labels=None, sample_weight=None)

array([[14700,  2832,    73],
       [  422, 90920,   146],
       [    3,   245, 41046]], dtype=int64)