In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
plt.rcParams['font.family'] = 'Arial'
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [2]:
data = pd.read_csv('data.csv')
data=data.loc[:, 'Ng1':'DG']
data

Unnamed: 0,Ng1,Ng2,Ng3,Nfl,Na,Pa,Ph,Cls,Tr,Pb,Css,DG
0,30,266,1224,1,25,5,2,t,n,s,mms,1
1,17,409,12182,2,0,13,7,t,n,s,mms,2
2,17,716,7056,2,5,12,6,o,q,s,mms,2
3,4,651,105,2,80,5,4,n,n,s,mms,1
4,3,1387,3909,5,40,5,10,t,n,o,mmb,1
...,...,...,...,...,...,...,...,...,...,...,...,...
214834,10,1007,10666,2,40,5,4,t,n,s,mms,2
214835,4,1235,4976,3,25,9,7,t,q,s,mms,1
214836,20,1359,1810,2,5,7,5,t,n,s,mms,1
214837,17,303,9803,3,55,10,6,t,q,s,mms,2


In [3]:
x=data.loc[:, 'Ng1':'Css']
x

Unnamed: 0,Ng1,Ng2,Ng3,Nfl,Na,Pa,Ph,Cls,Tr,Pb,Css
0,30,266,1224,1,25,5,2,t,n,s,mms
1,17,409,12182,2,0,13,7,t,n,s,mms
2,17,716,7056,2,5,12,6,o,q,s,mms
3,4,651,105,2,80,5,4,n,n,s,mms
4,3,1387,3909,5,40,5,10,t,n,o,mmb
...,...,...,...,...,...,...,...,...,...,...,...
214834,10,1007,10666,2,40,5,4,t,n,s,mms
214835,4,1235,4976,3,25,9,7,t,q,s,mms
214836,20,1359,1810,2,5,7,5,t,n,s,mms
214837,17,303,9803,3,55,10,6,t,q,s,mms


In [4]:
x[['Tr','Cls','Pb', 'Css']]=x[['Tr','Cls','Pb', 'Css']].astype('category')
x.dtypes

Ng1       int64
Ng2       int64
Ng3       int64
Nfl       int64
Na        int64
Pa        int64
Ph        int64
Cls    category
Tr     category
Pb     category
Css    category
dtype: object

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x[['Ng1','Ng2','Ng3','Nfl','Na','Pa','Ph']] = \
scaler.fit_transform(x[['Ng1','Ng2','Ng3','Nfl','Na','Pa','Ph']])
x

Unnamed: 0,Ng1,Ng2,Ng3,Nfl,Na,Pa,Ph,Cls,Tr,Pb,Css
0,1.981761,-1.049271,-1.375597,-1.535787,-0.022956,-0.688724,-1.759421,t,n,s,mms
1,0.370355,-0.703061,1.617499,-0.172111,-0.353404,1.079663,0.800049,t,n,s,mms
2,0.370355,0.040201,0.217370,-0.172111,-0.287314,0.858615,0.288155,o,q,s,mms
3,-1.241050,-0.117167,-1.681244,-0.172111,0.704031,-0.688724,-0.735633,n,n,s,mms
4,-1.365005,1.664725,-0.642209,3.918920,0.175314,-0.688724,2.335731,t,n,o,mmb
...,...,...,...,...,...,...,...,...,...,...,...
214834,-0.497325,0.744726,1.203415,-0.172111,0.175314,-0.688724,-0.735633,t,n,s,mms
214835,-1.241050,1.296725,-0.350766,1.191566,-0.022956,0.195470,0.800049,t,q,s,mms
214836,0.742218,1.596935,-1.215536,-0.172111,-0.287314,-0.246627,-0.223739,t,n,s,mms
214837,0.370355,-0.959692,0.967693,1.191566,0.373583,0.416518,0.288155,t,q,s,mms


In [6]:
y=data.loc[:, 'DG']
y.astype('category')
y

0         1
1         2
2         2
3         1
4         1
         ..
214834    2
214835    1
214836    1
214837    2
214838    2
Name: DG, Length: 214839, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
indices = range(len(y))

x_train, x_test, y_train, y_test, indices_train,indices_test = \
train_test_split(x, y, indices, test_size=0.3,random_state=1)


In [8]:
y_train.value_counts()/len(y_train)

1    0.608350
2    0.274585
0    0.117065
Name: DG, dtype: float64

In [9]:
from catboost import CatBoostClassifier
category_features=['Tr','Cls','Pb', 'Css']
model = CatBoostClassifier(cat_features = category_features,verbose=False)
model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x2470746e860>

In [10]:
from sklearn import metrics
y_pred = model.predict(x_test)
metrics.accuracy_score(y_test,y_pred)

0.8493297337553528

In [11]:
metrics.confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)

array([[ 3663,  3766,    90],
       [ 1078, 36370,  1717],
       [   29,  3031, 14708]], dtype=int64)

In [12]:
y_pred_t = model.predict(x_train)
metrics.accuracy_score(y_train,y_pred_t)

0.8584784589093473

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

parameters = {'learning_rate': [0.05,0.08,0.1,0.12,0.15],
             'depth':[6,7,8]
             }  
model = CatBoostClassifier(cat_features = category_features,iterations=1200,
                           od_type='Iter',early_stopping_rounds=50,verbose=False)

grid_search = GridSearchCV(model, parameters, scoring='accuracy', cv=5,n_jobs=-1,verbose=1)

grid_search.fit(x_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 15 candidates, totalling 75 fits


{'depth': 8, 'learning_rate': 0.15}

In [14]:
from catboost import CatBoostClassifier
category_features=['Tr','Cls','Pb', 'Css']

opt_model = CatBoostClassifier(depth=8, learning_rate=0.15,cat_features = category_features,iterations=1200,
                           od_type='Iter',early_stopping_rounds=50,verbose=False)
import time
start_time = time.time()    

opt_model.fit(x_train, y_train)

end_time = time.time()
run_time = end_time - start_time 
run_time

165.91021084785461

### t = 169.51
![image-2.png](attachment:image-2.png)
(The authors' note: There may be some differences in the time taken for each running. The above is the result when writing the paper.)

In [15]:
y_pred = opt_model.predict(x_test)
metrics.accuracy_score(y_test,y_pred)

0.873704462235462

In [16]:
metrics.f1_score(y_test, y_pred, labels=None,average='micro', sample_weight=None)

0.873704462235462

In [17]:
metrics.f1_score(y_test, y_pred, labels=None,average='macro', sample_weight=None)

0.8104061230271937

In [18]:
metrics.precision_score(y_test, y_pred, labels=None, average=None)

array([0.78488941, 0.86755826, 0.91585531])

In [19]:
metrics.recall_score(y_test, y_pred, labels=None, average=None)

array([0.5333156 , 0.93912932, 0.8735367 ])

In [20]:
metrics.confusion_matrix(y_test, y_pred, labels=None, sample_weight=None)

array([[ 4010,  3410,    99],
       [ 1057, 36781,  1327],
       [   42,  2205, 15521]], dtype=int64)

In [21]:
y_pred_t = opt_model.predict(x_train)
metrics.accuracy_score(y_train,y_pred_t)

0.9076648912472488

In [22]:
metrics.confusion_matrix(y_train, y_pred_t, labels=None, sample_weight=None)

array([[11043,  6384,   178],
       [ 1414, 87990,  2084],
       [   41,  3785, 37468]], dtype=int64)