In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re


In [6]:
df = pd.read_excel("bert_sample.xlsx")
df.head()

Unnamed: 0,ITEM_NAME,CATEGORY_ID
0,CALIBRACION TRANSDUCER 75 nm,CAPITAL ASSEMBLY
1,for pusher whskey,CAPITAL ASSEMBLY
2,Stat 40B Press Head Cup to Carrier from Stati...,CAPITAL ASSEMBLY
3,TRANSD. Cable (4145097103) scrw,CAPITAL ASSEMBLY
4,"ZT200 7,5BAR,13BAR60HZ NUMERO DE SERIE: AIF09...",CAPITAL ASSEMBLY


In [7]:
df["CATEGORY_ID"].value_counts()

CUTTING TOOLS        5000
CAPITAL ASSEMBLY     2000
CHEMICALS            2000
LOGISTICS SERVICE    1000
Name: CATEGORY_ID, dtype: int64

In [8]:
def to_class(data):
    if data == "CUTTING TOOLS":
        return 0
    elif data == "CHEMICALS":
        return 1
    elif data == "CAPITAL ASSEMBLY":
        return 2
    else: 
        return 3

df['CATEGORY_ID'] = df.CATEGORY_ID.apply(to_class)
df.head()

Unnamed: 0,ITEM_NAME,CATEGORY_ID
0,CALIBRACION TRANSDUCER 75 nm,2
1,for pusher whskey,2
2,Stat 40B Press Head Cup to Carrier from Stati...,2
3,TRANSD. Cable (4145097103) scrw,2
4,"ZT200 7,5BAR,13BAR60HZ NUMERO DE SERIE: AIF09...",2


In [10]:
df['ITEM_NAME'] = df['ITEM_NAME'].apply(lambda x: x.lower())
# removing special chars
df['ITEM_NAME'] = df['ITEM_NAME'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
#
df.head()

Unnamed: 0,ITEM_NAME,CATEGORY_ID
0,calibracion transducer 75 nm,2
1,for pusher whskey,2
2,stat 40b press head cup to carrier from stati...,2
3,transd cable 4145097103 scrw,2
4,zt200 75bar13bar60hz numero de serie aif09446...,2


In [13]:
print(df[ df['CATEGORY_ID'] == 0].size)
print(df[ df['CATEGORY_ID'] == 1].size)
print(df[ df['CATEGORY_ID'] == 2].size)
print(df[ df['CATEGORY_ID'] == 3].size)

for idx,row in df.iterrows():
    row[0] = row[0].strip()
df.head()

10000
4000
4000
2000


Unnamed: 0,ITEM_NAME,CATEGORY_ID
0,calibracion transducer 75 nm,2
1,for pusher whskey,2
2,stat 40b press head cup to carrier from stati...,2
3,transd cable 4145097103 scrw,2
4,zt200 75bar13bar60hz numero de serie aif09446...,2


In [14]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['ITEM_NAME'].values)
X = tokenizer.texts_to_sequences(df['ITEM_NAME'].values)
X = pad_sequences(X)
X[:2]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        1047,  693,  462],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           

In [22]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4,activation='sigmoid'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 80, 128)           256000    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 80, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_2 (Dense)              (None, 4)                 788       
Total params: 511,588
Trainable params: 511,588
Non-trainable params: 0
_________________________________________________________________
None


In [30]:
Y = pd.get_dummies(df['CATEGORY_ID']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)


(8000, 80) (8000, 4)
(2000, 80) (2000, 4)


In [31]:
print(X_train)

[[   0    0    0 ...  127  129  269]
 [   0    0    0 ...  365  188    9]
 [   0    0    0 ...  133   83 1350]
 ...
 [   0    0    0 ...    0  285   52]
 [   0    0    0 ... 1079  552  911]
 [   0    0    0 ...  132  209   69]]


In [32]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 15, batch_size=batch_size, verbose = 1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x216bfb11ee0>

In [33]:
Y_pred = model.predict_classes(X_test,batch_size = batch_size)



In [34]:
df_test = pd.DataFrame({'true': Y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print("confusion matrix",confusion_matrix(df_test.true, df_test.pred))
print(classification_report(df_test.true, df_test.pred))

confusion matrix [[963  18  36   5]
 [ 77 283  11  22]
 [ 60  21 305  24]
 [ 20  20  11 124]]
              precision    recall  f1-score   support

           0       0.86      0.94      0.90      1022
           1       0.83      0.72      0.77       393
           2       0.84      0.74      0.79       410
           3       0.71      0.71      0.71       175

    accuracy                           0.84      2000
   macro avg       0.81      0.78      0.79      2000
weighted avg       0.84      0.84      0.83      2000

