In [60]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer

In [61]:
df = pd.read_csv('train.csv')
df.sample(4)

Unnamed: 0,Text,Category
16,Arrestable and Non-Arrestable Offences in Sing...,Criminal
26,How to Affirm an Affidavit Outside of Singapor...,Legal_Procedures
0,Drafting a Deed of Separation in Singapore (In...,Divorce
32,8 Checks to Conduct on Registered Companies in...,Corporate


In [62]:
max_words = 20000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['Text'])
X_sequences = tokenizer.texts_to_sequences(df['Text'])

In [63]:
sequence_len = 15
X = []
y_corpus = []

for i in range(len(X_sequences)):
    num_sequences = len(X_sequences[i]) // sequence_len
    
    for j in range(num_sequences):
        start = sequence_len * j
        end = sequence_len * (j + 1)
        X.append(X_sequences[i][start:end])
        y_corpus.append(df['Category'][i])

In [64]:
X = pd.DataFrame(X)

Y = pd.get_dummies(pd.Series(y_corpus))
Y = Y[ sorted(Y.columns) ]

print(X.shape)
print(Y.shape)

(3403, 15)
(3403, 4)


In [70]:
print(Y.columns)

Index(['Corporate', 'Criminal', 'Divorce', 'Legal_Procedures'], dtype='object')


In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(2722, 15) (2722, 4)
(681, 15) (681, 4)


In [66]:
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [67]:
epochs = 100
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,
                    validation_split=0.1,
#                     callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100


Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [68]:
# Evaluate the model
model.evaluate(X_test, Y_test)



[0.6108688116073608, 0.8898678421974182]

In [69]:
from tensorflow import keras
import pickle

model.save('saved_model')
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

INFO:tensorflow:Assets written to: saved_model\assets


In [125]:
# END

In [None]:
# plt.title('Loss')
# plt.plot(history.history['loss'], label='train')
# plt.plot(history.history['val_loss'], label='test')
# plt.legend()
# plt.show();

In [None]:
# from keras.preprocessing.sequence import pad_sequences
# X = pad_sequences(X_sequences, maxlen=sequence_len)
# print(X.shape)

In [None]:
# sequence_len = 20
# X_corpus = []
# y_corpus = []

# for i in range(len(df['Text'])):
#     text = df['Text'][i].split()
#     num_sequences = len(text) // sequence_len
    
#     for j in range(num_sequences):
#         start = sequence_len * j
#         end = sequence_len * (j + 1)
#         X_corpus.append(' '.join(text[start:end]))
#         y_corpus.append(df['Category'][i])

# from keras.preprocessing.text import Tokenizer
# max_words = 10000
# tokenizer = Tokenizer(num_words=max_words)
# tokenizer.fit_on_texts(X_corpus)
# X_sequences = tokenizer.texts_to_sequences(X_corpus)