In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Embedding,Bidirectional,GRU,LSTM,Dense,Dropout,SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer

In [None]:
df = pd.read_csv('train.csv')

In [None]:
import re
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

X_processed = []
for text in df['Text']:
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text]
    text = ' '.join(text)
    X_processed.append(text)

In [None]:
max_words = 20000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_processed)
X_sequences = tokenizer.texts_to_sequences(X_processed)

In [None]:
tokenizer_index_len = len(tokenizer.word_index)
print(tokenizer_index_len)

In [None]:
sequence_len = 20
X = []
y_corpus = []

for i in range(len(X_sequences)):
    num_sequences = len(X_sequences[i]) // sequence_len
    
    for j in range(num_sequences):
        start = sequence_len * j
        end = sequence_len * (j + 1)
        X.append(X_sequences[i][start:end])
        y_corpus.append(df['Category'][i])

In [None]:
X = pd.DataFrame(X)

Y = pd.get_dummies(pd.Series(y_corpus))
Y = Y[ sorted(Y.columns) ]

print(X.shape)
print(Y.shape)

In [None]:
print(list(Y.columns))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.10, random_state = 42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

In [None]:
MAX_NB_WORDS = tokenizer_index_len + 1
EMBEDDING_DIM = 100

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
# model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
# model.add(GRU(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Bidirectional(GRU(100, dropout=0.2, recurrent_dropout=0.2)))
# model.add(Dropout(0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics=['accuracy', tf.keras.metrics.TopKCategoricalAccuracy(k=2)])

In [None]:
epochs = 100
batch_size = 32

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,
                    validation_split=0.1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
)

In [None]:
# Evaluate the model
model.evaluate(X_test, Y_test)

In [None]:
from tensorflow import keras
import pickle

model.save('saved_model')
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# END

In [None]:
# plt.title('Loss')
# plt.plot(history.history['loss'], label='train')
# plt.plot(history.history['val_loss'], label='test')
# plt.legend()
# plt.show();

In [None]:
# from keras.preprocessing.sequence import pad_sequences
# X = pad_sequences(X_sequences, maxlen=sequence_len)
# print(X.shape)

In [None]:
# sequence_len = 20
# X_corpus = []
# y_corpus = []

# for i in range(len(df['Text'])):
#     text = df['Text'][i].split()
#     num_sequences = len(text) // sequence_len
    
#     for j in range(num_sequences):
#         start = sequence_len * j
#         end = sequence_len * (j + 1)
#         X_corpus.append(' '.join(text[start:end]))
#         y_corpus.append(df['Category'][i])

# from keras.preprocessing.text import Tokenizer
# max_words = 10000
# tokenizer = Tokenizer(num_words=max_words)
# tokenizer.fit_on_texts(X_corpus)
# X_sequences = tokenizer.texts_to_sequences(X_corpus)