In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
import os
print(os.listdir("../input"))

In [None]:
data = pd.read_excel('../input/bmte12/bmte.xlsx',encoding="utf-8")


In [None]:
data['TITLE'] = data.fillna({'TITLE':''})

In [None]:
#M class has way less data than the orthers, thus the classes are unbalanced.
data.CATEGORY.value_counts()

In [None]:
len(data)

**Preprocessing**

In [None]:
#aspire here to have balanced classes
num_of_categories = 45000
shuffled = data.reindex(np.random.permutation(data.index))
print(len(shuffled))

e = shuffled[shuffled['CATEGORY'] == 'e'][:num_of_categories]
b = shuffled[shuffled['CATEGORY'] == 'b'][:num_of_categories]
t = shuffled[shuffled['CATEGORY'] == 't'][:num_of_categories]
m = shuffled[shuffled['CATEGORY'] == 'm'][:num_of_categories]
concated = pd.concat([e,b,t,m], ignore_index=True)
#Shuffle the dataset
concated = concated.reindex(np.random.permutation(concated.index))
concated['LABEL'] = 0
print(len(e))
print(concated)

In [None]:
#One-hot encode the lab
concated.loc[concated['CATEGORY'] == 'e', 'LABEL'] = 0
concated.loc[concated['CATEGORY'] == 'b', 'LABEL'] = 1
concated.loc[concated['CATEGORY'] == 't', 'LABEL'] = 2
concated.loc[concated['CATEGORY'] == 'm', 'LABEL'] = 3
print(concated['LABEL'][:10])
labels = to_categorical(concated['LABEL'], num_classes=4)
print(labels[:10])
if 'CATEGORY' in concated.keys():
    concated.drop(['CATEGORY'], axis=1)
'''
 [1. 0. 0. 0.] e
 [0. 1. 0. 0.] b
 [0. 0. 1. 0.] t
 [0. 0. 0. 1.] m
'''

In [None]:
len(concated['TITLE'].values)

In [None]:
n_most_common_words = 8000
max_len = 64
# Only keeping the top 8000 words based on frequency
# char_level=True
tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~1234567890o.\n\t', lower=False)
tokenizer.fit_on_texts(concated['TITLE'].values)
sequences = tokenizer.texts_to_sequences(concated['TITLE'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen=max_len)
print(X[9])
print(word_index)

In [None]:
len(sequences)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X , labels, test_size=0.25, random_state=47)

In [None]:
epochs = 30
emb_dim = 128
batch_size = 256
labels[:10]

In [None]:
X.shape[1]

In [None]:
n_most_common_words

In [None]:
print((X_train.shape, y_train.shape, X_test.shape, y_test.shape))


model = Sequential()
model.add(Embedding(8000, emb_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.7))
model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

In [None]:
accr = model.evaluate(X_test,y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'ko', label='Training Accuracy')
plt.plot(epochs, val_acc, 'k', label='Validation Accuracy')
# plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'ko', label='Training loss')
plt.plot(epochs, val_loss, 'k', label='Validation loss')
# plt.title('Training and validation loss')
plt.legend()

plt.show()

In [None]:
txt = ["Anticipates member-guest"] 
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_len)
pred = model.predict(padded)
labels = ['Safety managment skills', 'Physical skills', 'financial transaction', 'Math skills']
print(pred, labels[np.argmax(pred)])