# Import Libraries

In [30]:
import numpy as np
from keras.layers import Activation, Conv1D, Dense, Embedding, Flatten, Input, MaxPooling1D, SimpleRNN
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.datasets import fetch_20newsgroups
from keras.metrics import categorical_accuracy

# Preprocessing the data
You already learned that we have to tokenize the text before we can feed it into a neural network. This tokenization process will also remove some of the features of the original text, such as all punctuation or words that are less common.

In [8]:
# http://qwone.com/~jason/20Newsgroups/
dataset = fetch_20newsgroups(subset='all', shuffle=True)

texts = dataset.data # Extract text.
target = dataset.target # Extract target.

In [11]:
dataset.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [13]:
print (target[:10])

print (len(texts))
print (len(target))
print (len(texts[0].split()))
print (texts[0])
print (target[0])
print (dataset.target_names[target[0]])

[10  3 17  3  4 12  4 10 10 19]
18846
18846
157
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!


10
rec.sport.hockey


Remember we have to specify the size of our vocabulary. Words that are less frequent will get removed. In this case we want to retain the 20,000 most common words.

In [14]:
vocab_size = 20000 # Define the vocabulary size

tokenizer = Tokenizer(num_words=vocab_size) # Setup tokenizer
tokenizer.fit_on_texts(texts) # Fitting the tokenizer on the data
sequences = tokenizer.texts_to_sequences(texts) # Generate sequences

In [15]:
print (tokenizer.texts_to_sequences(['Hello King, how are you?']))

print (len(sequences))
print (len(sequences[0]))
print (sequences[0])

[[1595, 1168, 82, 20, 13]]
18846
160
[14, 19415, 455, 559, 15, 29, 2552, 1240, 5609, 33, 322, 767, 2175, 2121, 871, 1343, 32, 251, 88, 77, 84, 12087, 455, 559, 15, 7, 122, 228, 63, 3, 2552, 1240, 20, 517, 3490, 50, 1, 1393, 3, 61, 437, 3, 1507, 50, 1, 1302, 2552, 3027, 3, 1, 2701, 309, 7, 122, 243, 16334, 175, 5, 4, 243, 19416, 268, 7, 122, 194, 2, 296, 37, 337, 2, 369, 4389, 22, 4, 243, 3, 7286, 12, 1, 2552, 349, 30, 20, 1502, 137, 2701, 1382, 90, 7, 397, 5987, 74, 2025, 13, 130, 56, 8, 140, 215, 90, 93, 1457, 770, 1963, 56, 8, 97, 4, 308, 9186, 1857, 2, 1306, 6, 1, 2327, 6760, 115, 348, 5987, 21, 4, 308, 3, 1857, 6, 1, 365, 658, 3, 467, 185, 1, 2552, 20, 194, 2, 1985, 1, 66, 3, 3215, 608, 7, 26, 132, 8755, 19, 2, 131, 1, 3280, 2000, 1, 1151, 1457, 770, 283, 2552, 1222]


In [16]:
word_index = tokenizer.word_index
print('Found {:,} unique words.'.format(len(word_index)))

Found 179,209 unique words.


Our text is now converted to sequences of numbers. It makes sense to convert some of those sequences back into text to check what the tokenization did to our text. To this end we create an inverse index that maps numbers to words while the tokenizer maps words to numbers.

In [17]:
# Create inverse index mapping numbers to words
inv_index = {v: k for k, v in tokenizer.word_index.items()}

# Print out text again
for w in sequences[0]:
    x = inv_index.get(w)
    print(x,end = ' ')

from ratnam andrew cmu edu subject pens fans reactions organization post office carnegie mellon pittsburgh pa lines 12 nntp posting host po4 andrew cmu edu i am sure some of pens fans are pretty confused about the lack of any kind of posts about the recent pens massacre of the devils actually i am bit puzzled too and a bit relieved however i am going to put an end to non relief with a bit of praise for the pens man they are killing those devils worse than i thought jagr just showed you why he is much better than his regular season stats he is also a lot fo fun to watch in the playoffs bowman should let jagr have a lot of fun in the next couple of games since the pens are going to beat the out of jersey anyway i was very disappointed not to see the islanders lose the final regular season game pens rule 

# Measuring text length
Let's ensure all sequences have the same length.

In [18]:
# Get the average length of a text
avg = sum(map(len, sequences)) / len(sequences)

# Get the standard deviation of the sequence length
std = np.sqrt(sum(map(lambda x: (len(x) - avg)**2, sequences)) / len(sequences))

avg,std

(292.4769712405816, 666.9329063050876)

You can see, the average text is about 300 words long. However, the standard deviation is quite large which indicates that some texts are much much longer. If some user decided to write an epic novel in the newsgroup it would massively slow down training. So for speed purposes we will restrict sequence length to 100 words. You should try out some different sequence lengths and experiment with processing time and accuracy gains.

In [19]:
print(pad_sequences([[1,2,3]], maxlen=5))
print(pad_sequences([[1,2,3,4,5,6]], maxlen=5))

[[0 0 1 2 3]]
[[2 3 4 5 6]]


In [None]:
max_length = 300 # Set the maximum length of the each data
data = pad_sequences(sequences, maxlen=max_length) # Padding each data

# Turning labels into One-Hot encodings
Labels can quickly be encoded into one-hot vectors with Keras:

In [None]:
from keras.utils import to_categorical

labels = to_categorical(np.asarray(target))
print('Shape of data:', data.shape)
print('Shape of labels:', labels.shape)

print (target[0])
print (labels[0])

# Split dataset into training and testing data

In [None]:
train_size = int(len(data) * .8) # Set training data size
xtrain = data[:train_size]
ytrain = labels[:train_size]

xtest = data[train_size:]
ytest = labels[train_size:]

xtest_texts = texts[train_size:]

In [None]:
print(len(xtrain))
print(len(xtest))

# Create Model (MLP)

In [None]:
modelNN = Sequential()
modelNN.add(Input(shape=(max_length,)))
modelNN.add(Activation('relu'))
modelNN.add(Dense(128, activation='sigmoid'))
modelNN.add(Dense(64, activation='sigmoid'))
modelNN.add(Dense(20, activation='softmax'))
modelNN.summary()

In [None]:
modelNN.compile(optimizer='adam',
                        loss='categorical_crossentropy',
                        metrics=[categorical_accuracy])

histNN = modelNN.fit(xtrain,ytrain,validation_split=0.2,epochs=10)

# Create Model (Simple RNN)

In [None]:
modelRNN = Sequential()
modelRNN.add(Embedding(input_dim=vocab_size,
                       output_dim=64,
                       input_length=max_length,
                       trainable=False))
modelRNN.add(SimpleRNN(64))
modelRNN.add(Dense(20, activation='softmax'))
modelRNN.summary()

In [None]:
# https://stackoverflow.com/questions/42081257/keras-binary-crossentropy-vs-categorical-crossentropy-performance
modelRNN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[categorical_accuracy])

histRNN = modelRNN.fit(xtrain, ytrain, validation_split=0.2, epochs=2)

# Create Model (LSTM)

In [None]:
from keras.layers import LSTM

In [None]:
modelLSTM = Sequential()

In [None]:
# https://stackoverflow.com/questions/42081257/keras-binary-crossentropy-vs-categorical-crossentropy-performance
modelLSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[categorical_accuracy])

histLSTM = modelLSTM.fit(xtrain, ytrain, validation_split=0.2, epochs=2)

# Create Model (CNN)

In [None]:
modelCNN = Sequential()
modelCNN.add(Embedding(input_dim=vocab_size,
                       output_dim=64,
                       input_length=max_length,
                       trainable=False))
modelCNN.add(ConvID(64, 2, activation='relu'))
modelCNN.add(MaxPooling1D(2))
modelCNN.add(ConvID(64, 2, activation='relu'))
modelCNN.add(MaxPooling1D(2))
modelCNN.add(ConvID(64, 2, activation='relu'))
modelCNN.add(MaxPooling1D(2))

model CNN.add(Flatten())

modelCNN.add(Dense(64, activation='relu'))
modelCNN.add(Dense(20, activation='softmax'))
modelCNN.summary()

In [None]:
# https://stackoverflow.com/questions/42081257/keras-binary-crossentropy-vs-categorical-crossentropy-performance
modelCNN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[categorical_accuracy])

histCNN = modelCNN.fit(xtrain, ytrain, validation_split=0.2, epochs=10)

Our model achieves 66% accuracy on the validation set. Systems like these can be used to assign emails in customer support centers, suggest responses, or classify other forms of text like invoices which need to be assigned to an department. Let's take a look at how our model classified one of the texts:

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(histNN, "categorical_accuracy")
plot_graphs(histNN, "loss")

# Example Prediction

In [None]:
example = xtest[1000] # Get the tokens
print (xtest_texts[1000])

# Print tokens as text
for w in example:
    x = inv_index.get(w)
    print(x,end = ' ')

In [None]:
# Get prediction
pred = modelCNN.predict(example.reshape(1,100))

In [None]:
# Output predicted category
target_names[np.argmax(pred)]