In [1]:
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer, one_hot
from keras_preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# Import dataset
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)

In [3]:
# arrange the data
sentences = newsgroups_train.data
y = newsgroups_train.target

In [4]:
# get the max review length in the reviews.
max_review_len = max([len(s.split()) for s in sentences])

In [5]:
# tokenizing data
tokenizer = Tokenizer(num_words=max_review_len)
tokenizer.fit_on_texts(sentences)

In [6]:
# getting the vocabulary of data
sentences = tokenizer.texts_to_sequences(sentences)
padded_docs = pad_sequences(sentences, maxlen=max_review_len)

In [7]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19], dtype=int64)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(padded_docs, y, test_size=0.25, random_state=1000)
vocab_size = len(tokenizer.word_index) + 1

In [9]:
# model
model = Sequential()
model.add(layers.Embedding(vocab_size, 50, input_length=max_review_len))
model.add(layers.Flatten())
model.add(layers.Dense(300, activation='relu'))
model.add(layers.Dense(20, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [10]:
# summarize the model
print(model.summary())
history = model.fit(X_train, y_train, epochs=2, verbose=True, validation_data=(X_test, y_test), batch_size=256)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 11821, 50)         6707150   
_________________________________________________________________
flatten (Flatten)            (None, 591050)            0         
_________________________________________________________________
dense (Dense)                (None, 300)               177315300 
_________________________________________________________________
dense_1 (Dense)              (None, 20)                6020      
Total params: 184,028,470
Trainable params: 184,028,470
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/2
Epoch 2/2


In [11]:
[test_loss, test_acc] = model.evaluate(X_test, y_test)
print("Evaluation result on Test Data : Loss = {}, accuracy = {}".format(test_loss, test_acc))

Evaluation result on Test Data : Loss = 2.941577196121216, accuracy = 0.09650053083896637
