In [1]:
import keras

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from keras.datasets import reuters

In [7]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)
word_index = reuters.get_word_index()

In [8]:
print("# of training samples:{}".format(len(x_train)))
print("# of test samples:{}".format(len(x_test)))

# of training samples:8982
# of test samples:2246


In [9]:
num_classes = max(y_train) + 1
print("# of classes {}".format(num_classes))

# of classes 46


In [10]:
print(x_train[0])
print(y_train[0])

[1, 27595, 28842, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]
3


In [11]:
word_index['money']

236

In [12]:
indexToWord = {}
for k, v in word_index.items():
    indexToWord[v] = k

In [13]:
indexToWord[8]

'mln'

In [14]:
print(" ".join(indexToWord[x] for x in x_train[0]))

the wattie nondiscriminatory mln loss for plc said at only ended said commonwealth could 1 traders now april 0 a after said from 1985 and from foreign 000 april 0 prices its account year a but in this mln home an states earlier and rise and revs vs 000 its 16 vs 000 a but 3 psbr oils several and shareholders and dividend vs 000 its all 4 vs 000 1 mln agreed largely april 0 are 2 states will billion total and against 000 pct dlrs


In [16]:
from keras.preprocessing.text import Tokenizer

In [17]:
max_words = 10000

In [18]:
tokenizer = Tokenizer(num_words=max_words)

In [20]:
x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

In [23]:
y_train = keras.utils.to_categorical(y_train, num_classes)

In [24]:
y_test = keras.utils.to_categorical(y_test, num_classes)

In [25]:
print(x_train.shape)

(8982, 10000)


In [31]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation


model = Sequential()
model.add(Dense(512, input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [32]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
print(model.metrics_names)

['loss', 'acc']


In [35]:
batch_size = 32
epochs = 2

In [36]:
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)

print("Test loss {}".format(score[0]))
print("Test accuracy {}".format(score[1]))

Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2
Test loss 0.829474725154289
Test accuracy 0.8054318788958148


Part 2

In [37]:
import numpy as np

In [40]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

x_train = tokenizer.sequences_to_matrix(x_train, mode='count')
x_test = tokenizer.sequences_to_matrix(x_test, mode='count')

In [41]:
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [42]:
print(x_train[0])
print(max(x_train[0]))

[0. 1. 0. ... 0. 0. 0.]
6.0


In [43]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [44]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [45]:
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)

print("Test loss {}".format(score[0]))
print("Test accuracy {}".format(score[1]))

Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2
Test loss 0.8626484643744233
Test accuracy 0.813446126447017


In [46]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

x_train = tokenizer.sequences_to_matrix(x_train, mode='freq')
x_test = tokenizer.sequences_to_matrix(x_test, mode='freq')

In [47]:
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [48]:
print(x_train[0])
print(max(x_train[0]))

[0.         0.01149425 0.         ... 0.         0.         0.        ]
0.06896551724137931


In [49]:
model = Sequential()
model.add(Dense(512, input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [50]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [51]:
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)

print("Test loss {}".format(score[0]))
print("Test accuracy {}".format(score[1]))

Train on 8083 samples, validate on 899 samples
Epoch 1/2
Epoch 2/2
Test loss 1.6478784640559319
Test accuracy 0.5854853072393609


In [52]:
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=None, test_split=0.2)

tokenizer = 

x_train = tokenizer.sequences_to_matrix(x_train, mode='tfidf')
x_test = tokenizer.sequences_to_matrix(x_test, mode='tfidf')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print(x_train[0])
print(max(x_train[0]))
model = Sequential()
model.add(Dense(512, input_shape=(max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1)

print("Test loss {}".format(score[0]))
print("Test accuracy {}".format(score[1]))

ValueError: Fit the Tokenizer on some data before using tfidf mode.