In [59]:
from nltk.corpus import reuters
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [71]:
fileids = reuters.fileids()

test = [ f for f in fileids if f.startswith('test/')]
train = [ f for f in fileids if f.startswith('training/')]

def label(file_ids):
    return np.array([ [float('earn' in reuters.categories(f))] for f in file_ids ])



# train_features = np.array( [ [len(reuters.raw(f))] for f in train])
# test_features = np.array( [ [len(reuters.raw(f))] for f in test])

# vectorizer = CountVectorizer(max_features= 1000)
# vectorizer = TfidfVectorizer(max_features= 1000)
vectorizer = HashingVectorizer(n_features= 10000)

train_features = vectorizer.fit_transform( (reuters.raw(f) for f in train ) ).toarray()
test_features = vectorizer.transform( (reuters.raw(f) for f in test ) ).toarray()


data = {
    'label_train': label(train),
    'label_test': label(test),
    'features_train': train_features,
    'features_test': test_features,
    'n_features': len(train_features[0])
}


In [76]:
model = tf.keras.Sequential()
model.add(layers.Dense(50, activation='relu', input_dim=data['n_features']))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy')


In [77]:
model.fit(data['features_train'], data['label_train'], epochs=10, batch_size=32)

Train on 7769 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ffa70404350>

In [78]:
prediction_probs = model.predict(data['features_test'])
prediction_class = [ 0 if x < 0.5 else 1 for x in prediction_probs]

In [69]:
print(metrics.accuracy_score(data['label_test'], prediction_class))
print(metrics.f1_score(data['label_test'], prediction_class))
print(metrics.confusion_matrix(data['label_test'], prediction_class))


0.967870155680689
0.9539629805410538
[[1917   15]
 [  82 1005]]


In [47]:
print(metrics.accuracy_score(data['label_test'], prediction_class))
print(metrics.f1_score(data['label_test'], prediction_class))
print(metrics.confusion_matrix(data['label_test'], prediction_class))


0.9867505796621397
0.9815668202764978
[[1914   18]
 [  22 1065]]


In [52]:
print(metrics.accuracy_score(data['label_test'], prediction_class))
print(metrics.f1_score(data['label_test'], prediction_class))
print(metrics.confusion_matrix(data['label_test'], prediction_class))


0.9867505796621397
0.9815327793167128
[[1916   16]
 [  24 1063]]


In [58]:
print(metrics.accuracy_score(data['label_test'], prediction_class))
print(metrics.f1_score(data['label_test'], prediction_class))
print(metrics.confusion_matrix(data['label_test'], prediction_class))


0.9801258694932097
0.9719101123595505
[[1921   11]
 [  49 1038]]


In [75]:
print(metrics.accuracy_score(data['label_test'], prediction_class))
print(metrics.f1_score(data['label_test'], prediction_class))
print(metrics.confusion_matrix(data['label_test'], prediction_class))


0.9741636303411726
0.9631728045325779
[[1921   11]
 [  67 1020]]


In [79]:
print(metrics.accuracy_score(data['label_test'], prediction_class))
print(metrics.f1_score(data['label_test'], prediction_class))
print(metrics.confusion_matrix(data['label_test'], prediction_class))


0.9864193441536933
0.9811320754716981
[[1912   20]
 [  21 1066]]
