In [10]:
from operator import itemgetter
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.utils import shuffle
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

# restore objects and unpack them into variables
%store -r object_keep
df_bbc, list_categories, X, y, X_train, X_test, y_train, y_test = itemgetter('df_bbc',
                                                                             'list_categories',
                                                                             'X',
                                                                             'y',
                                                                             'X_train',
                                                                             'X_test',
                                                                             'y_train',
                                                                             'y_test')(object_keep)

## Deep Learning with Bag of Words

Will now use the deep-learning framework, [Keras](https://keras.io/), to perform our text classification.

The process will be like so:
1. Separate the data into the training and test sets.
1. Use the `tokenizer` method to count the unique words in our vocabulary and assign each of these words to indices.
1. Call `fit_on_texts()` automatically creates a word index lookup of our vocabulary.
1. Limit our vocabulary to the top words by passing a `num_words` parameter to the `tokenizer` method.
1. With the `tokenizer` method, can use the `texts_to_matrix` method to create the training data that we'll pass to our model.
1. Pass a one-hot vector to our model.
1. Transform our features and labels into a format that Keras can read.
1. Build our model, telling Keras the shape of our:
    + input data
    + output data
    + type of each layer
1. When training the model, call the `fit()` method, pass the training data and labels, batch size and epochs.

> *Note*: Generally, deep-learning works best when you have tonnes of data, probably above 10,000. In our case, we don't but we still proceed with this deep-learning to get used to using Keras.

In [11]:
df_bbc_shuffle = shuffle(df_bbc, random_state = 42)
df_bbc_shuffle

Unnamed: 0,category,article_text,article_text_clean
28,business,"b""China now top trader with Japan\n\nChina ove...",china top trader japan china overtook us becom...
34,business,b'Bush budget seeks deep cutbacks\n\nPresident...,bush budget seeks deep cutbacks president bush...
237,politics,b'MPs\' murder sentence concern\n\nMurder sent...,mps murder sentence concern murder sentences r...
30,business,b'GE sees \'excellent\' world economy\n\nUS be...,ge sees excellent world economy us behemoth ge...
336,sport,b'Rush future at Chester uncertain\n\nIan Rush...,rush future chester uncertain ian rush future ...
...,...,...,...
231,politics,b'Labour\'s Cunningham to stand down\n\nVetera...,labour cunningham stand veteran labour mp form...
199,sport,b'Collins to compete in Birmingham\n\nWorld an...,collins compete birmingham world commonwealth ...
234,sport,b'Juninho demand for O\'Neill talks\n\nJuninho...,juninho demand neill talks juninho agent confi...
398,sport,b'Wenger shock at Newcastle dip\n\nArsenal man...,wenger shock newcastle dip arsenal manager ars...


In [15]:
# create train and test sets
train_size = int(len(df_bbc) * 0.7)
train_articles = df_bbc_shuffle['article_text_clean'][:train_size]
train_labels = df_bbc_shuffle['category'][:train_size]

test_articles = df_bbc_shuffle['article_text_clean'][train_size:]
test_labels = df_bbc_shuffle['category'][train_size:]

In [16]:
# tokenise
max_words = 1000
tokenise = text.Tokenizer(num_words = max_words, char_level = False)
tokenise.fit_on_texts(train_articles)

X_train = tokenise.texts_to_matrix(train_articles)
X_test = tokenise.texts_to_matrix(test_articles)

In [17]:
encoder = LabelEncoder()
encoder.fit(train_labels)

y_train = encoder.transform(train_labels)
y_test = encoder.transform(test_labels)

In [22]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [20]:
batch_size = 32
epochs = 2

In [23]:
# build model
model = Sequential()
model.add(Dense(units = 512, input_shape = (max_words, )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(units = num_classes))
model.add(Activation('softmax'))

model.compile(loss = 'categorical_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [24]:
history = model.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = epochs,
                    verbose = 1,
                    validation_split = 0.1)

Epoch 1/2
Epoch 2/2


In [26]:
score = model.evaluate(X_test, y_test,
                       batch_size = batch_size, 
                       verbose = 1)
print("Test accuracy: ", score[1])

Test accuracy:  0.970059871673584


Wow, not bad. 97% is pretty decent. In fact, it's quite surprising given how relatively small our dataset is for a deep-learning problem!