In [1]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

## Preprocessing

### Loading the Data

In [2]:
# Read in data
data = './data/ag_news_csv/train.csv'

train_df = pd.read_csv(data, header=None, names=['class', 'title', 'description'])
train_df.head()

Unnamed: 0,class,title,description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [3]:
# concatenate column 1 and 2 as one text
train_df['text'] = train_df.title + train_df.description
train_df.drop(['title', 'description'], axis=1, inplace=True)

In [12]:
train_df.head()

Unnamed: 0,class,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."


### Tokenizing the Text

In [4]:
texts = train_df.text.values # text values as an array
texts = [ s.lower() for s in texts ] # preproc texts to all lowercase to match vocab

print(f"Length of texts: {len(texts)}")
print(f"First two text excerpts:\n{texts[:2]}")

Length of texts: 120000
First two text excerpts:
["wall st. bears claw back into the black (reuters)reuters - short-sellers, wall street's dwindling\\band of ultra-cynics, are seeing green again.", 'carlyle looks toward commercial aerospace (reuters)reuters - private investment firm carlyle group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.']


In [5]:
# Initialize and train Tokenizer on text
# The Tokenizer at a character level will detect all unique characters that exist on the training dataset (texts)
# When not at char_level, it does this on all unique words
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(texts)

# This generates a character dictionary learned from the training data
tk.word_index

{'UNK': 1,
 ' ': 2,
 'e': 3,
 'a': 4,
 't': 5,
 'i': 6,
 's': 7,
 'o': 8,
 'n': 9,
 'r': 10,
 'l': 11,
 'd': 12,
 'h': 13,
 'c': 14,
 'u': 15,
 'p': 16,
 'm': 17,
 'g': 18,
 'f': 19,
 'y': 20,
 'w': 21,
 'b': 22,
 '.': 23,
 'v': 24,
 'k': 25,
 ',': 26,
 '-': 27,
 ';': 28,
 '3': 29,
 '0': 30,
 'x': 31,
 '9': 32,
 'j': 33,
 'q': 34,
 '#': 35,
 '1': 36,
 '(': 37,
 ')': 38,
 '2': 39,
 "'": 40,
 'z': 41,
 '\\': 42,
 '&': 43,
 ':': 44,
 '/': 45,
 '5': 46,
 '4': 47,
 '6': 48,
 '"': 49,
 '7': 50,
 '$': 51,
 '8': 52,
 '=': 53,
 '?': 54,
 '!': 55,
 '_': 56,
 '*': 57}

In [6]:
# Although we generated a vocabulary already, we already have an existing character list:
alphabet="abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
    
char_dict

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '0': 27,
 '1': 28,
 '2': 29,
 '3': 30,
 '4': 31,
 '5': 32,
 '6': 33,
 '7': 34,
 '8': 35,
 '9': 36,
 '-': 60,
 ',': 38,
 ';': 39,
 '.': 40,
 '!': 41,
 '?': 42,
 ':': 43,
 "'": 44,
 '"': 45,
 '/': 46,
 '\\': 47,
 '|': 48,
 '_': 49,
 '@': 50,
 '#': 51,
 '$': 52,
 '%': 53,
 '^': 54,
 '&': 55,
 '*': 56,
 '~': 57,
 '`': 58,
 '+': 59,
 '=': 61,
 '<': 62,
 '>': 63,
 '(': 64,
 ')': 65,
 '[': 66,
 ']': 67,
 '{': 68,
 '}': 69}

In [7]:
# Now, we just need to add the 'UNK' character to the vocabulary

tk.word_index = char_dict   # assign Tokenizer's word index to our custom index
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1   # append 'UNK' to be the next sequential value of the char dict

### Converting Characters to Index

In this step, the goal is to represent all text by using its character index according to the char_dict we set earlier

In [10]:
# This can be accomplished using tk.texts_to_sequences()
sequences = tk.texts_to_sequences(texts)

# notice how a string has been converted to a character array, where each value is the char_dict
# value corresponding to that character key
print(texts[0])
print(sequences[0])

wall st. bears claw back into the black (reuters)reuters - short-sellers, wall street's dwindling\band of ultra-cynics, are seeing green again.
[23, 1, 12, 12, 70, 19, 20, 40, 70, 2, 5, 1, 18, 19, 70, 3, 12, 1, 23, 70, 2, 1, 3, 11, 70, 9, 14, 20, 15, 70, 20, 8, 5, 70, 2, 12, 1, 3, 11, 70, 64, 18, 5, 21, 20, 5, 18, 19, 65, 18, 5, 21, 20, 5, 18, 19, 70, 60, 70, 19, 8, 15, 18, 20, 60, 19, 5, 12, 12, 5, 18, 19, 38, 70, 23, 1, 12, 12, 70, 19, 20, 18, 5, 5, 20, 44, 19, 70, 4, 23, 9, 14, 4, 12, 9, 14, 7, 47, 2, 1, 14, 4, 70, 15, 6, 70, 21, 12, 20, 18, 1, 60, 3, 25, 14, 9, 3, 19, 38, 70, 1, 18, 5, 70, 19, 5, 5, 9, 14, 7, 70, 7, 18, 5, 5, 14, 70, 1, 7, 1, 9, 14, 40]


### Padding

As you might imagine, all the texts have different lengths, and they must be normalized so that the CNN can handle the batch data. Notice that our `char_dict` starts at 1, not 0. This is because 0 will serve as our meaningless padding value.

In [12]:
data = pad_sequences(
    sequences,                                  # sequences to be padded
    maxlen=max([len(i) for i in sequences]),    # get max length of all sequences
    padding='post'                              # pad sequences on the right end
)

In [13]:
print(sequences[0][:160], "\n")
print(data[0][:160])

[23, 1, 12, 12, 70, 19, 20, 40, 70, 2, 5, 1, 18, 19, 70, 3, 12, 1, 23, 70, 2, 1, 3, 11, 70, 9, 14, 20, 15, 70, 20, 8, 5, 70, 2, 12, 1, 3, 11, 70, 64, 18, 5, 21, 20, 5, 18, 19, 65, 18, 5, 21, 20, 5, 18, 19, 70, 60, 70, 19, 8, 15, 18, 20, 60, 19, 5, 12, 12, 5, 18, 19, 38, 70, 23, 1, 12, 12, 70, 19, 20, 18, 5, 5, 20, 44, 19, 70, 4, 23, 9, 14, 4, 12, 9, 14, 7, 47, 2, 1, 14, 4, 70, 15, 6, 70, 21, 12, 20, 18, 1, 60, 3, 25, 14, 9, 3, 19, 38, 70, 1, 18, 5, 70, 19, 5, 5, 9, 14, 7, 70, 7, 18, 5, 5, 14, 70, 1, 7, 1, 9, 14, 40] 

[23  1 12 12 70 19 20 40 70  2  5  1 18 19 70  3 12  1 23 70  2  1  3 11
 70  9 14 20 15 70 20  8  5 70  2 12  1  3 11 70 64 18  5 21 20  5 18 19
 65 18  5 21 20  5 18 19 70 60 70 19  8 15 18 20 60 19  5 12 12  5 18 19
 38 70 23  1 12 12 70 19 20 18  5  5 20 44 19 70  4 23  9 14  4 12  9 14
  7 47  2  1 14  4 70 15  6 70 21 12 20 18  1 60  3 25 14  9  3 19 38 70
  1 18  5 70 19  5  5  9 14  7 70  7 18  5  5 14 70  1  7  1  9 14 40  0
  0  0  0  0  0  0  0  0  0  0  0  0  

In [14]:
# Now that our data has been normalized, we can convert the 2D list to a numpy array
data = np.array(data)
data.shape

(120000, 1011)

### Obtaining the Labels

In this step, we're going to make our labels begin as 0-index (since they start with 1, currently). Afterwards, because this is a multiclass classifier task, we need the classes to be one-hot encoded. There are 4 classes, so 4 class columns will be created; a column value will be 1 if that column matches that sample's class, and 0 otherwise.

In [22]:
class_list = [ x-1 for x in train_df['class'].values ]    # make classes start with 0 index

In [24]:
from tensorflow.keras.utils import to_categorical

classes = to_categorical(class_list)

In [32]:
class_list[75:80]   # note the classes...

[2, 2, 2, 3, 3]

In [33]:
classes[75:80]  # now note where the 1 value is...

array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

In [35]:
train_df

Unnamed: 0,class,text
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."
...,...,...
119995,1,Pakistan's Musharraf Says Won't Quit as Army C...
119996,2,Renteria signing a top-shelf dealRed Sox gener...
119997,2,Saban not going to Dolphins yetThe Miami Dolph...
119998,2,Today's NFL gamesPITTSBURGH at NY GIANTS Time:...


## Creating the CNN

In [2]:
# Used to skip preprocessing stuff
from utils.preprocess import preprocess_text

train_data, train_classes, test_data, test_classes, tk = preprocess_text(
    train_path='./data/ag_news_csv/train.csv',
    test_path='./data/ag_news_csv/test.csv'
)

### Model Setup

In [4]:
vocab_size = len(tk.word_index)

In [5]:
embedding_weights = []
embedding_weights.append(np.zeros(vocab_size))  # we create a zero vector to represent the padding value

# for each character, generate a sparse array of zeros where a value of 1 is given for the index of that character
for char, idx in tk.word_index.items():
    onehot = np.zeros(vocab_size)
    onehot[idx-1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)

In [8]:
embedding_weights.shape

(70, 69)

In [7]:
# Embedding layer initialization

input_size = 1014
# vocab_size = 69
embedding_size = 69
conv_layers = [
    [256, 7, 3],
    [256, 7, 3],
    [256, 3, -1],
    [256, 3, -1],
    [256, 3, -1],
    [256, 3, 3],
]

embedding_layer = Embedding(
    vocab_size+1,
    embedding_size,
    input_length=input_size,
    weights=[embedding_weights]
)

### Model Construction

In [9]:
# additional parameters defined

fully_connected_layers = [1024, 1024]
num_of_classes = 4
dropout_p = 0.5
optimizer = 'adam'
loss = 'categorical_crossentropy'

In [10]:
# Defining the model:

# Input layer, shape of (?, 1011)
inputs = Input(shape=(input_size,), name='input', dtype='int64')

# Embedding layer
x = embedding_layer(inputs)

# Convolutional Layer
for filter_num, filter_size, pooling_size in conv_layers:
    x = Conv1D(filter_num, filter_size)(x)
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)

x = Flatten()(x) # results in a (None, 8704) shape

# Output layer
predictions = Dense(num_of_classes, activation='softmax')(x)
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 1014)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1014, 69)          4830      
_________________________________________________________________
conv1d (Conv1D)              (None, 1008, 256)         123904    
_________________________________________________________________
activation (Activation)      (None, 1008, 256)         0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 336, 256)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 330, 256)          459008    
_________________________________________________________________
activation_1 (Activation)    (None, 330, 256)          0     

## Training the Model

### Specifying Data

In [11]:
# Shuffle
indices = np.arange(train_data.shape[0])
np.random.shuffle(indices)

x_train = train_data[indices][:1000]
y_train = train_classes[indices][:1000]

x_test = test_data[:100]
y_test = test_classes[:100]

# Training
model.fit(x_train, y_train,
          validation_data=(x_test, y_test),
          batch_size=128,
          epochs=10,
          verbose=2)

Epoch 1/10
8/8 - 12s - loss: 1.3857 - accuracy: 0.2580 - val_loss: 1.3602 - val_accuracy: 0.4400
Epoch 2/10
8/8 - 10s - loss: 1.3699 - accuracy: 0.3240 - val_loss: 1.3963 - val_accuracy: 0.2400
Epoch 3/10
8/8 - 10s - loss: 1.3738 - accuracy: 0.3050 - val_loss: 1.3522 - val_accuracy: 0.3400
Epoch 4/10
8/8 - 10s - loss: 1.3462 - accuracy: 0.3460 - val_loss: 1.2914 - val_accuracy: 0.4400
Epoch 5/10
8/8 - 10s - loss: 1.3229 - accuracy: 0.3530 - val_loss: 1.3357 - val_accuracy: 0.3400
Epoch 6/10
8/8 - 10s - loss: 1.2693 - accuracy: 0.4310 - val_loss: 1.2960 - val_accuracy: 0.4400
Epoch 7/10
8/8 - 10s - loss: 1.1909 - accuracy: 0.4610 - val_loss: 1.4851 - val_accuracy: 0.3900
Epoch 8/10
8/8 - 10s - loss: 1.0020 - accuracy: 0.5750 - val_loss: 1.5096 - val_accuracy: 0.3600
Epoch 9/10
8/8 - 10s - loss: 0.6532 - accuracy: 0.7450 - val_loss: 1.7948 - val_accuracy: 0.3400
Epoch 10/10
8/8 - 10s - loss: 0.3669 - accuracy: 0.8660 - val_loss: 2.6382 - val_accuracy: 0.3400


<keras.callbacks.History at 0x2c509794460>

In [13]:
test_data[101].shape

(1014,)

In [14]:
np.reshape(test_data[101], (1, 1014))

array([[ 8, 15, 13, ...,  0,  0,  0]])

In [19]:
model.predict(np.reshape(test_data[105], (1, 1014)))

array([[0.18573591, 0.5521907 , 0.17353961, 0.08853371]], dtype=float32)

In [20]:
test_classes[105]

array([0., 1., 0., 0.], dtype=float32)

In [14]:
test_data[101]

array([ 8, 15, 13, ...,  0,  0,  0])

In [23]:
model.get_layer('embedding').predict(test_data[101], (1, 1014))

AttributeError: 'Embedding' object has no attribute 'predict'

In [33]:
model.get_layer('embedding').get_weights()[0].shape

(70, 69)