In [33]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt 

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling1D
from tensorflow.keras.layers import LSTM, GRU, SimpleRNN, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy

In [12]:
df = pd.read_csv("../data/bbc_text_cls.csv")

In [13]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\r\n\r\nQuart...,business
1,Dollar gains on Greenspan speech\r\n\r\nThe do...,business
2,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business
3,High fuel prices hit BA's profits\r\n\r\nBriti...,business
4,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business


In [14]:
df['targets'] = df['labels'].astype("category").cat.codes

In [15]:
df.head()

Unnamed: 0,text,labels,targets
0,Ad sales boost Time Warner profit\r\n\r\nQuart...,business,0
1,Dollar gains on Greenspan speech\r\n\r\nThe do...,business,0
2,Yukos unit buyer faces loan claim\r\n\r\nThe o...,business,0
3,High fuel prices hit BA's profits\r\n\r\nBriti...,business,0
4,Pernod takeover talk lifts Domecq\r\n\r\nShare...,business,0


In [16]:
# Number of Classes
K = df['labels'].nunique()

In [17]:
df_train, df_test = train_test_split(df,test_size=0.3)

In [24]:
# Convert Sentence to Sequences
MAX_VOCAB_SIZE = 2000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(df_train['text'])
sequence_train = tokenizer.texts_to_sequences(df_train['text'])
sequence_test = tokenizer.texts_to_sequences(df_test['text'])

In [25]:
# Get word -> Index mapping
word2idx = tokenizer.word_index
V = len(word2idx) #Vocab size
print("Found %s unique tokens." % V)

Found 28466 unique tokens.


In [26]:
# Pad Sequence to get N*T Matrix
# N: Number of Documnets
# T: Sequence Length

data_train = pad_sequences(sequence_train)
# get Sequence Length
T = data_train.shape[1]

data_test = pad_sequences(sequences = sequence_test, maxlen=T)

print('Shape of data train tensor:', data_train.shape)
print('Shape of data test tensor:', data_test.shape)

Shape of data train tensor: (1557, 3581)
Shape of data test tensor: (668, 3581)


In [30]:
# Create Model
D = 20 # Embedding Dimensionality
M1 = 32 # Hyper parameter

# Note: we actually want to the size of the embedding to (V + 1) x D,
# because the first index starts from 1 and not 0.
# Thus, if the final index of the embedding matrix is V,
# then it actually must have size V + 1.

i = Input(shape = (T,))
x = Embedding(V+1, D)(i)
x = SimpleRNN(M1)(x)
x = Dense(K,activation='softmax')(x)

model = Model(i,x)


In [29]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 3581)]            0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 3581, 20)          569340    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                1696      
_________________________________________________________________
dense (Dense)                (None, 5)                 165       
Total params: 571,201
Trainable params: 571,201
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Compile and fit
model.compile(
  loss=SparseCategoricalCrossentropy(from_logits=True),
  optimizer='adam',
  metrics=['accuracy']
)


print('Training model...')
r = model.fit(
  data_train,
  df_train['targets'],
  epochs=50,
  validation_data=(data_test, df_test['targets'])
)

In [None]:
# Plot loss per iteration
plt.plot(r.history['loss'], label='train loss')
plt.plot(r.history['val_loss'], label='val loss')
plt.legend();

In [None]:
# Plot accuracy per iteration
plt.plot(r.history['accuracy'], label='train acc')
plt.plot(r.history['val_accuracy'], label='val acc')
plt.legend();