In [23]:
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from keras.utils.np_utils import to_categorical

In [26]:
data_dir = 'D:/Deeplearning/datasets/bbc'
labels = []
texts = []
label_count = 0
for label_type in ['business', 'entertainment', 'politics', 'sport', 'tech']:
    dir_name = os.path.join(data_dir, label_type)
    for fname in os.listdir(dir_name):
        f = open(os.path.join(dir_name, fname), encoding="utf8", errors='ignore')
        texts.append(f.read())
        f.close()
        labels.append(label_count)
    label_count = label_count + 1

maxlen = 375 # Cut off after 375 words in tokenizer
training_samples = 1725
validation_samples = 500
max_words = 10000 # Size of dictionary for our problem
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)

# Randomly get training and validation samples
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

# Get pre-trained embedding vectors
# Each vector has a size of 300
glove_dir = 'D:/Deeplearning/datasets/bbc/glove'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'))
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()

# Embedding dimension is the same as our embedding vector size
embedding_dim = 300
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Start creating the model
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(5, activation='softmax'))
model.summary()

# Set weights of the embedding layer from our pretrained embedding matrix
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

# Compile and start training for 20 epochs
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_val, y_val), shuffle=True)
model.save_weights('bbc_news_classfication_model.h5')




Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 375, 300)          3000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 112500)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                3600032   
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 85        
Total params: 6,600,645
Trainable params: 6,600,645
Non-trainable params: 0
_________________________________________________________________



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 1