In [29]:
import numpy as np
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from sklearn.datasets import fetch_20newsgroups
import pandas as pd

import matplotlib.pyplot as plt
plt.switch_backend('agg')
%matplotlib inline

In [30]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [31]:
def load_embeddings(filename):
    embeddings_index = {}
    with open(filename,encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    print('Total %s word vectors in %s' % (len(embeddings_index), filename))
    
    return embeddings_index

In [32]:
def create_embedding_layer(word_index, embeddings_index):
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    return Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,trainable=True)

In [48]:
def create_model(embedding_layer, macronum):
    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    l_cov1= Conv1D(128, 5, activation='relu')(embedded_sequences)
    l_pool1 = MaxPooling1D(5)(l_cov1)
    l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
    l_pool2 = MaxPooling1D(5)(l_cov2)
    l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
    l_pool3 = MaxPooling1D(35)(l_cov3)  # global max pooling
    l_flat = Flatten()(l_pool3)
    l_dense = Dense(128, activation='relu')(l_flat)
    preds = Dense(len(macronum), activation='softmax')(l_dense)
    
    model = Model(sequence_input, preds)
    model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

    print("Simplified convolutional neural network")
    model.summary()
    cp = ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
    return (model, cp)

In [49]:
def load_twenty_newsgroup():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']
    
    return df

In [50]:
def load_data():
    texts = []
    labels = []

    df = load_twenty_newsgroup()
    
    texts = df['text']
    labels = df['target']
    
    macronum=sorted(set(df['target']))
    
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Number of Unique Tokens',len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

    labels = to_categorical(np.asarray(labels))
    print('Shape of Data Tensor:', data.shape)
    print('Shape of Label Tensor:', labels.shape)

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

    x_train = data[:-nb_validation_samples]
    y_train = labels[:-nb_validation_samples]
    x_val = data[-nb_validation_samples:]
    y_val = labels[-nb_validation_samples:]
    
    return (x_train, y_train, x_val, y_val, word_index, macronum)

In [51]:
embeddings_filename = 'glove.6B.' + str(EMBEDDING_DIM) + 'd.txt'
embeddings_index = load_embeddings(embeddings_filename)

Total 400000 word vectors in glove.6B.100d.txt


In [52]:
x_train, y_train, x_val, y_val, word_index, macronum = load_data()

Number of Unique Tokens 105372
Shape of Data Tensor: (11314, 1000)
Shape of Label Tensor: (11314, 20)


In [53]:
embedding_layer = create_embedding_layer(word_index, embeddings_index)
model, cp = create_model(embedding_layer, macronum)

Simplified convolutional neural network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 1000, 100)         10537300  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 

In [54]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=15, batch_size=2, callbacks=[cp])

Instructions for updating:
Use tf.cast instead.
Train on 9052 samples, validate on 2262 samples
Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.43899, saving model to model_cnn.hdf5
Epoch 2/15

Epoch 00002: val_acc improved from 0.43899 to 0.50531, saving model to model_cnn.hdf5
Epoch 3/15

Epoch 00003: val_acc did not improve from 0.50531
Epoch 4/15

Epoch 00004: val_acc improved from 0.50531 to 0.52829, saving model to model_cnn.hdf5
Epoch 5/15

Epoch 00005: val_acc did not improve from 0.52829
Epoch 6/15

Epoch 00006: val_acc improved from 0.52829 to 0.56322, saving model to model_cnn.hdf5
Epoch 7/15

Epoch 00007: val_acc did not improve from 0.56322
Epoch 8/15

Epoch 00008: val_acc did not improve from 0.56322
Epoch 9/15

Epoch 00009: val_acc did not improve from 0.56322
Epoch 10/15

Epoch 00010: val_acc improved from 0.56322 to 0.56631, saving model to model_cnn.hdf5
Epoch 11/15

Epoch 00011: val_acc did not improve from 0.56631
Epoch 12/15

Epoch 00012: val_acc did not i

KeyboardInterrupt: 

In [None]:
fig2=plt.figure()
plt.plot(history.history['acc'],'r',linewidth=3.0)
plt.plot(history.history['val_acc'],'b',linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Accuracy',fontsize=16)
plt.title('Accuracy Curves : CNN',fontsize=16)
fig2.savefig('accuracy_cnn.png')
plt.show()