In [1]:
from string import punctuation
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
import keras.backend.tensorflow_backend as K
import tensorflow as tf
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

import string
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import glob
from tqdm import tqdm
from nltk.stem.porter import PorterStemmer
from collections import Counter
from operator import itemgetter

Using TensorFlow backend.


# Hi
This is test doc.

In [2]:
config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.9
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

In [3]:
remove_punctuation_table = str.maketrans('', '', '\'"!.,?:;')
stop_words = set(stopwords.words('english'))

In [4]:

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = word_tokenize(doc)
    # remove punctuation from each token
    tokens = [w.translate(remove_punctuation_table) for w in tokens]
    porter = PorterStemmer()
    tokens = [porter.stem(w.lower()) for w in tokens]

    # filter out tokens not in vocab
    #tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

# load all docs in a directory
def process_docs(directory):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc)
        # add to list
        documents.append(tokens)
    return documents

In [5]:
# load all training reviews
positive_docs = process_docs('data/train/pos')
negative_docs = process_docs('data/train/neg')
train_docs = negative_docs + positive_docs

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array(([0] * len(negative_docs)) + ([1] * len(positive_docs)))

In [7]:
# load all test reviews
positive_docs = process_docs('data/test/pos')
negative_docs = process_docs('data/test/neg')
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array(([0] * len(negative_docs)) + ([1] * len(positive_docs)))

In [8]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

# define model
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_length))
model.add(Conv1D(filters=8, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Flatten())
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
#model.add(Dense(1, activation='softmax'))
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2573, 128)         7536256   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 2570, 8)           4104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 856, 8)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 6848)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 54792     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 9         
Total params: 7,595,161
Trainable params: 7,595,161
Non-trainable para

In [9]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
# fit network (Training)
model.fit(Xtrain, ytrain, epochs=6, verbose=2, validation_data=(Xtest, ytest))


Instructions for updating:
Use tf.cast instead.
Train on 25000 samples, validate on 25000 samples
Epoch 1/6
 - 11s - loss: 0.3835 - acc: 0.8046 - val_loss: 0.2859 - val_acc: 0.8814
Epoch 2/6
 - 9s - loss: 0.1530 - acc: 0.9449 - val_loss: 0.3171 - val_acc: 0.8728
Epoch 3/6
 - 9s - loss: 0.0636 - acc: 0.9785 - val_loss: 0.4274 - val_acc: 0.8634
Epoch 4/6
 - 9s - loss: 0.0217 - acc: 0.9942 - val_loss: 0.5587 - val_acc: 0.8564
Epoch 5/6
 - 9s - loss: 0.0063 - acc: 0.9990 - val_loss: 0.6966 - val_acc: 0.8558
Epoch 6/6
 - 9s - loss: 0.0021 - acc: 0.9997 - val_loss: 0.8421 - val_acc: 0.8470


<keras.callbacks.History at 0x7fc800684080>

In [None]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))