In [1]:
from string import punctuation
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
import keras.backend.tensorflow_backend as K
import tensorflow as tf
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

import string
from nltk.corpus import stopwords
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import glob
from tqdm import tqdm
from nltk.stem.porter import PorterStemmer
from collections import Counter
from operator import itemgetter

Using TensorFlow backend.


In [2]:
remove_punctuation_table = str.maketrans('', '', '\'"!.,?:;')
stop_words = set(stopwords.words('english'))

def read_tokens_from_file(file_name):
    with open(file_name) as f:
        doc = f.read()
        # Tokenize
        tokens = word_tokenize(doc)

        # Remove punctuations
        tokens = [w.translate(remove_punctuation_table) for w in tokens]

        # Remove short words
        # Remove word including number characters
        # Make the word into lowercase
        tokens = [w.lower() for w in tokens if len(w) > 1 and w.isalpha()]

        # Filter out stop words
        tokens = [w for w in tokens if not w in stop_words]

        # Stemming
        #porter = PorterStemmer()
        #tokens = [porter.stem(w) for w in tokens]

        return tokens

In [3]:
vocab_set = set()
file_list = glob.glob('./data/train/neg/*.txt')
file_list = file_list + glob.glob('./data/train/pos/*.txt')

vocab_counter = Counter()

for file_name in tqdm(file_list):
    tokens = read_tokens_from_file(file_name)
    vocab_counter.update(tokens)

vocab_occurrence_list = list(vocab_counter.items())

# Remove rare words (in this case, the word which appeared less than 2 times)        
min_occurrence = 2
vocab_set = {k for k, c in vocab_occurrence_list if c >= min_occurrence}


100%|██████████| 25000/25000 [00:37<00:00, 664.23it/s]


In [4]:
config = tf.ConfigProto(allow_soft_placement=True)
config.gpu_options.per_process_gpu_memory_fraction = 0.9
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

In [5]:

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# turn a doc into clean tokens
def clean_doc(doc, vocab):
    # split into tokens by white space
    tokens = word_tokenize(doc)
    # remove punctuation from each token
    tokens = [w.translate(remove_punctuation_table) for w in tokens]
    #porter = PorterStemmer()
    #tokens = [porter.stem(w.lower()) for w in tokens]

    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

# load all docs in a directory
def process_docs(directory, vocab):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # create the full path of the file to open
        path = directory + '/' + filename
        # load the doc
        doc = load_doc(path)
        # clean doc
        tokens = clean_doc(doc, vocab)
        # add to list
        documents.append(tokens)
    return documents

In [6]:
# load the vocabulary
vocab = vocab_set

In [7]:
# load all training reviews
positive_docs = process_docs('data/train/pos', vocab)
negative_docs = process_docs('data/train/neg', vocab)
train_docs = negative_docs + positive_docs

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array(([0] * len(negative_docs)) + ([1] * len(positive_docs)))

In [8]:
# load all test reviews
positive_docs = process_docs('data/test/pos', vocab)
negative_docs = process_docs('data/test/neg', vocab)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array(([0] * len(negative_docs)) + ([1] * len(positive_docs)))

In [14]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

# define model
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_length))
model.add(Conv1D(filters=8, kernel_size=4, activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Flatten())
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
#model.add(Dense(1, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 891, 128)          4267904   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 888, 8)            4104      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 296, 8)            0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 2368)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 18952     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 4,290,969
Trainable params: 4,290,969
Non-trainable params: 0
_________________________________________________________________


In [15]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
# fit network (Training)
model.fit(Xtrain, ytrain, epochs=6, verbose=2, validation_data=(Xtest, ytest))


Train on 25000 samples, validate on 25000 samples
Epoch 1/6
 - 7s - loss: 0.4016 - acc: 0.8037 - val_loss: 0.2951 - val_acc: 0.8786
Epoch 2/6
 - 6s - loss: 0.1642 - acc: 0.9408 - val_loss: 0.3323 - val_acc: 0.8639
Epoch 3/6
 - 6s - loss: 0.0595 - acc: 0.9826 - val_loss: 0.4500 - val_acc: 0.8622
Epoch 4/6
 - 6s - loss: 0.0164 - acc: 0.9964 - val_loss: 0.5635 - val_acc: 0.8580
Epoch 5/6
 - 6s - loss: 0.0041 - acc: 0.9995 - val_loss: 0.6641 - val_acc: 0.8577
Epoch 6/6
 - 6s - loss: 0.0013 - acc: 0.9998 - val_loss: 0.7376 - val_acc: 0.8578


<keras.callbacks.History at 0x7f67081149e8>

In [12]:
# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 86.260000
