In [1]:
from nltk.corpus import stopwords
import string
import re

In [2]:
def load_doc(filename):
    file = open(filename,'r')
    text = file.read()
    file.close()
    return text

In [3]:
def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' %re.escape(string.punctuation))
    tokens = [re_punc.sub('',w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word)>1]
    return tokens

In [4]:
filename = 'txt_sentoken/pos/cv000_29590.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

In [5]:
from os import listdir
from collections import Counter

In [6]:
def add_doc_to_vocab(filename,vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)

In [7]:
def process_docs(directory,vocab):
    for filename in listdir(directory):
        if filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        add_doc_to_vocab(path,vocab)

In [8]:
vocab = Counter()

In [9]:
process_docs('txt_sentoken/pos',vocab)
process_docs('txt_sentoken/neg',vocab)

In [10]:
print(len(vocab))

44276


In [11]:
print(vocab.most_common(50))

[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('bad', 1248), ('could', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


In [12]:
min_occurane = 2
tokens = [k for k,c in vocab.items() if c>= min_occurane]

In [13]:
def save_list(lines,filename):
    data = '\n'.join(lines)
    file = open(filename,'w')
    file.write(data)
    file.close()
save_list(tokens,'vocab.txt')

In [14]:
print(len(tokens))

25767


In [15]:
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

In [16]:
# Updated clean_doc

In [17]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Using TensorFlow backend.


In [18]:
def clean_doc(doc, vocab):
    tokens = doc.split()
    re_punc = re.compile('[%s]'%re.escape(string.punctuation))
    tokens = [re_punc.sub('',w) for w in tokens]
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens

In [19]:
def process_docs(directory, vocab, is_train):
    documents = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc,vocab)
        documents.append(tokens)
    return documents

In [27]:
def load_clean_dataset(vocab, is_train):
    neg = process_docs('txt_sentoken/neg',vocab,is_train)
    pos = process_docs('txt_sentoken/pos',vocab,is_train)
    docs = neg + pos
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels

In [28]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [29]:
def encode_docs(tokenizer, max_length, docs):
    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded,maxlen = max_length, padding = 'post')
    return padded

In [41]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length))
    model.add(Conv1D(filters=32, kernel_size=8,activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(1,activation="sigmoid"))
    model.compile(loss="binary_crossentropy",optimizer ='adam',metrics=["accuracy"])
    model.summary()
    plot_model(model,to_file = "model.png", show_shapes=True)
    return model

In [31]:
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

In [32]:
import traceback
try:
    train_docs, ytrain = load_clean_dataset(vocab,True)
except:
    print(traceback.print_exc())

In [33]:
tokenizer = create_tokenizer(train_docs)

In [34]:
vocab_size = len(tokenizer.word_index)+1

In [35]:
print("Vocabulary size: %d" %vocab_size)

Vocabulary size: 25768


In [36]:
max_length = max([len(s.split()) for s in train_docs])

In [37]:
print("Maximum length: %d" % max_length)

Maximum length: 1317


In [42]:
Xtrain = encode_docs(tokenizer,max_length,train_docs)
model = define_model(vocab_size,max_length)

W0503 19:13:23.795296 140196306483008 deprecation.py:323] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1317, 100)         2576800   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 1310, 32)          25632     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 655, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20960)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 20961     
Total params: 2,623,393
Trainable params: 2,623,393
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.fit(Xtrain,ytrain,epochs=10,verbose=2)

W0503 19:15:36.345375 140196306483008 deprecation.py:323] From /home/arvind/.local/lib/python3.6/site-packages/keras/optimizers.py:550: BaseResourceVariable.constraint (from tensorflow.python.ops.resource_variable_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Apply a constraint manually following the optimizer update step.
W0503 19:15:36.669386 140196306483008 module_wrapper.py:136] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



Epoch 1/10
 - 11s - loss: 0.6883 - accuracy: 0.5317
Epoch 2/10
 - 11s - loss: 0.5342 - accuracy: 0.7750
Epoch 3/10
 - 11s - loss: 0.1315 - accuracy: 0.9900
Epoch 4/10
 - 11s - loss: 0.0102 - accuracy: 1.0000
Epoch 5/10
 - 11s - loss: 0.0033 - accuracy: 1.0000
Epoch 6/10
 - 10s - loss: 0.0023 - accuracy: 1.0000
Epoch 7/10
 - 10s - loss: 0.0013 - accuracy: 1.0000
Epoch 8/10
 - 11s - loss: 9.5155e-04 - accuracy: 1.0000
Epoch 9/10
 - 11s - loss: 7.5352e-04 - accuracy: 1.0000
Epoch 10/10
 - 11s - loss: 5.9029e-04 - accuracy: 1.0000


<keras.callbacks.callbacks.History at 0x7f819d537358>

In [44]:
model.save('model.h5')

In [45]:
# Evaluate model

In [46]:
train_docs, ytrain = load_clean_dataset(vocab, True)

In [47]:
test_docs, ytest = load_clean_dataset(vocab, False)

In [48]:
tokenizer = create_tokenizer(train_docs)

In [49]:
vocab_size = len(tokenizer.word_index)+1

In [50]:
print("Vocabulary size: %d"%vocab_size)

Vocabulary size: 25768


In [51]:
max_length = max([len(s.split()) for s in train_docs])

In [52]:
print('Maximum length: %d' %max_length)

Maximum length: 1317


In [53]:
Xtrain = encode_docs(tokenizer, max_length, train_docs)
Xtest = encode_docs(tokenizer,max_length,test_docs)

In [55]:
from keras.models import load_model
model = load_model('model.h5')
_ , acc = model.evaluate(Xtrain, ytrain, verbose=0)
print('Train Accuracy: %f' %(acc*100))

Train Accuracy: 100.000000


In [56]:
_, acc = model.evaluate(Xtest,ytest,verbose=0)
print("Test Accuracy: %f"%(acc*100))

Test Accuracy: 85.500002


In [57]:
# Prediction on a new review

In [58]:
def predict_sentiment(review, vocab, tokenizer, max_length, model):
    line = clean_doc(review, vocab)
    padded = encode_docs(tokenizer, max_length, [line])
    yhat = model.predict(padded, verbose=0)
    percent_pos = yhat[0,0]
    if round(percent_pos)==0:
        return(1-percent_pos), "NEGATIVE"
    return percent_pos, 'POSITIVE'

In [62]:
text = "Everyone will enjoy this film. I love it, recommended!"

In [63]:
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)

In [64]:
print("Review: [%s]\nSentiment: %s (%.3f%%)" %(text, sentiment, percent*100))

Review: [Everyone will enjoy this film. I love it, recommended!]
Sentiment: POSITIVE (52.890%)


In [65]:
text = "This is a bad movie. Do not watch it. It sucks"

In [66]:
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)

In [67]:
print("Review: [%s]\nSentiment: %s (%.3f%%)"%(text, sentiment, percent*100))

Review: [This is a bad movie. Do not watch it. It sucks]
Sentiment: NEGATIVE (52.249%)
