In [1]:
from nltk.corpus import stopwords
import string
import re
from os import listdir
from pickle import dump

In [2]:
def load_doc(filename):
    file = open(filename,'r')
    text=file.read()
    file.close()
    return text

In [3]:
def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]'%re.escape(string.punctuation))
    tokens = [re_punc.sub('',w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word)>1]
    tokens = ' '.join(tokens)
    return tokens

In [4]:
filename = 'txt_sentoken/pos/cv000_29590.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

films adapted comic books plenty success whether theyre superheroes batman superman spawn geared toward kids casper arthouse crowd ghost world theres never really comic book like hell starters created alan moore eddie campbell brought medium whole new level mid series called watchmen say moore campbell thoroughly researched subject jack ripper would like saying michael jackson starting look little odd book graphic novel pages long includes nearly consist nothing footnotes words dont dismiss film source get past whole comic book thing might find another stumbling block hells directors albert allen hughes getting hughes brothers direct seems almost ludicrous casting carrot top well anything riddle better direct film thats set ghetto features really violent street crime mad geniuses behind menace ii society ghetto question course whitechapel londons east end filthy sooty place whores called unfortunates starting get little nervous mysterious psychopath carving profession surgical precisio

In [5]:
def process_docs(directory, is_train):
    documents = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc)
        documents.append(tokens)
    return documents

In [6]:
def load_clean_dataset(is_train):
    neg = process_docs('txt_sentoken/neg', is_train)
    pos = process_docs('txt_sentoken/pos',is_train)
    docs = neg + pos
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

In [7]:
def save_dataset(dataset, filename):
    dump(dataset, open(filename, 'wb'))
    print('Saved: %s'%filename)

In [8]:
train_docs, ytrain = load_clean_dataset(True)

In [9]:
len(train_docs)

1800

In [10]:
test_docs, ytest = load_clean_dataset(False)

In [11]:
save_dataset([train_docs,ytrain],'train.pkl')
save_dataset([test_docs,ytest], 'test.pkl')

Saved: train.pkl
Saved: test.pkl


In [12]:
# Encode data

In [13]:
from pickle import load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.merge import concatenate


Using TensorFlow backend.


In [14]:
def load_dataset(filename):
    return load(open(filename,'rb'))

In [15]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [16]:
def max_length(lines):
    return max([len(s.split()) for s in lines])

In [17]:
def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [18]:
def define_model(length, vocab_size):
    inputs1 = Input(shape = (length,))
    embedding1 = Embedding(vocab_size,100)(inputs1)
    conv1 = Conv1D(filters=32, kernel_size=4,activation="relu")(embedding1)
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D(pool_size=2)(drop1)
    flat1 = Flatten()(pool1)
    
    inputs2 = Input(shape=(length,))
    embedding2 = Embedding(vocab_size,100)(inputs2)
    conv2 = Conv1D(filters=32, kernel_size=6,activation="relu")(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D(pool_size=2)(drop2)
    flat2 = Flatten()(pool2)
    
    inputs3 = Input(shape=(length,))
    embedding3 = Embedding(vocab_size,100)(inputs3)
    conv3 = Conv1D(filters = 32, kernel_size=8, activation="relu")(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D(pool_size=2)(drop3)
    flat3 = Flatten()(pool3)
    
    merged = concatenate([flat1,flat2,flat3])
    
    dense1 = Dense(10, activation="relu")(merged)
    outputs = Dense(1,activation="sigmoid")(dense1)
    model = Model(inputs=[inputs1,inputs2,inputs3], outputs=outputs)
    
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics = ["accuracy"])
    model.summary()
    plot_model(model, show_shapes = True, to_file = 'multichannel.png')
    return model

In [19]:
trainLines, trainLabels = load_dataset('train.pkl')
tokenizer = create_tokenizer(trainLines)

In [20]:
len(trainLabels)

1800

In [21]:
length = max_length(trainLines)

In [22]:
print("Max document length: %d" %length)

Max document length: 1380


In [23]:
vocab_size = len(tokenizer.word_index) + 1

In [24]:
print("Vocabulary size: %d" %vocab_size)

Vocabulary size: 44277


In [25]:
trainX = encode_text(tokenizer, trainLines,length)

In [26]:
model = define_model(length, vocab_size)

W0504 13:47:53.260085 140365116127040 deprecation.py:506] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1633: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
W0504 13:47:53.320976 140365116127040 module_wrapper.py:136] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0504 13:47:53.496384 140365116127040 deprecation.py:323] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1380)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 1380)         0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 1380)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1380, 100)    4427700     input_1[0][0]                    
____________________________________________________________________________________________

In [27]:
model.fit([trainX, trainX, trainX], trainLabels, epochs=7, batch_size=16)

W0504 13:47:58.077256 140365116127040 deprecation.py:323] From /home/arvind/.local/lib/python3.6/site-packages/keras/optimizers.py:550: BaseResourceVariable.constraint (from tensorflow.python.ops.resource_variable_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Apply a constraint manually following the optimizer update step.
W0504 13:47:58.917683 140365116127040 module_wrapper.py:136] From /home/arvind/.local/lib/python3.6/site-packages/tensorflow_core/python/util/module_wrapper.py:163: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.



Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.callbacks.History at 0x7fa8ed2b20f0>

In [28]:
model.save('model.h5')

In [29]:
# Evaluate model

In [30]:
trainLines, trainLabels = load_dataset('train.pkl')
testLines, testLabels = load_dataset('test.pkl')

In [31]:
tokenizer = create_tokenizer(trainLines)

In [32]:
length = max_length(trainLines)

In [33]:
vocab_size = len(tokenizer.word_index)+1

In [34]:
print("Max document length: %d" %length)

Max document length: 1380


In [35]:
print("Vocabulary size: %d" % vocab_size)

Vocabulary size: 44277


In [36]:
trainX = encode_text(tokenizer, trainLines, length)
testX = encode_text(tokenizer, testLines, length)

In [37]:
print(trainX.shape, testX.shape)

(1800, 1380) (200, 1380)


In [38]:
from keras.models import load_model

In [39]:
model = load_model('model.h5')

In [40]:
_ , acc = model.evaluate([trainX, trainX, trainX], trainLabels, verbose=0)
print("Train Accuracy: %.2f"%(acc*100))

Train Accuracy: 100.00


In [45]:
_ , acc = model.evaluate([testX, testX, testX], testLabels, verbose=0)

In [46]:
print("Test Accuracy: %.2f" %(acc*100))

Test Accuracy: 84.00
