In [3]:
# make sure to run nltk.download() if you're getting errors
import nltk
nltk.download('punkt')
from keras.layers import Conv2D, Concatenate, Dense, Flatten, Reshape, Dropout, MaxPool2D
import string
from keras.engine.input_layer import Input
from keras import preprocessing
from keras.models import Sequential, Model
from keras.optimizers import Adadelta
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
import numpy as np
import collections
from nltk.tokenize import word_tokenize

pos_examples = open('data/rt-polarity.pos', encoding='ISO-8859-1').readlines()
neg_examples = open('data/rt-polarity.neg', encoding='ISO-8859-1').readlines()

vocab = collections.defaultdict(int)
translator = str.maketrans('', '', string.punctuation)

[nltk_data] Downloading package punkt to /Users/erickzhao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# 1. Tokenize all strings
token_pos = list(map(lambda ex: word_tokenize(ex.translate(translator)), pos_examples))
token_neg = list(map(lambda ex: word_tokenize(ex.translate(translator)), neg_examples))

# 2. Get vocabulary size and max sentence length
max_sentence_length = 0
for ex_p, ex_n in zip(token_pos, token_neg):
    max_sentence_length = max(max_sentence_length, len(ex_p), len(ex_n))
    for word in ex_p:
        vocab[word] += 1
    for word in ex_n:
        vocab[word] += 1

vocab_size = len(vocab)
print("Vocabulary size: " + str(vocab_size))
print("Max sentence length: " + str(max_sentence_length))

# 3. One-Hot encode and pad.
encoded_pos = [preprocessing.text.one_hot(ex, vocab_size) for ex in pos_examples]
padded_pos = preprocessing.sequence.pad_sequences(encoded_pos, maxlen=max_sentence_length, padding='post')
encoded_neg = [preprocessing.text.one_hot(ex, vocab_size) for ex in neg_examples]
padded_neg = preprocessing.sequence.pad_sequences(encoded_neg, maxlen=max_sentence_length, padding='post')

print(padded_pos.shape)
print(padded_neg.shape)

X = np.concatenate((padded_pos, padded_neg))
y = np.concatenate((np.ones(padded_pos.shape[0]), np.zeros(padded_neg.shape[0])))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

Vocabulary size: 20490
Max sentence length: 51
(5331, 51)
(5331, 51)
(8529, 51) (8529,)
(2133, 51) (2133,)


In [None]:
# 4. Create the CNN.
# Using an embedding layer, followed by a convolutional, max-pooling and softmax layer.
filter_sizes = [3,4,5]
hidden_units = [100,2]
adadelta = Adadelta()
embed_dim = 300

# Initialize sequential model

# Create embedding layer for randomized word vectors


sent_length = max_sentence_length
num_filters = 100


inputs = Input(shape=(sent_length,), dtype='int32')
embed = Embedding(input_dim = vocab_size,
                  output_dim = embed_dim,
                  input_length = sent_length)(inputs)
reshape = Reshape((sent_length, embed_dim, 1))(embed)

shapes = []
for fsize in filter_sizes:
  shapes.append(((fsize, embed_dim),(sent_length - fsize + 1, 1)))

pools = []
for shape in shapes:
  (fshape, pshape) = shape
  conv = Conv2D(filters=num_filters, kernel_size=fshape, activation='relu')(reshape)
  pool = MaxPool2D(pool_size=pshape, strides=(1,1), padding='valid')(conv)
  pools.append(pool)

concat = Concatenate(axis=1)(pools)
flat = Flatten()(concat)
dropout = Dropout(0.5)(flat)
output = Dense(units=1, activation='sigmoid')(dropout)

model = Model(inputs=inputs, outputs=output)

model.compile(loss='binary_crossentropy', optimizer=adadelta, metrics=['accuracy'])
print(model.summary())

model.fit(X_train, y_train, epochs=25, batch_size=20, validation_data=(X_test, y_test), shuffle=True)

# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 51)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 51, 300)      6147000     input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 51, 300, 1)   0           embedding_1[0][0]                
___________________________________________________________________________________________