# Imports

In [2]:
# make sure to run nltk.download() if you're getting errors
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import string
from keras.layers import Conv2D, Concatenate, Dense, Flatten, Reshape, Dropout, MaxPool2D
from keras.engine.input_layer import Input
from keras import preprocessing
from keras.models import Sequential, Model
from keras.optimizers import Adadelta
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.regularizers import l2
import numpy as np
import collections
from nltk.tokenize import word_tokenize
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Pre-processing

In [10]:

pos_examples = open('data/rt-polarity.pos', encoding='ISO-8859-1').readlines()
neg_examples = open('data/rt-polarity.neg', encoding='ISO-8859-1').readlines()

vocab = collections.defaultdict(int)
translator = str.maketrans('', '', string.punctuation)

# 1. Tokenize all strings
token_pos = list(map(lambda ex: word_tokenize(ex.translate(translator)), pos_examples))
token_neg = list(map(lambda ex: word_tokenize(ex.translate(translator)), neg_examples))

# Attempt at removing stop-words 
# token_pos = list(map(lambda ex: word_tokenize(ex), pos_examples))
# token_neg = list(map(lambda ex: word_tokenize(ex), neg_examples))
# stop_words = set(stopwords.words('english'))

# token_pos = list(map(lambda ex: [w for w in ex if not w in stop_words] , token_pos))
# token_neg = list(map(lambda ex: [w for w in ex if not w in stop_words] , token_neg))

# token_pos = list(map(lambda ex: [w for w in ex if not w in string.punctuation] , token_pos))
# token_neg = list(map(lambda ex: [w for w in ex if not w in string.punctuation] , token_neg))

# 2. Get vocabulary size and max sentence length
max_sentence_length = 0
for ex_p, ex_n in zip(token_pos, token_neg):
    max_sentence_length = max(max_sentence_length, len(ex_p), len(ex_n))
    for word in ex_p:
        vocab[word] += 1
    for word in ex_n:
        vocab[word] += 1

vocab_size = len(vocab)
print("Vocabulary size: " + str(vocab_size))
print("Max sentence length: " + str(max_sentence_length))

# 3. One-Hot encode and pad.
encoded_pos = [preprocessing.text.one_hot(ex, vocab_size) for ex in pos_examples]
padded_pos = preprocessing.sequence.pad_sequences(encoded_pos, maxlen=max_sentence_length, padding='post')
encoded_neg = [preprocessing.text.one_hot(ex, vocab_size) for ex in neg_examples]
padded_neg = preprocessing.sequence.pad_sequences(encoded_neg, maxlen=max_sentence_length, padding='post')

X = np.concatenate((padded_pos, padded_neg))
y = np.concatenate((np.ones(padded_pos.shape[0]), np.zeros(padded_neg.shape[0])))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

Vocabulary size: 20490
Max sentence length: 51
(5331, 51)
(5331, 51)
(8529, 51) (8529,)
(2133, 51) (2133,)


# Model Definition

In [0]:
# 4. Create the CNN.
def create_model(dropout_rate=0.9, embed_dim=300, learning_rate=0.2, hidden_units=200):
  adadelta = Adadelta(lr=learning_rate)
  sent_length = max_sentence_length
  filter_sizes = [3,4,5]

  inputs = Input(shape=(sent_length,), dtype='int32')
  embed = Embedding(input_dim = vocab_size,
                  output_dim = embed_dim,
                  input_length = sent_length)(inputs)
  reshape = Reshape((sent_length, embed_dim, 1))(embed)

  shapes = []
  for fsize in filter_sizes:
    shapes.append(((fsize, embed_dim),(sent_length - fsize + 1, 1)))

  pools = []
  for shape in shapes:
    (fshape, pshape) = shape
    conv = Conv2D(filters=hidden_units, kernel_size=fshape, activation='relu', kernel_regularizer=l2(0.01))(reshape)
    pool = MaxPool2D(pool_size=pshape, strides=(1,1), padding='valid')(conv)
    pools.append(pool)

  concat = Concatenate(axis=1)(pools)
  flat = Flatten()(concat)
  dropout = Dropout(dropout_rate)(flat)
  output = Dense(units=1, activation='sigmoid')(dropout)

  model = Model(inputs=inputs, outputs=output)
  print(model.summary())

  model.compile(loss='binary_crossentropy', optimizer=adadelta, metrics=['accuracy'])
  return model



# Training

In [51]:
# Reduce the learning rate when the validation accuracy is not increasing for 3 epochs
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=5, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.0001)

# Save the best results in a file to my drive
# mcp_save = ModelCheckpoint('drive/My Drive/Colab Notebooks/model.h5', save_best_only=True, monitor='val_loss', mode='min', verbose=1)

#  Train the model
# history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), shuffle=True, callbacks=[learning_rate_reduction])

# HYPERPARAMETERS TO TUNE
dropout_rates = [0.5,0.9]
learning_rates = [0.1,1]
epochs = [25,50]
batch_size = [25]

model = KerasClassifier(build_fn=create_model)
param_grid = dict(batch_size=batch_size,
                  epochs=epochs,
                  dropout_rate=dropout_rates,
                  learning_rate=learning_rates)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=4, verbose=100)
grid_result = grid.fit(X_train, y_train)


print("Best Accuracy: %.2f using %s" % (grid_result.best_score_, grid_result.best_params_))


# Load best saved state of model and evaluate with this
# model.load_weights('drive/My Drive/Colab Notebooks/model.h5')




Fitting 4 folds for each of 8 candidates, totalling 32 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
Memmapping (shape=(8529, 51), dtype=int32) to new file /dev/shm/joblib_memmapping_folder_127_4032850145/127-139870684060304-333d6ea6440d4c19935390867b1cab99.pkl
Pickling array (shape=(8529,), dtype=float64).
Pickling array (shape=(6396,), dtype=int64).
Pickling array (shape=(2133,), dtype=int64).
Memmapping (shape=(8529, 51), dtype=int32) to old file /dev/shm/joblib_memmapping_folder_127_4032850145/127-139870684060304-333d6ea6440d4c19935390867b1cab99.pkl
Pickling array (shape=(8529,), dtype=float64).
Pickling array (shape=(6397,), dtype=int64).
Pickling array (shape=(2132,), dtype=int64).
Memmapping (shape=(8529, 51), dtype=int32) to old file /dev/shm/joblib_memmapping_folder_127_4032850145/127-139870684060304-333d6ea6440d4c19935390867b1cab99.pkl
Pickling array (shape=(8529,), dtype=float64).
Pickling array (shape=(6397,), dtype=int64).
Pickling array 



[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  7.9min
Memmapping (shape=(8529, 51), dtype=int32) to old file /dev/shm/joblib_memmapping_folder_127_4032850145/127-139870684060304-333d6ea6440d4c19935390867b1cab99.pkl
Pickling array (shape=(8529,), dtype=float64).
Pickling array (shape=(6397,), dtype=int64).
Pickling array (shape=(2132,), dtype=int64).
Memmapping (shape=(8529, 51), dtype=int32) to old file /dev/shm/joblib_memmapping_folder_127_4032850145/127-139870684060304-333d6ea6440d4c19935390867b1cab99.pkl
Pickling array (shape=(8529,), dtype=float64).
Pickling array (shape=(6397,), dtype=int64).
Pickling array (shape=(2132,), dtype=int64).
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 11.8min
Memmapping (shape=(8529, 51), dtype=int32) to old file /dev/shm/joblib_memmapping_folder_127_4032850145/127-139870684060304-333d6ea6440d4c19935390867b1cab99.pkl
Pickling array (shape=(8529,), dtype=float64).
Pickling array (shape=(6397,), dtype=int64).
Pickling array (shape=(2132

In [56]:
# Final evaluation of the model
from sklearn.metrics import classification_report
y_true, y_pred = y_test, grid.predict(X_test)
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.70      0.78      0.73      1033
         1.0       0.76      0.69      0.72      1100

    accuracy                           0.73      2133
   macro avg       0.73      0.73      0.73      2133
weighted avg       0.73      0.73      0.73      2133



# Plotting Accuracy

In [0]:
# Plot learning curve
import matplotlib.pyplot as plt
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()