In [None]:
import pandas as pd
import numpy as np
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

from keras.models import Sequential, Model
from keras.layers import Dense,Embedding,LSTM,GRU,Flatten,Dropout, Input, Activation
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.layers.convolutional import Conv1D,MaxPooling1D
from keras.layers import Concatenate, Activation
from tensorflow.keras import optimizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
%load_ext tensorboard

# Importing IMDb Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train = pd.read_csv("drive/MyDrive/NNDL_Datasets/IMDB_train.csv")
df_val = pd.read_csv("drive/MyDrive/NNDL_Datasets/IMDB_val.csv")
df_test = pd.read_csv("drive/MyDrive/NNDL_Datasets/IMDB_test.csv")

In [None]:
len(df_train), len(df_val), len(df_test)

(35000, 7500, 7500)

# Text Preprocessing

In [None]:
stop_words = set(stopwords.words("english"))

def text_preprocessing(dataframe):

  lines = dataframe["clean_text"].values.tolist()
  reviews = list()

  for line in lines:
      tokens = word_tokenize(line)
      tokens = [w.lower() for w in tokens]
      table = str.maketrans("","",string.punctuation)
      stripped = [w.translate(table) for w in tokens]
      words = [w for w in stripped if w.isalpha()]
      words = [w for w in words if w not in stop_words]
      reviews.append(words)

  reviews = [' '.join(review) for review in reviews]
  dataframe['prepro_rev'] = reviews
  return dataframe


In [None]:
df_train = text_preprocessing(df_train)
df_val = text_preprocessing(df_val)
df_test = text_preprocessing(df_test)

In [None]:
# lines = df["clean_text"].values.tolist()
# stop_words = set(stopwords.words("english"))
# reviews = list()
# for line in lines:
#     tokens = word_tokenize(line)
#     tokens = [w.lower() for w in tokens]
#     table = str.maketrans("","",string.punctuation)
#     stripped = [w.translate(table) for w in tokens]
#     words = [w for w in stripped if w.isalpha()]
#     words = [w for w in words if w not in stop_words]
#     reviews.append(words)

# reviews = [' '.join(review) for review in reviews]
# df['prepro_rev'] = reviews

In [None]:
# train_df = df[df['data_type'] == 'train']
# test_df = df[df['data_type'] == 'val']

train_text = df_train['prepro_rev'].values
val_text = df_val['prepro_rev'].values
test_text = df_test['prepro_rev'].values

train_text = [s.lower() for s in train_text]
val_text = [s.lower() for s in val_text]
test_text = [s.lower() for s in test_text]

train_classes = df_train['labels'].values
val_classes = df_val['labels'].values
test_classes = df_test['labels'].values

### Tokenizer for Base Model

In [None]:
EMBED_DIM_WORD = 768
MAX_NUM_WORDS_WORD = 30522
MAX_SEQUENCE_LENGTH_WORD = 400
do_early_stopping = True
VOCAB_SIZE_WORD = MAX_NUM_WORDS_WORD+1

In [None]:
tokenizer1 = Tokenizer(num_words = MAX_NUM_WORDS_WORD+1, oov_token='UNK')
tokenizer1.fit_on_texts(train_text)

In [None]:
def custom_text_to_word_sequence(texts, modelTokenizer):
  vects = []
  for text in texts:
    seq = text.split(' ')
    seq = [i for i in seq if i]

    vect = []
    for w in seq:
      try:
        ind = modelTokenizer.word_index[w]
        if (ind<=MAX_NUM_WORDS_WORD):
          vect.append(ind)
        else:
          vect.append(modelTokenizer.word_index['UNK'])
      except KeyError:
        vect.append(modelTokenizer.word_index['UNK'])

    vects.append(vect)
  
  return vects

In [None]:
x_train_tokens1 = custom_text_to_word_sequence(train_text, tokenizer1)
x_val_tokens1 = custom_text_to_word_sequence(val_text, tokenizer1)
x_test_tokens1 = custom_text_to_word_sequence(test_text, tokenizer1)

x_train_pad1 = pad_sequences(x_train_tokens1,maxlen=MAX_SEQUENCE_LENGTH_WORD,padding="post")
x_val_pad1 = pad_sequences(x_val_tokens1, maxlen=MAX_SEQUENCE_LENGTH_WORD, padding='post')
x_test_pad1 = pad_sequences(x_test_tokens1,maxlen=MAX_SEQUENCE_LENGTH_WORD,padding="post")

### Tokenizer for Character Level Model

In [None]:
MAX_SEQUENCE_LENGTH_CHAR = 1014

In [None]:
tokenizer2 = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tokenizer2.fit_on_texts(train_text)

In [None]:
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

tokenizer2.word_index = char_dict.copy()
tokenizer2.word_index[tokenizer2.oov_token] = max(char_dict.values()) + 1

In [None]:
x_train_tokens2 = tokenizer2.texts_to_sequences(train_text)
x_val_tokens2 = tokenizer2.texts_to_sequences(val_text)
x_test_tokens2 = tokenizer2.texts_to_sequences(test_text)

x_train_pad2 = pad_sequences(x_train_tokens2,maxlen=MAX_SEQUENCE_LENGTH_CHAR,padding="post")
x_val_pad2 = pad_sequences(x_val_tokens2, maxlen=MAX_SEQUENCE_LENGTH_CHAR, padding='post')
x_test_pad2 = pad_sequences(x_test_tokens2,maxlen=MAX_SEQUENCE_LENGTH_CHAR,padding="post")

In [None]:
# import gensim
# word2vec_model = gensim.models.Word2Vec(sentences=train_text,size=EMBED_DIM,window=5,workers=4,min_count=1)
# vocab_size = MAX_NUM_WORDS+1
# embedding_weights = np.zeros((vocab_size,EMBED_DIM))
# for word,i in tokenizer.word_index.items():
#   if (i<=vocab_size):
#     try:
#       vector = word2vec_model.wv.get_vector(word)
#       embedding_weights[i] = vector
#     except KeyError:
#       pass
#   else:
#       break

# Baseline CNN

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH_WORD,), dtype='int32')
embedding_layer = Embedding(VOCAB_SIZE_WORD, EMBED_DIM_WORD, input_length=MAX_SEQUENCE_LENGTH_WORD)

input_node = Input(shape=(MAX_SEQUENCE_LENGTH_WORD, EMBED_DIM_WORD))
conv_list = []

conv = Conv1D(filters=10, kernel_size=3, activation='relu')(input_node)
drop = Dropout(0.3)(conv)
pool = MaxPooling1D(pool_size=2)(drop)
flatten = Flatten()(pool)
conv_list.append(flatten)

conv = Conv1D(filters=10, kernel_size=8, activation='relu')(input_node)
drop = Dropout(0.3)(conv)
pool = MaxPooling1D(pool_size=2)(drop)
flatten = Flatten()(pool)
conv_list.append(flatten)

out = Concatenate()(conv_list)
graph = Model(inputs = input_node, outputs = out)

model1 = Sequential()
model1.add(embedding_layer)
model1.add(Dropout(0.5, input_shape=(MAX_SEQUENCE_LENGTH_WORD, EMBED_DIM_WORD)))
model1.add(graph)
model1.add(Dense(50))
model1.add(Activation('relu'))
model1.add(Dropout(0.5))
model1.add(Dense(1, activation='sigmoid'))

optimizer = optimizers.Adam(learning_rate=0.0004)

model1.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['acc'])

print(model1.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 400, 768)          23441664  
                                                                 
 dropout_2 (Dropout)         (None, 400, 768)          0         
                                                                 
 model (Functional)          (None, 3950)              84500     
                                                                 
 dense (Dense)               (None, 50)                197550    
                                                                 
 activation (Activation)     (None, 50)                0         
                                                                 
 dropout_3 (Dropout)         (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 5

In [None]:
tensorboard1 = TensorBoard(log_dir='drive/MyDrive/NNDL_Project/Dataset1/CNN_Baseline_logs/', histogram_freq=0, write_graph=True)
early_stopping1 = EarlyStopping(monitor='val_loss', patience = 2, mode = 'min')
cp1 = ModelCheckpoint('drive/MyDrive/NNDL_Project/Dataset1/CNN_Baseline_bestModel.h5', monitor='val_acc', save_best_only=True, mode='max')

print('using early stopping strategy')
history1 = model1.fit(x_train_pad1, train_classes, validation_data=(x_val_pad1, val_classes), epochs=5, batch_size=16, callbacks = [early_stopping1, cp1, tensorboard1])

using early stopping strategy
Epoch 1/5
Epoch 2/5
Epoch 3/5


In [None]:
results1 = model1.evaluate(x_test_pad1, test_classes, batch_size=16)



In [None]:
# %tensorboard --logdir 'drive/MyDrive/NNDL_Project/CNN_Baseline_logs/'

# Character Level CNN

In [None]:
conv_layers_sm = [[256, 7, 3],
                  [256, 7, 3],
                  [256, 3, -1],
                  [256, 3, -1],
                  [256, 3, -1],
                  [256, 3, 3]]

fully_connected_layers_sm = [1024, 1024]

num_of_classes = 2
dropout_p = 0.5
optimizer = 'adam'
loss = 'binary_crossentropy'
EMBED_DIM_CHAR = len(tokenizer2.word_index)
VOCAB_SIZE_CHAR = EMBED_DIM_CHAR + 1

In [None]:
# Model Construction
# Input
inputs = Input(shape=(MAX_SEQUENCE_LENGTH_CHAR,), name='input', dtype='int64')  # shape=(?, 1014)
# Embedding
x = Embedding(VOCAB_SIZE_CHAR,
              EMBED_DIM_CHAR,
              input_length=MAX_SEQUENCE_LENGTH_CHAR,)(inputs)
# Conv
for filter_num, filter_size, pooling_size in conv_layers_sm:
    x = Conv1D(filter_num, filter_size)(x)
    x = Activation('relu')(x)
    if pooling_size != -1:
        x = MaxPooling1D(pool_size=pooling_size)(x)  # Final shape=(None, 34, 256)
x = Flatten()(x)  # (None, 8704)
# Fully connected layers
for dense_size in fully_connected_layers_sm:
    x = Dense(dense_size, activation='relu')(x)  # dense_size == 1024
    x = Dropout(dropout_p)(x)
# Output Layer
predictions = Dense(1, activation='sigmoid')(x)
# Build model
model2 = Model(inputs=inputs, outputs=predictions)
model2.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])  # Adam, binary_crossentropy
print(model2.summary())

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 1014)]            0         
                                                                 
 embedding_2 (Embedding)     (None, 1014, 69)          4830      
                                                                 
 conv1d_12 (Conv1D)          (None, 1008, 256)         123904    
                                                                 
 activation_12 (Activation)  (None, 1008, 256)         0         
                                                                 
 max_pooling1d_6 (MaxPooling  (None, 336, 256)         0         
 1D)                                                             
                                                                 
 conv1d_13 (Conv1D)          (None, 330, 256)          459008    
                                                           

In [None]:
tensorboard2 = TensorBoard(log_dir='drive/MyDrive/NNDL_Project/Dataset1/CNN_Char_sm_logs/', histogram_freq=0, write_graph=True)
early_stopping2 = EarlyStopping(monitor='val_loss', patience = 2, mode = 'min')
cp2 = ModelCheckpoint('drive/MyDrive/NNDL_Project/Dataset1/CNN_Char_sm_bestModel.h5', monitor='val_acc', save_best_only=True, mode='max')

print('using early stopping strategy')
history2 = model2.fit(x_train_pad2, train_classes, validation_data=(x_val_pad2, val_classes), epochs=5, batch_size=16, callbacks = [early_stopping2, cp2, tensorboard2])

using early stopping strategy
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
results2 = model2.evaluate(x_test_pad2, test_classes, batch_size=16)

