In [1]:

# coding: utf-8

# ## CNN for Text Classification
# Implementation of *Convolutional Neural Networks for Sentence Classification* (Yoon Kim, 2014).
# 
# In his [paper](https://arxiv.org/abs/1408.5882), Yoon Kim proposed several techniques to achieve good text classification accuracy with minimal hyper-parameter tuning.
# 
# This notebook consist of 4 main sections:
# 
# 1. Preparing the data
# 2. Implementing Yoon Kim's CNN model
# 3. Training the model
# 4. Evaluating the model

# **Key Model Parameters**

# In[1]:


MAX_NB_WORDS = 100000 # max no. of words for tokenizer # tokenizer: add a number value to each unique word
MAX_SEQUENCE_LENGTH = 30 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM = 100 # embedding dimensions for word vectors (word2vec/GloVe)
# download GloVe file from
# https://tlkh.design/downloads/glove.6B.100d.txt.zip
# and place it in glove/
GLOVE_DIR = "glove/glove.6B."+str(EMBEDDING_DIM)+"d.txt"


# In[2]:

'''
import numpy as np
import pandas as pd
import re, sys, os, csv, keras, pickle
from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from keras.layers import *
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
'''

# ### 1. Prepare the data
# **Read from dataset**

# In[3]:


# from nltk.corpus import stopwords # stopwords: things that don't really have any meaning for 
                                  # the purposes of our classification; only there to make things grammatically correct.
                                  # for example, stopwords are words like "the", "and", etcetera.
def clean_text(text):
    output = ""
    text = str(text).replace("\n", "") # get rid of all newlines
    text = re.sub(r'[^\w\s]','',text).lower().split(" ") # turn it into a list
    for word in text: # for each word in the list of words
        if word not in stopwords.words("english"): # if word is not a stopword
            output = output + " " + word # add word to output string
    return str(output.strip())[1:-3].replace("  ", " ") # remove leading and trailing whitespace


# In[4]:


texts, labels = [], [] # empty lists for the sentences and labels

dir_path = os.path.dirname(os.path.realpath(__file__)) # get directory of current python file
data_path_neg = os.path.join(dir_path, "datasets", "stanford_movie_neg.txt") # negative datasets' file path
data_path_pos = os.path.join(dir_path, "datasets", "stanford_movie_pos.txt") # positive datasets' file path

print(data_path_neg, data_path_pos)

raise

data_neg = open(data_path_neg, "rb") 
for line in data_neg: 
    texts.append(clean_text(line)) # removing all the stopwords
    labels.append(int(0)) # label all negative texts with 0


# In[5]:


data_pos = open(data_path_pos, "rb") 
for line in data_pos: 
    texts.append(clean_text(line))
    labels.append(int(1)) # label all positive texts with 1


## now we are left with texts for the cleaned corpus, and labels for the labelling of pos and neg. neg = 0, pos = 1

# In[6]:
## not very useful for the training. 

# print("Sample positive:", texts[0], labels[0])
# print("Sample negative:", texts[9000], labels[9000])


# **Word Tokenizer**

# In[7]:
"""
class Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, **kwargs)
Text tokenization utility class.

This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf...

# Arguments

    num_words: the maximum number of words to keep, based  
        on word frequency. Only the most common `num_words` words will  
        be kept.  
    filters: a string where each element is a character that will be  
        filtered from the texts. The default is all punctuation, plus  
        tabs and line breaks, minus the `'` character.  
    lower: boolean. Whether to convert the texts to lowercase.  
    split: str. Separator for word splitting.  
    char_level: if True, every character will be treated as a token.  
    oov_token: if given, it will be added to word_index and used to  
        replace out-of-vocabulary words during text_to_sequence calls  
By default, all punctuation is removed, turning the texts into space-separated sequences of words (words maybe include the ' character). These sequences are then split into lists of tokens. They will then be indexed or vectorized.

0 is a reserved index that won't be assigned to any word.
"""

tokenizer = Tokenizer(num_words=MAX_NB_WORDS) # num_words is the max number of words that the tokenizer will recognize
tokenizer.fit_on_texts(texts) # tokenizer will create a new entry for every word it encounters
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # why pickle? certain things cannot be saved as str or int types. 
    # pickle lets us serialize these objects to be saved and reused later.
print("[i] Saved word tokenizer to file: tokenizer.pickle") 

with open('tokenizer.pickle', 'rb') as handle:    
    tokenizer = pickle.load(handle) # load a previously generated Tokenizer


# **Generate the array of sequences from dataset**

# In[8]:


sequences = tokenizer.texts_to_sequences(texts) # convert the text corpus to sequences. the main point of tokenizer
word_index = tokenizer.word_index # word_index: unique tokens
print('[i] Found %s unique tokens.' % len(word_index))
data_int = pad_sequences(sequences, padding='pre', maxlen=(MAX_SEQUENCE_LENGTH-5)) # convert to 2D np array
data = pad_sequences(data_int, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))

"""
pad_sequences Documentation:
https://keras.io/preprocessing/sequence/

Pads sequences to the same length.

This function transforms a list of num_samples sequences (lists of integers) into a 2D Numpy array of shape (num_samples, num_timesteps). num_timesteps is either the maxlen argument if provided, or the length of the longest sequence otherwise.

Sequences that are shorter than num_timesteps are padded with value at the end.

Sequences longer than num_timesteps are truncated so that they fit the desired length. The position where padding or truncation happens is determined by the arguments padding and  truncating, respectively.

Pre-padding is the default.

Arguments

sequences: List of lists, where each element is a sequence.
maxlen: Int, maximum length of all sequences.
dtype: Type of the output sequences.
padding: String, 'pre' or 'post': pad either before or after each sequence.
truncating: String, 'pre' or 'post': remove values from sequences larger than maxlen, either at the beginning or at the end of the sequences.
value: Float, padding value.

Returns

x: Numpy array with shape (len(sequences), maxlen)

Raises

ValueError: In case of invalid values for truncating or padding, or in case of invalid shape for a sequences entry.
"""


# **Create the train-validation split**

# In[9]:


labels = to_categorical(np.asarray(labels)) # Converts a class vector (integers) to binary class matrix.
print('[i] Shape of data tensor:', data.shape)
print('[i] Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0]) # np.arange(5) returns array([0, 1, 2, 3, 4])
np.random.shuffle(indices) # shuffle contents of np array
data = data[indices] 
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('[i] Number of entries in each category:')
print("[+] Training:",y_train.sum(axis=0))
print("[+] Validation:",y_val.sum(axis=0))

# separate the data that you are going to use for training from the one you will use for validating

# **What does the data look like?**

# In[10]:


print("Tokenized sequence:\n", data[0])
print("")
print("One-hot label:\n", labels[0])


# ### 2. Create the model
# Yoon Kim's model has several notable features:
# ![model-structure](notebook_imgs/yoon_kim_structure.png)
# * two sets of word embeddings for what he terms a **"multi-channel" approach**.
#   * One of the word embeddings will be frozen (**"static channel"**), 
#     and one will be modified during the training process (**"non-static channel"**). 
# * multiple convolutional kernel sizes
# 
# We will now start to create the model in `Keras`.

# **Load word embeddings into an `embeddings_index`**
# 
# Create an index of words mapped to known embeddings, by parsing the data dump of pre-trained embeddings.
# 
# We use a set from [pre-trained GloVe vectors from Stanford](https://nlp.stanford.edu/projects/glove/).

# In[11]:


embeddings_index = {}
f = open(GLOVE_DIR, encoding="utf8")
print("[i] (long) Loading GloVe from:",GLOVE_DIR,"...",end="")
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32') # create an embedding index
f.close()
print("Done.\n[+] Proceeding with Embedding Matrix...", end="")
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print(" Completed!")
# the 0th dimension for keras is equal to the number of entries of the batch
# 1st dimension is the word sequence
# 2nd dimension is the word vectors


# In[12]:


# second embedding matrix for non-static channel
embedding_matrix_ns = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix_ns[i] = embedding_vector


# **Create the `Embedding` layers**

# In[13]:


sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # input to the model

# static channel
embedding_layer_frozen = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
embedded_sequences_frozen = embedding_layer_frozen(sequence_input)

# non-static channel
embedding_layer_train = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix_ns],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
embedded_sequences_train = embedding_layer_train(sequence_input)

l_lstm_1 = Concatenate(axis=1)([embedded_sequences_frozen, embedded_sequences_train])


# **Create the CNN layer with multiple kernel (filter) sizes**

# In[14]:


l_conv_3 = Conv1D(filters=128,kernel_size=3,activation='relu',kernel_regularizer=regularizers.l2(0.001))(l_lstm_1)
l_conv_4 = Conv1D(filters=128,kernel_size=4,activation='relu',kernel_regularizer=regularizers.l2(0.001))(l_lstm_1)
l_conv_5 = Conv1D(filters=128,kernel_size=5,activation='relu',kernel_regularizer=regularizers.l2(0.001))(l_lstm_1)
# filters: number of matrices
# kernel_size: size of the matrix; for kernel_size=3, matrix is a 3 x 1 matrix. (Because it is 1D in this case.)
# activation: activation function. we are use relu in this case.
l_conv = Concatenate(axis=1)([l_conv_3, l_conv_4, l_conv_5]) # concat all the 3 outputs


# Followed by the rest of the model (boring!!)

# In[16]:


l_pool = MaxPooling1D(4)(l_conv) # selects the strongest output out of every 4 outputs
l_drop = Dropout(0.2)(l_pool) # dropout 20% of the output of l_pool
# l_flat = Flatten()(1_pool) # flatten to 1 dimension; if using l_drop you may lose too much data as the dataset is small
l_dense = Dense(32, activation='relu')(l_flat) # 32 perceptrons
preds = Dense(2, activation='softmax')(l_dense) #follows the number of classes


# **Compile the model into a static graph for training**

# In[17]:


model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer="sgd",
              metrics=['acc'])
model.summary()


# **Visualisation**

# In[18]:

'''
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
SVG(model_to_dot(model).create(prog='dot', format='svg'))
'''
# not necessary. uncomment if needed.

# In[19]:


# Keras callback functions
tensorboard = callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=16, write_grads=True , write_graph=True)
model_checkpoints = callbacks.ModelCheckpoint("checkpoint-{val_loss:.3f}.h5", monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=0)


# In[ ]:


# remove TensorFlow logs directory and old checkpoints
get_ipython().system('rm -r logs *.h5')


# In[ ]:


#model = keras.models.load_model("checkpoint-0.91.h5") # in case you ever want to load from a checkpoint


# ### 3. Train the model

# In[20]:


print("Training Progress:")
model_log = model.fit(x_train, y_train, validation_data=(x_val, y_val),
            epochs=12, batch_size=32, # around 32 for CPU, higher for GPU
            callbacks=[tensorboard, model_checkpoints]) # publish to tensorboard and save automatic checkpoints
# 1 epoch = 1 run of the program
#pd.DataFrame(model_log.history).to_csv("history.csv") # save the training progress to a csv file


# ### 4. Evaluate the model

# In[21]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'")

plt.plot(model_log.history['acc'])
plt.plot(model_log.history['val_acc'])
plt.title('accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(model_log.history['loss'])
plt.plot(model_log.history['val_loss'])
plt.title('loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


# In[22]:


from sklearn.metrics import classification_report, confusion_matrix
import itertools, pickle

classes = ["positive", "negative"]


# In[23]:


Y_test = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict(x_val)
y_pred_class = np.argmax(y_pred,axis=1)
cnf_matrix = confusion_matrix(Y_test, y_pred_class)
print(classification_report(Y_test, y_pred_class, target_names=classes))


# In[24]:


def plot_confusion_matrix(cm, labels,
                          normalize=True,
                          title='Confusion Matrix (Validation Set)',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        #print('Confusion matrix, without normalization')
        pass

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=45)
    plt.yticks(tick_marks, labels)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

plt.figure(figsize=(20,10))
plot_confusion_matrix(cnf_matrix, labels=classes)



SyntaxError: invalid token (<ipython-input-1-78b5be64c37a>, line 320)