In [3]:
import numpy as np
import re
import itertools
from collections import Counter
import tensorflow as tf
import numpy as np

# Auxiliary functions

In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]


def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]


# Neural Network

In [10]:
class TextCNN(object):
    """
    A CNN for text classification.
    Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer.
    """
    def __init__(self, sequence_length, num_classes, vocab_size, embbeding_size, filter_size, num_filters):
#         sequence_length – The length of our sentences. Remember that we padded all \
#       our sentences to have the same length (59 for our data set).
#         num_classes – Number of classes in the output layer, two in our case (positive and negative).
#         vocab_size – The size of our vocabulary. This is needed to define the size of our embedding layer, \
#       which will have shape [vocabulary_size, embedding_size].
#         embedding_size – The dimensionality of our embeddings.
#         filter_sizes – The number of words we want our convolutional filters to cover. \
#       We will have num_filters for each size specified here. For example, [3, 4, 5] \
#       means that we will have filters that slide over 3, 4 and 5 words respectively, for a total of 3 * num_filters filters.
#       num_filters – The number of filters per filter size (see above).


        #Placeholders
        self.input_x = tf.placeholder(tf.int32, [None, sequence_length], name="input_x")
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
        self.dropout_keep_prob = tf.placeholder(tf.float32,name="dropout_keep_prob")
        
        
#         Embedding layer
#         The first layer we define is the embedding layer, which maps vocabulary word indices into \
#         low-dimensional vector representations. It’s essentially a lookup table that we learn from data.

        with tf.device('/cpu:0', tf.name_scope("embedding")):
            W = tf.Variable(
                    tf.random_uniform([vocab_size,embbeding_size], -1.0, 1.0), 
                    name="W")
            # tf.nn.embedding_lookup creates the actual embedding operation
            self.embedded_chars = tf.nn.embedding_lookup(W,input_x)
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
    
                    
#           Convolution layer
#           TensorFlow’s convolutional conv2d operation expects a 4-dimensional \
#           tensor with dimensions corresponding to batch, width, height and channel.\
#           The result of our embedding doesn’t contain the channel dimension, so we add it manually, \
#           leaving us with a layer of shape [None, sequence_length, embedding_size, 1]. ---> \
#           Em imagens, por exemplo, seria 3




#           Now we’re ready to build our convolutional layers followed by max-pooling. \
#           Remember that we use filters of different sizes. Because each convolution \
#           produces tensors of different shapes we need to iterate through them, \
#           create a layer for each of them, and then merge the results into one big feature vector

        pooled_outputs = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope("conv-max-poll-{}".format(filter_size)):
                #Convolution layer
                #filtro especifico, tamanho do embbeding, numero de canais e todos os filtros
                filter_shape = [filter_size, embbeding_size, 1, num_filters] 
                #Each filter slides over all embbeding matrix, but varies how many words each one will slide
                W = tf.Variable(tf.truncated_normal(filter_shape,stdev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b")
                #"VALID" padding means that we slide the filter over our sentence without padding the \
                #edges, performing a narrow convolution that gives us an output \
                #of shape [1, sequence_length - filter_size + 1, 1, 1]
                conv = tf.nn.conv2d(
                        self.embedded_chars_expanded,
                        W,
                        strides=[1,1,1,1],
                        padding="VALID",
                        name="conv"
                )
                #Activation function - apply nonlinearity
                h = tf.nn.relu(tf.nn.add_bias(conv,b), name="relu")
                #Max pooling
                pooled = tf.nn.max_pool(
                            h,
                            k_size=[1, sequence_length - filter_size + 1,1,1],
                            strides = [1,1,1,1],
                            padding = "VALID",
                            name= "pool")
                #Performing max-pooling over the output of a specific filter size \
                # leaves us with a tensor of shape [batch_size, 1, 1, num_filters]
                pooled_outputs.append(pooled)
                

                
        #      Combine all pooled filters
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(3,pooled_outputs)
        #une a porra toda num vetor so
        #Once we have all the pooled output tensors from each filter size we \
        #combine them into one long feature vector of shape [batch_size, num_filters_total]
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])
        
        
        
        #Add dropout
        with tf.name_scope("dropout"):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)
            
            
        #outputs
        with tf.name_scope("output"):
            W = tf.Variable(tf.truncated_normal([num_filters_total,num_classes]), stddev=0.1, name="W")
            b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b")
            self.scores = tf.nn.xw_plus_b(self.h_prob, W, b, name="scores")
            #argamx = retorna o indice com o maior valor
            self.predictions = tf.argmax(self.scores, 1, name="predictions")
            
        #Calculate mean-cross-entropy loss
        #, tf.nn.softmax_cross_entropy_with_logits is a convenience function that calculates the \
        # cross-entropy loss for each class, given our scores and the correct input labels
        with tf.name_scope("loss"):
            losses = tf.nn.softmax_cross_entropy_with_logits(self.scores, self.input_y)
            #take mean
            self.loss = tf.reduce_mean(losses)
            
            
        #Take Accuracy
        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
        

# Training procedure

In [None]:
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        # allow_soft_placement setting allows TensorFlow to fall back on a device with a certain operation \
        #implemented when the preferred device doesn’t exist
        
        # log_device_placement is set, TensorFlow log on which devices (CPU or GPU) it places operations. 
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement
    )
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x_train.shape[1],
            num_classes=2,
            vocab_size=len(vocabulary),
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=map(int, FLAGS.filter_sizes.split(",")),
            num_filters=FLAGS.num_filters
        )
        
        global_step = tf.Variable(0, name="global_step", trainable=False)
        optmizer = tf.train.AdamOptmizer(1e-4)
        grads_and_vars = optmizer.compute_gradients(cnn.loss)
        #train_op here is a newly created operation that we can run to perform a gradient update on our parameters.
        train_op = optmizer.apply_gradients(grads_and_vars,global_step=global_step)
    