# Convolutional sentiment analysis model with three inputs to the model and one output for sentiment prediction.
## Data

In [146]:
# Hidden warnings
import warnings
warnings.simplefilter('ignore')

In [147]:
import string
import re
import numpy as np
from os import listdir
from nltk.corpus import stopwords
from pickle import dump


def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word)>1]
    return tokens

filename = 'data/txt_sentoken/pos/cv000_29590.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

In [148]:
def process_docs(directory, is_train):
    documents = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc)
        documents.append(tokens)
    return documents

def load_clean_dataset(is_train):
    neg = process_docs('data/txt_sentoken/neg', is_train)
    pos = process_docs('data/txt_sentoken/pos', is_train)
    docs = neg + pos
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels

def save_dataset(dataset, filename):
    dump(dataset, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [149]:

# load and clear dataset 
train_docs, ytrain = load_clean_dataset(True)
test_docs, ytest = load_clean_dataset(False)
# save train/test
save_dataset([train_docs,ytrain], "train.pkl")
save_dataset([test_docs,ytest], "test.pkl")

Saved: train.pkl
Saved: test.pkl


## develop multi-channel model 

The network uses three distinct inputs, each processed by its own convolutional network with a different kernel size. Each convolutional channel is designed to capture patterns in sequences of words of varying length, grouped according to the kernel size specific to that layer (2, 4 and 8 words, respectively). The combination of features extracted from these different 'n-grams' allows the network to integrate multiple perspectives of the input data to predict a single output result.

In [150]:
from pickle import load

def load_dataset(filename):
    return load(open(filename, "rb"))

In [151]:
#If we want to control the number of occurrences of the words we will have to make a vocabulary before as in the previous example.
def create_tokenizer(lines):
    tokenizer = Tokenizer(num_words=3000) # if you don't put it is the whole size 
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max([len(s) for s in lines])

def mid_length(lines):
     total_length = sum(([len(s) for s in lines]))
     number_of_lines = len(lines)
     return int(total_length / number_of_lines if number_of_lines else 0)
 
def encoded_text(tokenizer, lines, length):
    encoded= tokenizer.texts_to_sequences(lines) #encode to integer
    padded = pad_sequences(encoded, maxlen=length, padding="post") #fill sequences to the maximum length
    return padded

In [152]:
#External models to perform text encoding, we will use both in the examples and evaluate them to see the result, we will also perform the example without using external encoding. In any case the results are better if we do not block them in the training even if we use them.

from gensim.models import Word2Vec

def word2vected_lines(tokenizer,lines,min_count):
    embedding_index = dict()
    model = Word2Vec(lines,min_count=min_count)
    #We see the vocabulary created
    words = model.wv.index_to_key
    vectors = model.wv.vectors
    for i in range(len(words)):
        word = words[i]
        coef = np.asanyarray(vectors[i])
        embedding_index[word] = coef 
        
    vocab_size = len(tokenizer.word_index)+1
    embedding_matix = np.zeros((vocab_size, 100)) #vocabulary and loaded weights
    for word, i in tokenizer.word_index.items():
        embedding_vector = embedding_index.get(word) #get returns None if it fails
        if embedding_vector is not None:
            embedding_matix[i] = embedding_vector
    return embedding_matix       

def gloved_test(tokenizer):
    embedding_index = dict()
    f = open("glove.6B.100d.txt", mode="rt", encoding="utf-8") #to use this part, you need to have the document with the weights of the glove vectors that you can download on the internet.
    for line in f:
        values = line.split()
        word = values[0]
        coef = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coef
    f.close()
    vocab_size = len(tokenizer.word_index)+1
    embedding_matix = np.zeros((vocab_size, 100)) #vocabulary and loaded weights
    for word, i in tokenizer.word_index.items():
        embedding_vector = embedding_index.get(word) #get returns None if it fails
        if embedding_vector is not None:
            embedding_matix[i] = embedding_vector
    return embedding_matix

# Define and implement the model

In [153]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Dropout
from keras.layers import concatenate
from keras.layers import Input
from pickle import load 
from keras.models import load_model

In [179]:
#The n-gram model will have 3 channels which will then be merged into 1, a concatenation layer and then a flattening and output.

def define_model(length, vocab_size, embeddin_matix=None, embedding_trainable=False):
    if embeddin_matix is None:
        embeddin_matix = None
    else:
        embeddin_matix = [embeddin_matix]

    # Specify channel 1 for 4-grams
    inputs1 = Input(shape= (length,))
    embedding1= Embedding(vocab_size, output_dim=100, weights = embeddin_matix, trainable = embedding_trainable)(inputs1)
    conv1 = Conv1D(16, 2, activation="relu")(embedding1) #4 viene de los gramas.( o palabras que entran )
    drop1 = Dropout(0.5)(conv1)
    pool1 = MaxPooling1D()(drop1)
    flat1 = Flatten()(pool1)
    
    #Specify channel 2 - 6 grammes
    inputs2 = Input(shape= (length,))
    embedding2= Embedding(vocab_size, output_dim=100, weights = embeddin_matix, trainable = embedding_trainable)(inputs2)
    conv2 = Conv1D(32, 4, activation="relu")(embedding2)
    drop2 = Dropout(0.5)(conv2)
    pool2 = MaxPooling1D()(drop2)
    flat2 = Flatten()(pool2)    
    
    
    #Specify for channel 3 - 8 grams.
    
    inputs3 = Input(shape= (length,))
    embedding3= Embedding(vocab_size, output_dim=100, weights = embeddin_matix, trainable = embedding_trainable)(inputs3)
    conv3 = Conv1D(64, 8, activation="relu")(embedding3)
    drop3 = Dropout(0.5)(conv3)
    pool3 = MaxPooling1D()(drop3)
    flat3 = Flatten()(pool3)    

    #concatenation
    merged = concatenate([flat1, flat2, flat3])
    
    #Flatten
    dense1 = Dense(32, activation= "relu")(merged)
    drop5 = Dropout(0.2)(dense1)
    dense2 = Dense(16, activation= "relu")(drop5)
    outputs = Dense(1, activation = "sigmoid") (dense2)
    model = Model(inputs = [inputs1,inputs2,inputs3], outputs = outputs)
    
    #compile
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics = ["accuracy"])
    
    model.summary()
    return model 

In [192]:
# Load the train
trainLines, trainLabels = load_dataset("train.pkl")
# Create the tokenizer
tokenizer = create_tokenizer(trainLines)
# embedding matrix
embedding_matrix = gloved_test(tokenizer) # use glove model with data vectors
#embedding_matrix = word2vected_lines(tokenizer,trainLines,min_count=3) #Word2vec model
#Calculate maximum length
length = max_length(trainLines) #--> This is the initial one
#length = mid_length(trainLines)
print("Longitud máxima de documentos: %d" % length)
# calculate the vocabulary size
vocab_size = len(tokenizer.word_index)+1
print("Tamaño del vocabulario: %d " % vocab_size)

#encoding data
trainX= encoded_text(tokenizer, trainLines, length)

#define model
model = define_model(length, vocab_size,embedding_matrix,embedding_trainable=False) 

#train the model
model.fit([trainX, trainX, trainX], np.array(trainLabels), epochs=15, batch_size=10)

#save the model
model.save("model_ngramas.h5")




Longitud máxima de documentos: 1380
Tamaño del vocabulario: 44277 
Model: "model_13"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_41 (InputLayer)       [(None, 1380)]               0         []                            
                                                                                                  
 input_42 (InputLayer)       [(None, 1380)]               0         []                            
                                                                                                  
 input_43 (InputLayer)       [(None, 1380)]               0         []                            
                                                                                                  
 embedding_40 (Embedding)    (None, 1380, 100)            4427700   ['input_41[0][0]']            
                        

## Evaluate the model

In [193]:
# We load the data and the model so that we can see a way to create models and use them in another script, in this case it would not be necessary since we would only have to use these variables.
# Load the test data
testLines, testLabels = load_dataset("test.pkl")

#encoding data
testX= encoded_text(tokenizer, testLines, length)

#load model
model = load_model("model_ngramas.h5")


In [194]:
def predict_sentiment(review, tokenizer,max_length, model):
    line = clean_doc(review)
    #Code and fill in
    padded = encoded_text(tokenizer,[line], max_length)
    #Predict the review
    yhat = model.predict([padded,padded,padded], verbose=1)
    #Extract the percentage of the given review
    porcentaje = yhat[0,0]
    if round(porcentaje) == 0:
        return (1- porcentaje), "Negative"
    return porcentaje, "Positive"

In [195]:
to_evaluate = "it's the worts film that I have the pleasure of been watched by me, I don't like this film"
percent, sentiment = predict_sentiment(to_evaluate,tokenizer,length,model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (to_evaluate, sentiment, percent*100))


to_evaluate = "it's the best movie I have ever seen"
percent, sentiment = predict_sentiment(to_evaluate,tokenizer,length,model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (to_evaluate, sentiment, percent*100))

Review: [it's the worts film that I have the pleasure of been watched by me, I don't like this film]
Sentiment: Negative (87.530%)
Review: [it's the best movie I have ever seen]
Sentiment: Positive (74.097%)


### Without external embebbing

In [None]:
#Evaluate in train 
_ , acc = model.evaluate([trainX,trainX,trainX], np.array(trainLabels), verbose=0)
print("Accuracy en Train %.2f" % (acc*100) )
#Evaluate in test
_ , acc = model.evaluate([testX,testX,testX], np.array(testLabels), verbose=0)
print("Accuracy en Train %.2f" % (acc*100) )

Accuracy en Train 100.00
Accuracy en Train 84.00


### With Glove

In [None]:
#Evaluate in train 
_ , acc = model.evaluate([trainX,trainX,trainX], np.array(trainLabels), verbose=0)
print("Accuracy en Train %.2f" % (acc*100) )
#Evaluate in test
_ , acc = model.evaluate([testX,testX,testX], np.array(testLabels), verbose=0)
print("Accuracy en Train %.2f" % (acc*100) )

Accuracy en Train 99.94
Accuracy en Train 82.00


### With Word2Vec

In [None]:
#Evaluate in train 
_ , acc = model.evaluate([trainX,trainX,trainX], np.array(trainLabels), verbose=0)
print("Accuracy en Train %.2f" % (acc*100) )
#Evaluate in test
_ , acc = model.evaluate([testX,testX,testX], np.array(testLabels), verbose=0)
print("Accuracy en Train %.2f" % (acc*100) )

Accuracy en Train 100.00
Accuracy en Train 77.00
