# Sentiment analysis on a dataset of movies using a convolutional network model
## DATA

In [1]:
# Hidden warnings
import warnings
warnings.simplefilter('ignore')

In [2]:
from nltk.corpus import stopwords
import string
import re

def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub("",w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set( stopwords.words("english"))
    tokens = [word for word in tokens if not word in stop_words ]
    tokens = [word for word in tokens if len(word) >1 ]
    return tokens

filename = 'data/txt_sentoken/pos/cv000_29590.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

## Define vocabulary

In [3]:
from os import listdir
from collections import Counter

def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)

def process_docs(directory, vocab):
    for filename in listdir(directory):
        if filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        add_doc_to_vocab(path,vocab)

vocab = Counter()
process_docs('data/txt_sentoken/pos', vocab)
process_docs('data/txt_sentoken/neg', vocab)

print(len(vocab))
print()
print(vocab.most_common(50))

44276

[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('could', 1248), ('bad', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


In [4]:
min_occurrence = 2
tokens = [k for k,c in vocab.items() if c>=min_occurrence]
print(len(tokens))


25767


In [5]:
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [6]:
save_list(tokens,"vocab.txt")

## Now CNN with Embedding layer

In [7]:
import string
import re
from os import listdir
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D

def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())




In [8]:
def clean_doc(doc,vocab): #for documents not for a vocabulary as before
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub("",w) for w in tokens]
    tokens = [word for word in tokens if word in vocab]
    tokens = " ".join(tokens)
    return tokens


In [9]:
def process_docs(directory, vocab, is_train):
    documents = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):  #We do this just to divide some in training and some in testing.
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc, vocab)
        documents.append(tokens)
    return documents

In [10]:
def load_clean_dataset(vocab, is_train):
    neg = process_docs('data/txt_sentoken/neg', vocab, is_train)
    pos = process_docs('data/txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    labels = array([0 for _ in range(len(neg))] + [1 for _ in range(len(pos))])
    return docs, labels

In [11]:
def create_tokenizer(lines): #Allocation of words to integers
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [12]:
train_docs, ytrain = load_clean_dataset(vocab,True)
## Calculate maximum sequence
max_length = max([len(s.split()) for s in train_docs])
print("Maximun length: %d" % max_length)

Maximun length: 1317


In [13]:
def encode_docs(tokenizer, max_length, docs):
    # Encode to integer
    encoded = tokenizer.texts_to_sequences(docs)
    # fill in sequences
    padded = pad_sequences(encoded, maxlen = max_length, padding="post")
    return padded

In [14]:
# Ready to define neural network model
train_docs, ytrain = load_clean_dataset(vocab,True)
#create the tokenizer
tokenizer = create_tokenizer(train_docs)
#Define vocabulary size
vocab_size = len(tokenizer.word_index) +1
print("Vocabulary size: ", vocab_size)



Vocabulary size:  25768


In [15]:
def define_model(vocab_size, max_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length = max_length)) 
    model.add(Conv1D(32,8, activation ="relu"))
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(10, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))
    # compilation
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.summary()
    return model

In [16]:
model = define_model(vocab_size, max_length)




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1317, 100)         2576800   
                                                                 
 conv1d (Conv1D)             (None, 1310, 32)          25632     
                                                                 
 max_pooling1d (MaxPooling1  (None, 655, 32)           0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 20960)             0         
                                                                 
 dense (Dense)               (None, 10)                209610    
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                     

In [17]:
train_docs, ytrain = load_clean_dataset(vocab, True)
# We create the Tokenizer
tokenizer = create_tokenizer(train_docs)
# Define the vocabulary
vocab_size = len(tokenizer.word_index) +1
print('Vocabulary size: %d' % vocab_size)
# We calculate the maximum sequence
max_length = max([len(s.split()) for s in train_docs])
print('Maximum length: %d' % max_length)
# We code the data
Xtrain = encode_docs(tokenizer, max_length, train_docs)
# We define the model
model = define_model(vocab_size,max_length)
# We make the fit
model.fit(Xtrain, ytrain, epochs=10, verbose=1)

Vocabulary size: 25768
Maximum length: 1317
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1317, 100)         2576800   
                                                                 
 conv1d_1 (Conv1D)           (None, 1310, 32)          25632     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 655, 32)           0         
 g1D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 20960)             0         
                                                                 
 dense_2 (Dense)             (None, 10)                209610    
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
          

<keras.src.callbacks.History at 0x1fdfbdae3b0>

In [18]:
model.save("model.h5")

## Evaluate the model

In [19]:
train_docs, ytrain = load_clean_dataset(vocab, True)
test_docs, ytest = load_clean_dataset(vocab, False)

tokenizer = create_tokenizer(train_docs)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size: %d' % vocab_size)

max_length = max([len(s.split()) for s in train_docs])
print('Maximun length: %d' % max_length)
Xtrain = encode_docs(tokenizer, max_length, train_docs)
Xtest = encode_docs(tokenizer, max_length, test_docs)

Vocabulary size: 25768
Maximun length: 1317


In [20]:
from keras.models import load_model
#Load the trained model
model = load_model("model.h5")
#Evaluate in train
_ ,acc = model.evaluate(Xtrain, ytrain, verbose=1)
print('Accuracy en Train: %.2f' % (acc*100))
#Evaluate in Test
_, acc = model.evaluate(Xtest,ytest, verbose=1)
print('Accuracy en Test: %.2f' % (acc*100))

Accuracy en Train: 100.00
Accuracy en Test: 87.50


In [21]:
def predict_sentiment(review, vocab, tokenizer,max_length, model):
    line = clean_doc(review, vocab)
    #Code and fill in
    padded = encode_docs(tokenizer, max_length, [line])
    #Predict the review
    yhat = model.predict(padded, verbose=1)
    #Extract the percentage of the given review
    porcentaje = yhat[0,0]
    if round(porcentaje) == 0:
        return (1- porcentaje), "Negative"
    return porcentaje, "Positive"
    

In [22]:
text = 'Best movie ever! It was great, I recommend it.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer,max_length ,model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))

text = 'This is a bad movie.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer,max_length , model)
print('Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))


Review: [Best movie ever! It was great, I recommend it.]
Sentiment: Positive (51.529%)
Review: [This is a bad movie.]
Sentiment: Negative (52.649%)
