## Sentiment Analysis Model

#### Hyperparameters

In [165]:
numDimensions = 300
maxSeqLength = 250
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 100000

#### Loading data structures

In [166]:
import numpy as np
wordsList = np.load('wordsList.npy').tolist()
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')

#### Network graph building

In [167]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
data = tf.nn.embedding_lookup(wordVectors,input_data)

lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.25)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

#### Loading the trained sentiment analysis model

In [169]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('models/sentiment_analysis'))

INFO:tensorflow:Restoring parameters from models/sentiment_analysis\trained_rnn_lstm.ckpt-5


INFO:tensorflow:Restoring parameters from models/sentiment_analysis\trained_rnn_lstm.ckpt-5


#### Helper functions for pre-processing text inputs

In [170]:
# Removes punctuation, parentheses, question marks, etc., and leaves only alphanumeric characters
import re
strip_special_chars = re.compile("[^A-Za-z0-9 ]+")

def cleanSentences(string):
    string = string.lower().replace("<br />", " ")
    return re.sub(strip_special_chars, "", string.lower())

def getSentenceMatrix(sentence):
    sentenceMatrix = np.zeros([batchSize,maxSeqLength], dtype='int32')
    cleanedSentence = cleanSentences(sentence)
    split = cleanedSentence.split() #"asf asfsaf asdfsaf" ---> ['asf', 'asfsaf', 'asdfsaf']
    for indexCounter,word in enumerate(split):
        try:
            sentenceMatrix[0,indexCounter] = wordsList.index(word)
        except ValueError:
            sentenceMatrix[0,indexCounter] = 399999 #Vector for unkown words
        if indexCounter == (maxSeqLength - 1):
            break
    return sentenceMatrix

#### ====================================================================================================================

#### ====================================================================================================================

## LDA model

In [None]:
#Loading the trained model

In [171]:
import gensim
from gensim import corpora
from gensim.test.utils import datapath

# Creatng the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
temp_file = datapath("model")

# Load a potentially pretrained model from disk.
ldaModel = Lda.load(temp_file)

In [172]:
#Testing the model

In [173]:
#maximum of 100 per review
def get_reviews(review, batch):
    return " ".join(review.split()[:batch]) 

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(doc):
    num_of_reviews = 120
    doc = get_reviews(doc, num_of_reviews)
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized 

In [None]:
for idx, topic in ldaModel.print_topics(num_topics=15, num_words=10):
    print('Topic: {} \nWords: {}'.format(idx, topic))

## Loading the reviews

In [242]:
from os import listdir
from os.path import isfile, join
product_review_files = ['product_reviews/' + f for f in listdir('product_reviews/') if isfile(join('product_reviews/', f))]

In [243]:
def getTextFromFiles(file_list):
    reviews = []
    for file in file_list:
        with open(file, "r") as f:
            text = f.read()
            reviews.append(text) 
    return reviews

In [244]:
product_reviews = getTextFromFiles(product_review_files)

In [245]:
final_results = []

## Predicting Review Sentiments

In [246]:
for review in product_reviews:
    inputMatrix = getSentenceMatrix(review)    
    predictedSentiment = sess.run(prediction, {input_data: inputMatrix})[0]
    # predictedSentiment[0] represents output score for positive sentiment
    # predictedSentiment[1] represents output score for negative sentiment

    if (predictedSentiment[0] > predictedSentiment[1]):
        final_results.append([review, "positive"]) 
    else:
        final_results.append([review, "negative"])

In [None]:
for i in final_results[:5]:
    print(i, "\n" * 3)

## Predicting Review Topic

In [248]:
#cleaning the documents
reviews_clean = [clean(doc).split() for doc in product_reviews]

# Creating the term dictionary of our corpus, wheere every unique term is assigned an index
dictionary = corpora.Dictionary(reviews_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above
doc_term_matrix = [dictionary.doc2bow(doc) for doc in reviews_clean]

In [None]:
for idx, topic in ldaModel.print_topics(num_topics=15, num_words=10):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
print("Predicting topics..... " + ("\n" * 3))

for i in range(len(doc_term_matrix)):
    
    print("Text: ", final_results[i][0], "\nSentiment: ", final_results[i][1])
    
    for index, score in sorted(ldaModel[doc_term_matrix[i]], key=lambda tup: -1*tup[1]):
        print("\nScore: {}\t \nTopic: {}".format(score, ldaModel.print_topic(index, 8)))
    
    print("\n" * 3)

### Write the results to file

In [None]:
import json
directory = "sentiment_topic_results"

counter = 0
for r in final_results:
    result = {}
    result['text'] = r[0]
    result['sentiment'] = r[1]
    result['topics'] = r[2]
    
    file_name = directory + "/review_" + str(counter) + ".txt"
    with open(file_name, "w") as file:
        json.dump(result, file)

    counter += 1