In [2]:
import os
import pandas as pd
import re
import numpy as np

import collections
from operator import itemgetter
import pprint as pp
import re
from nltk.corpus import stopwords
import nltk.data
import time
import proj_base
#data from http://times.cs.uiuc.edu/~wang296/Data/
files = os.listdir('./Review_Texts')
#print(os.listdir('./Review_Texts'))
print(len(files))

1759


In [3]:
data = proj_base.getStandardData(numFiles=1)
data.shape

(124, 13)

In [None]:
data.head()

In [None]:
#mean of int columns
data.mean(axis=0)

In [None]:
data["Content"][1]

In [None]:
#creating the vocab of all the words

def lintWord(w):
    regex = re.compile(r'(,|\(|\)|!|:|$|\.)')
    w = re.sub(regex, '', w)
    return w


allWords = ""
for r in data["Content"]:
    #add word to big content string
    allWords += r + " "
    
#split the string at spaces, keep only unique
words = set(allWords.split(" "))


vocab = list(set([lintWord(w) for w in words if not w in stopwords.words("english")]))

#n eed to remove stopwords again because some of them may have had punctuation 
# at the end and didnt get caught the first time
vocab = [w for w in vocab if not w in stopwords.words("english") and len(w) > 2]

#vocab


In [None]:
print(len(vocab))
print(len(set(vocab)))
print("i" in vocab)


In [None]:

"""Algorithm: Aspect Segmentation Algorithm
Input: A collection of reviews {푑1, 푑2, . . . , 푑∣퐷∣)}, set of
aspect keywords {푇1, 푇2, . . . , 푇푘}, vocabulary V, selection
threshold p and iteration step limit I.
Output: Reviews split into sentences with aspect assignments.
Step 0: Split all reviews into sentences, 푋 =
{푥1, 푥2, . . . , 푥푀};
Step 1: Match the aspect keywords in each sentence
of X and record the matching hits for each aspect i in
퐶표푢푛푡(푖);
Step 2: Assign the sentence an aspect label by 푎푖 =
푎푟푔푚푎푥푖 퐶표푢푛푡(푖). If there is a tie, assign the sentence
with multiple aspects.
Step 3: Calculate chi^2 measure of each word (in V);
Step 4: Rank the words under each aspect with respect
to their chi^2value and join the top p words for each aspect
into their corresponding aspect keyword list 푇푖;
Step 5: If the aspect keyword list is unchanged or iteration
exceeds I, go to Step 6, else go to Step 1;
Step 6: Output the annotated sentences with aspect
assignments."""

In [None]:
#tokenizer to split sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


In [None]:
# initial seeds from LARA paper
seeds = {"Value" : ["value", "price", "quality","worth"],
         "Rooms" : ["room", "suite", "view", "bed"],
         "Location" : ["location", "traffic", "minute", "restaurant"],
         "Cleanliness" : ["clean", "dirty", "maintain", "smell"],
         "Check in / front desk": ["stuff", "check", "help", "reservation"],
         "Service" : ["service", "food", "breakfast", "buffet"],
         "Business service" : ["business", "center", "computer", "internet"]
        }


def aspectSegmentation(reviews, aspects, vocab=[], threshold=0, iterationLimit=3):
    #when we have the top chi-squared rated keywords, how many do we take
    keywordsToTake = 3
    
    #bootstrap iterations
    for i in range(0, iterationLimit):
        
        #print our current aspects
        print("begin bootstrapping iteration, aspect keywords: ")
        pp.pprint(aspects)
        print("\n\n\n")
        labeledSentences = []
        for r in reviews:
            #use the pickle tokenizer to split sentences
            sentences = tokenizer.tokenize(r)
            
            for s in sentences:
                
                theseAspects = collections.defaultdict(int)
                maxAspect = (0, "None")
                
                #for each aspect count how many times one of those aspect words appears
                for a in aspects:
                    for word in aspects[a]:
                        if " "+word+" " in s:
                            theseAspects[a] += 1
                
                #find the max occuring aspect for each sentence, take multiple if ties
                for a in theseAspects:
                    if theseAspects[a] > maxAspect[0]:
                        maxAspect = (theseAspects[a], a)
                    if theseAspects[a] == maxAspect[0] and a not in maxAspect:
                        #label it with multiple aspects
                        maxAspect = maxAspect + (a, )
                
                #add the sentence with labels
                labeledSentences.append((s, maxAspect[1:]))
            
        
        chiSquaredForAspects = collections.defaultdict(list)
        #calculate chi squared measure for each word in vocab
        """c1 is the number of times w occurs in sentences belonging
            to aspect a_i, c2 is the number of times w occurs
            in sentences not belonging to a_i, c3 is the number of sentences
            of aspect a_i that do not contain w, c4 is the number
            of sentences that neither belong to aspect a_i, nor contain
            word w, and C is the total number of word occurrences"""
        for w in vocab:
            for a in aspects:
                c = 0
                c_1 = 0
                c_2 = 0 
                c_3 = 0
                c_4 = 0

                for s in labeledSentences:
                    sentenceText = s[0]
                    sentenceAspects = s[1]

                    if " "+w+" " in sentenceText and a in sentenceAspects:
                        c_1 += 1
                    elif " "+w+" " in sentenceText and a not in sentenceAspects:
                        c_2 += 1
                    elif a in sentenceAspects and w not in sentenceText:
                        c_3 += 1
                    else:
                        c_4 += 1

                numer = ((1.0 * c_1 * c_4 - 1.0 * c_2 * c_3)**2)
                denom = (1.0*(c_1 + c_3) * 1.0 * (c_2 + c_4) * 1.0 * (c_1 + c_2) * 1.0 * (c_3 + c_4))
                #unreasonable use of 1.0's here to be safe
                if denom != 0:
                    csq = numer / denom
                    chiSquaredForAspects[a].append((w, csq))


        #have the chi squared aspects for each word in vocab, add top kewordsToTake for each aspect
        for a in chiSquaredForAspects:
            #make sure were not taking words we already have
            noDupes = [tup for tup in chiSquaredForAspects[a] if tup[0] not in aspects[a]]
            chiSquaredForAspects[a] = sorted(noDupes, key=itemgetter(1), reverse=True)[0:keywordsToTake]
            for t in chiSquaredForAspects[a]:
                if t[0] not in aspects[a] and t[0] != '':
                    aspects[a].append(t[0])
            
            
        
            #split into sentences
        
        #loop through again
    return labeledSentences
    #return labeledSentences

start = time.time()
sentencesWLabels = aspectSegmentation(data["Content"], seeds, vocab)
end = time.time()

print("done, time taken:", end-start)

In [None]:
numToShow = 20
count = 0
for s in sentencesWLabels:
    if 'None' not in s[1]:
        print(s)
        print("\n")
        count += 1
    if count > numToShow:
        break

In [None]:
#seeds will have changed, whats differet
seeds

In [None]:
def seedInReview(review, seeds):
    for s in seeds:
        #print(s)
        if s in review:
            return True
    return False


reviewsWithSeeds = {}
for s in seeds:
    reviewsWithSeeds[s] = sum(data["Content"].apply(seedInReview, args = [seeds[s]]))
reviewsWithSeeds

In [None]:
def getContentLen(rev):
    return len(rev["Content"])

data.apply(getContentLen, axis=1)

In [None]:
aspect = "Value"

def filterReviewSentencesByWords(rev, words):
    #tokenize review into sentences
    sentences = tokenizer.tokenize(rev["Content"])
    
    
    newContent = ""
    for s in sentences:
        wordlist = re.sub("[^a-zA-Z]"," ", s).split()
        intersect = set(wordlist).intersection(words)
        
        if len(intersect) != 0:
            newContent += s 

    
    if len(newContent) == 0:
        #so no sentences contain the rating? what do we do ehre
        pass
    rev["Content"] = newContent

    return rev

def filterReviewsByAspectWords(data, seeds):
    #we have our seeds, now let's only keep the sentences which are relevant to that aspect in each review
    aspectWords = set(seeds[aspect])
    
    
    data = data.apply(filterReviewSentencesByWords, axis=1, args=(aspectWords,))
    return data
    
data = filterReviewsByAspectWords(data[0:9], seeds)

In [None]:
data.head(10)