In [1]:
import os
import pandas as pd
import re
import numpy as np
import copy

import collections
from operator import itemgetter
import pprint as pp
import re
from nltk.corpus import stopwords
import nltk.data
import time
import proj_base
from gensim import corpora, models
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
#Stemmer defined for stemming words
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
from nltk.tokenize import RegexpTokenizer

tokenizer2 = RegexpTokenizer(r'\w+')



In [2]:
data = proj_base.getStandardData()
data.shape

(1111, 13)

In [3]:
#creating the vocab of all the words

def lintWord(w):
    regex = re.compile(r'(,|\(|\)|!|:|$|\.)')
    w = re.sub(regex, '', w)
    return w


allWords = ""
for r in data["Content"]:
    #add word to big content string
    allWords += r + " "
    
#split the string at spaces, keep only unique
words = set(allWords.split(" "))


vocab = list(set([lintWord(w) for w in words if not w in stopwords.words("english")]))

#n eed to remove stopwords again because some of them may have had punctuation 
# at the end and didnt get caught the first time
vocab = [w for w in vocab if not w in stopwords.words("english") and len(w) > 2]

#vocab


In [4]:
seeds =  {"Value" : ["value", "price", "quality","worth"],
          "Rooms" : ["room", "suite", "view", "bed"],
          "Location" : ["location", "traffic", "minute", "restaurant"],
          "Cleanliness" : ["clean", "dirty", "maintain", "smell"],
          "Check in / front desk": ["stuff", "check", "help", "reservation"],
          "Service" : ["service", "food", "breakfast", "buffet"],
          "Business service" : ["business", "center", "computer", "internet"]
         }
seedsBayes = copy.deepcopy(seeds)
seedsLDA = copy.deepcopy(seeds)
seedsCsq = copy.deepcopy(seeds)

In [5]:
# alternate method to get keywords. LARA method based on Chi-square is basically same, but much better
def aspectSegmentationBayes(reviews, seeds, freq_threshold = .5, prob_threshold = 0.2, words_per_iter = 4, iters = 3):

    #break down reviews into sentences and break down each sentence into words using tokenizer and remove stopwords
    # returns list where each item is the list of words in that sentence
    sentence_words = []
    for review in reviews:
        review = review.decode('utf-8')
        sentences = nltk.tokenize.sent_tokenize(review)
        for sentence in sentences:
            sentence_words.append([x.lower() for x in nltk.tokenize.word_tokenize(sentence) if x not in stopwords.words('english') and len(x) > 2])    
    

    # find Probability(sentence(S) has aspect(A) GIVEN S has word(W)) = count(S that have A and have W) / count(S that have W)

    for i in range(iters):
        
        sents_with_word_asp = {}
        sents_with_word = {}
        sents_with_aspect = {}
        prob_asp_given_word = {}

        # calculates counts of (S that have W) and (S that have A and W)
        for sentence in sentence_words:
            for word in sentence:
                sents_with_word[word] = sents_with_word.get(word,0) + 1
                for aspect, aspect_words in seeds.items():
                    for aspect_word in aspect_words:
                        if aspect_word in sentence:
                            sents_with_word_asp[(word,aspect)] = sents_with_word_asp.get((word, aspect), 0) + 1
                            sents_with_aspect[aspect] = sents_with_aspect.get(aspect,0) + 1
                            break

        for (word, aspect), count in sents_with_word_asp.items():
            #susceptible to low frequencies. hence freq_threshold
            #freq_threshold ensures that count(S with  W) is atleast x% of count(S)
            if sents_with_word[word] > (freq_threshold/100.0)*len(sentence_words):
                prob_asp_given_word[(word,aspect)] = count/float(sents_with_word[word])

        prob_asp_given_word_sorted = sorted(prob_asp_given_word.items(), key=itemgetter(1),reverse=True)
        
        for aspect, word_list in seeds.items():
            count = 0
            for item in prob_asp_given_word_sorted:
                #item is of the form ((word,aspect),probability)
                if item[0][1] == aspect:
                    if item[0][0] not in word_list:
                        if count <= words_per_iter:
                            if item[1] >= prob_threshold:
                                seeds[aspect].append(item[0][0])
                                count += 1
                            else:
                                #because sorted, the others can't have higher probability
                                break
                        else:
                            # because limiit of words per aspect in this iteration has been reached
                            break

    return seeds

In [6]:
sds = aspectSegmentationBayes(data["Content"], seedsBayes)
sds

{'Business service': ['business',
  'center',
  'computer',
  'internet',
  u'access',
  u'free',
  u'monorail',
  u'charge',
  u'bus',
  u'shuttle',
  u'blocks',
  u'easy',
  u'provided',
  u'waterfront',
  u'airport',
  u'pike',
  u'shopping',
  u'away'],
 'Check in / front desk': ['stuff',
  'check',
  'help',
  'reservation',
  u'clerk',
  u'early',
  u'late',
  u'arrived',
  u'morning',
  u'luggage',
  u'called',
  u'hours',
  u'told',
  u'checked',
  u'desk',
  u'said'],
 'Cleanliness': ['clean',
  'dirty',
  'maintain',
  'smell',
  u'shared',
  u'bathrooms',
  u'comfortable',
  u'spacious',
  u'quiet',
  u'beds',
  u'large',
  u'bed',
  u'bathroom',
  u'rooms',
  u'size',
  u'king',
  u'huge',
  u'bath',
  u'small'],
 'Location': ['location',
  'traffic',
  'minute',
  'restaurant',
  u'perfect',
  u'excellent',
  u'great',
  u'convenient',
  u'shopping',
  u'value',
  u'restaurants',
  u'choice',
  u'monorail',
  u'pike',
  u'market',
  u'distance',
  u'shops',
  u'waterfront'

In [7]:


def aspectSegmentationLDA(reviews, aspects, vocab=[], threshold=0, iterationLimit=3):
    #when we have the top chi-squared rated keywords, how many do we take
    keywordsToTake = 3
    
    #bootstrap iterations
    for i in range(0, iterationLimit):
        
        #print our current aspects
        print("begin bootstrapping iteration, aspect keywords: ")
        print(aspects)
        print("\n\n\n")
        labeledSentences = []
        for r in reviews:
            
            #use the pickle tokenizer to split sentences
                 
            sentences = tokenizer.tokenize(r.decode('utf-8'))

                
            for s in sentences:
                
                theseAspects = collections.defaultdict(int)
                maxAspect = (0, "None")
                
                #for each aspect count how many times one of those aspect words appears
                for a in aspects:
                    for word in aspects[a]:
                        if " "+word+" " in s:
                            theseAspects[a] += 1
                
                #find the max occuring aspect for each sentence, take multiple if ties
                for a in theseAspects:
                    if theseAspects[a] > maxAspect[0]:
                        maxAspect = (theseAspects[a], a)
                    if theseAspects[a] == maxAspect[0] and a not in maxAspect:
                        #label it with multiple aspects
                        maxAspect = maxAspect + (a, )
                
                #add the sentence with labels
                labeledSentences.append((s, maxAspect[1:]))
            
        LDAForAspects = collections.defaultdict(list)
        
# obtain dictionary of tokenized sentences corresponding to each aspect
        for a in aspects:
            for s in labeledSentences:
                sentenceText=s[0]
                sentenceAspects=s[1]
                
                if a in sentenceAspects:
                    sentenceTextTokens=tokenizer2.tokenize(sentenceText)
                    sentenceTextTokensNS=[q for q in sentenceTextTokens if not q in stopwords.words("english") and q>2]
                    sentenceTextTokensNS_stemmed = [stemmer.stem(q) for q in sentenceTextTokensNS]
                    LDAForAspects[a].extend(sentenceTextTokensNS)

#Implement LDA to obtain topics
        SentenceWords=[]
        dictionary={}
        corpus={}
        for a in aspects:
            #print(LDAForAspects[a])
            SentenceWords.append(LDAForAspects[a])
            dictionary[a]=corpora.Dictionary([LDAForAspects[a]])
            corpus[a] = [dictionary[a].doc2bow([text]) for text in LDAForAspects[a]]
            #print(a, corpus[a])
            ldamodel = models.ldamodel.LdaModel(corpus[a], num_topics=1, id2word = dictionary[a], passes=20)
            #print( a+":")
            #print(ldamodel.print_topics(num_topics=1, num_words=5))
            X=ldamodel.get_topic_terms(0,5)
            i=0
            while i<len(X):
                temp=list(dictionary[a].token2id.keys())[list(dictionary[a].token2id.values()).index(X[i][0])]
#     Checking for duplicates
                if temp not in aspects[a]:  
                    aspects[a].append(temp)
                i=i+1
            #print aspects[a]
    return labeledSentences


In [8]:
ldaAspects = aspectSegmentationLDA(data["Content"], seedsLDA, vocab)
seedsLDA

begin bootstrapping iteration, aspect keywords: 
{'Service': ['service', 'food', 'breakfast', 'buffet'], 'Business service': ['business', 'center', 'computer', 'internet'], 'Cleanliness': ['clean', 'dirty', 'maintain', 'smell'], 'Check in / front desk': ['stuff', 'check', 'help', 'reservation'], 'Value': ['value', 'price', 'quality', 'worth'], 'Rooms': ['room', 'suite', 'view', 'bed'], 'Location': ['location', 'traffic', 'minute', 'restaurant']}




begin bootstrapping iteration, aspect keywords: 
{'Service': ['service', 'food', 'breakfast', 'buffet', u'room', u'hotel', u'great'], 'Business service': ['business', 'center', 'computer', 'internet', u'hotel', u'seattle'], 'Cleanliness': ['clean', 'dirty', 'maintain', 'smell', u'room', u'hotel', u'rooms', u'staff'], 'Check in / front desk': ['stuff', 'check', 'help', 'reservation', u'hotel', u'room', u'us'], 'Value': ['value', 'price', 'quality', 'worth', u'hotel', u'great', u'location'], 'Rooms': ['room', 'suite', 'view', 'bed', u'hotel',

{'Business service': ['business',
  'center',
  'computer',
  'internet',
  u'hotel',
  u'seattle',
  u'stay',
  u'place',
  u'downtown'],
 'Check in / front desk': ['stuff',
  'check',
  'help',
  'reservation',
  u'hotel',
  u'room',
  u'us',
  u'one',
  u'would'],
 'Cleanliness': ['clean',
  'dirty',
  'maintain',
  'smell',
  u'room',
  u'hotel',
  u'rooms',
  u'staff'],
 'Location': ['location',
  'traffic',
  'minute',
  'restaurant',
  u'hotel',
  u'great',
  u'moore',
  u'room'],
 'Rooms': ['room', 'suite', 'view', 'bed', u'hotel', u'great'],
 'Service': ['service',
  'food',
  'breakfast',
  'buffet',
  u'room',
  u'hotel',
  u'great',
  u'place'],
 'Value': ['value',
  'price',
  'quality',
  'worth',
  u'hotel',
  u'great',
  u'location',
  u'seattle']}

In [9]:

def aspectSegmentationChiSquared(reviews, aspects, vocab=[], threshold=0, iterationLimit=3):
    #when we have the top chi-squared rated keywords, how many do we take
    keywordsToTake = 3
    
    #bootstrap iterations
    for i in range(0, iterationLimit):
        
        #print our current aspects
        print("begin bootstrapping iteration, aspect keywords: ")
        pp.pprint(aspects)
        print("\n\n\n")
        labeledSentences = []
        for r in reviews:
            #use the pickle tokenizer to split sentences
            sentences = tokenizer.tokenize(r)
            
            for s in sentences:
                
                theseAspects = collections.defaultdict(int)
                maxAspect = (0, "None")
                
                #for each aspect count how many times one of those aspect words appears
                for a in aspects:
                    for word in aspects[a]:
                        if " "+word+" " in s:
                            theseAspects[a] += 1
                
                #find the max occuring aspect for each sentence, take multiple if ties
                for a in theseAspects:
                    if theseAspects[a] > maxAspect[0]:
                        maxAspect = (theseAspects[a], a)
                    if theseAspects[a] == maxAspect[0] and a not in maxAspect:
                        #label it with multiple aspects
                        maxAspect = maxAspect + (a, )
                
                #add the sentence with labels
                labeledSentences.append((s, maxAspect[1:]))
            
        
        chiSquaredForAspects = collections.defaultdict(list)
        #calculate chi squared measure for each word in vocab
        """c1 is the number of times w occurs in sentences belonging
            to aspect a_i, c2 is the number of times w occurs
            in sentences not belonging to a_i, c3 is the number of sentences
            of aspect a_i that do not contain w, c4 is the number
            of sentences that neither belong to aspect a_i, nor contain
            word w, and C is the total number of word occurrences"""
        for w in vocab:
            for a in aspects:
                c = 0
                c_1 = 0
                c_2 = 0 
                c_3 = 0
                c_4 = 0

                for s in labeledSentences:
                    sentenceText = s[0]
                    sentenceAspects = s[1]

                    if " "+w+" " in sentenceText and a in sentenceAspects:
                        c_1 += 1
                    elif " "+w+" " in sentenceText and a not in sentenceAspects:
                        c_2 += 1
                    elif a in sentenceAspects and w not in sentenceText:
                        c_3 += 1
                    else:
                        c_4 += 1

                numer = ((1.0 * c_1 * c_4 - 1.0 * c_2 * c_3)**2)
                denom = (1.0*(c_1 + c_3) * 1.0 * (c_2 + c_4) * 1.0 * (c_1 + c_2) * 1.0 * (c_3 + c_4))
                #unreasonable use of 1.0's here to be safe
                if denom != 0:
                    csq = numer / denom
                    chiSquaredForAspects[a].append((w, csq))


        #have the chi squared aspects for each word in vocab, add top kewordsToTake for each aspect
        for a in chiSquaredForAspects:
            #make sure were not taking words we already have
            noDupes = [tup for tup in chiSquaredForAspects[a] if tup[0] not in aspects[a]]
            chiSquaredForAspects[a] = sorted(noDupes, key=itemgetter(1), reverse=True)[0:keywordsToTake]
            for t in chiSquaredForAspects[a]:
                if t[0] not in aspects[a] and t[0] != '':
                    aspects[a].append(t[0])
            
            
        
            #split into sentences
        
        #loop through again
    return seeds

In [10]:
csqSeeds = aspectSegmentationChiSquared(data["Content"], seedsCsq, vocab)

begin bootstrapping iteration, aspect keywords: 
{'Business service': ['business', 'center', 'computer', 'internet'],
 'Check in / front desk': ['stuff', 'check', 'help', 'reservation'],
 'Cleanliness': ['clean', 'dirty', 'maintain', 'smell'],
 'Location': ['location', 'traffic', 'minute', 'restaurant'],
 'Rooms': ['room', 'suite', 'view', 'bed'],
 'Service': ['service', 'food', 'breakfast', 'buffet'],
 'Value': ['value', 'price', 'quality', 'worth']}




begin bootstrapping iteration, aspect keywords: 
{'Business service': ['business',
                      'center',
                      'computer',
                      'internet',
                      'access',
                      'westlake',
                      'wireless'],
 'Check in / front desk': ['stuff',
                           'check',
                           'help',
                           'reservation',
                           'clerk',
                           'told',
                           'later'],

In [None]:
seedsCsq