In [1]:
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pprint import pprint
from spacy.en import English
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os
import re
import requests
import string

parser = English()

In [2]:
# list of dependencies not likely to be words found in features
com_dep = set(['det', 'aux', 'cc', 'punct', 'mark', '', 'neg', 'nummod',
               'prt', 'auxpass', 'case', 'expl', 'preconj', 'intj',
               'predet', 'meta', 'quantmod', 'agent'])

# list of POS tags not likely to be words found in features
com_tag = set(['IN', 'PRP', 'PRP$', 'DT', 'HYPH', 'TO', ',', '.', 'CC',
               'SP', 'CD', 'MD', 'WDT', 'RP', 'WRB', '-LRB-', '-RRB-',
               ':', 'WP', 'POS', '``', "''", 'SYM', 'EX', 'PDT', 'UH',
               'NFP', 'XX'])

# list of POS tag belonging to nouns
noun_tag = set(['NN', 'NNP', 'NNS'])

In [3]:
# Loosely adopted from Nic Scharding
# https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

# A custom stoplist
STOPLIST = set(stopwords.words('english') +
               ["n't", "'s", "'m", "ca", "'re"] +
               list(ENGLISH_STOP_WORDS))

# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ")
SYMBOLS += ["-----", "---", "...", "“", "”", "'ve", ""]

# Full set of stops
STOPS = STOPLIST ^ set(SYMBOLS)

##### Scraping functions

In [4]:
def get_id(url):
    '''
    INPUT: str
    OUTPUT: str
    
    gets asin identifer for amazon product from a url
    '''
    
    # url format: https://www.amazon.com/.../.../id/...
    regex = re.compile(r'(?<=/)[^/]*')
    asin = regex.findall(url)[-2]

    if len(asin) != 10:
        # url format https://www.amazon.com/.../id
        asin = regex.findall(url)[-1][:10]

    return asin


def extract(asin):
    '''
    INPUT: str
    OUTPUT: list(int), list(str)
    
    extracts the star rating and review text from directory of
    amazon html files
    '''
    ratings = []
    reviews = []

    path = 'reviews/com/{}/'.format(asin)
    pages = [file_ for file_ in os.listdir(path) if file_[-5:] == '.html']

    for page in pages:
        html = open(path + page, 'r')
        soup = BeautifulSoup(html, 'html.parser')
        tags = soup.findAll("div", {"class": "a-section review"})

        if not tags:
            break

        for tag in tags:
            rating = int(tag.find('i').text[0])
            review = tag.findAll("span",
                                 {"class": "a-size-base review-text"})[0].text
            ratings.append(rating)
            reviews.append(review)

    return ratings, reviews

In [5]:
class Loader(object):
    '''
    class for scraping a review site on Amazon
    '''

    def __init__(self, freq_thresh=5, aspect_thresh=8):
        '''
        INPUT: int, int
        OUTPUT: None
        
        freq_thresh = num of reviews aspect has to appear in product
        aspect_thresh = how strict to filter a word from aspect consideration
        '''
        self.aspect_thresh = aspect_thresh
        self.freq_thresh = freq_thresh

    def scrape(self, n_reviews=100):
        '''
        INPUT: int
        OUTPUT: None
        
        scrapes 100 most helpful amazon reviews and extracts reviews
        if already scraped, extracts reviews
        '''
        url = raw_input('url of amazon product: ')
        asin = get_id(url)
        doc1.asin = asin
        
        folder = os.getcwd() + '/reviews/com/' + asin
        
        if not os.path.isdir(folder):
            # Run Amazon scraper
            # Credit to Andrea Esuli
            # https://github.com/aesuli/amadown2py
            os.system('python amazon_crawler.py '
                      '-d com {} -m {} -o reviews'.format(asin, n_reviews))

        ratings, reviews = extract(asin)
        
        self.reviews = reviews

##### Code for classing review objects

In [6]:
class SentCustomProperties(object):
    '''
    adds properties to spacy sentence that tracks:
        index of review where sentence originated
        num of words in sentences
    '''
    
    def __init__(self, review_idx, sent_idx, sent):
        '''
        INPUT: int, spacy sentence (spacy.tokens.span.Span)
        OUTPUT: None
        '''
        self.review_idx = review_idx
        self.sent_idx = sent_idx
        self.sent = sent
        self.start_idx = sent[0].i
        self.words = len(sent)

In [7]:
class ReviewSents(object):
    '''
    Takes a list of unicode reviews and stores the sentences
    (with additional properties) in the returned object
    '''
    
    def __init__(self, reviews):
        '''
        INPUT: List of unicode reviews
        OUTPUT: None
        '''
        self.reviews = reviews
        self.n_reviews, self.n_sent, self.sentences = self._parse_sentences()
        
    def _parse_sentences(self):
        '''
        INPUT: None
        OUTPUT: int, int, list(SentCustomProperties)
        
        Uses spacy to parse and split the sentences
        Return number of reviews, sentences, and list of spacy objects
        '''
        n_sent, n_reviews = 0, 0
        sentences = []
        
        for i, review in enumerate(self.reviews):
            try:
                review = parser(review)
                n_reviews += 1
            except AssertionError:
                print 'parser for review #{} failed'.format(i)
                continue
            
            for sent in review.sents:
                if sent.string:
                    sentences.append(SentCustomProperties(i, n_sent, sent))
                    n_sent += 1
                
        return n_reviews, n_sent, sentences

### Hu and Liu Workflow

In [8]:
doc1 = Loader()
doc1.scrape(1000)

url of amazon product: https://www.amazon.com/Samsung-UN40J5200-40-Inch-1080p-Smart/dp/B00WR292JE/ref=sr_1_6?ie=UTF8&qid=1470191366&sr=8-6&keywords=television+samsung


In [9]:
corpus = ReviewSents(doc1.reviews)

parser for review #304 failed


3.1 POS Tagging

In [10]:
### No script needed, just use the token.tag_ attribute in spaCy after parsing sentence

3.2 Frequent Feature Identification

In [11]:
def _iter_nps(sent):
    '''
    INPUT: spacy sentence (spacy.tokens.span.Span)
    OUTPUT: set
    
    Iterates through each token of spacy sentence and collects
    lemmas of all nouns into a set
    
    this function requires parser to be defined as:
        from spacy.en import English
        parser = English()
    '''
    wordset = set()
    
    for token in sent:
        root = parser.vocab[token.lemma].prob
        if token.tag_ in noun_tag and root < -7.5:
            wordset.add(token.lemma_)
    
    return " ".join(wordset)

def candidate_unigrams(corpus, min_pct=0.01):
    '''
    INPUT: ReviewSents, float
    OUTPUT: set
    
    obtains a set of candidate unigrams
    each candidate unigram must be a noun and must appear in at least
        a percentage of the sentences specified by min_pct
    '''
    count_X = []

    for sent in corpus.sentences:
        count_X.append(_iter_nps(sent.sent))
        
    cnt_vec = CountVectorizer()
    freq = cnt_vec.fit_transform(count_X)
    
    total_count = freq.toarray().sum(axis=0)
    filter_ = total_count >= min_pct * corpus.n_sent

    features = np.array(cnt_vec.get_feature_names())
    return set(features[filter_])

In [12]:
unigrams = candidate_unigrams(corpus)

##### Associative mining with compactness pruning

In [13]:
def _get_compactness_feat(corpus):
    '''
    INPUT: ReviewSents
    OUTPUT: generator(tuples)
    
    outputs generator of tuples consisting of:
        at least one noun
        a second word within += 3 words of noun
    excludes dependencies and tags not likely to be a feature word
    
    this function requires parser to be defined as:
        from spacy.en import English
        parser = English()
    '''

    for sent in corpus.sentences:
        output = set()

        for i, token in enumerate(sent.sent):            
            if token.tag_ not in noun_tag:
                continue
            else:
                arr = sent.sent[max(0, i-3):min(i+4, sent.words)]
                arr = np.array(arr)
                arr = arr[arr != token]

                for item in arr:
                    root = parser.vocab[item.lemma].prob
                    if root > -7.5 or item.dep_ in com_dep or item.tag_ in com_tag:
                        continue
                    else:
                        tup = tuple(sorted([item.lemma_, token.lemma_]))
                        output.add(tup)

        if output:
            for element in output:
                yield element
                            
def candidate_bigrams(corpus, min_pct=0.005):
    '''
    INPUT: ReviewSents, float
    OUTPUT: set(tuples), set(str)
    
    outputs set of tuples and set of words within tuples from 
    _get_compactness_feat function appearing at least min_cnt times
    '''
    bigrams = set()
    bigram_words = set()
    
    feats = Counter(_get_compactness_feat(corpus))
    
    for (key, val) in feats.iteritems():
        if val >= max(2, min_pct * corpus.n_sent):
            bigrams.add(key)
            bigram_words.update(set(key))
            
    return bigrams, bigram_words

In [14]:
bigrams, bigram_words = candidate_bigrams(corpus)

3.3 Opinion Word Extraction

In [15]:
def _check_bigram(adj, token, sent, bigrams, bigram_words):
    '''
    INPUT: spacy token (spacy.tokens.token.Token),
           spacy token (spacy.tokens.token.Token),
           SentCustomProperties,
           set(tuples),
           set(str)
    OUTPUT: generator(tuples)
    
    Given an adjective token, dependency word token, a sentence, and token 
    index within sentence, check if the token has any associated bigram 
    within  +/- 3 words and yield all bigrams. 
    
    If no bigrams, yield unigram.
    '''
    yielded = False
    seen_words = []
    t_idx = token.i - sent.start_idx
    a_idx = adj.i - sent.start_idx
    
    arr = sent.sent[max(0, t_idx-3):min(t_idx+4, sent.words)]
    
    for item in arr:
        if item == token or item.lemma_ in seen_words:
            continue
        elif item.lemma_ in bigram_words:
            tup = tuple(sorted([item.lemma_, token.lemma_]))
            adjectives = [adj.lemma_]
            
            if adj.lemma_ in tup:
                # try finding first adjective around aspect if
                # bigram includes adjective
                # return nothing if no other adjectives found
                left = max(0, min(a_idx-3, t_idx-3))
                right = min(sent.words, max(a_idx+4, t_idx+4))
                bi_arr = sent.sent[left:right]

                adjectives = [word.lemma_ for word in bi_arr 
                              if word.tag_ in ['JJ', 'JJR', 'JJS']
                              and word.lemma_ not in tup]
                
            if tup in bigrams and adjectives:
                yielded = True
                seen_words.append(item.lemma_)
                ordering = [1, 0] if item.lemma_ == tup[0] else [0, 1]
                
                yield adjectives[0], tup, np.array(ordering), t_idx
                
    if not yielded:
        yield adj.lemma_, token.lemma_, np.array([1, 0]), t_idx

def extract_aspects(corpus, unigrams, bigrams, bigram_words):
    '''
    INPUT: ReviewSents, set(str), set(tuple), set(str)
    OUTPUT: dictionary
    
    Extracts all aspects modified by an 'amod' dependency and returns
    dictionary of all aspects within candidate unigram and bigram aspects
    '''
    props = defaultdict(lambda:
                    {'adjectives': [],
                     'aspect_idx': [],
                     'ordering': np.array([0, 0]),
                     'review_idx': [],
                     'sentence_idx': []})
    
    for sent in corpus.sentences:
        
        for token in sent.sent:
            if token.head.lemma_ not in unigrams:
                continue
            
            elif token.dep_ == 'amod':
                aspects = _check_bigram(token, token.head, sent, 
                                        bigrams, bigram_words)
                
                for adj, aspect, ordering, index in aspects:
                    props[aspect]['adjectives'].append(adj)
                    props[aspect]['aspect_idx'].append(index)
                    props[aspect]['review_idx'].append(sent.review_idx)
                    props[aspect]['sentence_idx'].append(sent.sent_idx)
                    props[aspect]['ordering'] += ordering
    
    for key in props:
        props[key]['review_count'] = len(set(props[key]['review_idx']))
        props[key]['sent_count'] = len(set(props[key]['sentence_idx']))
        
    return props

In [16]:
def sort_aspect_frequency(aspect_dict, n=3):
    '''
    INPUT: dict, int
    OUTPUT: list([aspect, freq])
    
    Outputs a list of the most common aspect occuring at least n times in descending order.
    Dictionary returned from extract_aspects is the input of this function.
    '''
    counts = []

    for key in aspect_dict:

        freq = aspect_dict[key]['review_count']

        if freq <= n:
            continue

        if isinstance(key, tuple):
            reverse = aspect_dict[key]['ordering'][1] > aspect_dict[key]['ordering'][0]
            ordered_key = " ".join(sorted(key, reverse=reverse))
            counts.append([ordered_key, freq])
        else:
            counts.append([key, freq])

    return sorted(counts, key=lambda x: x[1], reverse=True)

In [17]:
def print_sentence_frag(corpus, aspect_dict, aspect, adjective=None):
    '''
    INPUT: ReviewSents, dict, string, str(optional)
    OUTPUT: None
    
    Script that prints out sentence fragment containing adjective
    describing specified aspect.
    
    If no adjective specified, prints all adjectives in descending
    order of occurance.
    '''
    print '-'*40
    print aspect
    print '-'*40
    print

    aspect_list = aspect.split(" ")
    
    if len(aspect_list) == 2:
        aspect = tuple(sorted(aspect_list))
    
    if not adjective:
        adjectives = Counter(aspect_dict[aspect]['adjectives']).most_common()
        
        for (word, freq) in adjectives:
            if not word.isalpha():
                continue
            
            print freq, "\t", word
            print '-'*40
            
            mask = np.array(aspect_dict[aspect]['adjectives']) == word
            aspect_idx = np.array(aspect_dict[aspect]['aspect_idx'])[mask]
            sent_idx = np.array(aspect_dict[aspect]['sentence_idx'])[mask]
            
            for i, (a_idx, s_idx) in enumerate(zip(aspect_idx, sent_idx)):
                sent = corpus.sentences[s_idx]
                print i, sent.sent[max(0, a_idx-10):min(a_idx+11, sent.words)]
                
            print
            
    else:
        print adjective
        print '-'*40

        mask = np.array(aspect_dict[aspect]['adjectives']) == adjective
        aspect_idx = np.array(aspect_dict[aspect]['aspect_idx'])[mask]
        sent_idx = np.array(aspect_dict[aspect]['sentence_idx'])[mask]

        for i, (a_idx, s_idx) in enumerate(zip(aspect_idx, sent_idx)):
            sent = corpus.sentences[s_idx]
            print i, sent.sent[max(0, a_idx-10):min(a_idx+11, sent.words)]

        print

In [18]:
results = extract_aspects(corpus, unigrams, bigrams, bigram_words)

In [19]:
sort_aspect_frequency(results)

[[u'tv', 233],
 [u'picture', 134],
 [u'price', 56],
 [u'quality', 54],
 [u'screen', 53],
 [u'smart tv', 47],
 [u'problem', 43],
 [u'apps', 42],
 [u'picture quality', 41],
 [u'issue', 40],
 [u'product', 40],
 [u'size', 39],
 [u'feature', 35],
 [u'review', 33],
 [u'samsung tv', 33],
 [u'sound picture', 32],
 [u'device', 32],
 [u'speaker', 31],
 [u'hub', 31],
 [u'model', 30],
 [u'sound', 29],
 [u'price tv', 25],
 [u'color', 25],
 [u'sound quality', 25],
 [u'room', 25],
 [u'smart feature', 23],
 [u'minute', 23],
 [u'tv feature', 22],
 [u'purchase', 22],
 [u'set', 21],
 [u'service', 21],
 [u'remote', 21],
 [u'television', 19],
 [u'box', 18],
 [u'connection', 18],
 [u'samsung', 17],
 [u'stand', 17],
 [u'button', 16],
 [u'nice picture', 15],
 [u'year', 13],
 [u'network', 12],
 [u'cable', 11],
 [u'smart hub', 11],
 [u'buy tv', 10],
 [u'wifi', 9],
 [u'second', 9],
 [u'old tv', 9],
 [u'movie', 8],
 [u'internet', 8],
 [u'inch', 8],
 [u'app', 7],
 [u'bedroom', 7],
 [u'star', 7],
 [u'video', 6],
 [

In [20]:
print_sentence_frag(corpus, results, 'picture quality')

----------------------------------------
picture quality
----------------------------------------

10 	great
----------------------------------------
0 another great bonus, I can watch everything in great picture quality and sound without the need of having another cable
1 But the great picture quality is overshadowed by slow, sometimes non-responsive
2 Great picture quality, but the smart features are pretty lackluster.  
3 Has a great picture quality as well.
4 Awesome budget TV. Great quality picture, which is pretty standard for Samsung.  
5 Great picture and sound quality.
6 Great picture quality, easy to use.
7 great picture quality, everything I expected from this brand of TV and
8 Great TV, So many features, great picture quality.
9 Great picture quality only gripe is the amount of time its smarthub needs

7 	good
----------------------------------------
0 This one has the same good picture quality as my previous units, but I must say that
1 The Good: Nice size, good picture qu

3.4 Orientation Identification for Opinion
Words

### Code for analysis of SpaCy attributes

In [21]:
# code to test how words rank within spacy's prob attribute

output = set()

for sent in corpus.sentences:
    for token in sent.sent:
        output.add((token.prob*-1, token.string.strip()))
            
sorted(output)

[(3.0678977966308594, u'.'),
 (3.4549596309661865, u','),
 (3.528766632080078, u'the'),
 (3.791565179824829, u'I'),
 (3.8560216426849365, u'to'),
 (3.92978835105896, u'a'),
 (4.113108158111572, u'and'),
 (4.27587366104126, u'of'),
 (4.373791217803955, u'you'),
 (4.388050079345703, u'it'),
 (4.457748889923096, u'is'),
 (4.464504718780518, u'that'),
 (4.619071960449219, u'in'),
 (4.830559253692627, u"'s"),
 (4.859938621520996, u"n't"),
 (4.8801093101501465, u'for'),
 (5.02677583694458, u'"'),
 (5.05924654006958, u'?'),
 (5.129165172576904, u''),
 (5.156484603881836, u'have'),
 (5.172736167907715, u'on'),
 (5.1977410316467285, u'*'),
 (5.197994232177734, u')'),
 (5.225094318389893, u'be'),
 (5.243249893188477, u'with'),
 (5.246996879577637, u'do'),
 (5.252320289611816, u'was'),
 (5.271068096160889, u'are'),
 (5.332601070404053, u'not'),
 (5.3419694900512695, u'but'),
 (5.359641075134277, u'!'),
 (5.36181640625, u'this'),
 (5.438112258911133, u'['),
 (5.468655109405518, u'-'),
 (5.49164295

In [22]:
dep = []

for sent in corpus.sentences:
    for token in sent.sent:
        dep.append(token.dep_)
            
Counter(dep).most_common()

[(u'punct', 10489),
 (u'nsubj', 9127),
 (u'det', 8660),
 (u'prep', 6693),
 (u'pobj', 6067),
 (u'advmod', 6025),
 (u'ROOT', 5793),
 (u'dobj', 5340),
 (u'amod', 4712),
 (u'aux', 4504),
 (u'conj', 3917),
 (u'cc', 3483),
 (u'compound', 3390),
 (u'acomp', 2010),
 (u'advcl', 1844),
 (u'', 1550),
 (u'ccomp', 1535),
 (u'xcomp', 1477),
 (u'neg', 1436),
 (u'mark', 1396),
 (u'poss', 1311),
 (u'nummod', 1058),
 (u'attr', 961),
 (u'relcl', 916),
 (u'prt', 823),
 (u'npadvmod', 573),
 (u'pcomp', 493),
 (u'auxpass', 398),
 (u'appos', 332),
 (u'nsubjpass', 320),
 (u'nmod', 304),
 (u'acl', 302),
 (u'quantmod', 176),
 (u'dep', 170),
 (u'case', 157),
 (u'expl', 152),
 (u'intj', 125),
 (u'csubj', 106),
 (u'dative', 91),
 (u'predet', 89),
 (u'oprd', 80),
 (u'parataxis', 79),
 (u'preconj', 41),
 (u'agent', 38),
 (u'meta', 24),
 (u'csubjpass', 1)]

In [23]:
pos = []

for sent in corpus.sentences:
    for token in sent.sent:
        pos.append(token.tag_)
            
Counter(pos).most_common()

[(u'NN', 13468),
 (u'DT', 9520),
 (u'IN', 8485),
 (u'JJ', 6883),
 (u'RB', 6707),
 (u'PRP', 6480),
 (u'.', 5446),
 (u'VB', 4429),
 (u'VBZ', 3537),
 (u'CC', 3511),
 (u'NNP', 3454),
 (u',', 2956),
 (u'NNS', 2800),
 (u'VBD', 2717),
 (u'VBP', 2533),
 (u'TO', 1818),
 (u'VBG', 1628),
 (u'SP', 1550),
 (u'CD', 1366),
 (u'VBN', 1354),
 (u'MD', 1268),
 (u'PRP$', 1220),
 (u'RP', 793),
 (u'WDT', 443),
 (u'JJR', 442),
 (u'WRB', 370),
 (u'-LRB-', 361),
 (u'``', 347),
 (u'-RRB-', 343),
 (u':', 341),
 (u"''", 279),
 (u'HYPH', 277),
 (u'WP', 202),
 (u'JJS', 152),
 (u'EX', 151),
 (u'POS', 150),
 (u'UH', 150),
 (u'RBR', 122),
 (u'NFP', 108),
 (u'PDT', 101),
 (u'$', 75),
 (u'FW', 60),
 (u'SYM', 53),
 (u'RBS', 38),
 (u'XX', 33),
 (u'LS', 17),
 (u'NNPS', 14),
 (u'ADD', 10),
 (u'AFX', 6)]