In [119]:
# 2016/08/03 6:17PM
# No filters for amod score in this notebook

# My Code

In [76]:
from __future__ import division
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pprint import pprint
from spacy.en import English
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import datetime
import numpy as np
import pandas as pd
import os
import re
import requests
import string
import time

parser = English()

In [98]:
# list of dependencies not likely to be words found in features
com_dep = set(['det', 'aux', 'cc', 'punct', 'mark', '', 'neg', 'nummod',
               'prt', 'auxpass', 'case', 'expl', 'preconj', 'intj',
               'predet', 'meta', 'quantmod', 'agent'])

# list of POS tags not likely to be words found in features
com_tag = set(['IN', 'PRP', 'PRP$', 'DT', 'HYPH', 'TO', ',', '.', 'CC',
               'SP', 'CD', 'MD', 'WDT', 'RP', 'WRB', '-LRB-', '-RRB-',
               ':', 'WP', 'POS', '``', "''", 'SYM', 'EX', 'PDT', 'UH',
               'NFP', 'XX'])

# list of POS tag belonging to nouns
noun_tag = set(['NN', 'NNP', 'NNS'])

In [118]:
# stopword aspects to filter out
nonaspects = set(['product', 'price', 'problem', 'device', 'review',
                 'item', 'amazon', 'everything', 'company', 'brand',
                 'buy', 'purchase', 'cost'])

# stopword adjectives to filter out
nonadj = set(['other', 'first', 'second', 'third', 'much'])

In [100]:
class SentCustomProperties(object):
    '''
    adds properties to spacy sentence that tracks:
        index of review where sentence originated
        spacy Span object
        index of sentence within review corpus
        index of first token in sentence within review
        num of words in sentences
    '''
    
    def __init__(self, review_idx, sent_idx, sent):
        '''
        INPUT: int, int, spacy sentence (spacy.tokens.span.Span)
        OUTPUT: None
        '''
        self.review_idx = review_idx
        self.sent = sent
        self.sent_idx = sent_idx
        self.start_idx = sent[0].i
        self.words = len(sent)

In [101]:
class ReviewSents(object):
    '''
    Takes a list of unicode reviews and stores the sentences
    (with additional properties) in the returned object
    '''
    
    def __init__(self, reviews):
        '''
        INPUT: list(unicode)
        OUTPUT: None
        '''
        self.reviews = reviews
        self.n_reviews, self.n_sent, self.sentences = self._parse_sentences()
        
    def _parse_sentences(self):
        '''
        INPUT: None
        OUTPUT: int, int, list(SentCustomProperties)
        
        Uses spacy to parse and split the sentences
        Return number of reviews, sentences, and list of spacy objects
        '''
        n_sent, n_reviews = 0, 0
        sentences = []
        
        for i, review in enumerate(self.reviews):
            try:
                review = parser(review)
                n_reviews += 1
            except AssertionError:
                print 'parser for review #{} failed'.format(i)
                continue
            
            for sent in review.sents:
                if sent.string:
                    sentences.append(SentCustomProperties(i, n_sent, sent))
                    n_sent += 1
                
        return n_reviews, n_sent, sentences

In [102]:
def _iter_nps(sent):
    '''
    INPUT: spacy sentence (spacy.tokens.span.Span)
    OUTPUT: set
    
    Iterates through each token of spacy sentence and collects
    lemmas of all nouns into a set
    
    this function requires parser to be defined as:
        from spacy.en import English
        parser = English()
    '''
    wordset = set()
    
    for token in sent:
        root = parser.vocab[token.lemma].prob
        # filter to only consider nouns
        if token.tag_ in noun_tag and (root < -7.5 and
                                       token.lemma_ not in nonaspects):
            wordset.add(token.lemma_)
    
    return " ".join(wordset)

def candidate_unigrams(corpus, min_pct=0.01):
    '''
    INPUT: ReviewSents, float
    OUTPUT: set
    
    obtains a set of candidate unigrams
    each candidate unigram must be a noun and must appear in at least
        a percentage of the sentences specified by min_pct
    '''
    count_X = []

    for sent in corpus.sentences:
        count_X.append(_iter_nps(sent.sent))
        
    cnt_vec = CountVectorizer()
    freq = cnt_vec.fit_transform(count_X)
    
    total_count = freq.toarray().sum(axis=0)
    filter_ = total_count >= min_pct * corpus.n_sent

    features = np.array(cnt_vec.get_feature_names())
    return set(features[filter_])

In [103]:
def _get_compactness_feat(corpus):
    '''
    INPUT: ReviewSents
    OUTPUT: generator(tuples(unicode))
    
    outputs generator of tuples consisting of:
        at least one noun
        a second word within +/- 3 words of noun
    excludes dependencies and tags not likely to be a feature word
    
    this function requires parser to be defined as:
        from spacy.en import English
        parser = English()
    '''

    for sent in corpus.sentences:
        output = set()

        for i, token in enumerate(sent.sent):
            # one word in bigram must be noun
            if token.tag_ not in noun_tag or token.lemma_ in nonaspects:
                continue
            else:
                arr = sent.sent[max(0, i-3):min(i+4, sent.words)]
                arr = np.array(arr)
                arr = arr[arr != token]

                for item in arr:
                    root = parser.vocab[item.lemma].prob
                    # filter out unlikely features
                    if root > -7.5 or (item.dep_ in com_dep or
                                       item.tag_ in com_tag or
                                       item.lemma_ in nonaspects):
                        continue
                    else:
                        tup = tuple(sorted([item.lemma_, token.lemma_]))
                        output.add(tup)

        if output:
            for element in output:
                yield element
                            
def candidate_bigrams(corpus, min_pct=0.005):
    '''
    INPUT: ReviewSents, float
    OUTPUT: set(tuples), set(str)
    
    outputs set of tuples and set of words within tuples from 
        _get_compactness_feat function appearing at least 
        min_cnt times
    '''
    bigrams = set()
    bigram_words = set()
    
    feats = Counter(_get_compactness_feat(corpus))
    
    for (key, val) in feats.iteritems():
        if val >= max(2, min_pct * corpus.n_sent):
            bigrams.add(key)
            bigram_words.update(set(key))
            
    return bigrams, bigram_words

In [104]:
def _check_bigram(adj, token, sent, bigrams, bigram_words):
    '''
    INPUT: spacy token (spacy.tokens.token.Token),
           spacy token (spacy.tokens.token.Token),
           SentCustomProperties,
           set(tuples),
           set(str)
    OUTPUT: generator(tuples)
    
    Given an adjective token, dependency word token, a sentence, and 
        token index within sentence, check if the token has any 
        associated bigram within +/- 3 words and yield all bigrams. 
    
    If no bigrams, yield unigram.
    '''
    yielded = False
    seen_words = []
    t_idx = token.i - sent.start_idx
    a_idx = adj.i - sent.start_idx
    
    arr = sent.sent[max(0, t_idx-3):min(t_idx+4, sent.words)]
    
    for item in arr:
        if item == token or (item.lemma_ in seen_words or
                             item.lemma_ in nonaspects):
            continue
        elif item.lemma_ in bigram_words:
            tup = tuple(sorted([item.lemma_, token.lemma_]))
            adjectives = [adj]
            
            if adj.lemma_ in tup:
                # try finding first adjective around aspect if
                #     bigram includes adjective
                # return nothing if no other adjectives found
                left = max(0, min(a_idx-3, t_idx-3))
                right = min(sent.words, max(a_idx+4, t_idx+4))
                bi_arr = sent.sent[left:right]

                adjectives = [word for word in bi_arr 
                              if word.tag_ in ['JJ', 'JJR', 'JJS']
                              and word.lemma_ not in tup
                              and word.lemma_ not in nonadj]
                
            if tup in bigrams and adjectives:
                yielded = True
                seen_words.append(item.lemma_)
                # for checking which word in bigram is likely to appear first
                ordering = [1, 0] if item.lemma_ == tup[0] else [0, 1]
                a_idx = adjectives[0].i - sent.start_idx
                
                yield (adjectives[0].lemma_, tup, 
                       np.array(ordering), t_idx, a_idx)
                
    if not yielded:
        yield (adj.lemma_, token.lemma_, np.array([1, 0]), t_idx, a_idx)

def extract_aspects(corpus, unigrams, bigrams, bigram_words):
    '''
    INPUT: ReviewSents, set(str), set(tuple), set(str)
    OUTPUT: dictionary
    
    Extracts all aspects modified by an 'amod' dependency and returns
    dictionary of all aspects within candidate unigram and bigram aspects
    
    relation_from: list of words that aspect has dependency relation from
    relation_dep: list of dependencies types aspect has relationship from
    
    adjectives: word modifying the aspect
    adj_idx: list of where adjective can be found within sentence
    aspect_idx: list of where aspect can be found within sentence
    ordering: if bigram, count of which word appears first
    review_idx: list of which reviews contain aspct
    sentence_idx: list of which sentences contain aspect
    
    amod_pct: percentage of time aspect has 'amod' dependency relation
    review_count: how many reviews aspects are found in
    sentence_count: how many sentences aspects are found in
    '''
    props = defaultdict(lambda:
                    {'adjectives': [],
                     'adj_idx': [],
                     'aspect_idx': [],
                     'ordering': np.array([0, 0]),
                     'relation_from': [],
                     'relation_dep': [],
                     'review_idx': [],
                     'sentence_idx': []})
    
    for sent in corpus.sentences:
        
        for token in sent.sent:
            # code to check frequency of adjective for aspect
            if token.head.lemma_ in unigrams:
                props[token.head.lemma_]['relation_from'].append(token.lemma_)
                props[token.head.lemma_]['relation_dep'].append(token.dep_)
            
            if token.head.lemma_ not in unigrams or token.lemma_ in nonadj:
                continue
            
            elif token.dep_ == 'amod':
                aspects = _check_bigram(token, token.head, sent, 
                                        bigrams, bigram_words)
                
                for adj, aspect, ordering, asp_i, adj_i in aspects:
                    props[aspect]['adjectives'].append(adj)
                    props[aspect]['adj_idx'].append(adj_i)
                    props[aspect]['aspect_idx'].append(asp_i)
                    props[aspect]['review_idx'].append(sent.review_idx)
                    props[aspect]['sentence_idx'].append(sent.sent_idx)
                    props[aspect]['ordering'] += ordering
    
    for key in props:
        props[key]['review_count'] = len(set(props[key]['review_idx']))
        props[key]['sent_count'] = len(set(props[key]['sentence_idx']))
        
        if isinstance(key, unicode):
            arr = np.array(props[key]['relation_dep']) == 'amod'
            props[key]['amod_pct'] = np.sum(arr)/len(arr)
    
    # determining bigram amod percentage requires knowing unigram amod
    for key in props:        
        if isinstance(key, tuple):
            reverse = props[key]['ordering'][1] > props[key]['ordering'][0]
            word1 = key[reverse]
            word2_key = key[not reverse]
            
            arr = np.array(props[word2_key]['relation_from']) == word1
            props[key]['amod_pct'] = np.sum(arr)/len(arr)
        
    return props

In [105]:
def sort_aspect_frequency(aspect_dict, n=3):
    '''
    INPUT: dict, int
    OUTPUT: list([aspect, freq])
    
    Outputs a list of the most common aspect occuring at least n times in descending order.
    Dictionary returned from extract_aspects is the input of this function.
    '''
    counts = []

    for key in aspect_dict:

        freq = aspect_dict[key]['review_count']
        amod = round(aspect_dict[key]['amod_pct'], 3)

        if freq < n:
            continue

        if isinstance(key, tuple):
            # check which word of bigram appears first more often
            reverse = aspect_dict[key]['ordering'][1] > aspect_dict[key]['ordering'][0]
            ordered_key = " ".join(sorted(key, reverse=reverse))
            counts.append([ordered_key, freq, amod])
        else:
            counts.append([key, freq, amod])

    return sorted(counts, key=lambda x: x[1], reverse=True)

In [106]:
def print_sentence_frag(corpus, aspect_dict, aspect, adjective=None):
    '''
    INPUT: ReviewSents, dict, string, str(optional)
    OUTPUT: None
    
    Script that prints out sentence fragment containing adjective
        describing specified aspect.
    
    Dictionary returned from extract_aspects is the second input 
        of this function.
    
    If no adjective specified, prints all adjectives in descending
        order of occurance.
    '''
    print '-'*40
    print aspect
    print '-'*40
    print

    aspect_list = aspect.split(" ")
    
    if len(aspect_list) == 2:
        aspect = tuple(sorted(aspect_list))
    
    if not adjective:
        adjectives = Counter(aspect_dict[aspect]['adjectives']).most_common()
        
        for (word, freq) in adjectives:
            # toss out adjectives that with non-alphabetic characters
            if not word.isalpha():
                continue
            
            print freq, "\t", word
            print '-'*40
            
            mask = np.array(aspect_dict[aspect]['adjectives']) == word
            aspect_idx = np.array(aspect_dict[aspect]['aspect_idx'])[mask]
            sent_idx = np.array(aspect_dict[aspect]['sentence_idx'])[mask]
            
            for i, (a_idx, s_idx) in enumerate(zip(aspect_idx, sent_idx)):
                sent = corpus.sentences[s_idx]
                print i, sent.sent[max(0, a_idx-10):min(a_idx+11, sent.words)]
                
            print
            
    else:
        print adjective
        print '-'*40

        mask = np.array(aspect_dict[aspect]['adjectives']) == adjective
        aspect_idx = np.array(aspect_dict[aspect]['aspect_idx'])[mask]
        sent_idx = np.array(aspect_dict[aspect]['sentence_idx'])[mask]

        for i, (a_idx, s_idx) in enumerate(zip(aspect_idx, sent_idx)):
            sent = corpus.sentences[s_idx]
            print i, sent.sent[max(0, a_idx-10):min(a_idx+11, sent.words)]

        print

In [107]:
def pipeline(df, asin_list):
    '''
    INPUT: df, list(str)
    OUTPUT: dict
    
    runs pipeline specified in functions above for list of asin
        on loaded data frame containing reviews
    outputs asin callable dictionary with attributes:
        aspect_dict: dictionary of aspects
        sorted_aspects: list of descending aspect frequencies
    '''
    print 'start time:', datetime.datetime.now().time().isoformat()
    print
    
    asin_dict = defaultdict(dict)
    
    for asin in asin_list:
        print 'working on ASIN {}'.format(asin)
        print '-'*40
        
        reviews = df[df.asin == asin].reviewText.tolist()
        corpus = ReviewSents(reviews)
        
        unigrams = candidate_unigrams(corpus)
        bigrams, bigram_words = candidate_bigrams(corpus)
        
        results = extract_aspects(corpus, unigrams, bigrams, bigram_words)
        freq = sort_aspect_frequency(results)
        
        asin_dict[asin]['corpus'] = corpus
        asin_dict[asin]['aspect_dict'] = results
        asin_dict[asin]['sorted_aspects'] = freq
        
        print '-'*40
        print
    
    print 'end time:', datetime.datetime.now().time().isoformat()
    return asin_dict

In [108]:
def print_sentence_frag_multiprod(multiprod_corpus,
                                  aspect, adjective=None):
    '''
    INPUT: dict, string, str(optional)
    OUTPUT: None
    
    Script that prints out sentence fragment containing adjective
        describing specified aspect.
    
    Dictionary returned from pipeline function is the first input 
        of this function.
    
    If no adjective specified, prints all adjectives in descending
        order of occurance.
    '''
    print '-'*60
    print aspect
    print '-'*60
    print

    for asin in multiprod_corpus:
        if aspect not in multiprod_corpus[asin]['aspect_dict']:
            continue
        
        print '-'*50
        print asin
        print '-'*50
        print
        
        corpus = multiprod_corpus[asin]['corpus']
        aspect_dict = multiprod_corpus[asin]['aspect_dict']
    
        aspect_list = aspect.split(" ")

        if len(aspect_list) == 2:
            aspect = tuple(sorted(aspect_list))

        if not adjective:
            adjectives = Counter(aspect_dict[aspect]['adjectives']).most_common()

            for (word, freq) in adjectives:
                # toss out adjectives that with non-alphabetic characters
                if not word.isalpha():
                    continue

                print freq, "\t", word
                print '-'*40

                mask = np.array(aspect_dict[aspect]['adjectives']) == word
                aspect_idx = np.array(aspect_dict[aspect]['aspect_idx'])[mask]
                sent_idx = np.array(aspect_dict[aspect]['sentence_idx'])[mask]

                for i, (a_idx, s_idx) in enumerate(zip(aspect_idx, sent_idx)):
                    sent = corpus.sentences[s_idx]
                    print i, sent.sent[max(0, a_idx-10):min(a_idx+11, sent.words)]

                print

        else:
            print adjective
            print '-'*40

            mask = np.array(aspect_dict[aspect]['adjectives']) == adjective
            aspect_idx = np.array(aspect_dict[aspect]['aspect_idx'])[mask]
            sent_idx = np.array(aspect_dict[aspect]['sentence_idx'])[mask]

            for i, (a_idx, s_idx) in enumerate(zip(aspect_idx, sent_idx)):
                sent = corpus.sentences[s_idx]
                print i, sent.sent[max(0, a_idx-10):min(a_idx+11, sent.words)]

            print

In [109]:
def get_item_name(asin_list):
    '''
    INPUT: list
    OUTPUT: dict
    
    Script that returns a list of product names given an ASIN list
    '''
    output = dict()
    
    for asin in asin_list:
        url = 'https://www.amazon.com/dp/{}/'.format(asin)
        user_agent = ['Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30',
                      '(KHTML, like Gecko) Ubuntu/11.04',
                      'Chromium/12.0.742.91 Chrome/12.0.742.91',
                      'Safari/534.30']
        headers = {'User-Agent': np.random.choice(user_agent)}
        html = requests.get(url, headers=headers).content
        soup = BeautifulSoup(html, 'html.parser')
        try:
            name = soup.select('#productTitle')[0].text.strip()
        except IndexError:
            name = ''
            
        output[asin] = name
        time.sleep(5 + np.random.random()*10)
    
    return output

In [110]:
def print_sorted_aspects_multiprod(multiprod_corpus, product_names):
    '''
    INPUT: dict
    OUTPUT: None
    
    Script that prints out top occuring aspects for each product
    '''

    for asin in multiprod_corpus:

        print '-'*50
        print asin, product_names[asin]
        print '-'*50
        print
        pprint(multiprod_corpus[asin]['sorted_aspects'])
        print

# Amazon Electronics Review Corpus

In [13]:
with open('reviews/Electronics_5-2.json', 'r') as f:
    data = f.readlines()

In [14]:
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"

In [15]:
df = pd.read_json(data_json_str)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1689188 entries, 0 to 1689187
Data columns (total 9 columns):
asin              1689188 non-null object
helpful           1689188 non-null object
overall           1689188 non-null int64
reviewText        1689188 non-null object
reviewTime        1689188 non-null object
reviewerID        1689188 non-null object
reviewerName      1664458 non-null object
summary           1689188 non-null object
unixReviewTime    1689188 non-null int64
dtypes: int64(2), object(7)
memory usage: 128.9+ MB


In [17]:
group = df.groupby('asin').size()

In [18]:
len(group[df.groupby('asin').size() >= 1000])

52

In [19]:
asin_1000 = group[df.groupby('asin').size() >= 1000].index.tolist()
asin_1000

[u'B00004ZCJE',
 u'B00007E7JU',
 u'B0002L5R78',
 u'B000BQ7GW8',
 u'B000I68BD4',
 u'B000LRMS66',
 u'B000QUUFRW',
 u'B000S5Q9CA',
 u'B000VX6XL6',
 u'B0012S4APK',
 u'B0015DYMVO',
 u'B0019EHU8G',
 u'B001TH7GSW',
 u'B001TH7GUU',
 u'B001XURP7W',
 u'B0027VT6V4',
 u'B002MAPRYU',
 u'B002QEBMAK',
 u'B002SZEOLG',
 u'B002V88HFE',
 u'B002WE6D44',
 u'B00316263Y',
 u'B003ELYQGG',
 u'B003ES5ZUU',
 u'B0041Q38NU',
 u'B0043T7FXE',
 u'B0044YU60M',
 u'B004G6002M',
 u'B004GF8TIK',
 u'B004QK7HI8',
 u'B004W2JKWG',
 u'B004XC6GJ0',
 u'B0052SCU8U',
 u'B005CLPP84',
 u'B005CT56F8',
 u'B005DKZTMG',
 u'B005FYNSPK',
 u'B005HMKKH4',
 u'B00622AG6S',
 u'B006GWO5WK',
 u'B006W8U2MU',
 u'B0074BW614',
 u'B007I5JT4S',
 u'B007R5YDYA',
 u'B007WTAJTO',
 u'B008OHNZI0',
 u'B009A5204K',
 u'B009SYZ8OC',
 u'B00B46XUQU',
 u'B00BGGDVOO',
 u'B00DR0PDNE',
 u'B00E3W15P0']

In [79]:
product_names = get_item_name(asin_1000)

In [62]:
group[df.groupby('asin').size() >= 1000].sum()

89054

In [111]:
asin_dict = pipeline(df, asin_1000)

start time: 17:37:42.429110

working on ASIN B00004ZCJE
----------------------------------------
parser for review #18 failed
parser for review #493 failed
parser for review #1081 failed
----------------------------------------

working on ASIN B00007E7JU
----------------------------------------
parser for review #312 failed
parser for review #602 failed
parser for review #643 failed
----------------------------------------

working on ASIN B0002L5R78
----------------------------------------
parser for review #446 failed
parser for review #1291 failed
parser for review #1624 failed
parser for review #1867 failed
parser for review #2009 failed
parser for review #2581 failed
----------------------------------------

working on ASIN B000BQ7GW8
----------------------------------------
parser for review #648 failed
parser for review #939 failed
parser for review #1028 failed
parser for review #1122 failed
----------------------------------------

working on ASIN B000I68BD4
-----------------

In [112]:
print_sorted_aspects_multiprod(asin_dict, product_names)

--------------------------------------------------
B00007E7JU Canon EF 50mm f/1.8 II Camera Lens - Fixed (Discontinued by Manufacturer)
--------------------------------------------------

[[u'lens', 653, 0.202],
 [u'picture', 168, 0.337],
 [u'shot', 165, 0.326],
 [u'light', 155, 0.459],
 [u'low light', 114, 0.323],
 [u'photo', 102, 0.291],
 [u'aperture', 96, 0.354],
 [u'photographer', 86, 0.301],
 [u'image', 79, 0.319],
 [u'field depth', 76, 0.0],
 [u'bokeh', 70, 0.281],
 [u'focus', 67, 0.116],
 [u'quality', 65, 0.148],
 [u'50mm lens', 60, 0.014],
 [u'prime lens', 58, 0.016],
 [u'portrait lens', 57, 0.006],
 [u'camera', 53, 0.114],
 [u'canon lens', 51, 0.01],
 [u'focal length', 51, 0.257],
 [u'portrait', 47, 0.247],
 [u'photography', 44, 0.374],
 [u'light situation', 42, 0.27],
 [u'length', 41, 0.385],
 [u'image quality', 32, 0.089],
 [u'focus ring', 31, 0.247],
 [u'background', 31, 0.278],
 [u'quality build', 28, 0.058],
 [u'body', 28, 0.123],
 [u'lens picture', 26, 0.001],
 [u'bit', 

In [117]:
reverse = asin_dict['B00007E7JU']['aspect_dict'][(u'depth', u'field')]['ordering'][1] > asin_dict['B00007E7JU']['aspect_dict'][(u'depth', u'field')]['ordering'][0]
word1 = (u'depth', u'field')[reverse]
word2_key = (u'depth', u'field')[not reverse]

arr = np.array(asin_dict['B00007E7JU']['aspect_dict'][word2_key]['relation_from']) == word1
np.sum(arr)/len(arr)

print word2_key
asin_dict['B00007E7JU']['aspect_dict'][word2_key]['relation_from']

depth


[u'good',
 u'shallow',
 u'of',
 u'good',
 u'the',
 u'narrow',
 u'-',
 u'of',
 u'at',
 u'that',
 u'shallow',
 u'of',
 u'of',
 u'the',
 u'narrow',
 u'of',
 u'some',
 u'intense',
 u'of',
 u'its',
 u'shallow',
 u'of',
 u'of',
 u'and',
 u'shot',
 u'of',
 u'some',
 u'of',
 u'great',
 u'of',
 u'a',
 u'shallow',
 u'of',
 u'think',
 u'a',
 u'satisfy',
 u'shallow',
 u'of',
 u'of',
 u'a',
 u'narrow',
 u'of',
 u'great',
 u'len',
 u'shallow',
 u'of',
 u'the',
 u'deep',
 u'of',
 u'its',
 u'great',
 u'of',
 u'of',
 u'some',
 u'great',
 u'of',
 u'the',
 u'right',
 u'of',
 u'a',
 u'of',
 u'shallow',
 u'a',
 u'shallow',
 u'of',
 u'at',
 u'of',
 u'a',
 u'great',
 u'shallow',
 u'of',
 u'the',
 u'of',
 u'the',
 u'of',
 u'the',
 u'of',
 u'the',
 u'narrow',
 u'of',
 u'the',
 u'of',
 u'the',
 u'of',
 u'the',
 u'shallow',
 u'of',
 u'some',
 u'nice',
 u'of',
 u'(',
 u'background',
 u',',
 u'improve',
 u'the',
 u'of',
 u'of',
 u'a',
 u'huge',
 u'of',
 u'the',
 u'shallow',
 u'of',
 u'a',
 u'of',
 u'of',
 u'excell

In [24]:
aspects = []

for key in asin_dict:
    for aspect, freq in asin_dict[key]['sorted_aspects']:
        aspects.append(aspect)
        
aspects_cnt = Counter(aspects)

In [25]:
aspects_cnt.most_common()

[(u'product', 51),
 (u'price', 51),
 (u'problem', 50),
 (u'issue', 43),
 (u'year', 42),
 (u'quality', 39),
 (u'device', 38),
 (u'review', 37),
 (u'computer', 31),
 (u'month', 27),
 (u'cable', 27),
 (u'picture', 23),
 (u'speed', 22),
 (u'video', 22),
 (u'size', 21),
 (u'brand', 21),
 (u'box', 20),
 (u'usb', 20),
 (u'phone', 19),
 (u'item', 19),
 (u'unit', 18),
 (u'connection', 18),
 (u'space', 17),
 (u'money', 17),
 (u'tv', 17),
 (u'bit', 17),
 (u'music', 16),
 (u'cord', 16),
 (u'port', 16),
 (u'file', 16),
 (u'power', 16),
 (u'player', 15),
 (u'laptop', 15),
 (u'data', 15),
 (u'storage', 15),
 (u'hd', 15),
 (u'pc', 14),
 (u'gb', 14),
 (u'capacity', 13),
 (u'high quality', 13),
 (u'photo', 13),
 (u'star', 13),
 (u'screen', 13),
 (u'case', 13),
 (u'usb port', 13),
 (u'signal', 13),
 (u'sound', 12),
 (u'system', 12),
 (u'job', 12),
 (u'room', 11),
 (u'feature', 11),
 (u'memory', 11),
 (u'hdmi cable', 11),
 (u'drive', 11),
 (u'store', 11),
 (u'movie', 11),
 (u'camera', 11),
 (u'light', 11)

In [32]:
bigram_aspects = []

for key in asin_dict:
    for aspect, freq in asin_dict[key]['sorted_aspects']:
        if len(aspect.split(" ")) == 2:
            bigram_aspects.append(aspect)
        
bigram_aspects_cnt = Counter(bigram_aspects)

In [33]:
bigram_aspects_cnt.most_common()

[(u'usb port', 13),
 (u'high quality', 13),
 (u'hdmi cable', 11),
 (u'hard drive', 9),
 (u'usb cable', 9),
 (u'purchase cable', 9),
 (u'buy cable', 9),
 (u'cell phone', 8),
 (u'ray player', 8),
 (u'cable quality', 8),
 (u'picture quality', 8),
 (u'write speed', 8),
 (u'cheap cable', 8),
 (u'sd card', 7),
 (u'transfer file', 7),
 (u'blu player', 7),
 (u'sound quality', 7),
 (u'high cable', 7),
 (u'amazon cable', 7),
 (u'memory card', 7),
 (u'usb drive', 6),
 (u'transfer speed', 6),
 (u'battery life', 6),
 (u'price cable', 6),
 (u'buy card', 6),
 (u'monster cable', 6),
 (u'expensive cable', 6),
 (u'apple tv', 5),
 (u'sound picture', 5),
 (u'flash card', 5),
 (u'come cable', 5),
 (u'build quality', 5),
 (u'purchase card', 5),
 (u'sandisk product', 5),
 (u'price quality', 5),
 (u'digital camera', 5),
 (u'brand cable', 5),
 (u'high speed', 5),
 (u'amazonbasics cable', 5),
 (u'charge phone', 5),
 (u'connect cable', 5),
 (u'cable box', 5),
 (u'audio video', 5),
 (u'usb charger', 5),
 (u'video

In [26]:
adjectives = []

for key in asin_dict:
    for aspect in asin_dict[key]['aspect_dict']:
        for adj in asin_dict[key]['aspect_dict'][aspect]['adjectives']:
            if adj.isalpha():
                adjectives.append(adj)
        
adjectives_cnt = Counter(adjectives)

In [27]:
adjectives_cnt.most_common()

[(u'great', 10142),
 (u'other', 8345),
 (u'good', 7915),
 (u'new', 5429),
 (u'old', 3909),
 (u'high', 3243),
 (u'hard', 2684),
 (u'more', 2656),
 (u'many', 2360),
 (u'little', 2352),
 (u'small', 2181),
 (u'same', 2164),
 (u'best', 2108),
 (u'nice', 2054),
 (u'cheap', 1935),
 (u'better', 1754),
 (u'different', 1646),
 (u'few', 1634),
 (u'low', 1606),
 (u'several', 1578),
 (u'external', 1577),
 (u'big', 1569),
 (u'excellent', 1561),
 (u'large', 1548),
 (u'expensive', 1458),
 (u'first', 1352),
 (u'extra', 1337),
 (u'wireless', 1227),
 (u'fast', 1227),
 (u'micro', 1201),
 (u'original', 1166),
 (u'most', 1154),
 (u'digital', 1120),
 (u'full', 1072),
 (u'second', 1057),
 (u'free', 1009),
 (u'only', 978),
 (u'local', 894),
 (u'standard', 883),
 (u'regular', 873),
 (u'sound', 857),
 (u'last', 839),
 (u'right', 834),
 (u'available', 776),
 (u'much', 755),
 (u'long', 752),
 (u'decent', 747),
 (u'previous', 737),
 (u'perfect', 716),
 (u'solid', 714),
 (u'smart', 711),
 (u'enough', 651),
 (u'porta

In [40]:
print_sentence_frag_multiprod(asin_dict, 'speed', adjective=None)

------------------------------------------------------------
speed
------------------------------------------------------------

--------------------------------------------------
B004G6002M
--------------------------------------------------

12 	high
----------------------------------------
0 I know there is a higher speed one so if you need to have the data move
1 The card isn't meant for high-speed transfers and all that jazz, but it's great
2 but then again i didn't use it for high speed use
3 sure if sandisk even makes 16GB cards at a higher speed rating by the way.
4 phone, they are not know for needing the highest speed, it was the cheapest i could find and it
5 I'm going to try a higher speed (and higher priced) version to see if that
6 and I copy photos to the NaS at a higher speed.  
7 the player may not work as well with the higher speed rating, since music files don't need a higher
8 rating, since music files don't need a higher speed card to work.
9 it is HC (high capacity

##### Test Code

In [38]:
reviews = df[df.asin == 'B00007E7JU'].reviewText.tolist()

In [39]:
corpus = ReviewSents(reviews)

parser for review #312 failed
parser for review #602 failed
parser for review #643 failed


In [40]:
unigrams = candidate_unigrams(corpus)

In [41]:
bigrams, bigram_words = candidate_bigrams(corpus)

In [42]:
results = extract_aspects(corpus, unigrams, bigrams, bigram_words)

In [43]:
sort_aspect_frequency(results)

[[u'lens', 630],
 [u'picture', 169],
 [u'shot', 166],
 [u'light', 161],
 [u'low light', 114],
 [u'photo', 102],
 [u'aperture', 96],
 [u'price', 91],
 [u'photographer', 86],
 [u'review', 81],
 [u'image', 79],
 [u'field depth', 76],
 [u'prime lens', 74],
 [u'bokeh', 70],
 [u'focus', 67],
 [u'quality', 65],
 [u'50mm lens', 63],
 [u'price lens', 59],
 [u'camera', 57],
 [u'canon lens', 57],
 [u'portrait lens', 57],
 [u'focal length', 52],
 [u'portrait', 47],
 [u'photography', 44],
 [u'light situation', 42],
 [u'length', 40],
 [u'image quality', 32],
 [u'focus ring', 31],
 [u'background', 31],
 [u'quality build', 28],
 [u'body', 28],
 [u'buy lens', 27],
 [u'lens picture', 26],
 [u'bit', 25],
 [u'50mm', 24],
 [u'quality lens', 24],
 [u'glass', 24],
 [u'money', 24],
 [u'plastic', 23],
 [u'sensor', 23],
 [u'kit lens', 23],
 [u'issue', 22],
 [u'fast lens', 22],
 [u'zoom lens', 22],
 [u'cheap lens', 21],
 [u'color', 21],
 [u'situation', 21],
 [u'zoom', 20],
 [u'light lens', 20],
 [u'manual focus'

In [46]:
print_sentence_frag(corpus, results, 'aperture')

----------------------------------------
aperture
----------------------------------------

29 	wide
----------------------------------------
0 I love this because of the f/1.8 wide aperture.  
1 Just an aside about the wide aperture, it's very tough to focus on a close
2 With the wide aperture, you can get some really nice depth of field
3 you'll be forced to rely on a very wide aperture, with the consequent lack of sharpness discussed above.
4 This lens has a very wide aperture, but surprisingly considering the price even out at f/1.8
5 , but still performs well since it can reach wide apertures.
6 depending on zoom, so this provides a much wider aperture for me to play with.
7 Also focus is hunting at wide f1.8 aperture sometimes and lens makes annoying noise.
8 Its very wide Aperture lets me shoot beautiful portraits and good shots in low
9 It doesn't have nearly as wide an aperture as this, but it will be great for outdoors
10 like a 27.2-136.The lenses that have BOTH wide apertur