# My Code

In [1]:
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pprint import pprint
from spacy.en import English
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd
import os
import re
import requests
import string

parser = English()

In [2]:
# list of dependencies not likely to be words found in features
com_dep = set(['det', 'aux', 'cc', 'punct', 'mark', '', 'neg', 'nummod',
               'prt', 'auxpass', 'case', 'expl', 'preconj', 'intj',
               'predet', 'meta', 'quantmod', 'agent'])

# list of POS tags not likely to be words found in features
com_tag = set(['IN', 'PRP', 'PRP$', 'DT', 'HYPH', 'TO', ',', '.', 'CC',
               'SP', 'CD', 'MD', 'WDT', 'RP', 'WRB', '-LRB-', '-RRB-',
               ':', 'WP', 'POS', '``', "''", 'SYM', 'EX', 'PDT', 'UH',
               'NFP', 'XX'])

# list of POS tag belonging to nouns
noun_tag = set(['NN', 'NNP', 'NNS'])

In [3]:
class SentCustomProperties(object):
    '''
    adds properties to spacy sentence that tracks:
        index of review where sentence originated
        spacy Span object
        index of sentence within review corpus
        index of first token in sentence within review
        num of words in sentences
    '''
    
    def __init__(self, review_idx, sent_idx, sent):
        '''
        INPUT: int, int, spacy sentence (spacy.tokens.span.Span)
        OUTPUT: None
        '''
        self.review_idx = review_idx
        self.sent = sent
        self.sent_idx = sent_idx
        self.start_idx = sent[0].i
        self.words = len(sent)

In [4]:
class ReviewSents(object):
    '''
    Takes a list of unicode reviews and stores the sentences
    (with additional properties) in the returned object
    '''
    
    def __init__(self, reviews):
        '''
        INPUT: list(unicode)
        OUTPUT: None
        '''
        self.reviews = reviews
        self.n_reviews, self.n_sent, self.sentences = self._parse_sentences()
        
    def _parse_sentences(self):
        '''
        INPUT: None
        OUTPUT: int, int, list(SentCustomProperties)
        
        Uses spacy to parse and split the sentences
        Return number of reviews, sentences, and list of spacy objects
        '''
        n_sent, n_reviews = 0, 0
        sentences = []
        
        for i, review in enumerate(self.reviews):
            try:
                review = parser(review)
                n_reviews += 1
            except AssertionError:
                print 'parser for review #{} failed'.format(i)
                continue
            
            for sent in review.sents:
                if sent.string:
                    sentences.append(SentCustomProperties(i, n_sent, sent))
                    n_sent += 1
                
        return n_reviews, n_sent, sentences

In [5]:
def _iter_nps(sent):
    '''
    INPUT: spacy sentence (spacy.tokens.span.Span)
    OUTPUT: set
    
    Iterates through each token of spacy sentence and collects
    lemmas of all nouns into a set
    
    this function requires parser to be defined as:
        from spacy.en import English
        parser = English()
    '''
    wordset = set()
    
    for token in sent:
        root = parser.vocab[token.lemma].prob
        # filter to only consider nouns
        if token.tag_ in noun_tag and root < -7.5:
            wordset.add(token.lemma_)
    
    return " ".join(wordset)

def candidate_unigrams(corpus, min_pct=0.01):
    '''
    INPUT: ReviewSents, float
    OUTPUT: set
    
    obtains a set of candidate unigrams
    each candidate unigram must be a noun and must appear in at least
        a percentage of the sentences specified by min_pct
    '''
    count_X = []

    for sent in corpus.sentences:
        count_X.append(_iter_nps(sent.sent))
        
    cnt_vec = CountVectorizer()
    freq = cnt_vec.fit_transform(count_X)
    
    total_count = freq.toarray().sum(axis=0)
    filter_ = total_count >= min_pct * corpus.n_sent

    features = np.array(cnt_vec.get_feature_names())
    return set(features[filter_])

In [6]:
def _get_compactness_feat(corpus):
    '''
    INPUT: ReviewSents
    OUTPUT: generator(tuples(unicode))
    
    outputs generator of tuples consisting of:
        at least one noun
        a second word within +/- 3 words of noun
    excludes dependencies and tags not likely to be a feature word
    
    this function requires parser to be defined as:
        from spacy.en import English
        parser = English()
    '''

    for sent in corpus.sentences:
        output = set()

        for i, token in enumerate(sent.sent):
            # one word in bigram must be noun
            if token.tag_ not in noun_tag:
                continue
            else:
                arr = sent.sent[max(0, i-3):min(i+4, sent.words)]
                arr = np.array(arr)
                arr = arr[arr != token]

                for item in arr:
                    root = parser.vocab[item.lemma].prob
                    # filter out unlikely features
                    if root > -7.5 or item.dep_ in com_dep or item.tag_ in com_tag:
                        continue
                    else:
                        tup = tuple(sorted([item.lemma_, token.lemma_]))
                        output.add(tup)

        if output:
            for element in output:
                yield element
                            
def candidate_bigrams(corpus, min_pct=0.005):
    '''
    INPUT: ReviewSents, float
    OUTPUT: set(tuples), set(str)
    
    outputs set of tuples and set of words within tuples from 
        _get_compactness_feat function appearing at least 
        min_cnt times
    '''
    bigrams = set()
    bigram_words = set()
    
    feats = Counter(_get_compactness_feat(corpus))
    
    for (key, val) in feats.iteritems():
        if val >= max(2, min_pct * corpus.n_sent):
            bigrams.add(key)
            bigram_words.update(set(key))
            
    return bigrams, bigram_words

In [7]:
def _check_bigram(adj, token, sent, bigrams, bigram_words):
    '''
    INPUT: spacy token (spacy.tokens.token.Token),
           spacy token (spacy.tokens.token.Token),
           SentCustomProperties,
           set(tuples),
           set(str)
    OUTPUT: generator(tuples)
    
    Given an adjective token, dependency word token, a sentence, and 
        token index within sentence, check if the token has any 
        associated bigram within +/- 3 words and yield all bigrams. 
    
    If no bigrams, yield unigram.
    '''
    yielded = False
    seen_words = []
    t_idx = token.i - sent.start_idx
    a_idx = adj.i - sent.start_idx
    
    arr = sent.sent[max(0, t_idx-3):min(t_idx+4, sent.words)]
    
    for item in arr:
        if item == token or item.lemma_ in seen_words:
            continue
        elif item.lemma_ in bigram_words:
            tup = tuple(sorted([item.lemma_, token.lemma_]))
            adjectives = [adj.lemma_]
            
            if adj.lemma_ in tup:
                # try finding first adjective around aspect if
                #     bigram includes adjective
                # return nothing if no other adjectives found
                left = max(0, min(a_idx-3, t_idx-3))
                right = min(sent.words, max(a_idx+4, t_idx+4))
                bi_arr = sent.sent[left:right]

                adjectives = [word.lemma_ for word in bi_arr 
                              if word.tag_ in ['JJ', 'JJR', 'JJS']
                              and word.lemma_ not in tup]
                
            if tup in bigrams and adjectives:
                yielded = True
                seen_words.append(item.lemma_)
                # for checking which word in bigram is likely to appear first
                ordering = [1, 0] if item.lemma_ == tup[0] else [0, 1]
                
                yield adjectives[0], tup, np.array(ordering), t_idx
                
    if not yielded:
        yield adj.lemma_, token.lemma_, np.array([1, 0]), t_idx

def extract_aspects(corpus, unigrams, bigrams, bigram_words):
    '''
    INPUT: ReviewSents, set(str), set(tuple), set(str)
    OUTPUT: dictionary
    
    Extracts all aspects modified by an 'amod' dependency and returns
    dictionary of all aspects within candidate unigram and bigram aspects
    
    adjectives: word modifying the aspect
    aspect_idx: list of where aspect can be found within sentence
    ordering: if bigram, count of which word appears first
    review_idx: list of which reviews contain aspct
    sentence_idx: list of which sentences contain aspect
    
    review_count: how many reviews aspects are found in
    sentence_count: how many sentences aspects are found in
    '''
    props = defaultdict(lambda:
                    {'adjectives': [],
                     'aspect_idx': [],
                     'ordering': np.array([0, 0]),
                     'review_idx': [],
                     'sentence_idx': []})
    
    for sent in corpus.sentences:
        
        for token in sent.sent:
            if token.head.lemma_ not in unigrams:
                continue
            
            elif token.dep_ == 'amod':
                aspects = _check_bigram(token, token.head, sent, 
                                        bigrams, bigram_words)
                
                for adj, aspect, ordering, index in aspects:
                    props[aspect]['adjectives'].append(adj)
                    props[aspect]['aspect_idx'].append(index)
                    props[aspect]['review_idx'].append(sent.review_idx)
                    props[aspect]['sentence_idx'].append(sent.sent_idx)
                    props[aspect]['ordering'] += ordering
    
    for key in props:
        props[key]['review_count'] = len(set(props[key]['review_idx']))
        props[key]['sent_count'] = len(set(props[key]['sentence_idx']))
        
    return props

In [8]:
def sort_aspect_frequency(aspect_dict, n=3):
    '''
    INPUT: dict, int
    OUTPUT: list([aspect, freq])
    
    Outputs a list of the most common aspect occuring at least n times in descending order.
    Dictionary returned from extract_aspects is the input of this function.
    '''
    counts = []

    for key in aspect_dict:

        freq = aspect_dict[key]['review_count']

        if freq < n:
            continue

        if isinstance(key, tuple):
            # check which word of bigram appears first more often
            reverse = aspect_dict[key]['ordering'][1] > aspect_dict[key]['ordering'][0]
            ordered_key = " ".join(sorted(key, reverse=reverse))
            counts.append([ordered_key, freq])
        else:
            counts.append([key, freq])

    return sorted(counts, key=lambda x: x[1], reverse=True)

In [9]:
def print_sentence_frag(corpus, aspect_dict, aspect, adjective=None):
    '''
    INPUT: ReviewSents, dict, string, str(optional)
    OUTPUT: None
    
    Script that prints out sentence fragment containing adjective
        describing specified aspect.
    
    Dictionary returned from extract_aspects is the second input 
        of this function.
    
    If no adjective specified, prints all adjectives in descending
        order of occurance.
    '''
    print '-'*40
    print aspect
    print '-'*40
    print

    aspect_list = aspect.split(" ")
    
    if len(aspect_list) == 2:
        aspect = tuple(sorted(aspect_list))
    
    if not adjective:
        adjectives = Counter(aspect_dict[aspect]['adjectives']).most_common()
        
        for (word, freq) in adjectives:
            # toss out adjectives that with non-alphabetic characters
            if not word.isalpha():
                continue
            
            print freq, "\t", word
            print '-'*40
            
            mask = np.array(aspect_dict[aspect]['adjectives']) == word
            aspect_idx = np.array(aspect_dict[aspect]['aspect_idx'])[mask]
            sent_idx = np.array(aspect_dict[aspect]['sentence_idx'])[mask]
            
            for i, (a_idx, s_idx) in enumerate(zip(aspect_idx, sent_idx)):
                sent = corpus.sentences[s_idx]
                print i, sent.sent[max(0, a_idx-10):min(a_idx+11, sent.words)]
                
            print
            
    else:
        print adjective
        print '-'*40

        mask = np.array(aspect_dict[aspect]['adjectives']) == adjective
        aspect_idx = np.array(aspect_dict[aspect]['aspect_idx'])[mask]
        sent_idx = np.array(aspect_dict[aspect]['sentence_idx'])[mask]

        for i, (a_idx, s_idx) in enumerate(zip(aspect_idx, sent_idx)):
            sent = corpus.sentences[s_idx]
            print i, sent.sent[max(0, a_idx-10):min(a_idx+11, sent.words)]

        print

In [21]:
def pipeline(df, asin_list):
    '''
    INPUT: df, list(str)
    OUTPUT: dict
    
    runs pipeline specified in functions above for list of asin
        on loaded data frame containing reviews
    outputs asin callable dictionary with attributes:
        aspect_dict: dictionary of aspects
        sorted_aspects: list of descending aspect frequencies
    '''
    asin_dict = defaultdict(dict)
    
    for asin in asin_list:
        print 'working on ASIN {}'.format(asin)
        print '-'*40
        
        reviews = df[df.asin == asin].reviewText.tolist()
        corpus = ReviewSents(reviews)
        
        unigrams = candidate_unigrams(corpus)
        bigrams, bigram_words = candidate_bigrams(corpus)
        
        results = extract_aspects(corpus, unigrams, bigrams, bigram_words)
        freq = sort_aspect_frequency(results)
        
        asin_dict[asin]['aspect_dict'] = results
        asin_dict[asin]['sorted_aspects'] = freq
        
        print '-'*40
        print
        
    return asin_dict

# Amazon Electronics Review Corpus

In [11]:
with open('reviews/Electronics_5-2.json', 'r') as f:
    data = f.readlines()

In [12]:
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"

In [15]:
df = pd.read_json(data_json_str)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1689188 entries, 0 to 1689187
Data columns (total 9 columns):
asin              1689188 non-null object
helpful           1689188 non-null object
overall           1689188 non-null int64
reviewText        1689188 non-null object
reviewTime        1689188 non-null object
reviewerID        1689188 non-null object
reviewerName      1664458 non-null object
summary           1689188 non-null object
unixReviewTime    1689188 non-null int64
dtypes: int64(2), object(7)
memory usage: 128.9+ MB


In [17]:
group = df.groupby('asin').size()

In [18]:
len(group[df.groupby('asin').size() >= 1000])

52

In [19]:
asin_1000 = group[df.groupby('asin').size() >= 1000].index.tolist()
asin_1000

[u'B00004ZCJE',
 u'B00007E7JU',
 u'B0002L5R78',
 u'B000BQ7GW8',
 u'B000I68BD4',
 u'B000LRMS66',
 u'B000QUUFRW',
 u'B000S5Q9CA',
 u'B000VX6XL6',
 u'B0012S4APK',
 u'B0015DYMVO',
 u'B0019EHU8G',
 u'B001TH7GSW',
 u'B001TH7GUU',
 u'B001XURP7W',
 u'B0027VT6V4',
 u'B002MAPRYU',
 u'B002QEBMAK',
 u'B002SZEOLG',
 u'B002V88HFE',
 u'B002WE6D44',
 u'B00316263Y',
 u'B003ELYQGG',
 u'B003ES5ZUU',
 u'B0041Q38NU',
 u'B0043T7FXE',
 u'B0044YU60M',
 u'B004G6002M',
 u'B004GF8TIK',
 u'B004QK7HI8',
 u'B004W2JKWG',
 u'B004XC6GJ0',
 u'B0052SCU8U',
 u'B005CLPP84',
 u'B005CT56F8',
 u'B005DKZTMG',
 u'B005FYNSPK',
 u'B005HMKKH4',
 u'B00622AG6S',
 u'B006GWO5WK',
 u'B006W8U2MU',
 u'B0074BW614',
 u'B007I5JT4S',
 u'B007R5YDYA',
 u'B007WTAJTO',
 u'B008OHNZI0',
 u'B009A5204K',
 u'B009SYZ8OC',
 u'B00B46XUQU',
 u'B00BGGDVOO',
 u'B00DR0PDNE',
 u'B00E3W15P0']

In [22]:
asin_dict = pipeline(df, asin_1000)

working on ASIN B00004ZCJE
----------------------------------------
parser for review #18 failed
parser for review #493 failed
parser for review #1081 failed
----------------------------------------

working on ASIN B00007E7JU
----------------------------------------
parser for review #312 failed
parser for review #602 failed
parser for review #643 failed
----------------------------------------

working on ASIN B0002L5R78
----------------------------------------
parser for review #446 failed
parser for review #1291 failed
parser for review #1624 failed
parser for review #1867 failed
parser for review #2009 failed
parser for review #2581 failed
----------------------------------------

working on ASIN B000BQ7GW8
----------------------------------------
parser for review #648 failed
parser for review #939 failed
parser for review #1028 failed
parser for review #1122 failed
----------------------------------------

working on ASIN B000I68BD4
----------------------------------------
parse

In [23]:
group[df.groupby('asin').size() >= 1000].sum()

89054

In [25]:
aspects = set()

for key in asin_dict:
    for aspect, freq in asin_dict[key]['sorted_aspects']:
        aspects.add(aspect)
        
aspects = sorted(list(aspects))

In [28]:
aspects[0:750]

[u'2tb',
 u'2tb drive',
 u'32gb',
 u'32gb card',
 u'3d',
 u'3tb',
 u'3tb drive',
 u'5 iphone',
 u'50mm',
 u'50mm lens',
 u'64gb',
 u'64gb card',
 u'700',
 u'730',
 u'aa',
 u'aa battery',
 u'aaa',
 u'aaa aa',
 u'aaa battery',
 u'ac outlet',
 u'access',
 u'account',
 u'acoustics',
 u'activate modem',
 u'ad',
 u'adapter',
 u'adapter card',
 u'add bulk',
 u'add channel',
 u'add weight',
 u'address',
 u'ago year',
 u'air',
 u'air bubble',
 u'airplay',
 u'alkaline battery',
 u'aluminum',
 u'amazon',
 u'amazon cable',
 u'amazon cover',
 u'amazon hdmi',
 u'amazon netflix',
 u'amazon product',
 u'amazon video',
 u'amazonbasics',
 u'amazonbasics cable',
 u'amp',
 u'amplifier',
 u'amplify antenna',
 u'android',
 u'android phone',
 u'android tablet',
 u'angle',
 u'angry bird',
 u'antenna',
 u'anti glare',
 u'aperture',
 u'app',
 u'apple',
 u'apple brand',
 u'apple cable',
 u'apple charge',
 u'apple cord',
 u'apple device',
 u'apple price',
 u'apple product',
 u'apple tv',
 u'apple version',
 u'app

In [29]:
aspects[750:]

[u'modem',
 u'modem fee',
 u'mohu',
 u'mohu antenna',
 u'mohu leaf',
 u'money',
 u'money cable',
 u'monitor',
 u'monster',
 u'monster cable',
 u'month',
 u'monthly fee',
 u'motorola',
 u'motorola charger',
 u'motorola logo',
 u'motorola modem',
 u'mount',
 u'mount bracket',
 u'mount cup',
 u'mount dash',
 u'mount dashboard',
 u'mount friction',
 u'mount gp',
 u'mount screw',
 u'mount wall',
 u'mouse',
 u'mouse button',
 u'mouse keyboard',
 u'mouse pad',
 u'move mouse',
 u'move part',
 u'movement',
 u'movie',
 u'movie show',
 u'mp3',
 u'mp3 player',
 u'multi touch',
 u'music',
 u'music player',
 u'music video',
 u'n router',
 u'n wireless',
 u'name',
 u'name brand',
 u'nas',
 u'neck',
 u'netflix',
 u'netflix prime',
 u'netflix youtube',
 u'netgear',
 u'netgear router',
 u'network',
 u'nice cable',
 u'nice cover',
 u'nice leather',
 u'nikon camera',
 u'nikon lens',
 u'nimh',
 u'nimh battery',
 u'noise',
 u'noise reduction',
 u'note',
 u'number',
 u'nuvi',
 u'nuvi mount',
 u'oem',
 u'oem 

In [37]:
adjectives = set()

for key in asin_dict:
    for aspect in asin_dict[key]['aspect_dict']:
        for adj in asin_dict[key]['aspect_dict'][aspect]['adjectives']:
            if adj.isalpha():
                adjectives.add(adj)
        
adjectives = sorted(list(adjectives))

In [40]:
adjectives[0:900]

[u'a',
 u'aa',
 u'aac',
 u'abetter',
 u'abgn',
 u'able',
 u'abnoxious',
 u'about',
 u'above',
 u'absolute',
 u'absurd',
 u'abuse',
 u'abysmal',
 u'ac',
 u'acase',
 u'accelerate',
 u'accelerated',
 u'accelerted',
 u'accelkerated',
 u'accentuate',
 u'acceptable',
 u'acceptbale',
 u'acceptble',
 u'access',
 u'accessible',
 u'accessory',
 u'accidental',
 u'acclaimed',
 u'accommodate',
 u'accompany',
 u'accompanying',
 u'accomplished',
 u'accurate',
 u'accuse',
 u'accustomed',
 u'accuweather',
 u'achy',
 u'acorn',
 u'acoustic',
 u'acquaint',
 u'acquire',
 u'activate',
 u'active',
 u'actual',
 u'actually',
 u'actuate',
 u'adaptershipping',
 u'adaptive',
 u'adaquate',
 u'add',
 u'added',
 u'addicted',
 u'addictive',
 u'additional',
 u'additonal',
 u'addon',
 u'addtional',
 u'adequate',
 u'adequte',
 u'adhesive',
 u'adhoc',
 u'adjacent',
 u'adjoining',
 u'adjust',
 u'adjustable',
 u'adjusted',
 u'adopter',
 u'adquiri',
 u'adroid',
 u'adult',
 u'advance',
 u'advanced',
 u'advantageous',
 u'adve

In [41]:
adjectives[900:1800]

[u'deceased',
 u'deceiving',
 u'decent',
 u'deceptive',
 u'decide',
 u'decided',
 u'decrease',
 u'decrepit',
 u'decroded',
 u'dect',
 u'dedicate',
 u'dedicated',
 u'deep',
 u'deeper',
 u'def',
 u'default',
 u'defect',
 u'defective',
 u'define',
 u'defined',
 u'definetelly',
 u'definite',
 u'definitive',
 u'defunct',
 u'degital',
 u'degrade',
 u'degraded',
 u'delay',
 u'delete',
 u'deleted',
 u'delicate',
 u'delicious',
 u'delievered',
 u'delighted',
 u'deliver',
 u'delivers',
 u'deluxe',
 u'demanding',
 u'dense',
 u'depend',
 u'dependable',
 u'dependably',
 u'dependent',
 u'deplete',
 u'depleted',
 u'depthwonderful',
 u'derive',
 u'descend',
 u'descent',
 u'describe',
 u'description',
 u'deserve',
 u'design',
 u'designate',
 u'designated',
 u'designedmediabridge',
 u'desirable',
 u'desire',
 u'desired',
 u'desk',
 u'desktop',
 u'despicable',
 u'destroy',
 u'detachable',
 u'detail',
 u'detailed',
 u'detatchable',
 u'detectable',
 u'deteroirating',
 u'detract',
 u'develop',
 u'developed'

In [42]:
adjectives[1800:2700]

[u'iluv',
 u'imac',
 u'image',
 u'imaginable',
 u'immaculate',
 u'immediate',
 u'immense',
 u'immersive',
 u'imminent',
 u'immovable',
 u'imp',
 u'impair',
 u'impeccable',
 u'imperfect',
 u'implement',
 u'imporatant',
 u'import',
 u'important',
 u'importantly',
 u'impose',
 u'impossible',
 u'impractical',
 u'impressed',
 u'impressive',
 u'impromptu',
 u'improper',
 u'improve',
 u'improved',
 u'improvement',
 u'in',
 u'inaccessible',
 u'inaccurate',
 u'inadvertent',
 u'inappropriate',
 u'inc',
 u'incandescent',
 u'incidental',
 u'incipio',
 u'incl',
 u'inclose',
 u'include',
 u'included',
 u'includible',
 u'inclusive',
 u'incoming',
 u'incompatable',
 u'incompatible',
 u'incomplete',
 u'inconsistant',
 u'inconsistent',
 u'inconspicuous',
 u'inconvenient',
 u'inconvenientmany',
 u'incorrect',
 u'increase',
 u'increased',
 u'incredible',
 u'incredibly',
 u'incremental',
 u'incriminate',
 u'independent',
 u'indian',
 u'indicate',
 u'indicated',
 u'indifferent',
 u'indiscernible',
 u'indisp

In [43]:
adjectives[2700:3600]

[u'oversimplified',
 u'oversize',
 u'oversized',
 u'overwhelmed',
 u'overwhelming',
 u'owc',
 u'own',
 u'oz',
 u'p',
 u'pack',
 u'package',
 u'packaged',
 u'packaging',
 u'packed',
 u'padded',
 u'page',
 u'paid',
 u'painful',
 u'painless',
 u'paint',
 u'pair',
 u'paired',
 u'panasonic',
 u'pancake',
 u'pandora',
 u'panisonic',
 u'panoramic',
 u'paper',
 u'paperback',
 u'paperwhite',
 u'parallel',
 u'parental',
 u'partial',
 u'particular',
 u'particulary',
 u'partition',
 u'pas',
 u'pass',
 u'passive',
 u'passport',
 u'passthrough',
 u'past',
 u'pathetic',
 u'paticular',
 u'patriot',
 u'pay',
 u'pc',
 u'pcie',
 u'pcvarious',
 u'pdf',
 u'peak',
 u'pelican',
 u'pending',
 u'pentagonal',
 u'perceivable',
 u'perceive',
 u'perceptible',
 u'perfct',
 u'perfect',
 u'perfectly',
 u'perform',
 u'performance',
 u'performing',
 u'periodic',
 u'peripheral',
 u'permanent',
 u'permissive',
 u'perpendicular',
 u'perpetual',
 u'persimmon',
 u'person',
 u'personal',
 u'personalized',
 u'personally',
 u'

In [44]:
adjectives[3600:]

[u'spoofed',
 u'sporadic',
 u'spotify',
 u'spotty',
 u'springy',
 u'spurious',
 u'square',
 u'squarish',
 u'squeak',
 u'squirt',
 u'squishy',
 u'ss',
 u'ssd',
 u'sssandroid',
 u'stabilize',
 u'stable',
 u'stack',
 u'staggering',
 u'stamped',
 u'stand',
 u'standalone',
 u'standard',
 u'standardized',
 u'standby',
 u'standing',
 u'standy',
 u'start',
 u'starting',
 u'startup',
 u'starve',
 u'statdard',
 u'state',
 u'stated',
 u'static',
 u'stationary',
 u'stay',
 u'steady',
 u'stealth',
 u'steam',
 u'steaming',
 u'steep',
 u'stellar',
 u'stereo',
 u'stereotypical',
 u'stickered',
 u'sticking',
 u'sticky',
 u'stiff',
 u'stifle',
 u'still',
 u'stinking',
 u'stock',
 u'stops',
 u'storage',
 u'store',
 u'stored',
 u'storybest',
 u'stout',
 u'straight',
 u'straightforward',
 u'strand',
 u'strange',
 u'stream',
 u'streamed',
 u'streamer',
 u'streaming',
 u'streamline',
 u'streamming',
 u'strenuous',
 u'stressful',
 u'stret',
 u'stretchy',
 u'strict',
 u'stringent',
 u'stringing',
 u'strip',
 u

##### Test Code

In [38]:
reviews = df[df.asin == 'B00007E7JU'].reviewText.tolist()

In [39]:
corpus = ReviewSents(reviews)

parser for review #312 failed
parser for review #602 failed
parser for review #643 failed


In [40]:
unigrams = candidate_unigrams(corpus)

In [41]:
bigrams, bigram_words = candidate_bigrams(corpus)

In [42]:
results = extract_aspects(corpus, unigrams, bigrams, bigram_words)

In [43]:
sort_aspect_frequency(results)

[[u'lens', 630],
 [u'picture', 169],
 [u'shot', 166],
 [u'light', 161],
 [u'low light', 114],
 [u'photo', 102],
 [u'aperture', 96],
 [u'price', 91],
 [u'photographer', 86],
 [u'review', 81],
 [u'image', 79],
 [u'field depth', 76],
 [u'prime lens', 74],
 [u'bokeh', 70],
 [u'focus', 67],
 [u'quality', 65],
 [u'50mm lens', 63],
 [u'price lens', 59],
 [u'camera', 57],
 [u'canon lens', 57],
 [u'portrait lens', 57],
 [u'focal length', 52],
 [u'portrait', 47],
 [u'photography', 44],
 [u'light situation', 42],
 [u'length', 40],
 [u'image quality', 32],
 [u'focus ring', 31],
 [u'background', 31],
 [u'quality build', 28],
 [u'body', 28],
 [u'buy lens', 27],
 [u'lens picture', 26],
 [u'bit', 25],
 [u'50mm', 24],
 [u'quality lens', 24],
 [u'glass', 24],
 [u'money', 24],
 [u'plastic', 23],
 [u'sensor', 23],
 [u'kit lens', 23],
 [u'issue', 22],
 [u'fast lens', 22],
 [u'zoom lens', 22],
 [u'cheap lens', 21],
 [u'color', 21],
 [u'situation', 21],
 [u'zoom', 20],
 [u'light lens', 20],
 [u'manual focus'

In [46]:
print_sentence_frag(corpus, results, 'aperture')

----------------------------------------
aperture
----------------------------------------

29 	wide
----------------------------------------
0 I love this because of the f/1.8 wide aperture.  
1 Just an aside about the wide aperture, it's very tough to focus on a close
2 With the wide aperture, you can get some really nice depth of field
3 you'll be forced to rely on a very wide aperture, with the consequent lack of sharpness discussed above.
4 This lens has a very wide aperture, but surprisingly considering the price even out at f/1.8
5 , but still performs well since it can reach wide apertures.
6 depending on zoom, so this provides a much wider aperture for me to play with.
7 Also focus is hunting at wide f1.8 aperture sometimes and lens makes annoying noise.
8 Its very wide Aperture lets me shoot beautiful portraits and good shots in low
9 It doesn't have nearly as wide an aperture as this, but it will be great for outdoors
10 like a 27.2-136.The lenses that have BOTH wide apertur