In [1]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pprint import pprint
from spacy.en import English
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os
import re
import requests
import string

In [2]:
parser = English()

In [3]:
def get_id(url):
    '''
    gets id identifer for amazon product
    '''
    
    # url format: https://www.amazon.com/.../.../id/...
    regex = re.compile(r'(?<=/)[^/]*')
    id_ = regex.findall(url)[-2]

    if len(id_) != 10:
        # url format https://www.amazon.com/.../id
        id_ = regex.findall(url)[-1][:10]

    return id_


def extract(id_):
    '''
    extracts the star rating and review text from directory of
    amazon html files
    '''
    ratings = []
    reviews = []

    path = 'reviews/com/{}/'.format(id_)
    pages = [file_ for file_ in os.listdir(path) if file_[-5:] == '.html']

    for page in pages:
        html = open(path + page, 'r')
        soup = BeautifulSoup(html, 'html.parser')
        tags = soup.findAll("div", {"class": "a-section review"})

        if not tags:
            break

        for tag in tags:
            rating = int(tag.find('i').text[0])
            review = tag.findAll("span",
                                 {"class": "a-size-base review-text"})[0].text
            ratings.append(rating)
            reviews.append(review)

    return ratings, reviews

In [4]:
# Loosely adopted from Nic Scharding
# https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

# A custom stoplist
STOPLIST = set(stopwords.words('english') +
               ["n't", "'s", "'m", "ca", "'re"] +
               list(ENGLISH_STOP_WORDS))

# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ")
SYMBOLS += ["-----", "---", "...", "“", "”", "'ve", ""]

# Full set of stops
STOPS = STOPLIST ^ set(SYMBOLS)

In [5]:
def word_pos_filter(word_pos):
    '''
    filters out stopwords and words with POS we don't care about
    '''
    toss_pos = set(['PUNCT', 'SPACE', 'NUM'])
    
    regex = re.compile('(.*)_(.*)')
    output = regex.findall(word_pos)[0]
    
    if output[0] not in STOPS and output[1] not in toss_pos:
        return True
    else:
        return False
    
def word_prob(word_pos):
    '''
    appends how frequent word appears in english dictionary
    '''
    regex = re.compile('(.*)_(.*)')
    output = regex.findall(word_pos)[0]
    
    return word_pos + '_' + str(parser.vocab[output[0]].prob)[1:5]

In [6]:
# Loosely adopted from Nic Scharding
# https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

# Every step in a pipeline needs to be a "transformer".
# Define a custom transformer to clean text using spaCy


def tokenizeText(sample):
    '''
    A custom function to tokenize the text using spaCy and convert to 
    lemmas. Suffixes are also added to tokens to indicate POS and
    word probability.
    '''

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize and add POS
    lemmas = []

    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() + '_' + tok.pos_
                      if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if word_pos_filter(tok)]
    
    # add the probability of word
    tokens = [word_prob(tok) for tok in tokens]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

# the vectorizer and classifer to use
# note that I changed the tokenizer in CountVectorizer to use a
# custom function using spaCy's tokenizer

In [7]:
def filter_score(word_pos_score, threshold):
    '''
    checks if word probability is greater than a specified threshold
    '''
    regex = re.compile('(.*)_(.*)')
    output = regex.findall(word_pos_score)
    return float(output[0][1]) >= threshold

def filter_score_wordphrase(wordphrase, threshold):
    '''
    checks if all words are greater than a specified threshold
    '''
    return np.array(all([filter_score(word, threshold)
                         for word in wordphrase.split()]))

In [8]:
def sentence_frag(spacy_sent, wordphrase, word_dist, full=False):
    '''
    returns sentence fragment of words around wordphrase
    '''
    if not spacy_sent:
        return None
    
    matched = False
    pops = 0
    words = [x.lemma_.strip().lower() for x in spacy_sent]
    
    search_term = parse_feature(wordphrase).split()
    phrase_len = len(parse_feature(wordphrase).split())
    
    if phrase_len == 2:
        while not matched:
            try:
                possible_match = words.index(search_term[0])
                words_wo_stops = [tok for i, tok in enumerate(words) 
                                  if tok not in STOPS or i < possible_match]
                if words_wo_stops[possible_match + 1] == search_term[1]:
                    idx = possible_match + pops
                    matched = True
                else:
                    words.pop(possible_match)
                    pops += 1
            except:
                return None
    else:
        while not matched:
            try:
                possible_match = words.index(search_term[0])
                words_wo_stops = [tok for i, tok in enumerate(words) 
                                  if tok not in STOPS or i < possible_match]
                if words_wo_stops[possible_match + 1] == search_term[1] \
                and words_wo_stops[possible_match + 2] == search_term[2]:
                    idx = possible_match + pops
                    matched = True
                else:
                    words.pop(possible_match)
                    pops += 1
            except:
                return None
    
    if full:
        return spacy_sent
    
    min_idx = max(0, idx-word_dist)
    max_idx = min(idx+word_dist+1, len(spacy_sent))
    
    return spacy_sent[min_idx:max_idx]

In [9]:
def parse_feature(wordphrase):
    '''
    parses phrase of word_pos_score words into just words
    '''
    regex = re.compile('(.*)_(.*_.*)')
    return " ".join([regex.findall(word_pos_score)[0][0]
                     for word_pos_score in wordphrase.split()])

def get_sentences(review, wordphrase):
    '''
    returns sentences containing wordphrase in amazon reviews
    '''
    review = parser(unicode(review))
    
    sentences = []
    for sent in review.sents:
        if sentence_frag(sent, wordphrase, 0, True):
            sentences.append(sent)
    
    if not sentences:
        return [None]
    return sentences

In [10]:
class get_aspects_occurring_n_times(object):

    def __init__(self, aspect_thresh=8, freq_thresh=5):
        self.aspect_thresh = aspect_thresh
        self.freq_thresh = freq_thresh

    def scrape(self):
        url = raw_input('url of amazon product: ')
        id_ = get_id(url)
        n_reviews = 100
        
        folder = os.getcwd() + '/reviews/com/' + id_
        
        if not os.path.isdir(folder):
            # Run Amazon scraper
            # Credit to Andrea Esuli
            # https://github.com/aesuli/amadown2py
            os.system('python amazon_crawler.py '
                      '-d com {} -m {} -o reviews'.format(id_, n_reviews))

        ratings, reviews = extract(id_)
        
        self.reviews = reviews

    def run(self):
        vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(2, 3))
        X = vectorizer.fit_transform(self.reviews).toarray()

        features = vectorizer.get_feature_names()
        total_count = np.apply_along_axis(np.count_nonzero, 0, X)

        most_common = np.argsort(total_count)[::-1]
        words = np.array(features)[most_common]
        freq = total_count[most_common]

        vfunc = np.vectorize(filter_score_wordphrase)
        filters = vfunc(words, self.aspect_thresh)

        freq_filter = freq[filters] >= self.freq_thresh
        most_common_idx = most_common[filters][freq_filter]
        words_filtered = words[filters][freq_filter]

        self.count_vec = X
        self.aspects = words_filtered
        self.aspect_dict = dict(zip(words_filtered, most_common_idx))

    def grab_sentences(self, wordphrase):

        reviews = np.array(self.reviews)
        filtering = self.count_vec[:, self.aspect_dict[wordphrase]] > 0

        filtered_reviews = reviews[filtering]

        vfunc2 = np.vectorize(get_sentences)
        wordphrase_sentences = vfunc2(filtered_reviews, wordphrase)

        sentences = []

        for review in wordphrase_sentences[0:].flatten():
            for sent in review:
                frag = sentence_frag(sent, wordphrase, 5)
                if frag:
                    sentences.append(sent)

        if not sentences:
            return None
            
        return sentences

## Test for get_aspects_occurring_n_times

In [11]:
doc1 = get_aspects_occurring_n_times()
doc1.scrape()
doc1.run()

url of amazon product: https://www.amazon.com/Nonstick-Jumbo-Cooker-inches-5-5-Quart/dp/B00VGZ9XVA/ref=sr_1_2?s=kitchen&ie=UTF8&qid=1469693916&sr=1-2-spons&keywords=frying+pan&psc=1


In [12]:
doc2 = get_aspects_occurring_n_times()
doc2.scrape()
doc2.run()

url of amazon product: https://www.amazon.com/GOTHAM-STEEL-inches-Non-stick-Titanium/dp/B018T909XG/ref=sr_1_2?s=kitchen&ie=UTF8&qid=1469694011&sr=1-2&keywords=frying+pan


In [13]:
common_aspects = set(doc1.aspects) & set(doc2.aspects)
common_aspects

{u'discount_NOUN_11.1 exchange_NOUN_10.5',
 u'discount_NOUN_11.1 exchange_NOUN_10.5 honest_ADJ_9.43',
 u'exchange_NOUN_10.5 honest_ADJ_9.43',
 u'non_ADJ_8.65 stick_NOUN_9.41',
 u'non_X_8.65 stick_NOUN_9.41'}

In [14]:
sid = SentimentIntensityAnalyzer()
aspect = list(common_aspects)[0]

In [15]:
aspect_sentences = doc1.grab_sentences(aspect)

print "term =", parse_feature(aspect)
print
print '-' * 40
print

for sent in aspect_sentences:
    ss = sid.polarity_scores(sent.string)
    print sent
    print
    for k in sorted(ss):
        print '{0}: {1}, '.format(k, ss[k])
    print
    print '-'*40
    print

term = non stick

----------------------------------------

It's non-stick and dishwasher safe, because if you know me, I'm all about  convenience.  

compound: 0.4404, 
neg: 0.0, 
neu: 0.818, 
pos: 0.182, 

----------------------------------------

It heats smoothly and evenly, washes up without the need to scrub, and is totally non-stick.

compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 

----------------------------------------

The pan is dishwasher safe, non stick, and it is a great size for cooking full size family meals.

compound: 0.7906, 
neg: 0.0, 
neu: 0.696, 
pos: 0.304, 

----------------------------------------

The item was sent just in time and i'm happy to cook my favorite dish using your non-stick pan.

compound: 0.7717, 
neg: 0.0, 
neu: 0.717, 
pos: 0.283, 

----------------------------------------

This skillet is non-stick, it  is made with 2 layers of scratch resistant non stick and also dishwasher safe.

compound: 0.4404, 
neg: 0.0, 
neu: 0.854, 
pos: 0.146, 

--

In [16]:
aspect_sentences = doc2.grab_sentences(aspect)

print "term =", parse_feature(aspect)
print
print '-' * 40
print

for sent in aspect_sentences:
    ss = sid.polarity_scores(sent.string)
    print sent
    print
    for k in sorted(ss):
        print '{0}: {1}, '.format(k, ss[k])
    print
    print '-'*40
    print

term = non stick

----------------------------------------

I was upset but who cares for the bottom of it is non stick right?

compound: 0.4939, 
neg: 0.101, 
neu: 0.674, 
pos: 0.225, 

----------------------------------------

After using for two months is not non stick anymore.

compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 

----------------------------------------

The handle never gets hot, even if you put it under the grill and it is 100% non-stick!!!

compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 

----------------------------------------

There are lots of five-star reviews, but also quite a lot of one-star reviews criticizing the product and its features, in particular, it's non-stick claims.

compound: -0.5023, 
neg: 0.124, 
neu: 0.876, 
pos: 0.0, 

----------------------------------------

No oil or butter needed- It weighs less than frying pans of comparable size- 100% Safe: PFOA, PTFE, and PFOS-Free- Oven safe up to 500 degrees (F)- Dishwasher safeBut the reverse side

In [17]:
aspect_sentences_1 = doc1.grab_sentences(aspect)
aspect_sentences_2 = doc2.grab_sentences(aspect)

aspect1_scores = []
aspect2_scores = []

for sent in aspect_sentences_1:
    ss = sid.polarity_scores(sent.string)
    aspect1_scores.append(ss['pos'] - ss['neg'])
    
for sent in aspect_sentences_2:
    ss = sid.polarity_scores(sent.string)
    aspect2_scores.append(ss['pos'] - ss['neg'])
    
if np.mean(aspect1_scores) > np.mean(aspect2_scores):
    print "item 1 is better for {}".format(parse_feature(aspect))
else:
    print "item 2 is better for {}".format(parse_feature(aspect))

item 1 is better for non stick


### NOTES

<span style="color:red">NEED TO FIX</span>  
Code below does not work when extracting aspects from sentences where aspect is seperated by punctuation

example:  
u'x_NOUN_9.81 t10_NOUN_15.9' (possibly because of hyphen?)  
https://www.amazon.com/Fujifilm-X-T10-Silver-Mirrorless-OIS/dp/B00X7QTVSQ/ref=sr_1_1?ie=UTF8&qid=1469675582&sr=8-1&keywords=mirrorless+camera+fuji

  
VADER returns neutral for everything, not sure how to make useful

### Code to test out Aspect Extractor

In [18]:
url = raw_input('url of amazon product: ')
id_ = get_id(url)
n_reviews = 100

folder = os.getcwd() + '/reviews/com/' + id_

if not os.path.isdir(folder):
    # Run Amazon scraper
    # Credit to Andrea Esuli
    # https://github.com/aesuli/amadown2py
    os.system('python amazon_crawler.py '
              '-d com {} -m {} -o reviews'.format(id_, n_reviews))

ratings, reviews = extract(id_)

url of amazon product: https://www.amazon.com/dp/B00H0OJPTG?psc=1


In [19]:
ratings

[5,
 4,
 5,
 3,
 5,
 4,
 5,
 4,
 5,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 1,
 5,
 4,
 4,
 5,
 5,
 4,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 1,
 5,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5]

In [20]:
pprint([x[:50] for x in reviews])

[u'I am blessed to be able to use my grill year round',
 u'These oven mitts work great. They are flexible but',
 u"There are great mitts. They don't fall off you han",
 u"It doesn't work for small hands at all.  I can't d",
 u'This product was recommended to me by a friend. I ',
 u"For most of my life, oven mitts hadn't changed. Th",
 u'You need new oven mitts? Why not triumphant chef? ',
 u'Very grippy, not too thick at all, but still keeps',
 u'Not long ago a trusted friend and top 50 amazon.co',
 u'I ordered two pairs of these oven mitts as I wante',
 u'Was skeptical of buying silicone oven mitts but we',
 u'I decided to replace the oven mitts that we bought',
 u"I love these gloves. They're like a regular hot pa",
 u'Mittens we have been using were too thin so I coul',
 u'This was a purchase from Amazon I was very leery c',
 u'What a great product. They mitts make loading our ',
 u'I was able to purchase these oven mitts for a disc',
 u'My husband has extra-large hands and none of 

In [21]:
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(2, 3))

In [22]:
X = vectorizer.fit_transform(reviews)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
features = vectorizer.get_feature_names()
features

[u'.simple_NOUN_19.5 clean_VERB_9.47',
 u'.simple_NOUN_19.5 clean_VERB_9.47 simple_ADJ_9.06',
 u'420*f_NOUN_19.5 plus_CONJ_9.65',
 u'420*f_NOUN_19.5 plus_CONJ_9.65 fact_NOUN_8.03',
 u'4th_NOUN_10.5 fumble_ADJ_12.9',
 u'4th_NOUN_10.5 fumble_ADJ_12.9 scenario_NOUN_10.4',
 u'ability_NOUN_9.29 grip_VERB_11.1',
 u'ability_NOUN_9.29 grip_VERB_11.1 easy_ADJ_8.48',
 u'able_ADJ_7.93 bury_VERB_12.4',
 u'able_ADJ_7.93 bury_VERB_12.4 location_NOUN_10.2',
 u'able_ADJ_7.93 grab_VERB_10.2',
 u'able_ADJ_7.93 grab_VERB_10.2 thing_NOUN_7.06',
 u'able_ADJ_7.93 pick_VERB_8.79',
 u'able_ADJ_7.93 pick_VERB_8.79 hot_ADJ_9.14',
 u'able_ADJ_7.93 purchase_VERB_10.2',
 u'able_ADJ_7.93 purchase_VERB_10.2 oven_ADJ_11.5',
 u'able_ADJ_7.93 remove_VERB_9.79',
 u'able_ADJ_7.93 remove_VERB_9.79 exceptionally_ADV_12.4',
 u'able_ADJ_7.93 turn_VERB_8.65',
 u'able_ADJ_7.93 turn_VERB_8.65 chicken_NOUN_10.2',
 u'able_ADJ_7.93 use_VERB_7.08',
 u'able_ADJ_7.93 use_VERB_7.08 grill_NOUN_12.0',
 u'able_ADJ_7.93 use_VERB_7.08 man_

In [24]:
total_count = np.apply_along_axis(np.count_nonzero, 0, X.toarray())
print total_count.shape
print X.shape

(8970,)
(100, 8970)


In [25]:
most_common = np.argsort(total_count)[:-41:-1]
words = np.array(features)[most_common]
pprint(list(words), width=1)

[u'oven_NOUN_11.5 mitt_NOUN_14.0',
 u'oven_ADJ_11.5 mitt_NOUN_14.0',
 u'silicone_NOUN_13.1 oven_NOUN_11.5',
 u'silicone_NOUN_13.1 oven_NOUN_11.5 mitt_NOUN_14.0',
 u'feel_VERB_7.34 heat_NOUN_10.0',
 u'silicone_NOUN_13.1 mitt_NOUN_14.0',
 u'highly_ADV_9.74 recommend_VERB_9.18',
 u'small_ADJ_8.37 hand_NOUN_8.50',
 u'easy_ADJ_8.48 clean_VERB_9.47',
 u'large_ADJ_8.84 hand_NOUN_8.50',
 u'pot_NOUN_10.4 holder_NOUN_12.1',
 u'triumphant_ADJ_14.6 chef_NOUN_12.1',
 u'burn_VERB_10.1 hand_NOUN_8.50',
 u'time_NOUN_6.37 use_VERB_7.08',
 u'mitt_NOUN_14.0 silicone_NOUN_13.1',
 u'pair_NOUN_10.1 oven_ADJ_11.5 mitt_NOUN_14.0',
 u'super_ADJ_8.93 flex_ADJ_12.1',
 u'long_ADJ_7.54 time_NOUN_6.37',
 u'pair_NOUN_10.1 oven_ADJ_11.5',
 u'basting_NOUN_16.2 brush_NOUN_10.9',
 u'pan_NOUN_11.2 oven_NOUN_11.5',
 u'use_VERB_7.08 time_NOUN_6.37',
 u'oven_VERB_11.5 mitt_NOUN_14.0',
 u'flex_ADJ_12.1 silicone_NOUN_13.1',
 u'flex_ADJ_12.1 silicone_NOUN_13.1 oven_NOUN_11.5',
 u'best_ADJ_7.49 oven_NOUN_11.5',
 u'silicone_NOUN

In [26]:
# list with more rarer words
most_common = np.argsort(total_count)[:-41:-1]
words = np.array(features)[most_common]
freq = total_count[most_common]

vfunc = np.vectorize(filter_score_wordphrase)
filters = vfunc(words, 8)

pprint(zip(list(words[filters]), freq))

[(u'oven_NOUN_11.5 mitt_NOUN_14.0', 30),
 (u'oven_ADJ_11.5 mitt_NOUN_14.0', 20),
 (u'silicone_NOUN_13.1 oven_NOUN_11.5', 13),
 (u'silicone_NOUN_13.1 oven_NOUN_11.5 mitt_NOUN_14.0', 12),
 (u'silicone_NOUN_13.1 mitt_NOUN_14.0', 11),
 (u'highly_ADV_9.74 recommend_VERB_9.18', 11),
 (u'small_ADJ_8.37 hand_NOUN_8.50', 9),
 (u'easy_ADJ_8.48 clean_VERB_9.47', 9),
 (u'large_ADJ_8.84 hand_NOUN_8.50', 9),
 (u'pot_NOUN_10.4 holder_NOUN_12.1', 9),
 (u'triumphant_ADJ_14.6 chef_NOUN_12.1', 8),
 (u'burn_VERB_10.1 hand_NOUN_8.50', 8),
 (u'mitt_NOUN_14.0 silicone_NOUN_13.1', 8),
 (u'pair_NOUN_10.1 oven_ADJ_11.5 mitt_NOUN_14.0', 7),
 (u'super_ADJ_8.93 flex_ADJ_12.1', 7),
 (u'pair_NOUN_10.1 oven_ADJ_11.5', 6),
 (u'basting_NOUN_16.2 brush_NOUN_10.9', 6),
 (u'pan_NOUN_11.2 oven_NOUN_11.5', 6),
 (u'oven_VERB_11.5 mitt_NOUN_14.0', 6),
 (u'flex_ADJ_12.1 silicone_NOUN_13.1', 6),
 (u'flex_ADJ_12.1 silicone_NOUN_13.1 oven_NOUN_11.5', 6),
 (u'silicone_NOUN_13.1 glove_NOUN_12.3', 6),
 (u'pick_VERB_8.79 hot_ADJ_9.14

In [27]:
freq_filter = freq[filters] >= 5
zip(list(words[filters][freq_filter]), freq[filters][freq_filter])

[(u'oven_NOUN_11.5 mitt_NOUN_14.0', 30),
 (u'oven_ADJ_11.5 mitt_NOUN_14.0', 20),
 (u'silicone_NOUN_13.1 oven_NOUN_11.5', 13),
 (u'silicone_NOUN_13.1 oven_NOUN_11.5 mitt_NOUN_14.0', 12),
 (u'silicone_NOUN_13.1 mitt_NOUN_14.0', 11),
 (u'highly_ADV_9.74 recommend_VERB_9.18', 9),
 (u'small_ADJ_8.37 hand_NOUN_8.50', 9),
 (u'easy_ADJ_8.48 clean_VERB_9.47', 9),
 (u'large_ADJ_8.84 hand_NOUN_8.50', 9),
 (u'pot_NOUN_10.4 holder_NOUN_12.1', 8),
 (u'triumphant_ADJ_14.6 chef_NOUN_12.1', 8),
 (u'burn_VERB_10.1 hand_NOUN_8.50', 8),
 (u'mitt_NOUN_14.0 silicone_NOUN_13.1', 7),
 (u'pair_NOUN_10.1 oven_ADJ_11.5 mitt_NOUN_14.0', 6),
 (u'super_ADJ_8.93 flex_ADJ_12.1', 6),
 (u'pair_NOUN_10.1 oven_ADJ_11.5', 6),
 (u'basting_NOUN_16.2 brush_NOUN_10.9', 6),
 (u'pan_NOUN_11.2 oven_NOUN_11.5', 6),
 (u'oven_VERB_11.5 mitt_NOUN_14.0', 6),
 (u'flex_ADJ_12.1 silicone_NOUN_13.1', 5),
 (u'flex_ADJ_12.1 silicone_NOUN_13.1 oven_NOUN_11.5', 5),
 (u'silicone_NOUN_13.1 glove_NOUN_12.3', 5),
 (u'pick_VERB_8.79 hot_ADJ_9.14'

In [28]:
index = 11

filtering = X.toarray()[:, most_common[filters][freq_filter][index]] > 0
filtered_reviews = np.array(reviews)[filtering]

vfunc2 = np.vectorize(get_sentences)
wordphrase_sentences = vfunc2(filtered_reviews, words[filters][index])

for review in wordphrase_sentences[0:].flatten():
    for sent in review:
        frag = sentence_frag(sent, words[filters][index], 10)
        if frag:
            print frag
            print

absolutely fantastic!I don't have to worry about hot spills burning my hands, the grip is fabulous, and the

they were cloth with enough insulation to keep you from burning your hands.

I always manage to burn my hands when using the traditional square pot holder so

I like not worrying about burning my hands, too!!

They are very comfortable and I haven't burned my hands with them on.

You cannot burn your hands.  

They might melt or burn your hand.

My arm above the mitts was burned, but my hands and wrists never even felt warm

