In [1]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pprint import pprint
from spacy.en import English
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os
import re
import requests
import string

In [2]:
parser = English()

In [3]:
def get_id(url):
    '''
    gets id identifer for amazon product
    '''
    
    # url format: https://www.amazon.com/.../.../id/...
    regex = re.compile(r'(?<=/)[^/]*')
    id_ = regex.findall(url)[-2]

    if len(id_) != 10:
        # url format https://www.amazon.com/.../id
        id_ = regex.findall(url)[-1]

    return id_


def extract(id_):
    '''
    extracts the star rating and review text from directory of
    amazon html files
    '''
    ratings = []
    reviews = []

    path = 'reviews/com/{}/'.format(id_)
    pages = [file_ for file_ in os.listdir(path) if file_[-5:] == '.html']

    for page in pages:
        html = open(path + page, 'r')
        soup = BeautifulSoup(html, 'html.parser')
        tags = soup.findAll("div", {"class": "a-section review"})

        if not tags:
            break

        for tag in tags:
            rating = int(tag.find('i').text[0])
            review = tag.findAll("span",
                                 {"class": "a-size-base review-text"})[0].text
            ratings.append(rating)
            reviews.append(review)

    return ratings, reviews

In [4]:
# Loosely adopted from Nic Scharding
# https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

# A custom stoplist
STOPLIST = set(stopwords.words('english') +
               ["n't", "'s", "'m", "ca", "'re"] +
               list(ENGLISH_STOP_WORDS))

# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ")
SYMBOLS += ["-----", "---", "...", "“", "”", "'ve", ""]

# Full set of stops
STOPS = STOPLIST ^ set(SYMBOLS)

In [5]:
def word_pos_filter(word_pos):
    '''
    filters out stopwords and words with POS we don't care about
    '''
    toss_pos = set(['PUNCT', 'SPACE', 'NUM'])
    
    regex = re.compile('(.*)_(.*)')
    output = regex.findall(word_pos)[0]
    
    if output[0] not in STOPS and output[1] not in toss_pos:
        return True
    else:
        return False
    
def word_prob(word_pos):
    '''
    appends how frequent word appears in english dictionary
    '''
    regex = re.compile('(.*)_(.*)')
    output = regex.findall(word_pos)[0]
    
    return word_pos + '_' + str(parser.vocab[output[0]].prob)[1:5]

In [6]:
# Loosely adopted from Nic Scharding
# https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

# Every step in a pipeline needs to be a "transformer".
# Define a custom transformer to clean text using spaCy


def tokenizeText(sample):
    '''
    A custom function to tokenize the text using spaCy and convert to 
    lemmas. Suffixes are also added to tokens to indicate POS and
    word probability.
    '''

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize and add POS
    lemmas = []

    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() + '_' + tok.pos_
                      if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if word_pos_filter(tok)]
    
    # add the probability of word
    tokens = [word_prob(tok) for tok in tokens]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

# the vectorizer and classifer to use
# note that I changed the tokenizer in CountVectorizer to use a
# custom function using spaCy's tokenizer

In [7]:
def filter_score(word_pos_score, threshold):
    '''
    checks if word probability is greater than a specified threshold
    '''
    regex = re.compile('(.*)_(.*)')
    output = regex.findall(word_pos_score)
    return float(output[0][1]) >= threshold

def filter_score_wordphrase(wordphrase, threshold):
    '''
    checks if all words are greater than a specified threshold
    '''
    return np.array(all([filter_score(word, threshold)
                         for word in wordphrase.split()]))

In [8]:
def parse_feature(wordphrase):
    '''
    parses phrase of word_pos_score words into just words
    '''
    regex = re.compile('(.*)_(.*_.*)')
    return " ".join([regex.findall(word_pos_score)[0][0]
                     for word_pos_score in wordphrase.split()])

def get_sentences(review, wordphrase):
    '''
    returns sentences containing wordphrase in amazon reviews
    '''
    review = parser(unicode(review))
    
    sentences = []
    for sent in review.sents:
        if parse_feature(wordphrase) in sent.string.lower():
            sentences.append(sent)
    
    if not sentences:
        return [None]
    return sentences

In [9]:
def sentence_frag(spacy_sent, wordphrase, word_dist):
    '''
    returns sentence fragment of words around wordphrase
    '''
    if not spacy_sent:
        return None
    
    words = [x.lemma_.strip().lower() for x in spacy_sent]
    
    phrase_len = len(parse_feature(wordphrase).split())
    
    if phrase_len == 2:
        ngram_list = [" ".join(x) for x in zip(words, words[1:])]
    else:
        ngram_list = [" ".join(x) for x in zip(words, words[1:], words[2:])]
        
    try:
        idx = ngram_list.index(parse_feature(wordphrase))
    except:
        return None
    
    min_idx = max(0, idx-word_dist)
    max_idx = min(idx+word_dist+1, len(spacy_sent))
    
    return spacy_sent[min_idx:max_idx]

In [10]:
class get_aspects_occurring_n_times(object):

    def __init__(self, aspect_thresh=8, freq_thresh=5):
        self.aspect_thresh = aspect_thresh
        self.freq_thresh = freq_thresh

    def run(self):
        url = raw_input('url of amazon product: ')
        id_ = get_id(url)
        n_reviews = 100

        # Run Amazon scraper
        # Credit to Andrea Esuli
        # https://github.com/aesuli/amadown2py
        os.system('python amazon_crawler.py '
                  '-d com {} -m {} -o reviews'.format(id_, n_reviews))

        ratings, reviews = extract(id_)

        vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(2, 3))
        X = vectorizer.fit_transform(reviews).toarray()

        features = vectorizer.get_feature_names()
        total_count = np.apply_along_axis(np.count_nonzero, 0, X)

        most_common = np.argsort(total_count)[::-1]
        words = np.array(features)[most_common]
        freq = total_count[most_common]

        vfunc = np.vectorize(filter_score_wordphrase)
        filters = vfunc(words, self.aspect_thresh)

        freq_filter = freq[filters] >= self.freq_thresh
        most_common_idx = most_common[filters][freq_filter]
        words_filtered = words[filters][freq_filter]

        self.reviews = reviews
        self.count_vec = X
        self.aspects = words_filtered
        self.aspect_dict = dict(zip(words_filtered, most_common_idx))

    def grab_sentences(self, wordphrase):

        reviews = np.array(self.reviews)
        filtering = self.count_vec[:, self.aspect_dict[wordphrase]] > 0

        filtered_reviews = reviews[filtering]

        vfunc2 = np.vectorize(get_sentences)
        wordphrase_sentences = vfunc2(filtered_reviews, wordphrase)

        sentences = []

        for review in wordphrase_sentences[0:].flatten():
            for sent in review:
                frag = sentence_frag(sent, wordphrase, 5)
                if frag:
                    sentences.append(sent)

        if not sentences:
            return None
            
        return sentences

## Test for get_aspects_occurring_n_times

In [11]:
doc1 = get_aspects_occurring_n_times()
doc1.run()

url of amazon product: https://www.amazon.com/Fujifilm-X-T10-Silver-Mirrorless-OIS/dp/B00X7QTVSQ/ref=sr_1_1?ie=UTF8&qid=1469675582&sr=8-1&keywords=mirrorless+camera+fuji


In [16]:
doc2 = get_aspects_occurring_n_times()
doc2.run()

url of amazon product: https://www.amazon.com/Sony-Mirrorless-Digital-Camera-16-50mm/dp/B007GK50X4/ref=sr_1_1?s=photo&ie=UTF8&qid=1469687704&sr=1-1-spons&keywords=mirrorless+camera&psc=1


In [67]:
common_aspects = set(doc1.aspects) & set(doc2.aspects)
common_aspects

{u'auto_NOUN_10.3 focus_NOUN_9.53',
 u'image_NOUN_9.34 quality_NOUN_9.02',
 u'kit_NOUN_10.6 lens_NOUN_11.1',
 u'mirrorless_NOUN_15.0 camera_NOUN_9.73'}

In [68]:
sid = SentimentIntensityAnalyzer()
aspect = list(common_aspects)[0]

In [69]:
aspect_sentences = doc1.grab_sentences(aspect)

print "term =", parse_feature(aspect)
print
print '-' * 40
print

for sent in aspect_sentences:
    ss = sid.polarity_scores(sent.string)
    print sent
    print
    for k in sorted(ss):
        print '{0}: {1}, '.format(k, ss[k])
    print
    print '-'*40
    print

term = auto focus

----------------------------------------

Very sharp pictures, great color rendition (I shoot JPEG - the out-of-camera shots are usually excellent and require no post-processing)- Fuji lenses seem to be very high quality based on multiple independent reviews- Quick, solid auto focus (I shoot mostly single, haven't explored much of continuous/burst yet)- Feature rich options that should accommodate most users' needs- Highly customizable and easy interface (even coming from a different system)Cons:- Lenses are pretty expensive (but, as stated above, you get what you pay for)- Small size may be an issue for some (inadvertently hitting back buttons; heavy lenses may feel unbalanced, requiring optional grip)-

compound: 0.9221, 
neg: 0.072, 
neu: 0.761, 
pos: 0.168, 

----------------------------------------

You have to use them like you would an old camera, no auto focus and no stop down, but I can get some new life out of some old glass.

compound: -0.2617, 
neg: 0.152

In [70]:
aspect_sentences = doc2.grab_sentences(aspect)

print "term =", parse_feature(aspect)
print
print '-' * 40
print

for sent in aspect_sentences:
    ss = sid.polarity_scores(sent.string)
    print sent
    print
    for k in sorted(ss):
        print '{0}: {1}, '.format(k, ss[k])
    print
    print '-'*40
    print

term = auto focus

----------------------------------------

Just a few years ago, mirrorless cameras had the true reputation of being significantly behind dSLRs in auto focus systems.  

compound: 0.4215, 
neg: 0.0, 
neu: 0.865, 
pos: 0.135, 

----------------------------------------

The autofocus system is state of the art; the tracking and continuous auto focus makes sure you never miss that perfect moment.

compound: -0.0654, 
neg: 0.116, 
neu: 0.738, 
pos: 0.145, 

----------------------------------------

better auto focus (from what other reviews are saying I don't have the 80d so....)4.

compound: 0.4404, 
neg: 0.0, 
neu: 0.818, 
pos: 0.182, 

----------------------------------------

It's sensor, it's image quality, it's speed, it's Auto Focus and it's speed.

compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 

----------------------------------------

These mirrorless cameras have ridiculous Auto Focus points and doesn't lose focus even in fast moving objects like a dog or a c

In [71]:
aspect_sentences_1 = doc1.grab_sentences(aspect)
aspect_sentences_2 = doc2.grab_sentences(aspect)

aspect1_pos_scores = []
aspect1_neg_scores = []
aspect2_pos_scores = []
aspect2_neg_scores = []

for sent in aspect_sentences_1:
    ss = sid.polarity_scores(sent.string)
    aspect1_pos_scores.append(ss['pos'])
    aspect1_neg_scores.append(ss['neg'])
    
for sent in aspect_sentences_2:
    ss = sid.polarity_scores(sent.string)
    aspect2_pos_scores.append(ss['pos'])
    aspect2_neg_scores.append(ss['neg'])

In [72]:
print aspect1_pos_scores
print aspect2_pos_scores

if np.mean(aspect1_pos_scores) > np.mean(aspect2_pos_scores):
    print "item 1 is better for {}".format(parse_feature(aspect))
else:
    print "item 2 is better for {}".format(parse_feature(aspect))

[0.168, 0.055, 0.0, 0.0, 0.146, 0.093, 0.367]
[0.135, 0.145, 0.182, 0.0, 0.192, 0.15, 0.12, 0.0, 0.147, 0.07, 0.299, 0.0, 0.289, 0.572, 0.0]
item 2 is better for auto focus


In [73]:
print aspect1_neg_scores
print aspect2_neg_scores

if np.mean(aspect1_neg_scores) > np.mean(aspect2_neg_scores):
    print "item 1 is worse for {}".format(parse_feature(aspect))
else:
    print "item 2 is worse for {}".format(parse_feature(aspect))

[0.072, 0.152, 0.0, 0.0, 0.076, 0.0, 0.0]
[0.0, 0.116, 0.0, 0.0, 0.126, 0.262, 0.0, 0.0, 0.0, 0.045, 0.03, 0.0, 0.059, 0.0, 0.0]
item 1 is worse for auto focus


### NOTES

<span style="color:red">NEED TO FIX</span>  
Code below does not work when extracting aspects from sentences where aspect is seperated by punctuation

example:  
u'x_NOUN_9.81 t10_NOUN_15.9' (possibly because of hyphen?)  
https://www.amazon.com/Fujifilm-X-T10-Silver-Mirrorless-OIS/dp/B00X7QTVSQ/ref=sr_1_1?ie=UTF8&qid=1469675582&sr=8-1&keywords=mirrorless+camera+fuji

  
VADER returns neutral for everything, not sure how to make useful

### Code to test out Aspect Extractor

In [18]:
url = raw_input('url of amazon product: ')
id_ = get_id(url)
n_reviews = 100

# Run Amazon scraper
# Credit to Andrea Esuli
# https://github.com/aesuli/amadown2py
os.system('python amazon_crawler.py '
          '-d com {} -m {} -o reviews'.format(id_, n_reviews))

ratings, reviews = extract(id_)

url of amazon product: https://www.amazon.com/Fujifilm-X-T10-Silver-Mirrorless-OIS/dp/B00X7QTVSQ/ref=sr_1_1?ie=UTF8&qid=1469675582&sr=8-1&keywords=mirrorless+camera+fuji


In [74]:
ratings

[5,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 3,
 1,
 1,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 3,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 5,
 4,
 5,
 5,
 5,
 4,
 5,
 4,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 5,
 5,
 5,
 3,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 3,
 5,
 5,
 5]

In [75]:
pprint([x[:50] for x in reviews])

[u'I bought this last month and took this on my trip ',
 u'Overall a very nice camera that takes excellent st',
 u'I got to try a pre-production model for a few days',
 u"I've had this camera for a couple of weeks now and",
 u'Rambling on time....I owned the Great X-T1 a littl',
 u'After years of carrying every brand of DSLR made I',
 u'Debated between this and the X-T1 and ended up wit',
 u'I love this camera! I decided to buy this one inst',
 u'This is the best Fuji X- Camera I have used. A nic',
 u'I got started into the world of Fujifilm (coming f',
 u'Very sharp - fun little camera. Bit of learning cu',
 u'Great camera so far',
 u'Ok',
 u"Cheap quality and Fuji doesn't honor their warrant",
 u'I bought a fuji film camera the other day and just',
 u'Difficult for an amateur',
 u"When I first started using an SLR in the '70's, I ",
 u'I love this camera. I considered buying the XT1, b',
 u'I like using this camera a lot. It has a solid fee',
 u'I bought the X-T10 as a backup camera 

In [76]:
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(2, 3))

In [77]:
X = vectorizer.fit_transform(reviews)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [78]:
features = vectorizer.get_feature_names()
features

[u'!!!i_NOUN_19.5 try_VERB_7.54',
 u'!!!i_NOUN_19.5 try_VERB_7.54 camera_NOUN_9.73',
 u'!!i_X_19.5 think_VERB_6.18',
 u'!!i_X_19.5 think_VERB_6.18 16mp_ADJ_17.9',
 u'!)but_NOUN_19.5 camera_NOUN_9.73',
 u'!)but_NOUN_19.5 camera_NOUN_9.73 like_ADP_5.61',
 u'(btw_CONJ_19.5 think_VERB_6.18',
 u'(btw_CONJ_19.5 think_VERB_6.18 like_ADP_5.61',
 u'+1sharpnes_NOUN_19.5 -1noise_ADJ_19.5',
 u'+1sharpnes_NOUN_19.5 -1noise_ADJ_19.5 reduction_NOUN_11.5',
 u'+2_ADJ_12.6 shadow_NOUN_11.1',
 u'+2_ADJ_12.6 shadow_NOUN_11.1 +1sharpnes_NOUN_19.5',
 u'-1noise_ADJ_19.5 reduction_NOUN_11.5',
 u'-1noise_ADJ_19.5 reduction_NOUN_11.5 bracketing_NOUN_15.8',
 u'.i_X_16.0 great_ADJ_7.42',
 u'.i_X_16.0 great_ADJ_7.42 x_NOUN_9.81',
 u'.love_ADP_18.8 jpegs_NOUN_15.7',
 u'.love_ADP_18.8 jpegs_NOUN_15.7 right_ADJ_6.93',
 u'1-amazon.com_NOUN_19.5 b&h_NOUN_19.5',
 u'1-amazon.com_NOUN_19.5 b&h_NOUN_19.5 photo)+_NOUN_19.5',
 u'12mm_NOUN_16.0 that.the_DET_19.5',
 u'12mm_NOUN_16.0 that.the_DET_19.5 image_NOUN_9.34',
 u'140mm

In [79]:
total_count = np.apply_along_axis(np.count_nonzero, 0, X.toarray())
print total_count.shape
print X.shape

(10613,)
(96, 10613)


In [80]:
most_common = np.argsort(total_count)[:-41:-1]
words = np.array(features)[most_common]
pprint(list(words), width=1)

[u'x_NOUN_9.81 t10_NOUN_15.9',
 u'great_ADJ_7.42 camera_NOUN_9.73',
 u'love_VERB_7.37 camera_NOUN_9.73',
 u'kit_NOUN_10.6 lens_NOUN_11.1',
 u'image_NOUN_9.34 quality_NOUN_9.02',
 u'x_NOUN_9.81 t1_NOUN_14.0',
 u'camera_NOUN_9.73 small_ADJ_8.37',
 u'mirrorless_NOUN_15.0 camera_NOUN_9.73',
 u'camera_NOUN_9.73 great_ADJ_7.42',
 u'excellent_ADJ_10.1 camera_NOUN_9.73',
 u'lcd_NOUN_14.7 screen_NOUN_9.32',
 u'use_VERB_7.08 camera_NOUN_9.73',
 u'auto_NOUN_10.3 focus_NOUN_9.53',
 u'small_ADJ_8.37 size_NOUN_9.18',
 u'large_ADJ_8.84 hand_NOUN_8.50',
 u'hold_VERB_9.07 camera_NOUN_9.73',
 u'weather_NOUN_10.4 seal_VERB_11.7',
 u'camera_NOUN_9.73 want_VERB_6.69',
 u'high_ADJ_7.96 quality_NOUN_9.02',
 u'manual_ADJ_10.9 control_NOUN_8.75',
 u'camera_NOUN_9.73 use_VERB_7.08',
 u'easy_ADJ_8.48 use_VERB_7.08',
 u'highly_ADV_9.74 recommend_VERB_9.18',
 u'really_ADV_6.37 love_VERB_7.37',
 u'camera_NOUN_9.73 good_ADJ_6.51',
 u'small_ADJ_8.37 compact_ADJ_12.5',
 u'lens_NOUN_11.1 camera_NOUN_9.73',
 u'small_ADJ

In [81]:
# list with more rarer words
most_common = np.argsort(total_count)[:-41:-1]
words = np.array(features)[most_common]
freq = total_count[most_common]

vfunc = np.vectorize(filter_score_wordphrase)
filters = vfunc(words, 8)

pprint(zip(list(words[filters]), freq))

[(u'x_NOUN_9.81 t10_NOUN_15.9', 21),
 (u'kit_NOUN_10.6 lens_NOUN_11.1', 18),
 (u'image_NOUN_9.34 quality_NOUN_9.02', 15),
 (u'x_NOUN_9.81 t1_NOUN_14.0', 11),
 (u'camera_NOUN_9.73 small_ADJ_8.37', 11),
 (u'mirrorless_NOUN_15.0 camera_NOUN_9.73', 11),
 (u'excellent_ADJ_10.1 camera_NOUN_9.73', 9),
 (u'lcd_NOUN_14.7 screen_NOUN_9.32', 7),
 (u'auto_NOUN_10.3 focus_NOUN_9.53', 7),
 (u'small_ADJ_8.37 size_NOUN_9.18', 6),
 (u'large_ADJ_8.84 hand_NOUN_8.50', 6),
 (u'hold_VERB_9.07 camera_NOUN_9.73', 6),
 (u'weather_NOUN_10.4 seal_VERB_11.7', 6),
 (u'manual_ADJ_10.9 control_NOUN_8.75', 6),
 (u'highly_ADV_9.74 recommend_VERB_9.18', 5),
 (u'small_ADJ_8.37 compact_ADJ_12.5', 5),
 (u'lens_NOUN_11.1 camera_NOUN_9.73', 5),
 (u'small_ADJ_8.37 camera_NOUN_9.73', 5),
 (u'camera_NOUN_9.73 body_NOUN_8.64', 5),
 (u'camera_NOUN_9.73 kit_NOUN_10.6', 5),
 (u'iso_NOUN_13.3 dial_NOUN_12.1', 5),
 (u'fuji_ADJ_15.8 film_NOUN_9.62', 5),
 (u'small_ADJ_8.37 light_ADJ_8.94', 5),
 (u'recommend_VERB_9.18 camera_NOUN_9.73

In [82]:
freq_filter = freq[filters] >= 5
zip(list(words[filters][freq_filter]), freq[filters][freq_filter])

[(u'x_NOUN_9.81 t10_NOUN_15.9', 21),
 (u'kit_NOUN_10.6 lens_NOUN_11.1', 11),
 (u'image_NOUN_9.34 quality_NOUN_9.02', 11),
 (u'x_NOUN_9.81 t1_NOUN_14.0', 11),
 (u'camera_NOUN_9.73 small_ADJ_8.37', 9),
 (u'mirrorless_NOUN_15.0 camera_NOUN_9.73', 7),
 (u'excellent_ADJ_10.1 camera_NOUN_9.73', 6),
 (u'lcd_NOUN_14.7 screen_NOUN_9.32', 6),
 (u'auto_NOUN_10.3 focus_NOUN_9.53', 6),
 (u'small_ADJ_8.37 size_NOUN_9.18', 6),
 (u'large_ADJ_8.84 hand_NOUN_8.50', 5),
 (u'hold_VERB_9.07 camera_NOUN_9.73', 5),
 (u'weather_NOUN_10.4 seal_VERB_11.7', 5),
 (u'manual_ADJ_10.9 control_NOUN_8.75', 5),
 (u'highly_ADV_9.74 recommend_VERB_9.18', 5)]

In [83]:
filtering = X.toarray()[:, most_common[filters][freq_filter][1]] > 0
filtered_reviews = np.array(reviews)[filtering]

vfunc2 = np.vectorize(get_sentences)
wordphrase_sentences = vfunc2(filtered_reviews, words[filters][1])

for review in wordphrase_sentences[0:].flatten():
    for sent in review:
        frag = sentence_frag(sent, words[filters][1], 5)
        if frag:
            print frag

I think it was the kit lens that came with it
And Fuji kit lenses are better than other's
a smaller sensor- Very nice kit lens compared to other systems
sharp and better than any kit lens from my past Nikon
camera with 18-55mm kit lens the day it was
-frame A7, the kit lenses are not that great
little light compared to the kit lens, so you can
has to be the best kit lens I have ever seen
Fuji does have a kit lens like this as well
supposedly one of the best kit lenses out there!
just the box for the kit lens [which I did
55 F2.8-4 OIS kit lens is not your average
box made for a different kit lens than the 14mm-55mm 2.8
