In [1]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from pprint import pprint
from spacy.en import English
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import os
import re
import requests
import string

In [2]:
parser = English()

In [3]:
def get_id(url):
    '''
    gets id identifer for amazon product
    '''
    
    # url format: https://www.amazon.com/.../.../id/...
    regex = re.compile(r'(?<=/)[^/]*')
    id_ = regex.findall(url)[-2]

    if len(id_) != 10:
        # url format https://www.amazon.com/.../id
        id_ = regex.findall(url)[-1]

    return id_


def extract(id_):
    '''
    extracts the star rating and review text from directory of
    amazon html files
    '''
    ratings = []
    reviews = []

    path = 'reviews/com/{}/'.format(id_)
    pages = [file_ for file_ in os.listdir(path) if file_[-5:] == '.html']

    for page in pages:
        html = open(path + page, 'r')
        soup = BeautifulSoup(html, 'html.parser')
        tags = soup.findAll("div", {"class": "a-section review"})

        if not tags:
            break

        for tag in tags:
            rating = int(tag.find('i').text[0])
            review = tag.findAll("span",
                                 {"class": "a-size-base review-text"})[0].text
            ratings.append(rating)
            reviews.append(review)

    return ratings, reviews

In [4]:
# Loosely adopted from Nic Scharding
# https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

# A custom stoplist
STOPLIST = set(stopwords.words('english') +
               ["n't", "'s", "'m", "ca", "'re"] +
               list(ENGLISH_STOP_WORDS))

# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ")
SYMBOLS += ["-----", "---", "...", "“", "”", "'ve", ""]

# Full set of stops
STOPS = STOPLIST ^ set(SYMBOLS)

In [5]:
def word_pos_filter(word_pos):
    '''
    filters out stopwords and words with POS we don't care about
    '''
    toss_pos = set(['PUNCT', 'SPACE', 'NUM'])
    
    regex = re.compile('(.*)_(.*)')
    output = regex.findall(word_pos)[0]
    
    if output[0] not in STOPS and output[1] not in toss_pos:
        return True
    else:
        return False
    
def word_prob(word_pos):
    '''
    appends how frequent word appears in english dictionary
    '''
    regex = re.compile('(.*)_(.*)')
    output = regex.findall(word_pos)[0]
    
    return word_pos + '_' + str(parser.vocab[output[0]].prob)[1:5]

In [6]:
# Loosely adopted from Nic Scharding
# https://nicschrading.com/project/Intro-to-NLP-with-spaCy/

# Every step in a pipeline needs to be a "transformer".
# Define a custom transformer to clean text using spaCy


def tokenizeText(sample):
    '''
    A custom function to tokenize the text using spaCy and convert to 
    lemmas. Suffixes are also added to tokens to indicate POS and
    word probability.
    '''

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize and add POS
    lemmas = []

    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() + '_' + tok.pos_
                      if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if word_pos_filter(tok)]
    
    # add the probability of word
    tokens = [word_prob(tok) for tok in tokens]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

# the vectorizer and classifer to use
# note that I changed the tokenizer in CountVectorizer to use a
# custom function using spaCy's tokenizer

vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(2, 3))

In [7]:
def filter_score(word_pos_score, threshold):
    '''
    checks if word probability is greater than a specified threshold
    '''
    regex = re.compile('(.*)_(.*)')
    output = regex.findall(word_pos_score)
    return float(output[0][1]) >= threshold

def filter_score_wordphrase(wordphrase, threshold):
    '''
    checks if all words are greater than a specified threshold
    '''
    return np.array(all([filter_score(word, threshold)
                         for word in wordphrase.split()]))

In [8]:
def parse_feature(wordphrase):
    '''
    parses phrase of word_pos_score words into just words
    '''
    regex = re.compile('(.*)_(.*_.*)')
    return " ".join([regex.findall(word_pos_score)[0][0]
                     for word_pos_score in wordphrase.split()])

def get_sentences(review, wordphrase):
    '''
    returns sentences containing wordphrase in amazon reviews
    '''
    review = parser(unicode(review))
    
    sentences = []
    for sent in review.sents:
        if parse_feature(wordphrase) in sent.string.lower():
            sentences.append(sent)
            
    return sentences

In [9]:
def sentence_frag(spacy_sent, wordphrase, word_dist):
    '''
    returns sentence fragment of words around wordphrase
    '''
    words = [x.lemma_.strip().lower() for x in spacy_sent]
    
    phrase_len = len(parse_feature(wordphrase).split())
    
    if phrase_len == 2:
        ngram_list = [" ".join(x) for x in zip(words, words[1:])]
    else:
        ngram_list = [" ".join(x) for x in zip(words, words[1:], words[2:])]
        
    try:
        idx = ngram_list.index(parse_feature(wordphrase))
    except:
        return None
    
    min_idx = max(0, idx-word_dist)
    max_idx = min(idx+word_dist+1, len(spacy_sent))
    
    return spacy_sent[min_idx:max_idx]

## Test for feature extraction code

In [10]:
url = raw_input('url of amazon product: ')
id_ = get_id(url)
n_reviews = 100

# Run Amazon scraper
# Credit to Andrea Esuli
# https://github.com/aesuli/amadown2py
os.system('python amazon_crawler.py '
          '-d com {} -m {} -o reviews'.format(id_, n_reviews))

ratings, reviews = extract(id_)

url of amazon product: https://www.amazon.com/Bose-QuietComfort-Cancelling-Headphones-Special/dp/B0117RFP0Y/ref=sr_1_3?s=electronics&ie=UTF8&qid=1469413096&sr=1-3&keywords=bose+qc25


In [11]:
ratings

[5,
 4,
 4,
 2,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 4,
 3,
 1,
 4,
 5,
 5,
 5,
 4,
 4,
 5,
 3,
 1,
 1,
 1,
 3,
 5,
 1,
 1,
 5,
 1,
 1,
 5,
 1,
 3,
 1,
 1,
 4,
 4,
 3,
 1,
 5,
 2,
 5,
 5,
 2,
 5,
 5,
 1,
 5,
 4,
 1,
 1,
 3,
 1,
 4,
 2,
 5,
 4,
 3,
 5,
 5,
 3,
 5,
 1,
 5,
 5,
 3,
 5,
 5,
 4,
 2,
 4,
 5,
 5,
 1,
 5,
 5,
 5,
 5,
 5,
 2,
 5,
 4,
 5,
 4,
 5,
 4,
 5,
 4,
 4,
 5,
 4,
 5,
 5,
 5,
 3,
 3,
 1,
 5]

In [12]:
pprint([x[:50] for x in reviews])

[u"The QC25 is Bose' best noise cancelling headphone ",
 u'Bose QC25 vs. Bose QC15 vs. Beats Studio (by Dre.)',
 u'My review is geared more towards QC15 owners wonde',
 u'After using the QC15 for five years--and as a Mill',
 u'I recently became obsessed with headphones after m',
 u"Yeah....My wife likes them.(BTW: they're not actua",
 u"I'm not like most reviewers on here. I do not fly ",
 u'Good quality comes bursting at you from the time y',
 u'This is quite a great headphone. Large ear pads ar',
 u"I'll start by stating that I am replacing an older",
 u'If I could afford to buy two pairs, I would. These',
 u'Wonderful noise cancelling.  really amazing.  soun',
 u'I have a few very important things to review about',
 u'I have used Bose gear for years. The sound on thes',
 u'Expensive, but indeed blocks out a good amount of ',
 u'Headsets/earbuds drive me crazy after a while - th',
 u'As a frequent flier, I depend heavily on noise can',
 u'Excellent headphones. Very well made, excelle

In [13]:
X = vectorizer.fit_transform(reviews)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [14]:
features = vectorizer.get_feature_names()
features

[u'+shipp_VERB_19.5 break_VERB_8.98',
 u'+shipp_VERB_19.5 break_VERB_8.98 lot_NOUN_7.11',
 u'-insert_NOUN_15.9 choose_VERB_9.31',
 u'-insert_NOUN_15.9 choose_VERB_9.31 expletive_NOUN_15.2',
 u'.any_DET_19.2 notch_VERB_11.9',
 u'.any_DET_19.2 notch_VERB_11.9 high_ADJ_7.96',
 u'.frustrat_VERB_19.5 miserable_ADJ_11.3',
 u'.frustrat_VERB_19.5 miserable_ADJ_11.3 waste_NOUN_9.84',
 u'.my_ADJ_17.4 wife_NOUN_9.20',
 u'.my_ADJ_17.4 wife_NOUN_9.20 like_VERB_5.61',
 u'.though_ADP_19.5 great_ADJ_7.42',
 u'.though_ADP_19.5 great_ADJ_7.42 sound_ADJ_8.68',
 u'08/27/2015_PROPN_19.5 troubleshot_VERB_17.6',
 u'08/27/2015_PROPN_19.5 troubleshot_VERB_17.6 replace_VERB_10.1',
 u'08/28/2015_NOUN_19.5 bose_NOUN_15.7',
 u'08/28/2015_NOUN_19.5 bose_NOUN_15.7 depot_NOUN_13.2',
 u'08/28/2015shipped_PROPN_19.5 ups_NOUN_10.6',
 u'08/28/2015shipped_PROPN_19.5 ups_NOUN_10.6 2nd_ADJ_10.1',
 u'09/01/2015.they_PROPN_19.5 turn_NOUN_8.65',
 u'09/01/2015.they_PROPN_19.5 turn_NOUN_8.65 ship_VERB_9.94',
 u'1-year_ADJ_15.1 f

In [15]:
total_count = np.apply_along_axis(np.count_nonzero, 0, X.toarray())
print total_count.shape
print X.shape

(24135,)
(100, 24135)


In [16]:
most_common = np.argsort(total_count)[:-41:-1]
words = np.array(features)[most_common]
pprint(list(words), width=1)

[u'noise_NOUN_10.4 cancel_VERB_11.4',
 u'sound_ADJ_8.68 quality_NOUN_9.02',
 u'cancel_VERB_11.4 headphone_NOUN_12.8',
 u'noise_NOUN_10.4 cancel_VERB_11.4 headphone_NOUN_12.8',
 u'noise_NOUN_10.4 cancellation_NOUN_13.6',
 u'pair_NOUN_10.1 headphone_NOUN_12.8',
 u'listen_VERB_9.47 music_NOUN_8.73',
 u'noise_NOUN_10.4 reduction_NOUN_11.5',
 u'bose_NOUN_15.7 headphone_NOUN_12.8',
 u'best_ADJ_7.49 noise_NOUN_10.4',
 u'sound_NOUN_8.68 quality_NOUN_9.02',
 u'headphone_NOUN_12.8 great_ADJ_7.42',
 u'ear_NOUN_10.8 headphone_NOUN_12.8',
 u'aaa_NOUN_15.6 battery_NOUN_10.1',
 u'background_NOUN_9.92 noise_NOUN_10.4',
 u'ear_NOUN_10.8 cup_NOUN_10.1',
 u'bose_NOUN_15.7 noise_NOUN_10.4',
 u'feel_VERB_7.34 like_ADP_5.61',
 u'build_NOUN_8.84 quality_NOUN_9.02',
 u'quality_NOUN_9.02 noise_NOUN_10.4',
 u'bose_NOUN_15.7 qc25_NOUN_19.5',
 u'stop_VERB_8.27 work_VERB_7.08',
 u'bose_NOUN_15.7 noise_NOUN_10.4 cancel_VERB_11.4',
 u'buy_VERB_8.14 pair_NOUN_10.1',
 u'noise_NOUN_10.4 cancel_VERB_11.4 feature_NOUN_10

In [17]:
# list with more rarer words
most_common = np.argsort(total_count)[:-41:-1]
words = np.array(features)[most_common]

vfunc = np.vectorize(filter_score_wordphrase)
filters = vfunc(words, 8)

pprint(list(words[filters]), width=1)

[u'noise_NOUN_10.4 cancel_VERB_11.4',
 u'sound_ADJ_8.68 quality_NOUN_9.02',
 u'cancel_VERB_11.4 headphone_NOUN_12.8',
 u'noise_NOUN_10.4 cancel_VERB_11.4 headphone_NOUN_12.8',
 u'noise_NOUN_10.4 cancellation_NOUN_13.6',
 u'pair_NOUN_10.1 headphone_NOUN_12.8',
 u'listen_VERB_9.47 music_NOUN_8.73',
 u'noise_NOUN_10.4 reduction_NOUN_11.5',
 u'bose_NOUN_15.7 headphone_NOUN_12.8',
 u'sound_NOUN_8.68 quality_NOUN_9.02',
 u'ear_NOUN_10.8 headphone_NOUN_12.8',
 u'aaa_NOUN_15.6 battery_NOUN_10.1',
 u'background_NOUN_9.92 noise_NOUN_10.4',
 u'ear_NOUN_10.8 cup_NOUN_10.1',
 u'bose_NOUN_15.7 noise_NOUN_10.4',
 u'build_NOUN_8.84 quality_NOUN_9.02',
 u'quality_NOUN_9.02 noise_NOUN_10.4',
 u'bose_NOUN_15.7 qc25_NOUN_19.5',
 u'bose_NOUN_15.7 noise_NOUN_10.4 cancel_VERB_11.4',
 u'buy_VERB_8.14 pair_NOUN_10.1',
 u'noise_NOUN_10.4 cancel_VERB_11.4 feature_NOUN_10.2',
 u'cancel_VERB_11.4 feature_NOUN_10.2',
 u'airplane_NOUN_12.1 adapter_NOUN_11.9',
 u'audio_ADJ_10.6 quality_NOUN_9.02',
 u'spare_ADJ_10.9 b

In [18]:
extracted_features_idx = most_common[filters]
extracted_features_idx

array([14028, 19732,  3476, 14057, 14125, 15026, 11948, 14273,  2601,
       19971,  6632,   124,  1284,  6583,  2655,  3018, 17309,  2697,
        2656,  3221, 14053,  3459,   535,  1045, 20143,  4012,  1096,
        3527, 18802,  3041])

In [19]:
reviews = np.array(reviews)

filtered_reviews = reviews[X.toarray()[:, extracted_features_idx[0]] > 0]
filtered_reviews.shape

(49,)

In [20]:
vfunc2 = np.vectorize(get_sentences)
wordphrase_sentences = vfunc2(filtered_reviews, words[filters][0])

for review in wordphrase_sentences[0:].flatten():
    for sent in review:
        frag = sentence_frag(sent, words[filters][0], 5)
        if frag:
            print frag

QC25 is Bose' best noise cancelling headphone to date.
QC25: sound quality, noise cancelling, comfort, and
hype about how the QC25 noise cancelling is superior.  
day use, the QC25 noise cancelling is really about the
QC25 represents the best Bose noise cancelling headphones to date.
If you have no Bose noise cancelling headphones, this is
In the noise canceling department, I honestly
cared about not losing the noise canceling feature since I only
are awesome headphones in the noise canceling department.  
the Bose QC25 is the noise canceling feature.
bus I turn on the noise cancelling feature and it feels
The noise cancelling is awesome.
an older pair of active noise canceling headphones that I used
These far outperformed my previous noise canceling headphones (which were
The noise canceling feature is typically on
pair of headphones with great noise canceling, great sound and
Wonderful noise cancelling.  
, but it is the noise cancelling you are buying this
does an amazing job at noi