In [1]:
from bs4 import BeautifulSoup
from pprint import pprint
import pandas as pd

import datetime
import time
import requests

### Custom Functions

In [2]:
from wordsets import *
from scraper import *

In [None]:
from __future__ import division
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from spacy.en import English
from textblob import TextBlob
from wordsets import com_dep, com_tag, noun_tag, nonaspects
import numpy as np
import re

parser = English()


class SentCustomProperties(object):
    '''
    adds properties to spacy sentence that tracks:
        index of review where sentence originated
        customer review rating
        spacy Span object
        index of sentence within review corpus
        index of first token in sentence within review
        num of words in sentences
    '''

    def __init__(self, review_idx, rating, sent_idx, sent):
        '''
        INPUT: int, int, spacy sentence (spacy.tokens.span.Span)
        OUTPUT: None
        '''
        self.review_idx = review_idx
        self.review_rate = rating
        self.sent = sent
        self.sent_idx = sent_idx
        self.start_idx = sent[0].i
        self.words = len(sent)


class ReviewSents(object):
    '''
    Takes a list of unicode reviews and stores the sentences
    (with additional properties) in the returned object
    '''

    def __init__(self, product):
        '''
        INPUT: Loader
        OUTPUT: None
        '''
        self.ratings = product.ratings
        self.reviews = product.reviews
        self.n_reviews, self.n_sent, self.sentences = self._parse_sentences()

    def _parse_sentences(self):
        '''
        INPUT: None
        OUTPUT: int, int, list(SentCustomProperties)

        Uses spacy to parse and split the sentences
        Return number of reviews, sentences, and list of spacy objects
        '''
        regex = re.compile(r'\.\.\.\.+')

        n_sent, n_reviews = 0, 0
        sentences = []

        for i, review in enumerate(self.reviews):
            try:
                review = regex.sub(u'...', review)
                review = parser(review)
                n_reviews += 1
            except AssertionError:
                print 'parser for review #{} failed'.format(i)
                continue

            for sent in review.sents:
                if sent.string:
                    sentences.append(SentCustomProperties(i, self.ratings[i],
                                                          n_sent, sent))
                    n_sent += 1

        return n_reviews, n_sent, sentences


class Unigramer(object):
    '''
    Class of functions for extracting Unigrams
    '''

    def __init__(self):
        self.dep_dict = defaultdict(list)
        self.cnt_dict = defaultdict(int)
        self.pol_dict = defaultdict(list)
        self.rev_dict = defaultdict(set)
        self.sent_dict = defaultdict(set)
        self.word_pos_dict = defaultdict(list)

    def _iter_nouns(self, sent):
        '''
        INPUT: SentCustomProperties
        OUTPUT: set

        Iterates through each token of spacy sentence and collects
            lemmas of all nouns into a set.
        '''
        wordset = set()

        for token in sent.sent:
            self.cnt_dict[token.lemma_] += 1
            self.dep_dict[token.head.lemma_].append(token.dep_)
            root = parser.vocab[token.lemma].prob

            # filter to only consider nouns, valid aspects, and uncommon words
            if token.tag_ in noun_tag and (root < -7.5 and
                                           token.lemma_ not in nonaspects):
                wordset.add(token.lemma_)
                self.rev_dict[token.lemma_].add(sent.review_idx)

                if sent.sent_idx not in self.sent_dict[token.lemma_]:
                    self.word_pos_dict[token.lemma_].append(token.i)
                    self.sent_dict[token.lemma_].add(sent.sent_idx)

            if token.dep_ == 'amod':
                pol = abs(TextBlob(token.string).sentiment.polarity) > 0
                self.pol_dict[token.head.lemma_].append(pol)

        return " ".join(wordset)

    def candidate_unigrams(self, corpus, min_pct=0.01, amod_pct=0.075):
        '''
        INPUT: ReviewSents, float, float
        OUTPUT: set, dict

        obtains a set of candidate unigrams

        each candidate unigram must be a noun and must appear in at least
            a percentage of the sentences specified by min_pct with the unigram
            being modified by an amod dependency at least amod_pct of the time
        '''
        count_X = []

        for sent in corpus.sentences:
            count_X.append(self._iter_nouns(sent))

        cnt_vec = CountVectorizer()
        freq = cnt_vec.fit_transform(count_X)

        total_count = freq.toarray().sum(axis=0)
        filter_ = total_count >= min_pct * corpus.n_sent

        # filter for aspect appearing in min_pct of sentences
        features = np.array(cnt_vec.get_feature_names())
        unigrams = set(features[filter_])

        # filter for percentage of time aspect is modified by amod
        for word in unigrams.copy():
            arr = np.array(self.dep_dict[word]) == 'amod'

            if np.mean(arr) < amod_pct:
                unigrams.remove(word)

        return unigrams, self.cnt_dict


class Bigramer(object):
    '''
    Class of functions for extracting Bigrams
    '''
    def __init__(self):
        self.avg_dist = defaultdict(float)
        self.distances = defaultdict(list)
        self.ordering = defaultdict(lambda: [0, 0])
        self.pmi = defaultdict(float)
        self.rev_dict = defaultdict(set)
        self.sent_dict = defaultdict(set)
        self.word_pos_dict = defaultdict(list)

    def _reverse_key(self, key, new_key):
        '''
        INPUT: string(two words seperated by space)
        OUTPUT: None

        Reverses the word order for the key in the class dictionaries
        '''
        self.avg_dist[new_key] = self.avg_dist.pop(key)
        self.distances[new_key] = self.distances.pop(key)
        self.ordering[new_key] = self.ordering.pop(key)
        self.pmi[new_key] = self.pmi.pop(key)
        self.rev_dict[new_key] = self.rev_dict.pop(key)
        self.sent_dict[new_key] = self.sent_dict.pop(key)
        self.word_pos_dict[new_key] = self.word_pos_dict.pop(key)

    def _get_compactness_feat(self, corpus):
        '''
        INPUT: ReviewSents
        OUTPUT: generator(tuples(unicode))

        outputs generator of tuples (in alphabetical order) consisting of:
            at least one noun
            a second word within +/- 3 words of noun
        excludes dependencies and tags not likely to be a feature word
        '''

        for sent in corpus.sentences:
            output = set()

            for i, token in enumerate(sent.sent):
                # one word in bigram must be noun
                if token.tag_ in noun_tag and token.lemma_ not in nonaspects:
                    arr = sent.sent[max(0, i - 3):min(i + 4, sent.words)]
                    arr = np.array(arr)
                    arr = arr[arr != token]

                    for item in arr:
                        root = parser.vocab[item.lemma].prob
                        # filter out unlikely features
                        if root < -7.5 and (item.dep_ not in com_dep and
                                            item.tag_ not in com_tag and
                                            item.lemma_ not in nonaspects):
                            bigrm = " ".join(sorted([item.lemma_,
                                                     token.lemma_]))
                            dist = item.i - token.i
                            word_sort = item.lemma_ < token.lemma_

                            self.distances[bigrm].append(abs(dist))
                            self.rev_dict[bigrm].add(sent.review_idx)
                            self.ordering[bigrm][word_sort == (dist > 0)] += 1

                            if sent.sent_idx not in self.sent_dict[bigrm]:
                                self.word_pos_dict[bigrm].append(token.i)
                                self.sent_dict[bigrm].add(sent.sent_idx)

                            output.add(bigrm)

            if output:
                for element in output:
                    yield element

    def candidate_bigrams(self, corpus, cnt_dict, min_pct=0.005,
                          pmi_pct=0.0, max_avg_dist=2):
        '''
        INPUT: ReviewSents, cnt_dict, float
        OUTPUT: set(tuples), set(str)

        outputs set of tuples and set of words within tuples from
            _get_compactness_feat function appearing at least
            min_cnt times

        cnt_dict output from candidate_unigrams is the input for the second
            argument of this function
        '''
        bigrams = set()
        bigram_words = set()

        feats = Counter(self._get_compactness_feat(corpus))

        for (key, val) in feats.iteritems():
            order = sorted(key.split(" "),
                           reverse=self.ordering[key][1] >
                           self.ordering[key][0])
            new_key = " ".join(order)

            pmi = round(val / (cnt_dict[order[0]] * cnt_dict[order[1]]), 4)
            avg_dist = round(np.mean(self.distances[key]), 2)

            if pmi >= pmi_pct and (avg_dist < max_avg_dist and
                                   val >= max(2, min_pct * corpus.n_sent)):
                self.avg_dist[key] = avg_dist
                self.pmi[key] = pmi

                bigrams.add(new_key)
                bigram_words.update(set(key))

                if key != new_key:
                    self._reverse_key(key, new_key)

        return bigrams, bigram_words

# My Code

In [24]:
class DFloader(object):
    '''
    INPUT: df
    OUTPUT: None
    
    Extracts the ratings and reviews from a canned amazon dataset
        parsed into pandas.
    '''
    
    def __init__(self, df, name=None):
        self.name = name
        self.ratings = df.overall.tolist()
        self.reviews = df.reviewText.tolist()

def pipeline(df, asin_list, product_names=None):
    '''
    INPUT: df, list(str), dict
    OUTPUT: dict
    
    runs pipeline specified in functions above for list of asin
        on loaded data frame containing reviews
    outputs asin callable dictionary
    '''
    print 'start time:', datetime.datetime.now().time().isoformat()
    print
    
    asin_dict = defaultdict(dict)
    
    for asin in asin_list:
        print 'working on ASIN {}'.format(asin)
        print '-'*40
        
        if product_names:
            asin_dict[asin]['name'] = product_names.get(asin, None)
        
        product = DFloader(df[df.asin == asin])    
        corpus = ReviewSents(product)
        
        unigramer = Unigramer()
        unigrams, cnt_dict = unigramer.candidate_unigrams(corpus, min_pct=0.01, amod_pct=0.094)
        
        bigramer = Bigramer()
        bigrams, bigram_words = bigramer.candidate_bigrams(corpus, cnt_dict, 
                                                           min_pct=0.005, pmi_pct=1/2500)
        
        asin_dict[asin]['corpus'] = corpus
        asin_dict[asin]['unigrams'] = unigrams
        asin_dict[asin]['dep_dict'] = unigramer.dep_dict
        asin_dict[asin]['cnt_dict'] = unigramer.cnt_dict
        asin_dict[asin]['pol_dict'] = unigramer.pol_dict
        asin_dict[asin]['uni_rev_dict'] = unigramer.rev_dict
        asin_dict[asin]['uni_sent_dict'] = unigramer.sent_dict
        asin_dict[asin]['uni_word_pos_dict'] = unigramer.word_pos_dict
        asin_dict[asin]['bigrams'] = bigrams
        asin_dict[asin]['bigram_words'] = bigram_words
        asin_dict[asin]['bigram_avg_dist'] = bigramer.avg_dist
        asin_dict[asin]['bigram_ordering'] = bigramer.ordering
        asin_dict[asin]['bigram_pmi'] = bigramer.pmi
        asin_dict[asin]['big_rev_dict'] = bigramer.rev_dict
        asin_dict[asin]['big_sent_dict'] = bigramer.sent_dict
        asin_dict[asin]['big_word_pos_dict'] = bigramer.word_pos_dict
        
        print '-'*40
        print
    
    print 'end time:', datetime.datetime.now().time().isoformat()
    return asin_dict

In [4]:
def get_item_name(asin_list):
    '''
    INPUT: list
    OUTPUT: dict
    
    Script that returns a list of product names given an ASIN list
    '''
    output = dict()
    
    for asin in asin_list:
        url = 'https://www.amazon.com/dp/{}/'.format(asin)
        user_agent = ['Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30',
                      '(KHTML, like Gecko) Ubuntu/11.04',
                      'Chromium/12.0.742.91 Chrome/12.0.742.91',
                      'Safari/534.30']
        headers = {'User-Agent': np.random.choice(user_agent)}
        html = requests.get(url, headers=headers).content
        soup = BeautifulSoup(html, 'html.parser')
        try:
            name = soup.select('#productTitle')[0].text.strip()
        except IndexError:
            name = ''
            
        output[asin] = name
        time.sleep(5 + np.random.random()*10)
    
    return output

# Amazon Electronics Review Corpus

In [5]:
cd ~/repos/amazon_review_summarizer/

/Users/Alvin/Repos/amazon_review_summarizer


In [6]:
with open('reviews/Electronics_5-2.json', 'r') as f:
    data = f.readlines()

In [7]:
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"

In [8]:
df = pd.read_json(data_json_str)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1689188 entries, 0 to 1689187
Data columns (total 9 columns):
asin              1689188 non-null object
helpful           1689188 non-null object
overall           1689188 non-null int64
reviewText        1689188 non-null object
reviewTime        1689188 non-null object
reviewerID        1689188 non-null object
reviewerName      1664458 non-null object
summary           1689188 non-null object
unixReviewTime    1689188 non-null int64
dtypes: int64(2), object(7)
memory usage: 128.9+ MB


In [10]:
group = df.groupby('asin').size()

In [11]:
len(group[df.groupby('asin').size() >= 1000])

52

In [12]:
asin_1000 = group[df.groupby('asin').size() >= 1000].index.tolist()
asin_1000

[u'B00004ZCJE',
 u'B00007E7JU',
 u'B0002L5R78',
 u'B000BQ7GW8',
 u'B000I68BD4',
 u'B000LRMS66',
 u'B000QUUFRW',
 u'B000S5Q9CA',
 u'B000VX6XL6',
 u'B0012S4APK',
 u'B0015DYMVO',
 u'B0019EHU8G',
 u'B001TH7GSW',
 u'B001TH7GUU',
 u'B001XURP7W',
 u'B0027VT6V4',
 u'B002MAPRYU',
 u'B002QEBMAK',
 u'B002SZEOLG',
 u'B002V88HFE',
 u'B002WE6D44',
 u'B00316263Y',
 u'B003ELYQGG',
 u'B003ES5ZUU',
 u'B0041Q38NU',
 u'B0043T7FXE',
 u'B0044YU60M',
 u'B004G6002M',
 u'B004GF8TIK',
 u'B004QK7HI8',
 u'B004W2JKWG',
 u'B004XC6GJ0',
 u'B0052SCU8U',
 u'B005CLPP84',
 u'B005CT56F8',
 u'B005DKZTMG',
 u'B005FYNSPK',
 u'B005HMKKH4',
 u'B00622AG6S',
 u'B006GWO5WK',
 u'B006W8U2MU',
 u'B0074BW614',
 u'B007I5JT4S',
 u'B007R5YDYA',
 u'B007WTAJTO',
 u'B008OHNZI0',
 u'B009A5204K',
 u'B009SYZ8OC',
 u'B00B46XUQU',
 u'B00BGGDVOO',
 u'B00DR0PDNE',
 u'B00E3W15P0']

In [13]:
# product_names = get_item_name(asin_1000)

In [14]:
product_names = {u'B00004ZCJE': u'Tiffen 46mm UV Protection Filter',
 u'B00007E7JU': u'Canon EF 50mm f/1.8 II Camera Lens - Fixed (Discontinued by Manufacturer)',
 u'B0002L5R78': u"DVI Gear HDMI-2M 2M/6' HDMI Cable",
 u'B000BQ7GW8': u'SanDisk 2GB Class 4 SD Flash Memory Card- SDSDB-002G-B35 (Label May Change)',
 u'B000I68BD4': u'JLab Audio JBuds Hi-Fi Noise-Reducing Ear Buds, GUARANTEED FOR LIFE - White',
 u'B000LRMS66': u'Garmin Portable Friction Mount',
 u'B000QUUFRW': u'SanDisk 4GB Extreme SDHC Class 10 Memory Card',
 u'B000S5Q9CA': u'Motorola Vehicle Power Adapter micro-USB Charger',
 u'B000VX6XL6': u'Kingston 4 GB microSDHC Class 4 Flash Memory Card SDC4/4GBET',
 u'B0012S4APK': u'Cheetah Mounts APTMM2B TV Wall Mount for 20-75-Inch TVs Bundle with 10-feet Braided HDMI Cable and a 6-Inch 3-Axis Magnetic Bubble',
 u'B0015DYMVO': u'Belkin Mini 5W 3-Outlet Swivel Travel Charger with Dual USB Ports',
 u'B0019EHU8G': u'Mediabridge ULTRA Series HDMI Cable (6 Foot) - High-Speed Supports Ethernet, 3D and Audio Return [Newest Standard]',
 u'B001TH7GSW': u'AmazonBasics Digital Optical Audio Toslink Cable - 6 Feet (1.8 Meters)',
 u'B001TH7GUU': u'AmazonBasics USB 2.0 Extension Cable - A-Male to A-Female - 9.8 Feet (3 Meters)',
 u'B001XURP7W': u'SanDisk Cruzer 4GB USB 2.0 Flash Drive- SDCZ36-004G-B35',
 u'B0027VT6V4': u'Cyber Acoustics 30 Watt Powered Speakers with Subwoofer for PC and Gaming Systems in Standard Packaging, (CA-3602a)',
 u'B002MAPRYU': u'SanDisk Sansa Clip+ 4 GB MP3 Player (Red) (Discontinued by Manufacturer)',
 u'B002QEBMAK': u'WD Elements 500 GB USB 2.0 Desktop External Hard Drive',
 u'B002SZEOLG': u'TP-LINK TL-WN722N Wireless N150 High Gain USB Adapter, 150Mbps, 4dBi External Antenna, WPS Button, Support Windows XP/Vista/7/8',
 u'B002V88HFE': u'eneloop SEC-CSPACER4PK C Size Spacers for use with AA battery cells',
 u'B002WE6D44': u'Transcend 8GB Class 10 SDHC Card (TS8GSDHC10)',
 u'B00316263Y': u'BlueRigger High Speed HDMI Cable with Ethernet 6.6 Feet (2m) - Supports 3D and Audio Return [Latest Version]',
 u'B003ELYQGG': u'Panasonic ErgoFit Best in Class In-Ear Earbud Headphones RP-HJE120-D (Orange) Dynamic Crystal Clear Sound, Ergonomic Comfort-Fit, iPhone, Android Compatible, Noise Isolating Headphones',
 u'B003ES5ZUU': u'AmazonBasics High-Speed HDMI Cable with Ethernet - Braided 6.5 feet/2.0 meters (Discontinued by Manufacturer)',
 u'B0041Q38NU': u'Kingston Datatraveler 101 Gen 2 With urDrive 8GB USB 2.0 (Red)',
 u'B0043T7FXE': u'Logitech M570 Wireless Trackball, Computer Wireless Mouse, Long Range Wireless Mouse',
 u'B0044YU60M': u'Wireless Router w/ WiFi Range Extender Mode (300 Mbps) by Medialink - Easy YouTube Setup Video (Part# MWNWAPR300N )',
 u'B004G6002M': u'SanDisk 16GB Mobile MicroSDHC Class 4 Flash Memory Card- SDSDQM-016G-B35N',
 u'B004GF8TIK': u'Mediabridge USB 2.0 - Micro-USB to USB Cable (6 Feet) - High-Speed A Male to Micro B - (Part# 30-004-06B )',
 u'B004QK7HI8': u'Mohu Leaf 30 TV Antenna, Indoor, 30 Mile Range, Original Paper-thin, Reversible, Paintable, 4K-Ready HDTV, 10 Foot Detachable Cable, Premium Materials for Performance, USA Made, MH-110598',
 u'B004W2JKWG': u'Crucial m4 64GB 2.5-Inch (9.5mm) SATA 6Gb/s Solid State Drive CT064M4SSD2',
 u'B004XC6GJ0': u'ARRIS SURFboard SB6121 DOCSIS 3.0 Cable Modem  (Black,Retail Packaging)',
 u'B0052SCU8U': u'AmazonBasics High-Speed HDMI Cable 2-Pack - 6.5 Feet (2 Meters) Supports Ethernet, 3D, 4K and Audio Return',
 u'B005CLPP84': u'Roku 2 XS 1080p Streaming Player (Old Model)',
 u'B005CT56F8': u'Seagate 320GB HDD SATA 6Gb/s 64MB Cache 3.5-Inch Internal Bare Drive (ST320DM000)',
 u'B005DKZTMG': u'Logitech Wireless Touch Keyboard K400 with Built-In Multi-Touch Touchpad, Black',
 u'B005FYNSPK': u'SanDisk Cruzer Fit 4GB USB 2.0 Low-Profile Flash Drive- SDCZ33-004G-B35',
 u'B005HMKKH4': u'WD My Passport 2TB Portable External USB 3.0 Hard Drive Storage Black (WDBY8L0020BBK-NESN)',
 u'B00622AG6S': u'PowerGen 2.4Amps / 12W Dual USB Car charger Designed for Apple and Android Devices - White',
 u'B006GWO5WK': u'Amazon Kindle 9W PowerFast Adapter for Accelerated Charging',
 u'B006W8U2MU': u'Kingston Digital DataTraveler SE9 8GB USB 2.0 DTSE9H/8GBZ',
 u'B0074BW614': u'Kindle Fire HD 7", Dolby Audio, Dual-Band Wi-Fi, 16 GB (Previous Generation - 2nd),-R',
 u'B007I5JT4S': '',
 u'B007R5YDYA': u'Amazon Kindle Paperwhite Case - Lightest and Thinnest Protective Genuine Leather Cover with Auto Wake/Sleep for Amazon Kindle Paperwhite, Saddle Tan',
 u'B007WTAJTO': u'SanDisk Ultra 64GB MicroSDXC Class 10 UHS Memory Card Speed Up To 30MB/s With Adapter - SDSDQUA-064G-U46A [Old Version]',
 u'B008OHNZI0': u'Tech Armor Ultimate 4-Way 360 Degree Privacy Screen Protector for Apple New iPhone 5, Latest Generation, 1-Pack',
 u'B009A5204K': u'LG Tone HBS-730 Wireless Stereo Headset - Black',
 u'B009SYZ8OC': u'AmazonBasics Apple Certified Lightning to USB Cable - 3 Feet (0.9 Meters) - Black',
 u'B00B46XUQU': u'PORTTA PET0301S 3x1 Port HDMI Switch/Switcher 1080P Supports 3D with IR Wireless Remote Ultra High...',
 u'B00BGGDVOO': u'Roku 3 Streaming Media Player (2014 model)',
 u'B00DR0PDNE': '',
 u'B00E3W15P0': u'[DISCONTINUED] Samsung 840 EVO 120GB 2.5-Inch SATA III Internal SSD (MZ-7TE120BW)'}

In [15]:
group[df.groupby('asin').size() >= 1000].sum()

89054

In [25]:
asin_dict = pipeline(df, asin_1000, product_names)

start time: 21:17:33.204446

working on ASIN B00004ZCJE
----------------------------------------
----------------------------------------

working on ASIN B00007E7JU
----------------------------------------
----------------------------------------

working on ASIN B0002L5R78
----------------------------------------
----------------------------------------

working on ASIN B000BQ7GW8
----------------------------------------
----------------------------------------

working on ASIN B000I68BD4
----------------------------------------
----------------------------------------

working on ASIN B000LRMS66
----------------------------------------
----------------------------------------

working on ASIN B000QUUFRW
----------------------------------------
----------------------------------------

working on ASIN B000S5Q9CA
----------------------------------------
----------------------------------------

working on ASIN B000VX6XL6
----------------------------------------
-----------------------

In [26]:
all_unigrams = []
all_bigrams = []

def is_amod(dep_list):
    '''
    INPUT: list of spacy dependency strings
    OUTPUT: float
    
    Calculates the percentage of time the unigram was used as an amod
    '''
    amod = np.array(asin_dict[asin]['dep_dict'][dep_list]) == 'amod'
    return round(np.mean(amod), 3)

for asin in asin_dict:
    name = asin_dict[asin]['name']
    
    print asin
    print name
    print '-' * 40, '\n'
    
    uni_rev_dict = asin_dict[asin]['uni_rev_dict']
    big_dist_dict = asin_dict[asin]['bigram_avg_dist']
    big_pmi_dict = asin_dict[asin]['bigram_pmi']
    big_rev_dict = asin_dict[asin]['big_rev_dict']
    
    unigrams = list(asin_dict[asin]['unigrams'])
    unigrams_amod = map(is_amod, unigrams)
    unigrams_amod_pol = [round(np.mean(asin_dict[asin]['pol_dict'][unigram]), 2)
                         for unigram in unigrams]
    unigrams_rev_f = [len(uni_rev_dict[unigram]) for unigram in unigrams]
    
    top_unigrams = sorted(zip(unigrams, unigrams_amod, unigrams_amod_pol, 
                              unigrams_rev_f),
                          key=lambda x: x[3], reverse=True)[0:20]
    all_unigrams.extend([[name, asin, x[0], x[1], x[2], x[3]] for x in top_unigrams])
    
    bigrams = list(asin_dict[asin]['bigrams'])
    bigrams_dist = [big_dist_dict[bigram] for bigram in bigrams]
    bigrams_pmi = [big_pmi_dict[bigram] for bigram in bigrams]
    bigrams_rev_f = [len(big_rev_dict[bigram]) for bigram in bigrams]
    
    top_bigrams = sorted(zip(bigrams, bigrams_dist, bigrams_pmi,
                            bigrams_rev_f),
                        key=lambda x: x[3], reverse=True)[0:20]
    all_bigrams.extend([[name, asin, x[0], x[1], x[2], x[3]]
                        for x in top_bigrams])
    
    pprint(top_unigrams)
    print
    pprint(top_bigrams)
    print

B00007E7JU
Canon EF 50mm f/1.8 II Camera Lens - Fixed (Discontinued by Manufacturer)
---------------------------------------- 

[(u'lens', 0.202, 0.62, 1171),
 (u'picture', 0.337, 0.78, 367),
 (u'quality', 0.148, 0.7, 360),
 (u'camera', 0.114, 0.57, 349),
 (u'light', 0.458, 0.24, 339),
 (u'portrait', 0.247, 0.7, 291),
 (u'shot', 0.326, 0.71, 276),
 (u'focus', 0.116, 0.35, 255),
 (u'photo', 0.291, 0.75, 247),
 (u'image', 0.319, 0.72, 242),
 (u'plastic', 0.178, 0.68, 187),
 (u'aperture', 0.353, 0.5, 179),
 (u'photography', 0.374, 0.52, 172),
 (u'money', 0.146, 0.48, 164),
 (u'bokeh', 0.281, 0.81, 159),
 (u'field', 0.203, 0.6, 154),
 (u'photographer', 0.301, 0.74, 153),
 (u'depth', 0.251, 0.92, 153),
 (u'bit', 0.121, 0.79, 121),
 (u'dslr', 0.138, 0.87, 106)]

[(u'low light', 1.28, 0.0008, 208),
 (u'build quality', 1.07, 0.0011, 124),
 (u'focal length', 1.02, 0.0065, 81),
 (u'image quality', 1.19, 0.0005, 79),
 (u'auto focus', 1.33, 0.0011, 75),
 (u'manual focus', 1.08, 0.0009, 66),
 (u'fo

In [119]:
# Only run the code if asin_dict was executed with following parameters:
# unigramer.candidate_unigrams(corpus, min_pct=0.01, amod_pct=0.0)
# bigramer.candidate_bigrams(corpus, cnt_dict, min_pct=0.005, pmi_pct=0, max_avg_dist=3)

import csv

with open("labeled_unigrams.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(all_unigrams)

with open("labeled_bigrams.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(all_bigrams)

### Evaluation of unigram threshold

In [3]:
uni_df = pd.read_csv('test.csv')

In [17]:
from sklearn.metrics import *

In [57]:
X = uni_df.ix[:, 3].values
y = uni_df.ix[:, 6].values.astype(bool)

In [32]:
def print_score(X, y, test_values):
    for thresh in test_values:
        acc = round(accuracy_score(y, X > thresh), 3)
        pre = round(precision_score(y, X > thresh), 3)
        rec = round(recall_score(y, X > thresh), 3)
        f1 = round(f1_score(y, X > thresh), 3)
        print thresh, '\t', acc, '\t', pre, '\t', rec, '\t', f1