In [1]:
from __future__ import division
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
from pprint import pprint
import numpy as np
import pandas as pd

import datetime
import time
import requests

### Custom Functions

In [2]:
from wordsets import *
from scraper import *
from parsers import *

# My Code

In [3]:
class DFloader(object):
    '''
    INPUT: df
    OUTPUT: None
    
    Extracts the ratings and reviews from a canned amazon dataset
        parsed into pandas.
    '''
    
    def __init__(self, df, name=None):
        self.name = name
        self.ratings = df.overall.tolist()
        self.reviews = df.reviewText.tolist()

def pipeline(df, asin_list, product_names=None):
    '''
    INPUT: df, list(str), dict
    OUTPUT: dict
    
    runs pipeline specified in functions above for list of asin
        on loaded data frame containing reviews
    outputs asin callable dictionary
    '''
    print 'start time:', datetime.datetime.now().time().isoformat()
    print
    
    asin_dict = defaultdict(dict)
    
    for asin in asin_list:
        print 'working on ASIN {}'.format(asin)
        print '-'*40
        
        if product_names:
            asin_dict[asin]['name'] = product_names.get(asin, None)
        
        product = DFloader(df[df.asin == asin])    
        corpus = ReviewSents(product)
        
        unigramer = Unigramer()
        unigrams = unigramer.candidate_unigrams(corpus, min_pct=0.01, amod_pct=0.094)
        
        bigramer = Bigramer()
        bigrams = bigramer.candidate_bigrams(corpus, unigramer, min_pct=0.005,
                                             pmi_pct=1/2500, max_avg_dist=2)
        
        unigramer.update_review_count(bigramer)
        
        asin_dict[asin]['corpus'] = corpus
        asin_dict[asin]['unigramer'] = unigramer
        asin_dict[asin]['bigramer'] = bigramer
        
        print '-'*40
        print
    
    print 'end time:', datetime.datetime.now().time().isoformat()
    return asin_dict

In [4]:
def get_item_name(asin_list):
    '''
    INPUT: list
    OUTPUT: dict
    
    Script that returns a list of product names given an ASIN list
    '''
    output = dict()
    
    for asin in asin_list:
        url = 'https://www.amazon.com/dp/{}/'.format(asin)
        user_agent = ['Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30',
                      '(KHTML, like Gecko) Ubuntu/11.04',
                      'Chromium/12.0.742.91 Chrome/12.0.742.91',
                      'Safari/534.30']
        headers = {'User-Agent': np.random.choice(user_agent)}
        html = requests.get(url, headers=headers).content
        soup = BeautifulSoup(html, 'html.parser')
        try:
            name = soup.select('#productTitle')[0].text.strip()
        except IndexError:
            name = ''
            
        output[asin] = name
        time.sleep(5 + np.random.random()*10)
    
    return output

# Amazon Electronics Review Corpus

In [5]:
cd ~/repos/amazon_review_summarizer/

/Users/Alvin/Repos/amazon_review_summarizer


In [6]:
with open('reviews/Electronics_5-2.json', 'r') as f:
    data = f.readlines()

In [7]:
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"

In [8]:
df = pd.read_json(data_json_str)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1689188 entries, 0 to 1689187
Data columns (total 9 columns):
asin              1689188 non-null object
helpful           1689188 non-null object
overall           1689188 non-null int64
reviewText        1689188 non-null object
reviewTime        1689188 non-null object
reviewerID        1689188 non-null object
reviewerName      1664458 non-null object
summary           1689188 non-null object
unixReviewTime    1689188 non-null int64
dtypes: int64(2), object(7)
memory usage: 128.9+ MB


In [10]:
group = df.groupby('asin').size()

In [11]:
len(group[df.groupby('asin').size() >= 1000])

52

In [12]:
asin_1000 = group[df.groupby('asin').size() >= 1000].index.tolist()
asin_1000

[u'B00004ZCJE',
 u'B00007E7JU',
 u'B0002L5R78',
 u'B000BQ7GW8',
 u'B000I68BD4',
 u'B000LRMS66',
 u'B000QUUFRW',
 u'B000S5Q9CA',
 u'B000VX6XL6',
 u'B0012S4APK',
 u'B0015DYMVO',
 u'B0019EHU8G',
 u'B001TH7GSW',
 u'B001TH7GUU',
 u'B001XURP7W',
 u'B0027VT6V4',
 u'B002MAPRYU',
 u'B002QEBMAK',
 u'B002SZEOLG',
 u'B002V88HFE',
 u'B002WE6D44',
 u'B00316263Y',
 u'B003ELYQGG',
 u'B003ES5ZUU',
 u'B0041Q38NU',
 u'B0043T7FXE',
 u'B0044YU60M',
 u'B004G6002M',
 u'B004GF8TIK',
 u'B004QK7HI8',
 u'B004W2JKWG',
 u'B004XC6GJ0',
 u'B0052SCU8U',
 u'B005CLPP84',
 u'B005CT56F8',
 u'B005DKZTMG',
 u'B005FYNSPK',
 u'B005HMKKH4',
 u'B00622AG6S',
 u'B006GWO5WK',
 u'B006W8U2MU',
 u'B0074BW614',
 u'B007I5JT4S',
 u'B007R5YDYA',
 u'B007WTAJTO',
 u'B008OHNZI0',
 u'B009A5204K',
 u'B009SYZ8OC',
 u'B00B46XUQU',
 u'B00BGGDVOO',
 u'B00DR0PDNE',
 u'B00E3W15P0']

In [13]:
# product_names = get_item_name(asin_1000)

In [14]:
product_names = {u'B00004ZCJE': u'Tiffen 46mm UV Protection Filter',
 u'B00007E7JU': u'Canon EF 50mm f/1.8 II Camera Lens - Fixed (Discontinued by Manufacturer)',
 u'B0002L5R78': u"DVI Gear HDMI-2M 2M/6' HDMI Cable",
 u'B000BQ7GW8': u'SanDisk 2GB Class 4 SD Flash Memory Card- SDSDB-002G-B35 (Label May Change)',
 u'B000I68BD4': u'JLab Audio JBuds Hi-Fi Noise-Reducing Ear Buds, GUARANTEED FOR LIFE - White',
 u'B000LRMS66': u'Garmin Portable Friction Mount',
 u'B000QUUFRW': u'SanDisk 4GB Extreme SDHC Class 10 Memory Card',
 u'B000S5Q9CA': u'Motorola Vehicle Power Adapter micro-USB Charger',
 u'B000VX6XL6': u'Kingston 4 GB microSDHC Class 4 Flash Memory Card SDC4/4GBET',
 u'B0012S4APK': u'Cheetah Mounts APTMM2B TV Wall Mount for 20-75-Inch TVs Bundle with 10-feet Braided HDMI Cable and a 6-Inch 3-Axis Magnetic Bubble',
 u'B0015DYMVO': u'Belkin Mini 5W 3-Outlet Swivel Travel Charger with Dual USB Ports',
 u'B0019EHU8G': u'Mediabridge ULTRA Series HDMI Cable (6 Foot) - High-Speed Supports Ethernet, 3D and Audio Return [Newest Standard]',
 u'B001TH7GSW': u'AmazonBasics Digital Optical Audio Toslink Cable - 6 Feet (1.8 Meters)',
 u'B001TH7GUU': u'AmazonBasics USB 2.0 Extension Cable - A-Male to A-Female - 9.8 Feet (3 Meters)',
 u'B001XURP7W': u'SanDisk Cruzer 4GB USB 2.0 Flash Drive- SDCZ36-004G-B35',
 u'B0027VT6V4': u'Cyber Acoustics 30 Watt Powered Speakers with Subwoofer for PC and Gaming Systems in Standard Packaging, (CA-3602a)',
 u'B002MAPRYU': u'SanDisk Sansa Clip+ 4 GB MP3 Player (Red) (Discontinued by Manufacturer)',
 u'B002QEBMAK': u'WD Elements 500 GB USB 2.0 Desktop External Hard Drive',
 u'B002SZEOLG': u'TP-LINK TL-WN722N Wireless N150 High Gain USB Adapter, 150Mbps, 4dBi External Antenna, WPS Button, Support Windows XP/Vista/7/8',
 u'B002V88HFE': u'eneloop SEC-CSPACER4PK C Size Spacers for use with AA battery cells',
 u'B002WE6D44': u'Transcend 8GB Class 10 SDHC Card (TS8GSDHC10)',
 u'B00316263Y': u'BlueRigger High Speed HDMI Cable with Ethernet 6.6 Feet (2m) - Supports 3D and Audio Return [Latest Version]',
 u'B003ELYQGG': u'Panasonic ErgoFit Best in Class In-Ear Earbud Headphones RP-HJE120-D (Orange) Dynamic Crystal Clear Sound, Ergonomic Comfort-Fit, iPhone, Android Compatible, Noise Isolating Headphones',
 u'B003ES5ZUU': u'AmazonBasics High-Speed HDMI Cable with Ethernet - Braided 6.5 feet/2.0 meters (Discontinued by Manufacturer)',
 u'B0041Q38NU': u'Kingston Datatraveler 101 Gen 2 With urDrive 8GB USB 2.0 (Red)',
 u'B0043T7FXE': u'Logitech M570 Wireless Trackball, Computer Wireless Mouse, Long Range Wireless Mouse',
 u'B0044YU60M': u'Wireless Router w/ WiFi Range Extender Mode (300 Mbps) by Medialink - Easy YouTube Setup Video (Part# MWNWAPR300N )',
 u'B004G6002M': u'SanDisk 16GB Mobile MicroSDHC Class 4 Flash Memory Card- SDSDQM-016G-B35N',
 u'B004GF8TIK': u'Mediabridge USB 2.0 - Micro-USB to USB Cable (6 Feet) - High-Speed A Male to Micro B - (Part# 30-004-06B )',
 u'B004QK7HI8': u'Mohu Leaf 30 TV Antenna, Indoor, 30 Mile Range, Original Paper-thin, Reversible, Paintable, 4K-Ready HDTV, 10 Foot Detachable Cable, Premium Materials for Performance, USA Made, MH-110598',
 u'B004W2JKWG': u'Crucial m4 64GB 2.5-Inch (9.5mm) SATA 6Gb/s Solid State Drive CT064M4SSD2',
 u'B004XC6GJ0': u'ARRIS SURFboard SB6121 DOCSIS 3.0 Cable Modem  (Black,Retail Packaging)',
 u'B0052SCU8U': u'AmazonBasics High-Speed HDMI Cable 2-Pack - 6.5 Feet (2 Meters) Supports Ethernet, 3D, 4K and Audio Return',
 u'B005CLPP84': u'Roku 2 XS 1080p Streaming Player (Old Model)',
 u'B005CT56F8': u'Seagate 320GB HDD SATA 6Gb/s 64MB Cache 3.5-Inch Internal Bare Drive (ST320DM000)',
 u'B005DKZTMG': u'Logitech Wireless Touch Keyboard K400 with Built-In Multi-Touch Touchpad, Black',
 u'B005FYNSPK': u'SanDisk Cruzer Fit 4GB USB 2.0 Low-Profile Flash Drive- SDCZ33-004G-B35',
 u'B005HMKKH4': u'WD My Passport 2TB Portable External USB 3.0 Hard Drive Storage Black (WDBY8L0020BBK-NESN)',
 u'B00622AG6S': u'PowerGen 2.4Amps / 12W Dual USB Car charger Designed for Apple and Android Devices - White',
 u'B006GWO5WK': u'Amazon Kindle 9W PowerFast Adapter for Accelerated Charging',
 u'B006W8U2MU': u'Kingston Digital DataTraveler SE9 8GB USB 2.0 DTSE9H/8GBZ',
 u'B0074BW614': u'Kindle Fire HD 7", Dolby Audio, Dual-Band Wi-Fi, 16 GB (Previous Generation - 2nd),-R',
 u'B007I5JT4S': u'Apple TV MD199LL/A [NEWEST VERSION]',
 u'B007R5YDYA': u'Amazon Kindle Paperwhite Case - Lightest and Thinnest Protective Genuine Leather Cover with Auto Wake/Sleep for Amazon Kindle Paperwhite, Saddle Tan',
 u'B007WTAJTO': u'SanDisk Ultra 64GB MicroSDXC Class 10 UHS Memory Card Speed Up To 30MB/s With Adapter - SDSDQUA-064G-U46A [Old Version]',
 u'B008OHNZI0': u'Tech Armor Ultimate 4-Way 360 Degree Privacy Screen Protector for Apple New iPhone 5, Latest Generation, 1-Pack',
 u'B009A5204K': u'LG Tone HBS-730 Wireless Stereo Headset - Black',
 u'B009SYZ8OC': u'AmazonBasics Apple Certified Lightning to USB Cable - 3 Feet (0.9 Meters) - Black',
 u'B00B46XUQU': u'PORTTA PET0301S 3x1 Port HDMI Switch/Switcher 1080P Supports 3D with IR Wireless Remote Ultra High...',
 u'B00BGGDVOO': u'Roku 3 Streaming Media Player (2014 model)',
 u'B00DR0PDNE': u'Google Chromecast HDMI Streaming Media Player',
 u'B00E3W15P0': u'[DISCONTINUED] Samsung 840 EVO 120GB 2.5-Inch SATA III Internal SSD (MZ-7TE120BW)'}

In [15]:
group[df.groupby('asin').size() >= 1000].sum()

89054

In [16]:
asin_dict = pipeline(df, asin_1000, product_names)

start time: 23:14:22.630081

working on ASIN B00004ZCJE
----------------------------------------
----------------------------------------

working on ASIN B00007E7JU
----------------------------------------


  arr = np.array(self.dep_dict[word]) == 'amod'


----------------------------------------

working on ASIN B0002L5R78
----------------------------------------
----------------------------------------

working on ASIN B000BQ7GW8
----------------------------------------
----------------------------------------

working on ASIN B000I68BD4
----------------------------------------
----------------------------------------

working on ASIN B000LRMS66
----------------------------------------
----------------------------------------

working on ASIN B000QUUFRW
----------------------------------------
----------------------------------------

working on ASIN B000S5Q9CA
----------------------------------------
----------------------------------------

working on ASIN B000VX6XL6
----------------------------------------
----------------------------------------

working on ASIN B0012S4APK
----------------------------------------
----------------------------------------

working on ASIN B0015DYMVO
----------------------------------------
----------

In [17]:
review_dict = defaultdict(lambda: defaultdict(str))

for asin in asin_dict:
    name = asin_dict[asin]['name']
    
    print asin
    print name
    print '-' * 40, '\n'
    
    corpus = asin_dict[asin]['corpus']
    unigramer = asin_dict[asin]['unigramer']
    bigramer = asin_dict[asin]['bigramer']
    
    unigrams = list(unigramer.unigrams)
    bigrams = list(bigramer.bigrams)
    
    unigrams_rev_f = [len(unigramer.rev_dict[unigram]) for unigram in unigrams]
    bigrams_rev_f = [len(bigramer.rev_dict[bigram]) for bigram in bigrams]
    
    aspects = unigrams[:]
    aspects_rev_f = unigrams_rev_f[:]
    
    aspects.extend(bigrams)
    aspects_rev_f.extend(bigrams_rev_f)
        
    top_aspects = sorted(zip(aspects, aspects_rev_f),
                          key=lambda x: x[1], reverse=True)[0:20]
    
    pprint(top_aspects)
    print
    
    for unigram in unigrams:
        sentences = unigramer.sent_dict[unigram]

        for idx in sentences:
            rating = corpus.sentences[idx].review_rate
            review = corpus.sentences[idx].review_idx
            text = corpus.sentences[idx].sent.string
            
            review_dict[rating][(asin, name, unigram, review)] += text
            
    for bigram in bigrams:
        sentences = bigramer.sent_dict[bigram]

        for idx in sentences:
            rating = corpus.sentences[idx].review_rate
            review = corpus.sentences[idx].review_idx
            text = corpus.sentences[idx].sent.string
            
            review_dict[rating][(asin, name, bigram, review)] += text

B00007E7JU
Canon EF 50mm f/1.8 II Camera Lens - Fixed (Discontinued by Manufacturer)
---------------------------------------- 

[(u'lens', 1171),
 (u'picture', 367),
 (u'camera', 349),
 (u'portrait', 291),
 (u'shot', 276),
 (u'photo', 247),
 (u'low light', 208),
 (u'plastic', 187),
 (u'quality', 176),
 (u'photography', 172),
 (u'money', 164),
 (u'image', 163),
 (u'bokeh', 159),
 (u'field', 154),
 (u'photographer', 153),
 (u'depth', 153),
 (u'aperture', 135),
 (u'light', 125),
 (u'build quality', 124),
 (u'bit', 121)]

B0027VT6V4
Cyber Acoustics 30 Watt Powered Speakers with Subwoofer for PC and Gaming Systems in Standard Packaging, (CA-3602a)
---------------------------------------- 

[(u'speaker', 823),
 (u'sound', 653),
 (u'bass', 477),
 (u'system', 395),
 (u'quality', 326),
 (u'set', 225),
 (u'volume', 224),
 (u'volume control', 195),
 (u'music', 192),
 (u'room', 137),
 (u'money', 127),
 (u'range', 108),
 (u'control', 105),
 (u'bit', 97),
 (u'base', 88),
 (u'level', 84),
 (u'satelli

In [35]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from afinn import Afinn

sid = SentimentIntensityAnalyzer()
afinn = Afinn()

np.random.seed(0)
random_reviews = []

for rating in review_dict:
    keys = review_dict[rating].keys()
    rand = np.random.choice(xrange(len(keys)), 200, replace=False)
    
    for idx in rand:
        _, name, aspect, _ = keys[idx]
        text = review_dict[rating][keys[idx]]
        pol = round(TextBlob(text).sentiment.polarity, 3)
        pol_v = sid.polarity_scores(text)['compound']
        pol_a = afinn.score(text)
        random_reviews.append([rating, name, aspect, pol, pol_v, pol_a, text])
        
random_reviews = sorted(random_reviews, key=lambda x: (x[0], x[3]))

In [36]:
import csv

with open("random_reviews.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(random_reviews)

In [39]:
# only output sentences with customer rating of 3 and textblob
# polarity score >= 0.7

from textblob import TextBlob

review_dict = defaultdict(lambda: defaultdict(str))

for asin in asin_dict:
    name = asin_dict[asin]['name']
    
    print asin
    print name
    print '-' * 40, '\n'
    
    corpus = asin_dict[asin]['corpus']
    unigramer = asin_dict[asin]['unigramer']
    bigramer = asin_dict[asin]['bigramer']
    
    unigrams = list(unigramer.unigrams)
    bigrams = list(bigramer.bigrams)
    
    unigrams_rev_f = [len(unigramer.rev_dict[unigram]) for unigram in unigrams]
    bigrams_rev_f = [len(bigramer.rev_dict[bigram]) for bigram in bigrams]
    
    aspects = unigrams[:]
    aspects_rev_f = unigrams_rev_f[:]
    
    aspects.extend(bigrams)
    aspects_rev_f.extend(bigrams_rev_f)
        
    top_aspects = sorted(zip(aspects, aspects_rev_f),
                          key=lambda x: x[1], reverse=True)[0:20]
    
    pprint(top_aspects)
    print
    
    for unigram in unigrams:
        sentences = unigramer.sent_dict[unigram]

        for idx in sentences:
            rating = corpus.sentences[idx].review_rate
            review = corpus.sentences[idx].review_idx
            text = corpus.sentences[idx].sent.string
            
            if rating == 3:
                if TextBlob(text).sentiment.polarity < 0.7:
                    continue
            else:
                continue
            
            review_dict[rating][(asin, name, unigram, review)] += text
            
    for bigram in bigrams:
        sentences = bigramer.sent_dict[bigram]

        for idx in sentences:
            rating = corpus.sentences[idx].review_rate
            review = corpus.sentences[idx].review_idx
            text = corpus.sentences[idx].sent.string
            
            if rating == 3:
                if TextBlob(text).sentiment.polarity < 0.7:
                    continue
            else:
                continue
            
            review_dict[rating][(asin, name, bigram, review)] += text

B00007E7JU
Canon EF 50mm f/1.8 II Camera Lens - Fixed (Discontinued by Manufacturer)
---------------------------------------- 

[(u'lens', 1171),
 (u'picture', 367),
 (u'camera', 349),
 (u'portrait', 291),
 (u'shot', 276),
 (u'photo', 247),
 (u'low light', 208),
 (u'plastic', 187),
 (u'quality', 176),
 (u'photography', 172),
 (u'money', 164),
 (u'image', 163),
 (u'bokeh', 159),
 (u'field', 154),
 (u'photographer', 153),
 (u'depth', 153),
 (u'aperture', 135),
 (u'light', 125),
 (u'build quality', 124),
 (u'bit', 121)]

B0027VT6V4
Cyber Acoustics 30 Watt Powered Speakers with Subwoofer for PC and Gaming Systems in Standard Packaging, (CA-3602a)
---------------------------------------- 

[(u'speaker', 823),
 (u'sound', 653),
 (u'bass', 477),
 (u'system', 395),
 (u'quality', 326),
 (u'set', 225),
 (u'volume', 224),
 (u'volume control', 195),
 (u'music', 192),
 (u'room', 137),
 (u'money', 127),
 (u'range', 108),
 (u'control', 105),
 (u'bit', 97),
 (u'base', 88),
 (u'level', 84),
 (u'satelli

In [40]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from afinn import Afinn

sid = SentimentIntensityAnalyzer()
afinn = Afinn()

np.random.seed(0)
random_reviews = []

for rating in review_dict:
    keys = review_dict[rating].keys()
    rand = np.random.choice(xrange(len(keys)), 200, replace=False)
    
    for idx in rand:
        _, name, aspect, _ = keys[idx]
        text = review_dict[rating][keys[idx]]
        pol = round(TextBlob(text).sentiment.polarity, 3)
        pol_v = sid.polarity_scores(text)['compound']
        pol_a = afinn.score(text)
        random_reviews.append([rating, name, aspect, pol, pol_v, pol_a, text])
        
random_reviews = sorted(random_reviews, key=lambda x: (x[0], x[3]))

In [41]:
import csv

with open("random_reviews_rating3_ge0.7.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(random_reviews)

In [42]:
# only output sentences with abs(textblob polarity score) >= 0.8

from textblob import TextBlob

review_dict = defaultdict(lambda: defaultdict(str))

for asin in asin_dict:
    name = asin_dict[asin]['name']
    
    print asin
    print name
    print '-' * 40, '\n'
    
    corpus = asin_dict[asin]['corpus']
    unigramer = asin_dict[asin]['unigramer']
    bigramer = asin_dict[asin]['bigramer']
    
    unigrams = list(unigramer.unigrams)
    bigrams = list(bigramer.bigrams)
    
    unigrams_rev_f = [len(unigramer.rev_dict[unigram]) for unigram in unigrams]
    bigrams_rev_f = [len(bigramer.rev_dict[bigram]) for bigram in bigrams]
    
    aspects = unigrams[:]
    aspects_rev_f = unigrams_rev_f[:]
    
    aspects.extend(bigrams)
    aspects_rev_f.extend(bigrams_rev_f)
        
    top_aspects = sorted(zip(aspects, aspects_rev_f),
                          key=lambda x: x[1], reverse=True)[0:20]
    
    pprint(top_aspects)
    print
    
    for unigram in unigrams:
        sentences = unigramer.sent_dict[unigram]

        for idx in sentences:
            rating = corpus.sentences[idx].review_rate
            review = corpus.sentences[idx].review_idx
            text = corpus.sentences[idx].sent.string
            
            if abs(TextBlob(text).sentiment.polarity) < 0.8:
                continue

            review_dict[rating][(asin, name, unigram, review)] += text
            
    for bigram in bigrams:
        sentences = bigramer.sent_dict[bigram]

        for idx in sentences:
            rating = corpus.sentences[idx].review_rate
            review = corpus.sentences[idx].review_idx
            text = corpus.sentences[idx].sent.string
            
            if abs(TextBlob(text).sentiment.polarity) < 0.8:
                continue
            
            review_dict[rating][(asin, name, bigram, review)] += text

B00007E7JU
Canon EF 50mm f/1.8 II Camera Lens - Fixed (Discontinued by Manufacturer)
---------------------------------------- 

[(u'lens', 1171),
 (u'picture', 367),
 (u'camera', 349),
 (u'portrait', 291),
 (u'shot', 276),
 (u'photo', 247),
 (u'low light', 208),
 (u'plastic', 187),
 (u'quality', 176),
 (u'photography', 172),
 (u'money', 164),
 (u'image', 163),
 (u'bokeh', 159),
 (u'field', 154),
 (u'photographer', 153),
 (u'depth', 153),
 (u'aperture', 135),
 (u'light', 125),
 (u'build quality', 124),
 (u'bit', 121)]

B0027VT6V4
Cyber Acoustics 30 Watt Powered Speakers with Subwoofer for PC and Gaming Systems in Standard Packaging, (CA-3602a)
---------------------------------------- 

[(u'speaker', 823),
 (u'sound', 653),
 (u'bass', 477),
 (u'system', 395),
 (u'quality', 326),
 (u'set', 225),
 (u'volume', 224),
 (u'volume control', 195),
 (u'music', 192),
 (u'room', 137),
 (u'money', 127),
 (u'range', 108),
 (u'control', 105),
 (u'bit', 97),
 (u'base', 88),
 (u'level', 84),
 (u'satelli

In [43]:
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from afinn import Afinn

sid = SentimentIntensityAnalyzer()
afinn = Afinn()

np.random.seed(0)
random_reviews = []

for rating in review_dict:
    keys = review_dict[rating].keys()
    rand = np.random.choice(xrange(len(keys)), 200, replace=False)
    
    for idx in rand:
        _, name, aspect, _ = keys[idx]
        text = review_dict[rating][keys[idx]]
        pol = round(TextBlob(text).sentiment.polarity, 3)
        pol_v = sid.polarity_scores(text)['compound']
        pol_a = afinn.score(text)
        random_reviews.append([rating, name, aspect, pol, pol_v, pol_a, text])
        
random_reviews = sorted(random_reviews, key=lambda x: (x[0], x[3]))

In [44]:
import csv

with open("random_reviews_abs_ge0.8.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(random_reviews)