# Amazon Web Scraping

### Section 1

We can limit ourselves to just the "Electronics" SearchIndex, since any IoT device will have at least some generic category there. When we do this, the number of items returned is fairly small in some cases.

> **NOTE:** Maximum number of items returned is 100

In [16]:
# Setup
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import bottlenose
import pickle
import time

amazon = bottlenose.Amazon(
    'AKIAJES52AQ7KXTBCE6A',
    'rxgPzGinrfRZPFe50+tMo4eYknhkWfmhgEZJ2tyb',
    'ballardt-20',
    Parser=lambda text: BeautifulSoup(text, 'xml'))

device_keyword_fields = {
    'SmartThings': {
        'O': 'Physical Graph',
        'emailAddress': 'smartthings',
        'example': 'Physical'
    },
    'Amazon_Echo': {
        'O': 'Amazon',
        'CN': 'amazon'
    },
    'Netatmo_Welcome': {
        'O': 'Netatmo',
        'CN': 'netatmo',
        'emailAddress': 'netatmo'
    },
    'Samsung_SmartCam': {
        'O': 'Hanwha Techwin',
        'CN': 'samsungsmartcam'
    },
    'Dropcam': {
        'O': 'Dropcam',
        'CN': 'dropcam'
    },
    'Insteon_Camera': {
        'O': 'IPCam',
        'CN': 'IPC'
    },
    'TP-Link_Smart-Plug': {
        'O': 'TP-LINK TECHNOLOGIES',
        'CN': 'tplinkcloud'
    },
    'LiFX_Light-Bulb': {
        'O': 'LIFX',
        'CN': 'lifx'
    },
    'Triby_Speaker': {
        'O': 'invoxia',
        'CN': 'invoxia',
        'emailAddress': 'invoxia'
    },
    'Pix-Star_Photo-Frame': {
        'CN': 'pix-star'
    },
    'HP_Printer': {
        'O': 'HP',
        'CN': 'hpeprint'
    }
}

ignore_words = ['tablet', 'laptop', 'case', 'bag', 'sleeve', 
                'eBook', 'car', 'desktop', 'toner', 
                'calculator', 'cord', 'telephone', 'motherboard', 'cooling',
                'stand', 'headset',
                'adapter', 'peripheral', 'server', 'disc',
                'paper', 'pad', 'headphone', 'mice', 'mouse', 'keyboard',
                'game', 'turntable', 'dictionary']


# Search for each device_keyword (only Electronics)
search_results = {}
for device,fields in device_keyword_fields.items():
    search_results[device] = {}
    for field,keyword in fields.items():
        search_results[device][field] = {}
        has_more = True
        page = 1
        while has_more:
            time.sleep(1.1)
            try:
                print('Searching for "{}", page {}'.format(keyword, page))
                search_results[device][field][page] = amazon.ItemSearch(
                    Keywords=' '.join(['-{}'.format(s) for s in ignore_words]),
                    Brand=keyword,
                    SearchIndex='Electronics',
                    ResponseGroup='ItemAttributes, BrowseNodes, EditorialReview',
                    ItemPage=page
                )
                if len(search_results[device][field][page].find_all('Item')) == 10:
                    page += 1
                else:    
                    has_more = False
            except HTTPError as err:
                print('Error ({}, page {}): {}'.format(device, page, err))
                #has_more = False
print()

# Save to a file so we don't have to do this each time
#with open('amazon_search_results.pickle', 'wb') as f:
#    pickle.dump(search_results, f)

# Show how many items exist for each device (keyword)
result_counts = {}
for device,fields in search_results.items():
    result_counts[device] = {}
    for field,pages in fields.items():
        result_counts[device][field] = 0
        for page,result in pages.items():
            result_counts[device][field] += len(result.find_all('Item'))
        print('{0:.<25}|{1:.<15}|{2}'.format(device, device_keyword_fields[device][field], result_counts[device][field]))

Searching for "IPCam", page 1
Searching for "IPC", page 1


KeyboardInterrupt: 

In [6]:
# Setup
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import pandas as pd
import bottlenose
import pickle
import time

amazon = bottlenose.Amazon(
    'AKIAJES52AQ7KXTBCE6A',
    'rxgPzGinrfRZPFe50+tMo4eYknhkWfmhgEZJ2tyb',
    'ballardt-20',
    Parser=lambda text: BeautifulSoup(text, 'xml'))

iotlist_df = pd.read_json('iotlist-devices.json')
device_names = list(iotlist_df['device_name'])
long_names = {i: '{}: {}'.format(k, v) for i, (k,v) in enumerate(zip(iotlist_df['device_name'], iotlist_df['short_desc']))}
"""
ignore_words = ['tablet', 'laptop', 'case', 'bag', 'sleeve', 
                'eBook', 'car', 'desktop', 'toner', 
                'calculator', 'cord', 'telephone', 'motherboard', 'cooling',
                'stand', 'headset',
                'adapter', 'peripheral', 'server', 'disc',
                'paper', 'pad', 'headphone', 'mice', 'mouse', 'keyboard',
                'game', 'turntable', 'dictionary']
"""
ignore_words = []
# Search for each device_keyword (only Electronics)
search_results = {}
for i, device in enumerate(device_names):
    search_results[i] = {}

    has_more = True
    page = 1
    while has_more:
        time.sleep(1.1)
        try:
            print('Searching for "{}"'.format(device))
            search_results[i] = amazon.ItemSearch(
                Keywords=device + ' ' + (' '.join(['-{}'.format(s) for s in ignore_words])),
                SearchIndex='Electronics',
                ResponseGroup='ItemAttributes, BrowseNodes, EditorialReview'
            )   
            has_more = False
        except HTTPError as err:
            print('Error ({}): {}'.format(device, err))
            #has_more = False
print()

Searching for "Nest Thermostat"
Error (Nest Thermostat): HTTP Error 503: Service Unavailable
Searching for "Nest Thermostat"
Searching for "August"
Searching for "HomePod"
Error (HomePod): HTTP Error 503: Service Unavailable
Searching for "HomePod"
Error (HomePod): HTTP Error 503: Service Unavailable
Searching for "HomePod"
Searching for "Amazon Echo Show"
Searching for "Amazon Echo Look"
Error (Amazon Echo Look): HTTP Error 503: Service Unavailable
Searching for "Amazon Echo Look"
Searching for "Awair Glow"
Searching for "TP-Link"
Searching for "Wemo Mini"
Searching for "Google Daydream"
Searching for "Lyric"
Searching for "Talkies"
Searching for "Google Wifi"
Searching for "Ring"
Searching for "Google Home"
Error (Google Home): HTTP Error 503: Service Unavailable
Searching for "Google Home"
Error (Google Home): HTTP Error 503: Service Unavailable
Searching for "Google Home"
Error (Google Home): HTTP Error 503: Service Unavailable
Searching for "Google Home"
Error (Google Home): HTTP 

Searching for "MOTA SmartRing"
Error (MOTA SmartRing): HTTP Error 503: Service Unavailable
Searching for "MOTA SmartRing"
Searching for "Olive"
Searching for "FitBark"
Searching for "SmartMat"
Searching for "Highfive"
Searching for "Wink"
Searching for "iBaby Monitor"
Searching for "Rico"
Searching for "Notion"
Searching for "Ring"
Error (Ring): HTTP Error 503: Service Unavailable
Searching for "Ring"
Error (Ring): HTTP Error 503: Service Unavailable
Searching for "Ring"
Searching for "Hue Tap"
Error (Hue Tap): HTTP Error 503: Service Unavailable
Searching for "Hue Tap"
Searching for "Wink Relay"
Searching for "Tory Burch Fitbit"
Searching for "Tilt"
Searching for "Logitech Harmony"
Searching for "ecobee"
Searching for "Cloud Your Car"
Searching for "CHUI"
Searching for "EO1"
Searching for "Aura"
Searching for "ZEN Thermostat"
Searching for "Reemo"
Searching for "Eve"
Searching for "Drop"
Searching for "Glyph"
Searching for "runScribe"
Searching for "The Defender"
Searching for "Butter

Searching for "Sammy Screamer"
Searching for "MetaWatch STRATA"
Searching for "WeMo Light Switch"
Error (WeMo Light Switch): HTTP Error 503: Service Unavailable
Searching for "WeMo Light Switch"
Error (WeMo Light Switch): HTTP Error 503: Service Unavailable
Searching for "WeMo Light Switch"
Searching for "Chromecast"
Searching for "Alima"
Error (Alima): HTTP Error 503: Service Unavailable
Searching for "Alima"
Searching for "Lapka BAM"
Error (Lapka BAM): HTTP Error 503: Service Unavailable
Searching for "Lapka BAM"
Error (Lapka BAM): HTTP Error 503: Service Unavailable
Searching for "Lapka BAM"
Searching for "Lapka PEM"
Searching for "Epson Moverio BT-200"
Error (Epson Moverio BT-200): HTTP Error 503: Service Unavailable
Searching for "Epson Moverio BT-200"
Error (Epson Moverio BT-200): HTTP Error 503: Service Unavailable
Searching for "Epson Moverio BT-200"
Searching for "Narrative"
Searching for "Prep Pad"
Error (Prep Pad): HTTP Error 503: Service Unavailable
Searching for "Prep Pad"

In [7]:
with open('amazon_search_results_no-ignore.pickle', 'wb') as f:
    pickle.dump({long_names[k]:str(v) for k,v in search_results.items()}, f)

In [11]:
search_results['Goji Play']

<?xml version="1.0" encoding="utf-8"?>
<ItemSearchResponse xmlns="http://webservices.amazon.com/AWSECommerceService/2013-08-01"><OperationRequest><HTTPHeaders><Header Name="UserAgent" Value="Python-urllib/3.6"/></HTTPHeaders><RequestId>aac2abad-6353-4892-bca1-9b34b0aa4d35</RequestId><Arguments><Argument Name="AWSAccessKeyId" Value="AKIAJES52AQ7KXTBCE6A"/><Argument Name="AssociateTag" Value="ballardt-20"/><Argument Name="Keywords" Value="Goji Play"/><Argument Name="Operation" Value="ItemSearch"/><Argument Name="ResponseGroup" Value="ItemAttributes, BrowseNodes, EditorialReview"/><Argument Name="SearchIndex" Value="Electronics"/><Argument Name="Service" Value="AWSECommerceService"/><Argument Name="Timestamp" Value="2017-10-12T18:54:43Z"/><Argument Name="Version" Value="2013-08-01"/><Argument Name="Signature" Value="+XVShT8dUo6mPMVBJhdWqKN5wCoFWMTCkYrGrIky9ZE="/></Arguments><RequestProcessingTime>0.2758211960000000</RequestProcessingTime></OperationRequest><Items><Request><IsValid>True</I

If we've already done this before, we can just load it.

> **TODO:** Why is this giving a TypeError? Because of old PyQt4?

In [1]:
import pickle

with open('amazon_search_results.pickle', 'rb') as f:
    data = pickle.load(f)

TypeError: __new__() missing 1 required positional argument: 'name'

### Section 2

To better visualize the items returned, we can represent their hierarchical categories as a tree.

> **NOTE:** Trees will be saved as images

In [14]:
# Setup
from ete3 import Tree, TreeNode, TreeStyle, TextFace, NodeStyle, faces, CircleFace

# Convert BrowseNodes XML into Python dicts for ease of use
for device,fields in search_results.items():
    for field,pages in fields.items():
        print('Constructing tree for {} ({})...'.format(device, field))
        root = TreeNode()
        root.name = 'root'
        nodes = {}
        all_pages = ''.join(list(map(lambda x: str(x), pages.values())))
        all_pages = '<AggResult>'+all_pages+'</AggResult>'
        
        # Get leaves (i.e. actual product category)
        leaf_nodes = BeautifulSoup(all_pages, 'xml').select('BrowseNodes > BrowseNode')
            
        for leaf in leaf_nodes:
            leaf_id = leaf.find('BrowseNodeId').string
            if leaf_id not in nodes:
                nodes[leaf_id] = {
                    'name': leaf.find('Name').string,
                    'leaf_count': 1,
                    'total_count': 1
                }
                leaf_treenode = TreeNode()
                leaf_treenode.name = leaf_id
                # Add text
                category_name = TextFace(nodes[leaf_id]['name'])
                category_name.margin_right = 5
                category_name.margin_left = 30
                category_name.margin_bottom = 5
                category_name.margin_top = 10
                leaf_treenode.add_face(category_name, column=0, position='branch-top')
                nodes[leaf_id]['treenode'] = leaf_treenode
            else:
                nodes[leaf_id]['leaf_count'] += 1
                nodes[leaf_id]['total_count'] += 1
                leaf_treenode = nodes[leaf_id]['treenode']
                
            # Get ancestors
            ancestor_node = leaf.find('Ancestors')
            descendent_treenode = leaf_treenode
            while ancestor_node is not None:
                ancestor_id = ancestor_node.find('BrowseNodeId').string
                ancestor_name = ancestor_node.find('Name')
                if ancestor_name is not None:
                    ancestor_name = ancestor_name.string
                else:
                    ancestor_name = 'NO_NAME'
                if ancestor_id not in nodes:
                    nodes[ancestor_id] = {
                        'name': ancestor_name,
                        'leaf_count': 0,
                        'total_count': 1
                    }
                    ancestor_treenode = TreeNode()
                    ancestor_treenode.name = ancestor_id
                    category_name = TextFace(nodes[ancestor_id]['name'])
                    category_name.margin_right = 5
                    category_name.margin_left = 30
                    category_name.margin_bottom = 5
                    category_name.margin_top = 10
                    ancestor_treenode.add_face(category_name, column=0, position='branch-top')
                    nodes[ancestor_id]['treenode'] = ancestor_treenode
                else:
                    nodes[ancestor_id]['total_count'] += 1
                    ancestor_treenode = nodes[ancestor_id]['treenode']
                    
                # Attach children to ancestors
                child_names = [x.name for x in ancestor_treenode.children]
                if descendent_treenode.name not in child_names:
                    ancestor_treenode.add_child(descendent_treenode)
                descendent_treenode = ancestor_treenode
                ancestor_node = ancestor_node.find('Ancestors')
                
            top_categories = [x.name for x in root.children]
            if descendent_treenode.name not in top_categories:
                root.add_child(descendent_treenode)
                
        ts = TreeStyle()
        ts.show_leaf_name = False
        ts.title.add_face(TextFace('{} (n=10)'.format(device), fsize=10), column=0)
        for n_id, n_data in nodes.items():
            leaf_count_text = TextFace('{} leaves'.format(n_data['leaf_count']))
            leaf_count_text.margin_right = 5
            leaf_count_text.margin_left = 30
            leaf_count_text.margin_bottom = 5
            leaf_count_text.margin_top = 5
            total_count_text = TextFace('{} total'.format(n_data['total_count']))
            total_count_text.margin_right = 5
            total_count_text.margin_left = 30
            total_count_text.margin_bottom = 5
            total_count_text.margin_top = 5
            n_data['treenode'].add_face(leaf_count_text, column=0, position='branch-bottom')
            n_data['treenode'].add_face(total_count_text, column=0, position='branch-bottom')
            n_style = NodeStyle()
            n_data['treenode'].img_style['size'] = n_data['total_count'] + 3
            if n_data['leaf_count'] > 0:
                n_color = '#e83a3a'
            else:
                n_color = 'blue'
            n_data['treenode'].img_style['fgcolor'] = n_color
            
        root.render(file_name='{}_{}.png'.format(device, field), tree_style=ts)
        print('Done')

Constructing tree for Insteon_Camera (O)...
Done
Constructing tree for Insteon_Camera (CN)...
Done
Constructing tree for Dropcam (O)...
Done
Constructing tree for Dropcam (CN)...
Done
Constructing tree for LiFX_Light-Bulb (O)...
Done
Constructing tree for LiFX_Light-Bulb (CN)...
Done
Constructing tree for SmartThings (O)...
Done
Constructing tree for SmartThings (emailAddress)...
Done
Constructing tree for SmartThings (example)...
Done
Constructing tree for Amazon_Echo (O)...
Done
Constructing tree for Amazon_Echo (CN)...
Done
Constructing tree for Samsung_SmartCam (O)...
Done
Constructing tree for Samsung_SmartCam (CN)...
Done
Constructing tree for HP_Printer (O)...
Done
Constructing tree for HP_Printer (CN)...
Done
Constructing tree for Pix-Star_Photo-Frame (CN)...
Done
Constructing tree for TP-Link_Smart-Plug (O)...
Done
Constructing tree for TP-Link_Smart-Plug (CN)...
Done
Constructing tree for Netatmo_Welcome (O)...
Done
Constructing tree for Netatmo_Welcome (emailAddress)...
Done

### Section 3

We may have poor keywords if a search yields products from many different brands. One way to find the best keyword to use is to look at the results from the O field, the CN field, and the emailAddress and see how many unique brands are returned from each. The one with the highest single-brand density is probably the best one.

> **TODO:** Try to think of a way to consolidate different ways of saying the same company (e.g. HP & Hewlett Packard)

In [25]:
from collections import Counter

for device,fields in search_results.items():
    print()
    for field,pages in fields.items():
        all_pages = str(pages[1]) if 1 in pages else ''
        all_pages = ''.join(list(map(lambda x: str(x), pages.values())))#str(pages[1])
        all_pages = '<AggResult>'+all_pages+'</AggResult>'
        brands = BeautifulSoup(all_pages, 'xml').select('ItemAttributes > Brand')
        clean_mans = [''.join(c for c in b.string.lower() if c.isalnum()) for b in brands]
        num_unique_brands = len(set(clean_mans))
        print('{} unique brands for {} ({}) ({})'.format(num_unique_brands, device, field, device_keyword_fields[device][field]))
        for k,v in Counter(clean_mans).items():
            print('  {:.<25}...{}'.format(k, v))
        print()
        #print(set(clean_mans))


12 unique brands for Samsung_SmartCam (O) (Hanwha Techwin)
  hanwhatechwinamerica........2
  samsungwisenet..............8
  tntelectron.................2
  unknown.....................1
  samungwisenet...............1
  samsunghanwhatechwin........8
  samsung.....................42
  hanwha......................2
  samsungtechwin..............1
  hanwhatechwin...............25
  samsungoptoelectronicsincdbahanwhatechwinam...1
  samsunghanwha...............4

1 unique brands for Samsung_SmartCam (CN) (samsungsmartcam)
  samsung.....................1


2 unique brands for TP-Link_Smart-Plug (O) (TP-LINK TECHNOLOGIES)
  tplinktechnologiescoltd.....2
  tplink......................96

0 unique brands for TP-Link_Smart-Plug (CN) (tplinkcloud)


28 unique brands for Amazon_Echo (O) (Amazon)
  mamamouth...................1
  missioncables...............2
  dtto........................1
  wemo........................1
  bluewind....................1
  smatree.....................1
  sideclick

1 unique brands for Triby_Speaker (CN) (invoxia)
  invoxia.....................10


54 unique brands for SmartThings (emailAddress) (smartthings)
  coolcam.....................1
  centralite..................1
  kwikset.....................3
  qubino......................2
  sensemother.................1
  smartthings.................2
  irisusainc..................2
  insteon.....................1
  monoprice...................4
  turcom......................2
  youthink....................1
  arducam.....................1
  2gig........................1
  samsung.....................2
  sensative...................1
  fibaro......................1
  leviton.....................1
  zipato......................2
  innovator...................1
  besense.....................2
  amazon......................2
  eyesighttechnologies........1
  logitech....................2
  ring........................1
  squaretrade.................1
  visonic.....................1
  securifi............

### Section 4

The Insteon Camera is a special case because its keywords are wrong but still return useful results. First, let's see if we can identify the vendor.

In [32]:
for field,pages in search_results['Insteon_Camera'].items():
    all_pages = ''.join(list(map(lambda x: str(x), pages.values())))#str(pages[1])
    all_pages = '<AggResult>'+all_pages+'</AggResult>'
    brands = BeautifulSoup(all_pages, 'xml').select('ItemAttributes > Brand')
    clean_mans = [b.string for b in brands]
    print('{} ({}): {}'.format(field, device_keyword_fields['Insteon_Camera'][field], set(clean_mans)))

O (IPCam): {'Sricam', 'Pyle', 'Mini Gadgets', 'NETGEAR', 'SERHOM', 'WECAM', 'ieGeek', 'BOSMA', 'Generic', 'Foscam', 'UNKNOWN', 'Fodsports', 'Sysvideo', 'PrettyMakeUp', 'SCS Enterprises', 'Honeywell', 'GT View', 'Besteye', 'SafeMart', 'Securityman', 'Arlo', 'Urban Security Group', 'Ikevision', 'UOKOO', 'VSTARCAM', 'LITB', 'Wasserstein', 'KUCAM', 'ZAVACA', 'YI', 'Aukora', 'CANAVIS', 'Sokos', 'FancyTech', 'IPCC', 'Trivision', 'Tekvision', 'Faittoo', 'LightInTheBox', 'Alytimes', 'KONLEN'}
CN (IPC): {'wsdcam', 'ZKDSUIPC', 'HD-IPC', 'Seagate', 'Ark Technology', 'Dahua', 'JOOAN', 'DE', 'Holmes', 'Techage', 'Western Digital', 'Generic', 'APC', 'Linksys', 'Sabrent', 'Zmodo', 'foneset', 'Y-Axis', 'Braun', 'HP', 'Omron', 'DYMO', 'Energizer', 'Logitech', 'Samsung', 'Panasonic', 'DahuaOEM', 'Ubiquiti Networks', 'SanDisk', 'P3', 'Savvypixel', 'Brother', 'Microsoft', 'Plugable', 'Ama Free', 'Canon', 'Texas Instruments', 'X-ElectroniS', 'ACDelco', 'Crystal Vision Technology', 'iShot Pro®', 'HD Camera'

### Section 5

In [None]:
# Rooted at Electronics > Categories
depth = 3
documents = {}

