# Amazon Web Scraping

### Section 1

We can limit ourselves to just the "Electronics" SearchIndex, since any IoT device will have at least some generic category there. When we do this, the number of items returned is fairly small in some cases.

> **NOTE:** Maximum number of items returned is 100

In [142]:
# Setup
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import bottlenose
import time

amazon = bottlenose.Amazon(
    'AKIAJR3E7MSS7K2VTCUQ',
    '6GPwrRVzhDeQnnpEzGDjQSdLQbQAOFNCrkJ7cZmP',
    'ballardt-20',
    Parser=lambda text: BeautifulSoup(text, 'xml'))

device_keywords = {
    'Smart_Things': 'smartthings',
    'Amazon_Echo': 'amazon',
    'Netatmo_Welcome': 'netatmo',
    'Samsung_Smartcam': 'samsungsmartcam',
    'Dropcam': 'dropcam',
    'Belkin_Wemo_Switch': 'xbcs',
    'TP-Link_Smart_Plug': 'tplinkcloud',
    'iHome': 'evrythng',
    'Belkin_Wemo_Motion_Sensor': 'xbcs',
    'LiFX_Smart_Bulb': 'lifx',
    'Triby_Speaker': 'invoxia',
    'Pix-Star_Photo_Frame': 'pix-star',
    'HP_Printer': 'hpeprint'
}


# Search for each device_keyword (only Electronics)
search_results = {}
for device,keyword in device_keywords.items():
    search_results[device] = {}
    has_more = True
    page = 1
    while has_more:
        time.sleep(1)
        try:
            search_results[device][page] = amazon.ItemSearch(
                Keywords=keyword,
                SearchIndex='Electronics',
                ResponseGroup='SearchBins, EditorialReview, ItemAttributes, BrowseNodes',
                ItemPage=page
            )
            if len(search_results[device][page].find_all('Item')) == 10:
                page += 1
            else:    
                has_more = False
        except HTTPError as err:
            print('Error ({}, page {}): {}'.format(device, page, err))
            has_more = False
print()
    

# Show how many items exist for each device (keyword)
result_counts = {}
for device,pages in search_results.items():
    result_counts[device] = 0
    for page,result in pages.items():
        result_counts[device] += len(result.find_all('Item'))
    print('{0:.<25}|{1:.<15}|{2}'.format(device, device_keywords[device], result_counts[device]))


Smart_Things.............|smartthings....|100
Samsung_Smartcam.........|samsungsmartcam|1
TP-Link_Smart_Plug.......|tplinkcloud....|0
HP_Printer...............|hpeprint.......|0
Netatmo_Welcome..........|netatmo........|8
Belkin_Wemo_Motion_Sensor|xbcs...........|5
Belkin_Wemo_Switch.......|xbcs...........|5
Amazon_Echo..............|amazon.........|29
Triby_Speaker............|invoxia........|24
Dropcam..................|dropcam........|100
LiFX_Smart_Bulb..........|lifx...........|2
Pix-Star_Photo_Frame.....|pix-star.......|88
iHome....................|evrythng.......|0


### Section 2

To better visualize the items returned, we can represent their hierarchical categories as a tree.

In [148]:
# Setup
from ete3 import Tree, TreeNode, TreeStyle, TextFace, NodeStyle, faces, CircleFace

# Convert BrowseNodes XML into Python dicts for ease of use
for device,pages in search_results.items():
    print('Constructing tree for {}...'.format(device))
    root = TreeNode()
    root.name = 'root'
    nodes = {}
    all_pages = str(pages[1])#''.join(list(map(lambda x: str(x), pages.values())))
    all_pages = '<AggResult>'+all_pages+'</AggResult>'
    
    # Get leaves (i.e. actual product category)
    leaf_nodes = BeautifulSoup(all_pages, 'xml').select('BrowseNodes > BrowseNode')
        
    for leaf in leaf_nodes:
        leaf_id = leaf.find('BrowseNodeId').string
        if leaf_id not in nodes:
            nodes[leaf_id] = {
                'name': leaf.find('Name').string,
                'leaf_count': 1,
                'total_count': 1
            }
            leaf_treenode = TreeNode()
            leaf_treenode.name = leaf_id
            # Add text
            category_name = TextFace(nodes[leaf_id]['name'])
            category_name.margin_right = 5
            category_name.margin_left = 30
            category_name.margin_bottom = 5
            category_name.margin_top = 10
            leaf_treenode.add_face(category_name, column=0, position='branch-top')
            nodes[leaf_id]['treenode'] = leaf_treenode
        else:
            nodes[leaf_id]['leaf_count'] += 1
            nodes[leaf_id]['total_count'] += 1
            leaf_treenode = nodes[leaf_id]['treenode']
            
        # Get ancestors
        ancestor_node = leaf.find('Ancestors')
        descendent_treenode = leaf_treenode
        while ancestor_node is not None:
            ancestor_id = ancestor_node.find('BrowseNodeId').string
            ancestor_name = ancestor_node.find('Name')
            if ancestor_name is not None:
                ancestor_name = ancestor_name.string
            else:
                ancestor_name = 'NO_NAME'
            if ancestor_id not in nodes:
                nodes[ancestor_id] = {
                    'name': ancestor_name,
                    'leaf_count': 0,
                    'total_count': 1
                }
                ancestor_treenode = TreeNode()
                ancestor_treenode.name = ancestor_id
                category_name = TextFace(nodes[ancestor_id]['name'])
                category_name.margin_right = 5
                category_name.margin_left = 30
                category_name.margin_bottom = 5
                category_name.margin_top = 10
                ancestor_treenode.add_face(category_name, column=0, position='branch-top')
                nodes[ancestor_id]['treenode'] = ancestor_treenode
            else:
                nodes[ancestor_id]['total_count'] += 1
                ancestor_treenode = nodes[ancestor_id]['treenode']
                
            # Attach children to ancestors
            child_names = [x.name for x in ancestor_treenode.children]
            if descendent_treenode.name not in child_names:
                ancestor_treenode.add_child(descendent_treenode)
            descendent_treenode = ancestor_treenode
            ancestor_node = ancestor_node.find('Ancestors')
            
        top_categories = [x.name for x in root.children]
        if descendent_treenode.name not in top_categories:
            root.add_child(descendent_treenode)
            
    ts = TreeStyle()
    ts.show_leaf_name = False
    ts.title.add_face(TextFace('{} (n=10)'.format(device), fsize=10), column=0)
    for n_id, n_data in nodes.items():
        leaf_count_text = TextFace('{} leaves'.format(n_data['leaf_count']))
        leaf_count_text.margin_right = 5
        leaf_count_text.margin_left = 30
        leaf_count_text.margin_bottom = 5
        leaf_count_text.margin_top = 5
        total_count_text = TextFace('{} total'.format(n_data['total_count']))
        total_count_text.margin_right = 5
        total_count_text.margin_left = 30
        total_count_text.margin_bottom = 5
        total_count_text.margin_top = 5
        n_data['treenode'].add_face(leaf_count_text, column=0, position='branch-bottom')
        n_data['treenode'].add_face(total_count_text, column=0, position='branch-bottom')
        n_style = NodeStyle()
        n_data['treenode'].img_style['size'] = n_data['total_count'] + 3
        if n_data['leaf_count'] > 0:
            n_color = '#e83a3a'
        else:
            n_color = 'blue'
        n_data['treenode'].img_style['fgcolor'] = n_color
        
    root.render(file_name='{}.png'.format(device), tree_style=ts)
    print('Done')

Constructing tree for Smart_Things...
Done
Constructing tree for Samsung_Smartcam...
Done
Constructing tree for TP-Link_Smart_Plug...
Done
Constructing tree for HP_Printer...
Done
Constructing tree for Netatmo_Welcome...
Done
Constructing tree for Belkin_Wemo_Motion_Sensor...
Done
Constructing tree for Belkin_Wemo_Switch...
Done
Constructing tree for Amazon_Echo...
Done
Constructing tree for Triby_Speaker...
Done
Constructing tree for Dropcam...
Done
Constructing tree for LiFX_Smart_Bulb...
Done
Constructing tree for Pix-Star_Photo_Frame...
Done
Constructing tree for iHome...
Done
