# Amazon Web Scraping

### Section 1

We can limit ourselves to just the "Electronics" SearchIndex, since any IoT device will have at least some generic category there. When we do this, the number of items returned is fairly small in some cases.

> **NOTE:** Maximum number of items returned is 100

In [3]:
# Setup
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import bottlenose
import pickle
import time

amazon = bottlenose.Amazon(
    'AKIAJES52AQ7KXTBCE6A',
    'rxgPzGinrfRZPFe50+tMo4eYknhkWfmhgEZJ2tyb',
    'ballardt-20',
    Parser=lambda text: BeautifulSoup(text, 'xml'))

device_keywords = {
    'Smart_Things_ideal': 'SmartThings',
    'Smart_Things_O-field': 'Physical',
    'Amazon_Echo': 'Amazon',
    'Netatmo_Welcome': 'Netatmo',
    'Samsung_Smartcam': 'Hanwha',
    'Dropcam': 'Dropcam',
    'Insteon_Camera': 'IPC',
    #'Belkin_Wemo_Switch': 'xbcs',
    'TP-Link_Smart_Plug': 'TP-LINK',
    #'iHome': 'evrythng',
    #'Belkin_Wemo_Motion_Sensor': 'xbcs',
    'LiFX_Smart_Bulb': 'LIFX',
    'Triby_Speaker': 'invoxia',
    'Pix-Star_Photo_Frame': 'pix-star',
    'HP_Printer': 'HP'
}


# Search for each device_keyword (only Electronics)
search_results = {}
for device,keyword in device_keywords.items():
    search_results[device] = {}
    has_more = True
    page = 1
    while has_more:
        time.sleep(1.1)
        try:
            print('Searching for "{}", page {}'.format(keyword, page))
            search_results[device][page] = amazon.ItemSearch(
                #Keywords=keyword,
                Brand=keyword,
                SearchIndex='Electronics',
                ResponseGroup='ItemAttributes, BrowseNodes',
                ItemPage=page
            )
            if len(search_results[device][page].find_all('Item')) == 10:
                page += 1
            else:    
                has_more = False
        except HTTPError as err:
            print('Error ({}, page {}): {}'.format(device, page, err))
            has_more = False
print()

# Save to a file so we don't have to do this each time
#with open('amazon_search_results.pickle', 'wb') as f:
#    pickle.dump(search_results, f)

# Show how many items exist for each device (keyword)
result_counts = {}
for device,pages in search_results.items():
    result_counts[device] = 0
    for page,result in pages.items():
        result_counts[device] += len(result.find_all('Item'))
    print('{0:.<25}|{1:.<15}|{2}'.format(device, device_keywords[device], result_counts[device]))

Searching for "Amazon", page 1
Searching for "Amazon", page 2
Searching for "Amazon", page 3
Searching for "Amazon", page 4
Searching for "Amazon", page 5
Searching for "Amazon", page 6
Searching for "Amazon", page 7
Error (Amazon_Echo, page 7): HTTP Error 503: Service Unavailable
Searching for "HP", page 1
Searching for "HP", page 2
Searching for "HP", page 3
Searching for "HP", page 4
Searching for "HP", page 5
Searching for "HP", page 6
Searching for "HP", page 7
Searching for "HP", page 8
Searching for "HP", page 9
Searching for "HP", page 10
Searching for "HP", page 11
Searching for "Dropcam", page 1
Searching for "TP-LINK", page 1
Searching for "TP-LINK", page 2
Searching for "TP-LINK", page 3
Searching for "TP-LINK", page 4
Searching for "TP-LINK", page 5
Searching for "TP-LINK", page 6
Searching for "TP-LINK", page 7
Searching for "TP-LINK", page 8
Searching for "TP-LINK", page 9
Searching for "TP-LINK", page 10
Searching for "TP-LINK", page 11
Searching for "pix-star", page 1


If we've already done this before, we can just load it.

> **TODO:** Why is this giving a TypeError? Because of old PyQt4?

In [31]:
import pickle

with open('amazon_search_results.pickle', 'rb') as f:
    data = pickle.load(f)

TypeError: __new__() missing 1 required positional argument: 'name'

### Section 2

To better visualize the items returned, we can represent their hierarchical categories as a tree.

> **NOTE:** Trees will be saved as images

In [4]:
# Setup
from ete3 import Tree, TreeNode, TreeStyle, TextFace, NodeStyle, faces, CircleFace

# Convert BrowseNodes XML into Python dicts for ease of use
for device,pages in search_results.items():
    print('Constructing tree for {}...'.format(device))
    root = TreeNode()
    root.name = 'root'
    nodes = {}
    all_pages = str(pages[1])#''.join(list(map(lambda x: str(x), pages.values())))
    all_pages = '<AggResult>'+all_pages+'</AggResult>'
    
    # Get leaves (i.e. actual product category)
    leaf_nodes = BeautifulSoup(all_pages, 'xml').select('BrowseNodes > BrowseNode')
        
    for leaf in leaf_nodes:
        leaf_id = leaf.find('BrowseNodeId').string
        if leaf_id not in nodes:
            nodes[leaf_id] = {
                'name': leaf.find('Name').string,
                'leaf_count': 1,
                'total_count': 1
            }
            leaf_treenode = TreeNode()
            leaf_treenode.name = leaf_id
            # Add text
            category_name = TextFace(nodes[leaf_id]['name'])
            category_name.margin_right = 5
            category_name.margin_left = 30
            category_name.margin_bottom = 5
            category_name.margin_top = 10
            leaf_treenode.add_face(category_name, column=0, position='branch-top')
            nodes[leaf_id]['treenode'] = leaf_treenode
        else:
            nodes[leaf_id]['leaf_count'] += 1
            nodes[leaf_id]['total_count'] += 1
            leaf_treenode = nodes[leaf_id]['treenode']
            
        # Get ancestors
        ancestor_node = leaf.find('Ancestors')
        descendent_treenode = leaf_treenode
        while ancestor_node is not None:
            ancestor_id = ancestor_node.find('BrowseNodeId').string
            ancestor_name = ancestor_node.find('Name')
            if ancestor_name is not None:
                ancestor_name = ancestor_name.string
            else:
                ancestor_name = 'NO_NAME'
            if ancestor_id not in nodes:
                nodes[ancestor_id] = {
                    'name': ancestor_name,
                    'leaf_count': 0,
                    'total_count': 1
                }
                ancestor_treenode = TreeNode()
                ancestor_treenode.name = ancestor_id
                category_name = TextFace(nodes[ancestor_id]['name'])
                category_name.margin_right = 5
                category_name.margin_left = 30
                category_name.margin_bottom = 5
                category_name.margin_top = 10
                ancestor_treenode.add_face(category_name, column=0, position='branch-top')
                nodes[ancestor_id]['treenode'] = ancestor_treenode
            else:
                nodes[ancestor_id]['total_count'] += 1
                ancestor_treenode = nodes[ancestor_id]['treenode']
                
            # Attach children to ancestors
            child_names = [x.name for x in ancestor_treenode.children]
            if descendent_treenode.name not in child_names:
                ancestor_treenode.add_child(descendent_treenode)
            descendent_treenode = ancestor_treenode
            ancestor_node = ancestor_node.find('Ancestors')
            
        top_categories = [x.name for x in root.children]
        if descendent_treenode.name not in top_categories:
            root.add_child(descendent_treenode)
            
    ts = TreeStyle()
    ts.show_leaf_name = False
    ts.title.add_face(TextFace('{} (n=10)'.format(device), fsize=10), column=0)
    for n_id, n_data in nodes.items():
        leaf_count_text = TextFace('{} leaves'.format(n_data['leaf_count']))
        leaf_count_text.margin_right = 5
        leaf_count_text.margin_left = 30
        leaf_count_text.margin_bottom = 5
        leaf_count_text.margin_top = 5
        total_count_text = TextFace('{} total'.format(n_data['total_count']))
        total_count_text.margin_right = 5
        total_count_text.margin_left = 30
        total_count_text.margin_bottom = 5
        total_count_text.margin_top = 5
        n_data['treenode'].add_face(leaf_count_text, column=0, position='branch-bottom')
        n_data['treenode'].add_face(total_count_text, column=0, position='branch-bottom')
        n_style = NodeStyle()
        n_data['treenode'].img_style['size'] = n_data['total_count'] + 3
        if n_data['leaf_count'] > 0:
            n_color = '#e83a3a'
        else:
            n_color = 'blue'
        n_data['treenode'].img_style['fgcolor'] = n_color
        
    root.render(file_name='{}.png'.format(device), tree_style=ts)
    print('Done')

Constructing tree for Amazon_Echo...
Done
Constructing tree for HP_Printer...
Done
Constructing tree for Dropcam...
Done
Constructing tree for TP-Link_Smart_Plug...
Done
Constructing tree for Pix-Star_Photo_Frame...
Done
Constructing tree for Triby_Speaker...
Done
Constructing tree for Insteon_Camera...
Done
Constructing tree for Netatmo_Welcome...
Done
Constructing tree for Smart_Things_O-field...
Done
Constructing tree for LiFX_Smart_Bulb...
Done
Constructing tree for Samsung_Smartcam...
Done
Constructing tree for Smart_Things_ideal...
Done


### Section 3

While we don't extract the device search terms with code, we do follow an algorithm:

    if (certificate has emailAddress):
      use emailAddress domain name
    
    else if (certificate has O field):
      use first word of O field
      
    else if (certificate has CN field):
      use CN field domain name
      
    else:
      this item cannot be used with Amazon
      
It works for the most part, but there are some instances where the first word of the O field gives misleading results. More generally, to detect when we've gotten bad keywords, one thing we can do is check how many brands get returned in the search results. If we search for a proper brand, we should mostly get its items. Conversely, if we search for a generic keyword, we might get a lot of different brands, which would be a hint that the keyword is bad.

> **TODO:** If we can ever get around the 503 errors, see if different N values affect this. Also try to think of a way to consolidate different ways of saying the same company (e.g. HP & Hewlett Packard)

In [8]:
for device,pages in search_results.items():
    all_pages = ''.join(list(map(lambda x: str(x), pages.values())))#str(pages[1])
    all_pages = '<AggResult>'+all_pages+'</AggResult>'
    manufacturers = BeautifulSoup(all_pages, 'xml').select('Manufacturer')
    clean_mans = [''.join(c for c in m.string.split(' ',1)[0].lower() if c.isalnum()) for m in manufacturers]
    num_unique_brands = len(set(clean_mans))
    print('{} unique brands for {}'.format(num_unique_brands, device))

3 unique brands for Amazon_Echo
2 unique brands for HP_Printer
1 unique brands for Dropcam
4 unique brands for TP-Link_Smart_Plug
1 unique brands for Pix-Star_Photo_Frame
1 unique brands for Triby_Speaker
4 unique brands for Insteon_Camera
1 unique brands for Netatmo_Welcome
8 unique brands for Smart_Things_O-field
1 unique brands for LiFX_Smart_Bulb
4 unique brands for Samsung_Smartcam
3 unique brands for Smart_Things_ideal


In the same vein, we expect a search by brand to yield items from the brand being searched for. Even if all of the items returned belong to a single brand, if that brand is different than the one we searched for, we may have used a generic keyword.

In [12]:
for device,pages in search_results.items():
    all_pages = ''.join(list(map(lambda x: str(x), pages.values())))#str(pages[1])
    all_pages = '<AggResult>'+all_pages+'</AggResult>'
    manufacturers = BeautifulSoup(all_pages, 'xml').select('Brand')
    clean_mans = [''.join(c for c in m.string.split(' ',1)[0].lower() if c.isalnum()) for m in manufacturers]
    num_right = clean_mans.count(''.join(c for c in device_keywords[device].lower() if c.isalnum()))
    num_wrong = len(clean_mans) - num_right
    percent_right = num_right / len(clean_mans)
    print()
    print(device)
    #print(device_keywords[device])
    #print(clean_mans)
    print('{:.2f} ({} right, {} wrong, {} total)'.format(percent_right, num_right, num_wrong, len(clean_mans)))


Amazon_Echo
0.91 (60 right, 6 wrong, 66 total)

HP_Printer
1.00 (111 right, 0 wrong, 111 total)

Dropcam
0.80 (4 right, 1 wrong, 5 total)

TP-Link_Smart_Plug
0.99 (109 right, 1 wrong, 110 total)

Pix-Star_Photo_Frame
1.00 (2 right, 0 wrong, 2 total)

Triby_Speaker
1.00 (12 right, 0 wrong, 12 total)

Insteon_Camera
0.13 (10 right, 66 wrong, 76 total)

Netatmo_Welcome
1.00 (11 right, 0 wrong, 11 total)

Smart_Things_O-field
0.26 (18 right, 50 wrong, 68 total)

LiFX_Smart_Bulb
1.00 (2 right, 0 wrong, 2 total)

Samsung_Smartcam
0.52 (57 right, 53 wrong, 110 total)

Smart_Things_ideal
0.40 (8 right, 12 wrong, 20 total)


We should also be careful if the results from any of the 3 fields of interest (O, CN, and emailAddress) differ significantly from the rest

> **TODO:** This may require more effort to test, both in coding it up and determing what metric to define success by. Should we look at how many literal items cross over? Or differences in brand name? What if different numbers of items are returned? Etc.

In [None]:
device_keyword_fields = {
    'SmartThings': {
        'O': 'Physical',
        'emailAddress': 'smartthings'
    },
    'Amazon_Echo': {
        'O': 'Amazon',
        'CN': 'amazon'
    },
    'Netatmo_Welcome': {
        'O': 'Netatmo',
        'CN': 'netatmo',
        'emailAddress': 'netatmo'
    },
    'Samsung_SmartCam': {
        'O': 'Hanwha Techwin',
        'CN': 'samsungsmartcam'
    },
    'Dropcam': {
        'O': 'Dropcam',
        'CN': 'dropcam'
    },
    'Insteon_Camera': {
        'O': 'IPCam',
        'CN': 'IPC'
    },
    'TP-Link_Smart-Plug': {
        'O': 'TP-LINK',
        'CN': 'tplinkcloud'
    },
    'LiFX_Lightbulb': {
        'O': 'LIFX',
        'CN': 'lifx'
    },
    'Triby_Speaker': {
        'O': 'invoxia',
        'CN': 'invoxia',
        'emailAddress': 'invoxia'
    },
    'Pix-Star_Photo-Frame': {
        'CN': 'pix-star'
    },
    'HP_Printer': {
        'O': 'HP',
        'CN': 'hpeprint'
    }
}