In [1]:
import urllib.parse
import hmac
import hashlib
import base64
import requests
from datetime import datetime
import cred


def getResponse(operation, keyword, index, group,page):
    param={'Service':'AWSECommerceService'}
    param['AWSAccessKeyId']=cred.login['AWSAccessKeyId']
    param['AssociateTag']=cred.login['AssociateTag']
    param['Operation']=operation
    param['Keywords']=keyword
    param['SearchIndex']=index
    param['ResponseGroup']=group
    param['ItemPage']=page
    param['Version']='2013-08-01'
    param['Timestamp']=datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
    s=urllib.parse.urlencode(sorted(param.items()))
    s="""GET\nwebservices.amazon.com\n/onca/xml\n"""+s
    s=s.encode('utf-8')
    secret=cred.login['SecretKey'].encode('utf-8')
    code=base64.b64encode(hmac.new(secret, msg=s, digestmod=hashlib.sha256).digest())
    param['Signature']=code
    response=requests.get('http://webservices.amazon.com/onca/xml',param)
    print(response.url)
    return response.text

In [19]:
import xml.etree.ElementTree as ET
import csv
import os
import urllib.request

def extractData(xmlText, keyword):
    root = ET.fromstring(xmlText)
    prefix=root.tag.split("}")[0][1:]
    ns={'amazon':prefix}
    head=['ASIN','Title','Keyword','ImageURL','Brand','Color','Features','Height','Length','Width','Weight','Price','Similars']
    row={}
    filename='data-table'+datetime.utcnow().strftime("%Y-%m-%dT%H:%M")+'.csv'
    fileExists=os.path.exists(filename)
    totalpages=""
    
    imgdir='./Amazon-images/'+keyword
    if not os.path.exists(imgdir):
        os.makedirs(imgdir)
    
    with open(filename, 'a') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=head)
        if not fileExists:
            writer.writeheader()
        
        for items in root.findall('amazon:Items',ns):
            tp=items.find('amazon:TotalPages',ns)
            if tp is not None:
                totalpages=tp.text  
                print(totalpages)
            for item in items.findall('amazon:Item',ns):
                #ASIN
                asin=item.find('amazon:ASIN',ns)
                if asin is not None:
                    row['ASIN']=asin.text
                #keyword
                row['Keyword']=keyword
        
                #images
        
                imagesets=item.find('amazon:ImageSets',ns)
                if imagesets is not None:
                    imageset=imagesets.find('amazon:ImageSet',ns)
                    if imageset is not None:
                        largeimage=imageset.find('amazon:LargeImage',ns)
                        if largeimage is not None:
                            imageurl=largeimage.find('amazon:URL',ns)
                            if imageurl is not None:
                                row['ImageURL']=imageurl.text 
                                urllib.request.urlretrieve(imageurl.text, imgdir+'/'+row['ASIN']+'.jpg')
                   
            
                   
                
            
                #attributes
        
                features=""
        
                for attr in item.findall('amazon:ItemAttributes',ns):
                    #brand
                    brand=attr.find('amazon:Brand',ns)
                    if brand is not None:
                        row['Brand']=brand.text
                    #color    
                    color=attr.find('amazon:Color',ns)
                    if color is not None:
                        row['Color']=color.text
                    #dimensions    
                    dime=attr.find('amazon:ItemDimensions',ns)
                    if dime is not None:
                        height=dime.find('amazon:Height',ns)
                        if height is not None:
                            row['Height']=height.text
                        length=dime.find('amazon:Length',ns)
                        if length is not None:
                            row['Length']=length.text
                        width=dime.find('amazon:Width',ns)
                        if width is not None:
                            row['Width']=width.text
                        weight=dime.find('amazon:Weight',ns)
                        if weight is not None:
                            row['Weight']=weight.text
                    #features
                    row['Features']=', '.join([f.text for f in attr.findall('amazon:Feature',ns)])
                    #price
                    listprice=attr.find('amazon:ListPrice',ns)
                    if listprice is not None:
                        price=listprice.find('amazon:FormattedPrice',ns)
                        if price is not None:
                            row['Price']=price.text
                    #title 
                    title=attr.find('amazon:Title',ns)
                    if title is not None:
                        row['Title']=title.text
                #similars 
                for similars in item.findall('amazon:SimilarProducts',ns):
                    simList=similars.findall('amazon:SimilarProduct',ns)
                    if simList is not None:
                        row['Similars']=', '.join([s.find('amazon:ASIN',ns).text for s in simList])
                
                #print(row)
                writer.writerow(row)
    return totalpages

In [20]:
keywords=['table','chair','sofa']
for keyword in keywords:
    xmlText=getResponse('ItemSearch',keyword,'HomeGarden','Images,ItemAttributes,Similarities',1)
    numberofPages=extractData(xmlText,keyword)
    for p in range(2,3):
        xmlText=getResponse('ItemSearch',keyword,'HomeGarden','Images,ItemAttributes,Similarities',p)
        extractData(xmlText,keyword)

http://webservices.amazon.com/onca/xml?AWSAccessKeyId=AKIAJN4PYBNAGHN74C3A&Version=2013-08-01&ResponseGroup=Images%2CItemAttributes%2CSimilarities&Signature=d39ear8WpspF5BK81uBJECnzxF0LIu8JwPGk9U5WoXw%3D&Operation=ItemSearch&Timestamp=2017-06-07T16%3A24%3A12&Service=AWSECommerceService&ItemPage=1&SearchIndex=HomeGarden&Keywords=table&AssociateTag=summerproje08-20
84599
http://webservices.amazon.com/onca/xml?AWSAccessKeyId=AKIAJN4PYBNAGHN74C3A&Version=2013-08-01&ResponseGroup=Images%2CItemAttributes%2CSimilarities&Signature=OVspmxngDSKZMSt6S2x4Mm6n40%2BqnI0JcP5aWOzghGc%3D&Operation=ItemSearch&Timestamp=2017-06-07T16%3A24%3A22&Service=AWSECommerceService&ItemPage=2&SearchIndex=HomeGarden&Keywords=table&AssociateTag=summerproje08-20
84603
http://webservices.amazon.com/onca/xml?AWSAccessKeyId=AKIAJN4PYBNAGHN74C3A&Version=2013-08-01&ResponseGroup=Images%2CItemAttributes%2CSimilarities&Signature=RBIYmSQ3Ym33iXFeYby70Mel0Gk5%2BHJt5C5wyv30zz0%3D&Operation=ItemSearch&Timestamp=2017-06-07T16%3A2

In [15]:
import os
if not os.path.exists('./Amazon-images/table'):
    os.makedirs('./Amazon-images/table')

In [None]:
import 