# Web Scrape Amazon Page Notebook
I will be web scraping the data that will go into a MongoDB database.

In [2]:
#Imports the BeautifulSoup module from library bs4
import requests
from bs4 import BeautifulSoup
import time
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}

In [3]:
import pymongo

client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
db = client['television_data']

In [4]:
client.list_database_names()

['example_database', 'lab_db', 'local', 'television_data']

In [5]:
#Create a collection (table) to store the product information
product_info_collection = db['product_information']

In [7]:
# product_info_collection.remove({}) 

In [8]:
product_info_collection.count_documents({})

0

# Webscrape Part 1: Obtain Individual Item Links

## Part A: Obtain The First Page of Results

I will be web scraping Amazon's most featured televisions page. I will first store the urls of each individual television.

The first results page has a different format than the following pages. I will not create a function for the first page because of this reason.

After I obtain the first few links I will add them to a new MongoDB collection.

Sometimes Amazon puts sponsored links in the results on the first page, this can be detected by the length of the results list.

In [9]:
#The link to the first page of results for televisions on Amazon, sorted by the default "Featured" option
amazon_first_page_url_tvs = 'https://www.amazon.com/s/ref=lp_1266092011_nr_n_12?fst=as%3Aoff&rh=n%3A172282%2Cn%3A%21493964%2Cn%3A1266092011%2Cn%3A172659&bbn=1266092011&ie=UTF8&qid=1564525104&rnid=1266092011'


In [10]:
length_first_page_html_split_by_result = 0

attempts_count = 1

while length_first_page_html_split_by_result != 25:
    
    time.sleep(.45)

    #Access the first page of search results for televisions on Amazon
    amazon_first_page_url_tvs_html = requests.get(amazon_first_page_url_tvs, headers=headers, timeout=5).text #it will keep trying at this stage, the loop will not continue to iterate until a response is given
    amazon_first_page_url_tvs_content = BeautifulSoup(amazon_first_page_url_tvs_html, 'html.parser')

    first_page_html_split_by_result = str(amazon_first_page_url_tvs_content).split('result_')
    length_first_page_html_split_by_result = len(first_page_html_split_by_result)
    
    print("Attempts to load first page : {}".format(attempts_count))
    attempts_count += 1
    
    

Attempts to load first page : 1


In [11]:
first_page_tv_links = []

for i in first_page_html_split_by_result[1:]:
    
    tv_link = BeautifulSoup(i, 'html.parser').find('a', class_="a-link-normal a-text-normal")['href'].split('/ref')[0]
    
    first_page_tv_links.append({'url': tv_link})
    

In [12]:
len(first_page_tv_links)

24

## Part B: Add The First Page Of Results To Mongo

In [13]:
insertion_results = product_info_collection.insert_many(first_page_tv_links)

In [14]:
query_1 = product_info_collection.find({}, {'_id': 0, 'url': 1})
for item in query_1[:5]:
    print(item)

{'url': 'https://www.amazon.com/Toshiba-32LF221U19-32-inch-720p-Smart/dp/B07FPR6FMJ'}
{'url': 'https://www.amazon.com/Samsung-UN65RU7100FXZA-Flat-UHD-Smart/dp/B07NC96MBL'}
{'url': 'https://www.amazon.com/TCL-50S425-inch-Smart-Roku/dp/B07JK98NNQ'}
{'url': 'https://www.amazon.com/TCL-40S325-Inch-1080p-Smart/dp/B07GB61TQR'}
{'url': 'https://www.amazon.com/Insignia-NS-50DF710NA19-50-inch-Ultra-Smart/dp/B07FPQ343D'}


# Web Scrape Part 2: Obtain The Subsequent Pages of Results

In [15]:
def return_page_tv_links(url):
    
    attempts = 1
    
    while attempts <105:
    
        try:
    
            amazon_page_tv_html = requests.get(url, headers=headers, timeout=5).text
            amazon_page_tv_html_content = BeautifulSoup(str(BeautifulSoup(amazon_page_tv_html, 'html.parser')), 'html.parser')

            html_tv_urls = amazon_page_tv_html_content.find('div', class_ = 's-result-list s-search-results sg-row')
            html_list_of_tv_urls = html_tv_urls.find_all('a', class_ = 'a-link-normal a-text-normal')
            html_list_of_tv_urls = html_list_of_tv_urls[:24]
            break
        
        except:
        
            attempts += 1
    
    list_of_dictionaries_of_television_urls = []
    
    for i in html_list_of_tv_urls:
        list_of_dictionaries_of_television_urls.append({'url': 'https://www.amazon.com' + i['href'].split('/ref')[0]})
    
    return list_of_dictionaries_of_television_urls

In [16]:
return_page_tv_links('https://www.amazon.com/s?rh=n%3A172282%2Cn%3A%21493964%2Cn%3A1266092011%2Cn%3A172659&page=13&qid=1564698515&ref=lp_172659_pg_2')[0]

{'url': 'https://www.amazon.com/VIZIO-Class-720P-Smart-D32h-F1/dp/B07DC77DPT'}

In [17]:
def obtain_urls(start_page = False, end_page = False):
    amazon_tvs_url = 'https://www.amazon.com/s?rh=n%3A172282%2Cn%3A%21493964%2Cn%3A1266092011%2Cn%3A172659&page={}&qid=1564698515&ref=lp_172659_pg_2'
    
    if not start_page:
        start_page = 2
        
    if not end_page:
        end_page = 100
    
    for i in range(start_page, end_page+1):
        
        
        time.sleep(2)
        one_page_of_amazon_tvs_url = amazon_tvs_url.format(str(i))
        product_info_collection.insert_many(return_page_tv_links(one_page_of_amazon_tvs_url))
        print('The total number of tv urls added are: {}'.format(product_info_collection.count_documents({})))

In [18]:
obtain_urls(end_page = 5)

The total number of tv urls added are: 48
The total number of tv urls added are: 72
The total number of tv urls added are: 96
The total number of tv urls added are: 120


# Web Scrape Part 3: Obtaining Product Information From The Product Page

In [19]:
for i in product_info_collection.find({}).limit( 5 ):
    print(i)

{'_id': ObjectId('5d55e36acfb34a3047024e8a'), 'url': 'https://www.amazon.com/Toshiba-32LF221U19-32-inch-720p-Smart/dp/B07FPR6FMJ'}
{'_id': ObjectId('5d55e36acfb34a3047024e8b'), 'url': 'https://www.amazon.com/Samsung-UN65RU7100FXZA-Flat-UHD-Smart/dp/B07NC96MBL'}
{'_id': ObjectId('5d55e36acfb34a3047024e8c'), 'url': 'https://www.amazon.com/TCL-50S425-inch-Smart-Roku/dp/B07JK98NNQ'}
{'_id': ObjectId('5d55e36acfb34a3047024e8d'), 'url': 'https://www.amazon.com/TCL-40S325-Inch-1080p-Smart/dp/B07GB61TQR'}
{'_id': ObjectId('5d55e36acfb34a3047024e8e'), 'url': 'https://www.amazon.com/Insignia-NS-50DF710NA19-50-inch-Ultra-Smart/dp/B07FPQ343D'}


In [20]:
def obtain_product_info_dict(tv_url):
    individual_tv_page = requests.get(tv_url, headers=headers, timeout=5).text
    individual_tv_page_content = BeautifulSoup(str(BeautifulSoup(individual_tv_page, 'html.parser')), 'html.parser')
    
    product_info_dict = {}
    
    price_number_with_spaces = individual_tv_page_content.find(class_ = 'a-size-medium a-color-price')
    
    try:
        price_number = float(price_number_with_spaces.text.split('$')[1].split("\n")[0])
    except:
        price_number = 'None'
        
    product_info_dict['price'] = price_number
            
    attempts = 1
    
    while attempts < 15:
        
        try:
    
            tech_specs_table = individual_tv_page_content.find('table', class_ = 'a-keyvalue prodDetTable')
    
    
            for i in range(0, len(tech_specs_table.find_all('th'))):
        
                try:
                    tech_key = tech_specs_table.find_all('th')[i].text.split('\n')[1].split('  ')[-1]
                    tech_value = tech_specs_table.find_all('td')[i].text.split('\n')[1].split('  ')[-1]
    
                    product_info_dict[tech_key] = tech_value
                except:
                    pass
                
        except:
            time.sleep(.45)
            attempts += 1
        
    return product_info_dict

In [None]:
obtain_product_info_dict('https://www.amazon.com/Toshiba-32LF221U19-32-inch-720p-Smart/dp/B07FPR6FMJ')

In [None]:
obtain_product_info_dict('https://www.amazon.com/VIZIO-Class-LED-HDTV-Smart/dp/B07P1NFNKX')

# Update The MongoDB With Product Information

In [None]:
url_query = product_info_collection.find({}, {'_id': 0, 'url': 1})
for item in url_query:
    
    try:
        product_info_dictionary = obtain_product_info_dict(item['url'])
        update = {'$set': product_info_dictionary}
        product_info_collection.update_many(item, update)
    
    except:
        print(item['url'])
    

In [None]:
for i in product_info_collection.find({}, {'_id':0}).limit( 5 ):
    print(i)

# Export The Mongo Database

If you want to share the database you can export it.

In a terminal window (not the mongo shell) type the following syntax:

mongodump --collection myCollection --db test

For instance:

mongodump --collection product_information --db television_data