# Web Scrape Amazon Page Notebook
I will be web scraping the data that will go into a MongoDB database.

In [None]:
#Imports the BeautifulSoup module from library bs4
import requests
from bs4 import BeautifulSoup
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}

In [None]:
import pymongo

client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
db = client['television_data']

In [None]:
client.list_database_names()

In [None]:
#Create a collection (table) to store the product information
product_info_collection = db['product_information']

# Webscrape Part 1: Obtain Individual Item Links

## Part A: Obtain The First Page of Results

I will be web scraping Amazon's most featured televisions page. I will first store the urls of each individual television.

The first results page has a different format than the following pages. I will not create a function for the first page because of this reason.

After I obtain the first few links I will add them to a new MongoDB collection.

In [None]:
#The link to the first page of results for televisions on Amazon, sorted by the default "Featured" option
amazon_first_page_url_tvs = 'https://www.amazon.com/s/ref=lp_1266092011_nr_n_12?fst=as%3Aoff&rh=n%3A172282%2Cn%3A%21493964%2Cn%3A1266092011%2Cn%3A172659&bbn=1266092011&ie=UTF8&qid=1564525104&rnid=1266092011'


In [None]:
#Access the first page of search results for televisions on Amazon
amazon_first_page_url_tvs_html = requests.get(amazon_first_page_url_tvs, headers=headers, timeout=5).text #it will keep trying at this stage, the loop will not continue to iterate until a response is given
amazon_first_page_url_tvs_content = BeautifulSoup(amazon_first_page_url_tvs_html, 'html.parser')


In [None]:
first_page_html_split_by_result = str(amazon_first_page_url_tvs_content).split('result_')

In [None]:
first_page_tv_links = []

for i in first_page_html_split_by_result[1:]:
    
    tv_link = BeautifulSoup(i, 'html.parser').find('a', class_="a-link-normal a-text-normal")['href'].split('/ref')[0]
    
    first_page_tv_links.append({'url': tv_link})
    

In [None]:
len(first_page_tv_links)

## Part B: Add The First Page Of Results To Mongo

In [None]:
insertion_results = product_info_collection.insert_many(first_page_tv_links)

In [None]:
query_1 = product_info_collection.find({}, {'_id': 0, 'url': 1})
for item in query_1:
    print(item)

## Part C: Obtain The Subsequent Pages of Results

In [None]:
def return_page_tv_links(url):
    
    amazon_page_tv_html = requests.get(url, headers=headers, timeout=5).text
    amazon_page_tv_html_content = BeautifulSoup(str(BeautifulSoup(amazon_page_tv_html, 'html.parser')), 'html.parser')

    html_tv_urls = amazon_page_tv_html_content.find('div', class_ = 's-result-list s-search-results sg-row')
    html_list_of_tv_urls = html_tv_urls.find_all('a', class_ = 'a-link-normal a-text-normal')
    html_list_of_tv_urls = html_list_of_tv_urls[:24]
    
    list_of_dictionaries_of_television_urls = []
    
    for i in html_list_of_tv_urls:
        list_of_dictionaries_of_television_urls.append({'url': 'https://www.amazon.com' + i['href'].split('/ref')[0]})
    
    return list_of_dictionaries_of_television_urls

In [None]:
return_page_tv_links('https://www.amazon.com/s?rh=n%3A172282%2Cn%3A%21493964%2Cn%3A1266092011%2Cn%3A172659&page=13&qid=1564698515&ref=lp_172659_pg_2')[0]

In [None]:
def obtain_urls(start_page = False, end_page = False):
    amazon_tvs_url = 'https://www.amazon.com/s?rh=n%3A172282%2Cn%3A%21493964%2Cn%3A1266092011%2Cn%3A172659&page={}&qid=1564698515&ref=lp_172659_pg_2'
    
    if not start_page:
        start_page = 2
        
    if not end_page:
        end_page = 100
    
    for i in range(start_page, end_page):
        one_page_of_amazon_tvs_url = amazon_tvs_url.format(str(i))
        product_info_collection.insert_many(return_page_tv_links(one_page_of_amazon_tvs_url))
        print('The total number of tv urls added are: {}'.format(product_info_collection.count_documents({})))

In [None]:
obtain_urls(start_page = 0, end_page = 50)

In [None]:
product_info_collection.delete_many({})

In [None]:
product_info_collection.count_documents({})