# Web scraping

Andrzej Kocielski, 02.05.2021

## Intro to Web Scraping with Beautiful Soup
Based on tutorial at https://youtu.be/XQgXKtPSzUI

In [4]:
# Importing the libraries
import bs4 # Beautiful Soup
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as url_req # for opening and reading URLs from Python Standard Library

In [5]:
# set the URL
my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38'

# make the connection, call the funtion urlopen with my_url as a parameter
url_client = url_req(my_url)

# read the webpage
page_html = url_client.read()

# close the connection
url_client.close()

In [6]:
# parse the html page
page_soup = soup(page_html, "html.parser")

In [30]:
# sneak peek at the page code

# the entire page, commented out for clarity
# print(page_soup.prettify())

# first H1 header
print(page_soup.h1)
print()

# H2 header
print(page_soup.find_all('h2'))
print()

<h1 class="page-title-text">Video Cards &amp; Video Devices</h1>

[<h2 class="standard-box-top-title">Shop Video Cards</h2>, <h2 class="standard-box-top-title">Video Cards &amp; Video Devices<!-- --> Featured Items</h2>, <h2 class="article-title-third">Video Cards Upgrade Your Graphics and Prevent Latency</h2>, <h2 class="article-title-third">Customized Cards Suit Your Needs</h2>, <h2 class="article-title-third">Video Card Accessories Enhance Your Experience</h2>]



In [35]:
# find all objects with a specific class name (within the html)
# print(page_soup.find_all("div", {"class":"item-cell"}))

# assign the findings to a variable
containers = page_soup.find_all("div", {"class":"item-cell"})

# check the number of containers (found items)
len(containers)

12

In [50]:
# check the first item
# containers[0].prettify()

# get the manufacturer's name (it is the title of the logo image)
# containers[0].div.div.a.img
containers[0].div.div.a.img["title"]

'PowerColor'

In [90]:
# loop through the items in containers

counter = 0
for container in containers:
        
    brand = container.div.div.a.img["title"]
    # print(brand) # for testing
    
    title_container = container.find_all("a", {"class":"item-title"})
    # title_container # as an array (one-element array in this case)
    # title_container[0] # as value of the first (and only in this case) element of the array
    product_name = title_container[0].text # removes the html tags
    
    shipping_container = container.find_all("li", {"class":"price-ship"})
    shipping = shipping_container[0].text.strip()
    
    counter += 1
    print("item no:", counter)
    print("brand: "+ brand)
    print("name: "+ product_name)
    print("shipping: "+ shipping)
    print()    

item no: 1
brand: PowerColor
name: PowerColor Red Devil AMD Radeon RX 6900 XT Ultimate Gaming Graphics Card with 16GB GDDR6 Memory, Powered by AMD RDNA 2, HDMI 2.1 (AXRX 6900XTU 16GBD6-3DHE/OC)
shipping: $9.99 Shipping

item no: 2
brand: PowerColor
name: PowerColor Liquid Devil AMD Radeon RX 6900 XT Ultimate Gaming Graphics Card with 16GB GDDR6 Memory, Powered by AMD RDNA 2, HDMI 2.1 (AXRX 6900XTU 16GBD6-W2DHC/OC)
shipping: 

item no: 3
brand: EVGA
name: EVGA GeForce RTX 3060 XC BLACK GAMING, 12G-P5-3655-KR, 12GB GDDR6, Dual-Fan
shipping: $9.99 Shipping

item no: 4
brand: GIGABYTE
name: GIGABYTE GeForce RTX 3060 EAGLE 12G Graphics Card, 2 x WINDFORCE Fans, 12GB 192-bit GDDR6, GV-N3060EAGLE-12GD Video Card
shipping: $9.99 Shipping

item no: 5
brand: MSI
name: MSI Ventus GeForce RTX 3060 Video Card RTX 3060 VENTUS 2X 12G
shipping: $9.99 Shipping

item no: 6
brand: GIGABYTE
name: GIGABYTE GeForce RTX 3060 GAMING OC 12G Graphics Card, 3 x WINDFORCE Fans, 12GB 192-bit GDDR6, GV-N3060GAMING 

In [94]:
# save the output the a file

filename = "prodcuts.csv"
f = open(filename, "w")

headers = "brand, product_name, shipping\n"

f.write(headers)

for container in containers:
    brand = container.div.div.a.img["title"]
    
    title_container = container.find_all("a", {"class":"item-title"})
    product_name = title_container[0].text # removes the html tags
    
    shipping_container = container.find_all("li", {"class":"price-ship"})
    shipping = shipping_container[0].text.strip()
    
    f.write(brand.replace(",", "") + "," + product_name.replace(",", "|") + "," + shipping + "\n") # on the fly replaces comas inside the text
    
f.close()