# Scrapping dynamic content from websites using BeautifulSoup and Selenium

> 1. This tutorial reuses code from https://medium.com/ymedialabs-innovation/web-scraping-using-beautiful-soup-and-selenium-for-dynamic-page-2f8ad15efe25 
> 2. this tutorial doesnt work on google colab, as a browser like Chrome/Firefox cannot be installed on colab

In [1]:
import requests
import bs4 as bs

![tomato](https://awesomescreenshot.s3.amazonaws.com/image/1882885/37751670-b88de8e82c7df9247c76899433f5ba6a.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJSCJQ2NM3XLFPVKA%2F20230307%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230307T124306Z&X-Amz-Expires=28800&X-Amz-SignedHeaders=host&X-Amz-Signature=e9b504b20109dde39c22b9abe3b5707935d346cd14a20e64d43408a9f5a60ebb)

In [2]:
url = 'https://www.rami-levy.co.il/he/online/search?item=100' # עגבניה של רמי לוי
response = requests.get(url)
response.raise_for_status()  # check all is OK

content = response.text
content[:500] # lots of HTML but hard to read

'<!doctype html>\n<html data-n-head-ssr class="rtl" dir="rtl" lang="he" data-n-head="%7B%22class%22:%7B%22ssr%22:%22rtl%22%7D,%22dir%22:%7B%22ssr%22:%22rtl%22%7D,%22lang%22:%7B%22ssr%22:%22he%22%7D%7D">\n<head>\n<title>חיפוש מוצרים באתר - רמי לוי אונליין</title><meta data-n-head="ssr" data-hid="og:title" property="og:title" content="חיפוש מוצרים באתר - רמי לוי אונליין"><meta data-n-head="ssr" data-hid="og:image" name="og:image" property="og:image" content="https://www.rami-levy.co.il/icons/512.png">'

In [3]:
soup = bs.BeautifulSoup(content, 'html.parser')
print(soup.body.prettify()[:1000]) # lots of HTML, but easier to read, most of it is JavaScript

<body class="clean" data-n-head="%7B%22class%22:%7B%22ssr%22:%22clean%22%7D%7D">
 <noscript data-hid="gtm-noscript" data-n-head="ssr" data-pbody="true">
  <iframe height="0" src="https://www.googletagmanager.com/ns.html?id=GTM-59SZDCM&amp;" style="display:none;visibility:hidden" title="gtm" width="0">
  </iframe>
 </noscript>
 <div data-server-rendered="true" id="__nuxt">
  <div id="__layout">
   <div class="bg-gray-100 nuxt-wrap" data-v-624709f8="">
   </div>
  </div>
 </div>
 <script>
  window.__NUXT__=(function(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,_,$,aa,ab,ac,ad,ae,af,ag,ah,ai,aj,ak,al,am,an,ao,ap,aq,ar,as){return {layout:"online",data:[{menu:{q:void 0}}],fetch:{},error:a,state:{accessibility:{options:{general:[],bigCursor:a,textAlign:a,color:a,statusShowAccessibility:b}},authuser:{user:a,guestToken:a,shopLists:[]},cart:{items:[],itemsPreference:{},price:c,priceMall:c,priceInfo:{price:c,priceClub:c,priceWallet:c,dis

In [5]:
soup.find_all(string="עגבניה")

[]

In [6]:
###
### example from https://selenium-python.readthedocs.io/getting-started.html#simple-usage
### 

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

with webdriver.Chrome() as driver:
    driver.get("http://www.python.org")
    assert "Python" in driver.title
    elem = driver.find_element(By.NAME, "q")
    elem.clear()
    elem.send_keys("pycon")
    elem.send_keys(Keys.RETURN)
    assert "No results found." not in driver.page_source
    # driver.close()  # no need to close, because of the 'with' statement

In [19]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
# driver = webdriver.Chrome()
with webdriver.Chrome(options=options) as driver:  # work with UI (headless mode)
    driver.get(url) # load the rami levi URL
    time.sleep(2)
    page_source = driver.page_source
soup = bs.BeautifulSoup(page_source, 'html.parser')

In [33]:
### now try selenium for Rami Levi
print(soup.find_all(string="עגבניה"))

### the place where the name of the item is
header = soup.find("h3", role="heading")
print(header.get_text()) # name of the item

price_header = soup.find("span", class_="position-relative currency-wrap overflow-ellipsis")
print(price_header.get_text())

['עגבניה']
עגבניה
3.90 ₪


In [35]:
### now try selenium for Rami Levi item 101

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
# driver = webdriver.Chrome()
url = "https://www.rami-levy.co.il/he/online/search?item=101"
with webdriver.Chrome(options=options) as driver:  # work with UI (headless mode)
    driver.get(url) # load the rami levi URL
    time.sleep(2)
    page_source = driver.page_source
soup = bs.BeautifulSoup(page_source, 'html.parser')

### the place where the name of the item is
header = soup.find("h3", role="heading")
print(header.get_text()) # name of the item

price_header = soup.find("span", class_="position-relative currency-wrap overflow-ellipsis")
print(price_header.get_text())

מלפפון
4.90 ₪


In [40]:
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')

scrapped_data = {}

# finally, lets put it all in place
with webdriver.Chrome(options=options) as driver:  # work with UI (headless mode)
    items = [100, 101, 102, 103]
    urls = [f"https://www.rami-levy.co.il/he/online/search?item={id}" for id in items]
    for url in urls:
        print(url)
        driver.get(url) # load the rami levi URL
        time.sleep(2)
        page_source = driver.page_source
        soup = bs.BeautifulSoup(page_source, 'html.parser')
        
        ### the place where the name of the item is
        name_header = soup.find("h3", role="heading")
        # print(header.get_text()) # name of the item

        ### the place where the price of the item is
        price_header = soup.find("span", class_="position-relative currency-wrap overflow-ellipsis")
        # print(price_header.get_text())
        
        scrapped_data[name_header.get_text()] = price_header.get_text()
        
print(scrapped_data)

https://www.rami-levy.co.il/he/online/search?item=100
https://www.rami-levy.co.il/he/online/search?item=101
https://www.rami-levy.co.il/he/online/search?item=102
https://www.rami-levy.co.il/he/online/search?item=103
{'עגבניה': '3.90 ₪', 'מלפפון': '4.90 ₪', 'גזר ארוז': '2.90 ₪', 'פלפל כהה': '4.90 ₪'}
