# Solution to Exercise 08 - Web Scraping

In today's exercise we're using the Python libraries *BeautifulSoup* and *owlready2* to create an ontology from data scraped from the Web.
[BeautifulSoup](https://beautiful-soup-4.readthedocs.io/en/latest/) is a library for extracting data from HTML or XML files by accessing concrete elements in the tree structure.
The ontology (& the web scraping) is foused on extracting data about Pok√©mon from a wiki-like website called *[Bulbapedia](https://bulbapedia.bulbagarden.net/wiki/Main_Page)*.

## Setup

In [1]:
!pip install bs4
!pip install webdriver-manager
!pip install curl_cffi
!pip install rapidfuzz




[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:


from curl_cffi import requests
from curl_cffi.const import CurlHttpVersion  
from bs4 import BeautifulSoup
import time
from rapidfuzz import process, fuzz, utils


In [None]:
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class TechScraper:
    def __init__(self, domain="www.cpubenchmark.net"):
        self.url = f"https://{domain}/CPU_mega_page.html"
        options = webdriver.ChromeOptions()
        # options.add_argument("--headless") # Headless often bypasses some banners
        self.driver = webdriver.Chrome(options=options)
        self.driver.maximize_window()

    def handle_quantcast_banner(self):
        """Specifically targets the qc-cmp2-ui privacy dialog."""
        print("üîç Checking for privacy banner...")
        try:
            # 1. Wait for the banner container to appear
            wait = WebDriverWait(self.driver, 7)
            banner = wait.until(EC.presence_of_element_located((By.ID, "qc-cmp2-ui")))
            
            # 2. Find the 'AGREE' button inside the footer
            # We use the specific class 'css-47sehv' you identified
            agree_xpath = "//button[contains(@class, 'css-47sehv')]//span[text()='AGREE']"
            agree_button = wait.until(EC.element_to_be_clickable((By.XPATH, agree_xpath)))
            
            # 3. Try a standard click; if blocked, use a JS click
            try:
                agree_button.click()
            except:
                self.driver.execute_script("arguments[0].click();", agree_button)
                
            print("üç™ Quantcast banner dismissed.")
            time.sleep(1.5) # Wait for the fade-out animation
        except Exception:
            print("üí° Banner not found or already gone. Proceeding...")
            # Fallback: Just try to delete it if it's lurking invisibly
            self.driver.execute_script("""
                var b = document.getElementById('qc-cmp2-ui');
                if(b) b.remove();
            """)

    def search_on_site(self, hardware_name):
        try:
            print(f"üåê Loading {self.url}...")
            self.driver.get(self.url)
            
            # 1. Clear the cookie banner
            self.handle_quantcast_banner()
            
            # 2. Wait for the search input to be present in the DOM
            wait = WebDriverWait(self.driver, 20)
            search_input = wait.until(EC.presence_of_element_located((By.NAME, "search_name")))
            
            # 3. THE RETRY LOOP: Keep trying to write until the text is actually there
            success = False
            for attempt in range(10): # Try for 10 seconds
                # Force focus and scroll
                self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", search_input)
                self.driver.execute_script("arguments[0].focus();", search_input)
                
                # Clear and Type using JS + Selenium Keys
                self.driver.execute_script("arguments[0].value = '';", search_input)
                search_input.send_keys(hardware_name)
                
                # Check if it worked
                current_val = search_input.get_attribute("value")
                if current_val == hardware_name:
                    search_input.send_keys(Keys.ENTER)
                    print(f"‚úÖ Successfully wrote: {hardware_name}")
                    success = True
                    break
                
                print(f"‚è≥ Attempt {attempt+1}: Search bar not ready yet, retrying...")
                time.sleep(1)

            if not success:
                print("‚ùå Failed to write in search bar after 10 attempts.")
                return

            # 4. Wait for results
            time.sleep(3)
            first_row = self.driver.find_element(By.CSS_SELECTOR, "table#cputable tbody tr")
            print(f"‚úÖ Result Found: {first_row.text}")
            
        except Exception as e:
            print(f"‚ùå Automation Error: {e}")

# --- EXECUTION ---
bot = TechScraper()
bot.search_on_site("Intel Core i7-13700K")
# bot.driver.quit()

üåê Loading https://www.cpubenchmark.net/CPU_mega_page.html...
üîç Checking for privacy banner...
üç™ Quantcast banner dismissed.
‚ùå Automation Error: Message: 
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff6f18188d5
	0x7ff6f1818930
	0x7ff6f15f165d
	0x7ff6f1649a33
	0x7ff6f1649d3c
	0x7ff6f169df67
	0x7ff6f169ac97
	0x7ff6f163ac29
	0x7ff6f163ba93
	0x7ff6f1b30620
	0x7ff6f1b2af60
	0x7ff6f1b496c6
	0x7ff6f1835dd4
	0x7ff6f183ed7c
	0x7ff6f1821ff4
	0x7ff6f18221a5
	0x7ff6f1807ed2
	0x7ffcb7a1257d
	0x7ffcb80caf08

