# Solution to Exercise 08 - Web Scraping

In today's exercise we're using the Python libraries *BeautifulSoup* and *owlready2* to create an ontology from data scraped from the Web.
[BeautifulSoup](https://beautiful-soup-4.readthedocs.io/en/latest/) is a library for extracting data from HTML or XML files by accessing concrete elements in the tree structure.
The ontology (& the web scraping) is foused on extracting data about Pok√©mon from a wiki-like website called *[Bulbapedia](https://bulbapedia.bulbagarden.net/wiki/Main_Page)*.

## Setup

In [None]:
!pip install bs4
!pip install webdriver-manager
!pip install curl_cffi

In [None]:

import json
import time
from pathlib import Path
from curl_cffi import requests 



In [None]:


class TechScraper:
    def __init__(self, domain="www.cpubenchmark.net"):
        # --- UNIVERSAL PATH SETUP ---
        # Get the directory where this script lives
        self.script_dir = Path.cwd()
        # Target the 'row data' folder (assumes it is in the same parent folder as 'cleaning')
        self.save_dir = self.script_dir / "raw data"
        self.save_dir.mkdir(parents=True, exist_ok=True)

        self.domain = domain
        self.is_cpu = "cpu" in domain
        self.mega_page = f"https://{domain}/{'CPU' if self.is_cpu else 'GPU'}_mega_page.html"
        
        # Absolute path for the JSON file
        filename = "cpu_data.json" if self.is_cpu else "gpu_data.json"
        self.cache_file = self.save_dir / filename
        
        # Load or Scrape logic
        self.items = self.load_local()
        
        if not self.items:
            print(f"--- No local data found. Starting Scrape for {self.domain} ---")
            self.session = requests.Session()
            self.items = self.scrape()
            if self.items:
                self.save_local()

    def clean_mark(self, value):
        """Safely converts benchmark strings to integers, handling 'NA' or 'Insufficient data'."""
        if value is None:
            return 0
        val_str = str(value).replace(',', '').strip()
        if val_str.replace('-', '', 1).isdigit():
            return int(val_str)
        return 0

    def scrape(self):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0",
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "X-Requested-With": "XMLHttpRequest",
            "Referer": self.mega_page,
            "Connection": "keep-alive"
        }
        
        try:
            print(f"Warming up session for {self.domain}...")
            # Using curl_cffi's impersonate to bypass TLS fingerprinting
            self.session.get(self.mega_page, impersonate="chrome120")
            time.sleep(3)
            
            print(f"Requesting hardware list from {self.domain}...")
            data_url = f"https://{self.domain}/data/?_={int(time.time()*1000)}"
            response = self.session.get(data_url, headers=headers, impersonate="chrome120")
            
            if response.status_code != 200:
                print(f"‚ùå Server returned status code {response.status_code}")
                return []

            raw_data = response.json().get("data", [])
            if not raw_data:
                print("‚ùå Server returned empty list.")
                return []
            
            print(f"‚úÖ Success! Found {len(raw_data)} items.")
            
            mark_key = 'cpumark' if self.is_cpu else 'g3dmark'
            
            return [
                {
                    "name": x.get('name'), 
                    "mark": self.clean_mark(x.get(mark_key)), 
                    "rank": self.clean_mark(x.get('rank'))
                } 
                for x in raw_data
            ]
        except Exception as e:
            print(f"‚ùå Scrape failed: {e}")
            return []

    def save_local(self):
        with open(self.cache_file, 'w', encoding='utf-8') as f:
            json.dump(self.items, f, indent=4, ensure_ascii=False)
        print(f"üíæ Data saved to: {self.cache_file.absolute()}")

    def load_local(self):
        if self.cache_file.exists():
            try:
                with open(self.cache_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    if data:
                        print(f"üìÇ Loaded {len(data)} items from {self.cache_file.name}.")
                        return data
            except (json.JSONDecodeError, IOError):
                return None
        return None

# --- EXAMPLE EXECUTION ---
if __name__ == "__main__":
    # Scrape CPUs
    cpu_scraper = TechScraper(domain="www.cpubenchmark.net")
    # Scrape GPUs
    gpu_scraper = TechScraper(domain="www.videocardbenchmark.net")