# Solution to Exercise 08 - Web Scraping

In today's exercise we're using the Python libraries *BeautifulSoup* and *owlready2* to create an ontology from data scraped from the Web.
[BeautifulSoup](https://beautiful-soup-4.readthedocs.io/en/latest/) is a library for extracting data from HTML or XML files by accessing concrete elements in the tree structure.
The ontology (& the web scraping) is foused on extracting data about PokÃ©mon from a wiki-like website called *[Bulbapedia](https://bulbapedia.bulbagarden.net/wiki/Main_Page)*.

## Setup

In [1]:
!pip install bs4




[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Next we import the necessary libraries
import requests
import time
import json
import re
import csv

from bs4 import BeautifulSoup

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import time
import csv
from pathlib import Path

def get_top_100_urls():
    list_url = "https://store.steampowered.com/search/?filter=mostplayed&count=100"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
    res = requests.get(list_url, headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')
    
    links = []
    rows = soup.find_all('a', class_='search_result_row')
    for row in rows[:100]:
        links.append(row['href'])
    return links

def scrape_game_data(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9"
    }
    # Included age-gate bypass cookies
    cookies = {'birthtime': '283996801', 'lastagecheckage': '1-0-1991', 'wants_mature_content': '1'}
    
    try:
        page_res = requests.get(url, headers=headers, cookies=cookies, timeout=10)
        gs = BeautifulSoup(page_res.text, 'html.parser')

        # --- Name ---
        name_tag = gs.find('div', id='appHubAppName') or gs.find('div', class_='apphub_AppName')
        NAME = name_tag.get_text(strip=True) if name_tag else "N/A"
        
        # --- Price ---
        price_section = gs.find('div', class_='game_purchase_price') or gs.find('div', class_='discount_final_price')
        PRICE = price_section.get_text(strip=True) if price_section else "N/A"

        # --- PEGI ---
        pegi_div = gs.find('div', class_='game_rating_icon')
        PEGI = pegi_div.find('img').get('alt', 'N/A') if pegi_div and pegi_div.find('img') else "N/A"

        # --- UPDATED MODE LOGIC (game_area_features_list_ctn) ---
        mode_keywords = [
            "Single-player", "Multi-player", "Online Co-op", "LAN Co-op", 
            "Shared/Split Screen Co-op", "MMO", "Online PvP", "LAN PvP",
            "Cross-Platform Multiplayer", "Co-op", "PvP"
        ]
        
        found_modes = []
        # Target the specific container you identified
        feature_container = gs.find('div', class_='game_area_features_list_ctn')
        
        if feature_container:
            # Find all links within that container
            feature_links = feature_container.find_all('a')
            for link in feature_links:
                # Get the label text (e.g., "Multi-player")
                label_text = link.find(class_='label').get_text(strip=True) if link.find(class_='label') else link.get_text(strip=True)
                
                if any(kw.lower() == label_text.lower() or kw.lower() in label_text.lower() for kw in mode_keywords):
                    found_modes.append(label_text)

        # Remove duplicates and join
        MODE = ", ".join(sorted(list(set(found_modes)))) if found_modes else "Single-player"

        # --- Hardware Helper ---
        def get_spec(label):
            tag = gs.find('strong', string=re.compile(label, re.IGNORECASE))
            if tag:
                text = tag.next_sibling
                if text and isinstance(text, str):
                    return text.strip().strip(':').strip()
                return tag.parent.get_text().replace(tag.get_text(), "").strip().strip(':').strip()
            return "N/A"

        OS = get_spec("OS:")
        CPU = get_spec("Processor:")
        RAM = get_spec("Memory:")
        GPU = get_spec("Graphics:")
        STORAGE = get_spec("Storage:")

        return {
            "NAME": NAME, "PRICE": PRICE, "PEGI": PEGI, "MODE": MODE,
            "OS": OS, "CPU": CPU, "GPU": GPU, "RAM": RAM, "STORAGE": STORAGE
        }
    except Exception as e:
        return None

# --- CONFIGURATION ---
JSON_FILE = "C:\Users\tomma\Desktop\Clone Repository Actionable Knowledge\GameMatcher\D3\Scraping\cleaning\steam_parsed_CLEAN.json"

def update_genres_in_json():
    # 1. Load the existing clean data
    if not JSON_FILE.exists():
        print(f"Error: {JSON_FILE.name} not found.")
        return

    with open(JSON_FILE, 'r', encoding='utf-8') as f:
        data_list = json.load(f)

    # 2. Create a lookup dictionary for fast access: { "Game Name": {original_data} }
    # We use .get('NAME') to match your scraper's key
    lookup = {game.get('NAME'): game for game in data_list}

    # 3. Get the URLs to scrape
    links = get_top_100_urls()
    print(f"Found {len(links)} games. Starting targeted update...")

    updated_count = 0

    for i, link in enumerate(links):
        scraped_game = scrape_game_data(link)
        
        if scraped_game:
            name = scraped_game['NAME']
            
            # 4. Check if this game exists in our file
            if name in lookup:
                # ONLY update the Genre (and maybe Mode if you want)
                lookup[name]['GENRE'] = scraped_game['GENRE']
                # lookup[name]['MODE'] = scraped_game['MODE'] # Optional
                
                print(f"[{i+1}/100] UPDATED: {name[:30]}")
                updated_count += 1
            else:
                print(f"[{i+1}/100] SKIPPED: {name[:30]} (Not in JSON)")

        time.sleep(0.7)

    # 5. Save the modified list back to the file
    # (lookup.values() turns our dictionary back into a list of objects)
    with open(JSON_FILE, 'w', encoding='utf-8') as f:
        json.dump(list(lookup.values()), f, indent=4, ensure_ascii=False)

    print(f"\nFinished! Updated {updated_count} games in {JSON_FILE.name}.")

# --- RUN ---
if __name__ == "__main__":
    update_genres_in_json()

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (118297416.py, line 92)