In [5]:
import requests
from bs4 import BeautifulSoup
import json
import time

BASE_URL = "http://darksouls.wikidot.com"
SHIELDS_URL = BASE_URL + "/shields"
browser = "chrome"

USER_AGENTS = {
    "chrome": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "safari": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15"
}

headers = {
    "User-Agent": USER_AGENTS[browser]
}


def scrape_shield_detail(shield_url):
    try:
        res = requests.get(shield_url, headers=headers, timeout=10)
        soup = BeautifulSoup(res.content, 'html.parser')
        content = soup.find('div', {'id': 'page-content'})

        # --- Image URL ---
        image_tag = content.find('img')
        image_url = image_tag['src'] if image_tag else None

        # --- In-Game Description ---
        in_game_description = ""
        found_description = False
        for tag in content.find_all():
            if tag.name in ['h2', 'h3'] and 'in-game description' in tag.text.lower():
                found_description = True
                continue
            if found_description:
                if tag.name in ['h2', 'h3']:
                    break
                if tag.name == 'p' and tag.find('em'):
                    in_game_description += tag.get_text(strip=True) + " "
        in_game_description = in_game_description.strip()

        # --- Availability ---
        availability = []
        for header in content.find_all(['h2', 'h3']):
            if 'availability' in header.text.lower():
                next_elements = []
                next_sibling = header.find_next_sibling()
                while next_sibling and next_sibling.name not in ['h2', 'h3']:
                    next_elements.append(next_sibling)
                    next_sibling = next_sibling.find_next_sibling()
                for elem in next_elements:
                    if elem.name == 'ul':
                        availability.extend([li.get_text(strip=True) for li in elem.find_all('li')])
                    elif elem.name == 'p':
                        availability.append(elem.get_text(strip=True))
                break

        # --- Stats Table (only first data row) ---
        stats_table = {}
        full_table = content.find('table', class_='wiki-content-table')
        if full_table:
            rows = full_table.find_all('tr')
            if len(rows) >= 2:
                header_row = rows[0].find_all(['th', 'td'])
                value_row = rows[1].find_all('td')

                for th, td in zip(header_row, value_row):
                    label = th.get_text(strip=True)
                    value = td.get_text(strip=True)
                    if label.lower().startswith("aux") or label.lower().startswith("frampt"):
                        break
                    stats_table[label] = value

        return {
            "in_game_description": in_game_description,
            "availability": availability,
            "stats_table": stats_table,
            "image_url": image_url
        }

    except Exception as e:
        print(f"‚ö†Ô∏è Error scraping {shield_url}: {e}")
        return {
            "in_game_description": "",
            "availability": [],
            "stats_table": {},
            "image_url": None
        }
        
def scrape_shields_by_category():
    response = requests.get(SHIELDS_URL, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    content = soup.find('div', {'id': 'page-content'})

    table_cells = content.find_all('td')
    td_shields = table_cells[1]  # right-hand cell with shield links
    all_tags = td_shields.find_all(['h3', 'p'])

    shields_data = []
    current_category = None
    current_shields = []

    for tag in all_tags:
        if tag.name == 'h3':
            if current_category and current_shields:
                shields_data.append({
                    'category': current_category,
                    'shields': current_shields
                })
            current_category = tag.text.strip()
            current_shields = []

        elif tag.name == 'p' and current_category:
            for a in tag.find_all('a'):
                shield_name = a.text.strip()
                shield_url = BASE_URL + a['href']
                print(f"üõ° Scraping {shield_name}...")

                shield_details = scrape_shield_detail(shield_url)
                time.sleep(1)  # polite scraping delay

                current_shields.append({
                    "name": shield_name,
                    "url": shield_url,
                    "in_game_description": shield_details['in_game_description'],
                    "availability": shield_details['availability'],
                    "stats_table": shield_details['stats_table']
                })

    # Add last block
    if current_category and current_shields:
        shields_data.append({
            'category': current_category,
            'shields': current_shields
        })

    return shields_data

# üîÑ Run
shields_full_data = scrape_shields_by_category()


üõ° Scraping Buckler...
üõ° Scraping Caduceus Round Shield...
üõ° Scraping Cracked Round Shield...
üõ° Scraping Effigy Shield...
üõ° Scraping Leather Shield...
üõ° Scraping Plank Shield...
üõ° Scraping Red and White Round Shield...
üõ° Scraping Small Leather Shield...
üõ° Scraping Target Shield...
üõ° Scraping Warrior's Round Shield...
üõ° Scraping Balder Shield...
üõ° Scraping Black Knight Shield...
üõ° Scraping Bloodshield...
üõ° Scraping Caduceus Kite Shield...
üõ° Scraping Crest Shield...
üõ° Scraping Dragon Crest Shield...
üõ° Scraping East-West Shield...
üõ° Scraping Gargoyle's Shield...
üõ° Scraping Grass Crest Shield...
üõ° Scraping Heater Shield...
üõ° Scraping Hollow Soldier Shield...
üõ° Scraping Iron Round Shield...
üõ° Scraping Knight Shield...
üõ° Scraping Large Leather Shield...
üõ° Scraping Sanctus...
üõ° Scraping Silver Knight Shield...
üõ° Scraping Spider Shield...
üõ° Scraping Sunlight Shield...
üõ° Scraping Tower Kite Shield...
üõ° Scra

In [7]:
with open("dark_souls_shields_full.json", "w") as f:
    json.dump(shields_full_data, f, indent=2)