In [None]:
# https://www.dataquest.io/blog/python-api-tutorial/

# Response codes and their meanings
# 200: Everything went okay, and the result has been returned (if any).
# 301: The server is redirecting you to a different endpoint. This can happen when a company switches domain names, or an endpoint name is changed.
# 400: The server thinks you made a bad request. This can happen when you don’t send along the right data, among other things.
# 401: The server thinks you’re not authenticated. Many APIs require login ccredentials, so this happens when you don’t send the right credentials to access an API.
# 403: The resource you’re trying to access is forbidden: you don’t have the right perlessons to see it.
# 404: The resource you tried to access wasn’t found on the server.
# 503: The server is not ready to handle the request.

In [79]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from tqdm import tqdm
import time
from xml.etree import ElementTree as ET
import lxml

In [None]:
id_list = []

def scrape_bgg_game_ids(page_num=1):
    global id_list
    
    page_num = str(page_num)
    
    # Define the list
    
    # Define search parameters
    search_url = 'https://boardgamegeek.com/sitemap_geekitems_boardgame_page_{0}'.format(page_num)
    
    # Send a GET request to fetch the XML data
    response = requests.get(search_url)
    response.raise_for_status()  # Raise an exception if the request fails

    # Parse the XML data
    soup = BeautifulSoup(response.text,"lxml")

    # Get the main info on a game from start_url
    all_loc = soup.find_all("loc")

    seen = set() # Keep track of game_id and game_name tuple
    
    for loc in all_loc:        
        # Get the game's url
        url=loc.text
        # print(url)
                  
        # Extract game name using regular expression
        match = re.search(r'/(?P<digits>[0-9]+)/', url)
        if match:
            # Make sure the game id matches teh group called digits (defined in <digits> above)
            game_id = match.group('digits')
            # Remove the part of the url up to and including game_id
            game_name = url.replace(f'https://boardgamegeek.com/boardgame/{game_id}/','')
            # Replace hyphens with underscores
            game_name = game_name.replace('-','_')
            
            # Create a tuple, game_id game_name pair
            game = (game_id, game_name)
            
            # Only add the game to id_list if it hasn't been seen before
            if game not in seen:
                id_list.append(game)
                seen.add(game)
            
            print(f"Game Id: {game_id}, Game Name: {game_name}")
        else:
            print(f"No match found for game ID in the URL for game: {game_name}")
            
        
   # return id_list

# Usage
for i in tqdm(range(1,16)):
    game_ids = scrape_bgg_game_ids(i)



In [121]:
id_list


[('1', 'die_macher'),
 ('2', 'dragonmaster'),
 ('3', 'samurai'),
 ('4', 'tal_der_konige'),
 ('5', 'acquire'),
 ('6', 'mare_mediterraneum'),
 ('7', 'cathedral'),
 ('8', 'lords_creation'),
 ('9', 'el_caballero'),
 ('10', 'elfenland'),
 ('11', 'bohnanza'),
 ('12', 'ra'),
 ('13', 'catan'),
 ('14', 'basari'),
 ('15', 'cosmic_encounter'),
 ('16', 'marracash'),
 ('17', 'button_men'),
 ('18', 'roborally'),
 ('19', 'wacky_wacky_west'),
 ('20', 'full_metal_planete'),
 ('21', 'gateway_stars'),
 ('22', 'magic_realm'),
 ('23', 'divine_right'),
 ('24', 'twilight_imperium'),
 ('25', 'battlemist'),
 ('26', 'age_renaissance'),
 ('27', 'supremacy_game_superpowers'),
 ('28', 'illuminati'),
 ('29', 'terrain_vague'),
 ('30', 'dark_tower'),
 ('31', 'dark_world'),
 ('32', 'buffalo_chess'),
 ('34', 'arkham_horror'),
 ('36', 'federation_empire'),
 ('37', 'dragon_masters'),
 ('38', 'runes'),
 ('39', 'darkover'),
 ('40', 'borderlands'),
 ('41', 'cant_stop'),
 ('42', 'tigris_euphrates'),
 ('43', 'airlines'),
 ('4

In [122]:
df = pd.DataFrame(id_list,columns=['game_id',
                                  'game_name'])

df['game_id'] = df['game_id'].astype(int)


In [123]:
df

Unnamed: 0,game_id,game_name
0,1,die_macher
1,2,dragonmaster
2,3,samurai
3,4,tal_der_konige
4,5,acquire
...,...,...
145503,392703,alpenglow
145504,392704,os_desejos_do_sultao_expansao_viajantes
145505,392705,atlantis_rising_monstrosities_here_there_be_mo...
145506,392707,dawn_heroes_annas_roundtable_expansion


In [124]:
df.to_csv('all_game_ids.csv')

In [125]:
# Check for duplicates
duplicates = df[df.duplicated('game_id',keep=False)]

In [126]:
print(duplicates)

Empty DataFrame
Columns: [game_id, game_name]
Index: []
