In [None]:
# https://www.dataquest.io/blog/python-api-tutorial/
# https://github.com/MarshallKrakauer/BGGAnalysis/blob/main/DataCreationFiles/Scrape_Top_1000_Games.ipynb

# Response codes and their meanings
# 200: Everything went okay, and the result has been returned (if any).
# 301: The server is redirecting you to a different endpoint. This can happen when a company switches domain names, or an endpoint name is changed.
# 400: The server thinks you made a bad request. This can happen when you don’t send along the right data, among other things.
# 401: The server thinks you’re not authenticated. Many APIs require login ccredentials, so this happens when you don’t send the right credentials to access an API.
# 403: The resource you’re trying to access is forbidden: you don’t have the right perlessons to see it.
# 404: The resource you tried to access wasn’t found on the server.
# 503: The server is not ready to handle the request.

In [178]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from tqdm import tqdm
import time

In [184]:
# Script works but BGG only limits HTML scraping of 2000 games.  Therefore, I'm going to try an XML version.
# XML version is not as straightforward either, limits to one 3500 results using xmlapi2/search
# I'm going back to HTML to scrape game ids with a 5 second pause between pages.

In [214]:
def scrape_bgg_game_ids(page_num=1):
    # Define global dictionary to store game_ids and associated information
    global id_dict
      
    # Define variables for dynamic board game searching
    page_num = str(page_num)
    
    start_url = 'https://boardgamegeek.com/browse/boardgame/page/{0}?sort=rank&sortdir=asc'.format(page_num)
    
    try:
        # Send a GET request to fetch the XML data
        response = requests.get(start_url)
        response.raise_for_status()  # Raise an exception if request fails
    except requests.exceptions.RequestException as e:
        print('Error:', e)
        return []
    
    # Parse the XML data using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    rank_table = soup.find_all("tr",attrs={"id": "row_"})
    
    for idx, elem in enumerate(rank_table):
        
        # Get the main info on a game from start_url
        links = elem.find_all("a",{'class': 'primary'})
        
        # Get the game's url
        url = links[0]['href']
        
        # Get the game_id
        game_id = re.findall(r'/(?P<digits>[0-9]+)/', url)[0]
        
        # Get the title of the game
        url = url.replace('/boardgame/', '')
        url = url.replace(game_id, '')
        url = url.replace('/', '')
        url = url.replace('-','_')
        
        # Get the release date
        release_year = elem.find_all('span')
        try:
            release_year = release_year[0].contents
            release_year = re.findall(r'[0-9]+', str(release_year))[0]
        except IndexError:
            release_year = 'n/a'        
        
        # Get the Geek Rating
        ratings = elem.find_all("td",{'class': 'collection_bggrating'})
        try:
            geek_rating = ratings[0].contents
            geek_rating = geek_rating[0].replace('\n', '').replace('\t', '').strip()
            geek_rating = float(geek_rating)
        except IndexError:
            geek_rating = 'n/a'         
        
        # Get the Avg Rating
        try:
            avg_rating = ratings[1].contents
            avg_rating = avg_rating[0].replace('\n', '').replace('\t', '').strip()
            avg_rating = float(avg_rating)
        except IndexError:
            avg_rating = 'n/a'
        
        # Get Num Voters
        try:
            num_voters = ratings[2].contents
            num_voters = num_voters[0].replace('\n', '').replace('\t', '').strip()
            num_voters = float(num_voters)
        except IndexError:
            num_voters = 'n/a'
       
        # Put all extracted values into a dictionary
        id_dict.append([game_id, url, release_year,geek_rating,avg_rating,num_voters])

In [215]:
# Scrape 1455 pages
id_dict=[] # Dictionary to store game_ids and associated information
last_page = 1457

for i in tqdm(range(1,last_page)):
    scrape_bgg_game_ids(i)
    time.sleep(5)    # Pause for 5 seconds between iterations

  0%|                                                                               | 2/1456 [00:12<2:25:18,  6.00s/it]

Error: HTTPSConnectionPool(host='boardgamegeek.com', port=443): Max retries exceeded with url: /browse/boardgame/page/3?sort=rank&sortdir=asc (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000215B4E323A0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))


  1%|▊                                                                             | 15/1456 [01:58<2:39:58,  6.66s/it]

Error: HTTPSConnectionPool(host='boardgamegeek.com', port=443): Max retries exceeded with url: /browse/boardgame/page/16?sort=rank&sortdir=asc (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000215B4EC90D0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))


100%|████████████████████████████████████████████████████████████████████████████| 1456/1456 [2:09:55<00:00,  5.35s/it]


In [216]:
id_dict
len(id_dict)

1800

In [217]:
df = pd.DataFrame(id_dict, columns = ['game_id', 'title', 'release_year','geek_rating','avg_rating','num_voters'])

In [218]:
df

Unnamed: 0,game_id,title,release_year,geek_rating,avg_rating,num_voters
0,224517,brass_birmingham,2018,8.423,8.61,38856.0
1,161936,pandemic_legacy_season_1,2015,8.389,8.53,50720.0
2,174430,gloomhaven,2017,8.386,8.62,58562.0
3,342942,ark_nova,2021,8.302,8.53,31488.0
4,233078,twilight_imperium_fourth_edition,2017,8.242,8.62,20993.0
...,...,...,...,...,...,...
1795,156496,march_ants,2015,6.181,7.17,1444.0
1796,320718,hidden_leaders,2022,6.181,6.90,2241.0
1797,12589,razzia,2004,6.180,6.85,1962.0
1798,253664,taco_cat_goat_cheese_pizza,2018,6.180,6.67,3840.0
