In [None]:
# https://www.dataquest.io/blog/python-api-tutorial/

# Response codes and their meanings
# 200: Everything went okay, and the result has been returned (if any).
# 301: The server is redirecting you to a different endpoint. This can happen when a company switches domain names, or an endpoint name is changed.
# 400: The server thinks you made a bad request. This can happen when you don’t send along the right data, among other things.
# 401: The server thinks you’re not authenticated. Many APIs require login ccredentials, so this happens when you don’t send the right credentials to access an API.
# 403: The resource you’re trying to access is forbidden: you don’t have the right perlessons to see it.
# 404: The resource you tried to access wasn’t found on the server.
# 503: The server is not ready to handle the request.

In [151]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import tqdm as tqdm

In [152]:
id_dict=[] # Dictionary to store game_ids and associated information

In [161]:
# Scrape game ids from web pages

def scrape_bgg_game_ids(page_num=1):
    # Define global dictionary to store game_ids and associated information
    global id_dict
      
    # Define variables for dynamic board game searching
    page_num = str(page_num)
    
    start_url = 'https://boardgamegeek.com/browse/boardgame/page/{0}?sort=rank&sortdir=asc'.format(page_num)
    
    try:
        # Send a GET request to fetch the XML data
        response = requests.get(start_url)
        response.raise_for_status()  # Raise an exception if request fails
    except requests.exceptions.RequestException as e:
        print('Error:', e)
        return []
    
    # Parse the XML data using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    rank_table = soup.find_all("tr",attrs={"id": "row_"})
    
    for idx, elem in enumerate(rank_table):
        
        # Get the main info on a game from start_url
        links = elem.find_all("a",{'class': 'primary'})
        
        # Get the game's url
        url = links[0]['href']
        
        # Get the game_id
        game_id = re.findall(r'/(?P<digits>[0-9]+)/', url)[0]
        
        # Get the title of the game
        url = url.replace('/boardgame/', '')
        url = url.replace(game_id, '')
        url = url.replace('/', '')
        url = url.replace('-','_')
        
        # Get the release date
        release_year = elem.find_all('span')
        try:
            release_year = release_year[0].contents
            release_year = re.findall(r'[0-9]+', str(release_year))[0]
        except IndexError:
            release_year = 'n/a'        
        
        # Get the Geek Rating
        ratings = elem.find_all("td",{'class': 'collection_bggrating'})
        try:
            geek_rating = ratings[0].contents
            geek_rating = geek_rating[0].replace('\n', '').replace('\t', '').strip()
            geek_rating = float(geek_rating)
        except IndexError:
            geek_rating = 'n/a'         
        
        # Get the Avg Rating
        try:
            avg_rating = ratings[1].contents
            avg_rating = avg_rating[0].replace('\n', '').replace('\t', '').strip()
            avg_rating = float(avg_rating)
        except IndexError:
            avg_rating = 'n/a'
        
        # Get Num Voters
        try:
            num_voters = ratings[2].contents
            num_voters = num_voters[0].replace('\n', '').replace('\t', '').strip()
            num_voters = float(num_voters)
        except IndexError:
            num_voters = 'n/a'
       
        # Put all extracted values into a dictionary
        id_dict.append([game_id, url, release_year,geek_rating,avg_rating,num_voters])
    



In [None]:
# Scrape 1455 pages

last_page = 1456

for i in tqdm(range(1,last_page)):
    scrape_bgg_game_ids(i)

In [None]:
id_dict

In [157]:
df = pd.DataFrame(id_dict, columns = ['game_id', 'title', 'release_year','geek_rating','avg_rating','num_voters'])

In [158]:
df

Unnamed: 0,game_id,title,release_year,geek_rating,avg_rating,num_voters
0,224517,brass_birmingham,2018,8.423,8.61,38800.0
1,161936,pandemic_legacy_season_1,2015,8.389,8.53,50695.0
2,174430,gloomhaven,2017,8.387,8.62,58514.0
3,342942,ark_nova,2021,8.302,8.53,31386.0
4,233078,twilight_imperium_fourth_edition,2017,8.242,8.62,20973.0
...,...,...,...,...,...,...
2681,268620,similo,2019,6.412,6.83,4460.0
2682,2338,starship_catan,2001,6.412,6.84,4446.0
2683,291859,riftforce,2021,6.412,7.35,2003.0
2684,217085,unearth,2017,6.411,6.85,4829.0
