# Final Project Data Preparation

## Imports

In [None]:
import csv
import time
import requests
import pandas as pd
import xml.etree.ElementTree as ET

## Scraping [BoardGameGeek](https://boardgamegeek.com/) Data

For this project, we selected BoardGameGeek's API as our main data source. We want to collect structured data about board games from the [BoardGameGeek XML API](https://boardgamegeek.com/wiki/page/BGG_XML_API2). This includes information like name, description, year of publication, average rating, and complexity.

We use the `/thing` endpoint with the `stats=1` parameter to include extra statistics like ratings and weight (complexity).

In [None]:
def get_game_data(game_id):
    url = f"https://boardgamegeek.com/xmlapi2/thing?id={game_id}&stats=1"
    response = requests.get(url)
    root = ET.fromstring(response.content)

    name = root.find(".//name").attrib["value"]
    description = root.find(".//description").text
    year = root.find(".//yearpublished").attrib["value"]
    rating = root.find(".//average").attrib["value"]
    complexity = root.find(".//averageweight").attrib["value"]

    return {
        "id": game_id,
        "name": name,
        "description": description,
        "year": int(year),
        "rating": float(rating),
        "complexity": float(complexity),
    }

In [None]:
# Example usage
game = get_game_data(174430)
print(game)

{'id': 174430, 'name': 'Gloomhaven', 'description': 'Gloomhaven  is a game of Euro-inspired tactical combat in a persistent world of shifting motives. Players will take on the roles of wandering adventurers with their own special sets of skills and their own reasons for traveling to this dark corner of the world. Players must work together out of necessity to clear out menacing dungeons and forgotten ruins. In the process, they will enhance their abilities with experience and loot, discover new locations to explore and plunder, and expand an ever-branching story fueled by the decisions they make.&#10; This is a game with a persistent and changing world that is ideally played over many game sessions. After a scenario, players will make decisions about what to do next, which will determine how the story continues, kind of like a &ldquo;Choose Your Own Adventure&rdquo; book. Playing through a scenario is a co-operative affair where players will fight against automated monsters using an in

Note: To explore all available attributes for a specific game, open this URL (replace the ID to see other games): https://boardgamegeek.com/xmlapi2/thing?id=174430&stats=1

### Creating a CSV with Multiple Games

We'll extend the previous function to handle multiple games and save them in a CSV file. This will allow us to analyze or process the dataset later.

In [None]:
def get_game_data(game_id):
    url = f"https://boardgamegeek.com/xmlapi2/thing?id={game_id}&stats=1"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to fetch ID {game_id}")
        return None

    root = ET.fromstring(response.content)

    try:
        name = root.find(".//name").attrib["value"]
        description = root.find(".//description").text.strip()
        year = root.find(".//yearpublished").attrib["value"]
        rating = root.find(".//average").attrib["value"]
        complexity = root.find(".//averageweight").attrib["value"]
        minplayers = root.find(".//minplayers").attrib["value"]
        maxplayers = root.find(".//maxplayers").attrib["value"]
        playingtime = root.find(".//playingtime").attrib["value"]
        minage = root.find(".//minage").attrib["value"]

        # Extract categories
        categories = [link.attrib["value"] for link in root.findall(".//link[@type='boardgamecategory']")]
        categories_str = ", ".join(categories)

        return {
            "id": game_id,
            "name": name,
            "description": description,
            "year": int(year),
            "rating": float(rating),
            "complexity": float(complexity),
            "minplayers": int(minplayers),
            "maxplayers": int(maxplayers),
            "playingtime": int(playingtime),
            "minage": int(minage),
            "categories": categories_str
        }

    except AttributeError:
        print(f"Skipping game {game_id} (missing data)")
        return None

In [None]:
def get_top_game_ids():
    # This helper function gets the current "hot" board games (popular games based on BGG activity)
    url = "https://boardgamegeek.com/xmlapi2/hot?boardgame"
    response = requests.get(url)
    root = ET.fromstring(response.content)

    game_ids = [item.attrib["id"] for item in root.findall(".//item")]
    return game_ids

In [None]:
def save_games_to_csv(filename, game_ids):
    with open(filename, "w", newline="", encoding="utf-8") as file:
        fieldnames = ["id", "name", "description", "year", "rating", "complexity",
                      "minplayers", "maxplayers", "playingtime", "minage", "categories"
                      ]

        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter=";")
        writer.writeheader()

        for idx, game_id in enumerate(game_ids, 1):
            print(f"Fetching {idx}/{len(game_ids)}: Game ID {game_id}")
            game_data = get_game_data(game_id)

            if game_data:
                writer.writerow(game_data)

            time.sleep(1)

In [None]:
def save_top_games_to_csv(filename, num_games=100):
    game_ids = get_top_game_ids()[:num_games]  # We retrieve the top games (MAYBE CAMBIAR ESTO)

    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["id", "name", "description", "year", "rating", "complexity"], delimiter=";")
        writer.writeheader()

        for idx, game_id in enumerate(game_ids, 1):
            print(f"Fetching {idx}/{num_games}: Game ID {game_id}")
            game_data = get_game_data(game_id)

            if game_data:
                writer.writerow(game_data)

            time.sleep(1)  # Avoid hitting API limits

In [None]:
# Save games from ID 1 to 3000
game_ids = list(range(1, 3001))
save_games_to_csv(filename="boardgames.csv", game_ids=game_ids)

Fetching 1/3000: Game ID 1
Fetching 2/3000: Game ID 2
Fetching 3/3000: Game ID 3
Fetching 4/3000: Game ID 4
Fetching 5/3000: Game ID 5
Fetching 6/3000: Game ID 6
Fetching 7/3000: Game ID 7
Fetching 8/3000: Game ID 8
Fetching 9/3000: Game ID 9
Fetching 10/3000: Game ID 10
Fetching 11/3000: Game ID 11
Fetching 12/3000: Game ID 12
Fetching 13/3000: Game ID 13
Fetching 14/3000: Game ID 14
Fetching 15/3000: Game ID 15
Fetching 16/3000: Game ID 16
Fetching 17/3000: Game ID 17
Fetching 18/3000: Game ID 18
Fetching 19/3000: Game ID 19
Fetching 20/3000: Game ID 20
Fetching 21/3000: Game ID 21
Fetching 22/3000: Game ID 22
Fetching 23/3000: Game ID 23
Fetching 24/3000: Game ID 24
Fetching 25/3000: Game ID 25
Fetching 26/3000: Game ID 26
Fetching 27/3000: Game ID 27
Fetching 28/3000: Game ID 28
Fetching 29/3000: Game ID 29
Fetching 30/3000: Game ID 30
Fetching 31/3000: Game ID 31
Fetching 32/3000: Game ID 32
Fetching 33/3000: Game ID 33
Skipping game 33 (missing data)
Fetching 34/3000: Game ID 34


In [None]:
# Save top 100 games (we only get 50)
# save_top_games_to_csv("top_boardgames.csv", num_games=100)

In [None]:
df = pd.read_csv("boardgames.csv", sep=";")
df

Unnamed: 0,id,name,description,year,rating,complexity,minplayers,maxplayers,playingtime,minage,categories
0,1,Die Macher,Die Macher is a game about seven sequential po...,1986,7.58777,4.3081,3,5,240,14,"Economic, Negotiation, Political"
1,2,Dragonmaster,Dragonmaster is a trick-taking card game based...,1981,6.66468,1.9643,3,4,30,12,"Card Game, Fantasy"
2,3,Samurai,Samurai is set in medieval Japan. Players comp...,1998,7.47596,2.4420,2,4,60,10,"Abstract Strategy, Medieval"
3,4,Tal der Könige,When you see the triangular box and the luxuri...,1992,6.61540,2.6875,2,4,60,12,Ancient
4,5,Acquire,"In Acquire, each player strategically invests ...",1964,7.35053,2.4923,2,6,90,12,"Economic, Territory Building"
...,...,...,...,...,...,...,...,...,...,...,...
1778,2992,Rythmo,2-player abstract based on a medieval game cal...,1985,5.50000,3.5000,2,2,30,0,"Abstract Strategy, Math, Number"
1779,2994,Guderian's Blitzkrieg: The Drive on Moscow,"Guderian's Blitzkrieg - The Drive on Moscow, S...",1992,7.03520,3.8667,2,2,360,12,"Wargame, World War II"
1780,2996,Dodge City,Directly from the rules:&#10;&#10;The board sh...,1983,5.57179,2.3333,2,5,60,8,American West
1781,2998,Reds! The Russian Civil War 1918-1921,Reds! The Russian Civil War 1918-1921 is a two...,2001,7.29145,3.0789,1,2,360,12,"Civil War, Wargame"
