# Final Project Data Preparation

## Imports

In [None]:
import csv
import time
import requests
import pandas as pd
import xml.etree.ElementTree as ET

## Scraping [BoardGameGeek](https://boardgamegeek.com/) Data

For this project, we selected BoardGameGeek's API as our main data source. We want to collect structured data about board games from the [BoardGameGeek XML API](https://boardgamegeek.com/wiki/page/BGG_XML_API2). This includes information like name, description, year of publication, average rating, and complexity.

We use the `/thing` endpoint with the `stats=1` parameter to include extra statistics like ratings and weight (complexity).

In [None]:
def get_game_data(game_id):
    url = f"https://boardgamegeek.com/xmlapi2/thing?id={game_id}&stats=1"
    response = requests.get(url)
    root = ET.fromstring(response.content)

    name = root.find(".//name").attrib["value"]
    description = root.find(".//description").text
    year = root.find(".//yearpublished").attrib["value"]
    rating = root.find(".//average").attrib["value"]
    complexity = root.find(".//averageweight").attrib["value"]

    return {
        "id": game_id,
        "name": name,
        "description": description,
        "year": int(year),
        "rating": float(rating),
        "complexity": float(complexity),
    }

In [None]:
# Example usage
game = get_game_data(174430)
print(game)

Note: To explore all available attributes for a specific game, open this URL (replace the ID to see other games): https://boardgamegeek.com/xmlapi2/thing?id=174430&stats=1

### Creating a CSV with Multiple Games

We'll extend the previous function to handle multiple games and save them in a CSV file. This will allow us to analyze or process the dataset later.

In [None]:
def get_game_data(game_id):
    url = f"https://boardgamegeek.com/xmlapi2/thing?id={game_id}&stats=1"
    response = requests.get(url)

    if response.status_code != 200:
        print(f"Failed to fetch ID {game_id}")
        return None

    root = ET.fromstring(response.content)

    try:
        name = root.find(".//name").attrib["value"]
        description = root.find(".//description").text.strip()
        year = root.find(".//yearpublished").attrib["value"]
        rating = root.find(".//average").attrib["value"]
        complexity = root.find(".//averageweight").attrib["value"]
        minplayers = root.find(".//minplayers").attrib["value"]
        maxplayers = root.find(".//maxplayers").attrib["value"]
        playingtime = root.find(".//playingtime").attrib["value"]
        minage = root.find(".//minage").attrib["value"]

        # Extract categories
        categories = [link.attrib["value"] for link in root.findall(".//link[@type='boardgamecategory']")]
        categories_str = ", ".join(categories)

        return {
            "id": game_id,
            "name": name,
            "description": description,
            "year": int(year),
            "rating": float(rating),
            "complexity": float(complexity),
            "minplayers": int(minplayers),
            "maxplayers": int(maxplayers),
            "playingtime": int(playingtime),
            "minage": int(minage),
            "categories": categories_str
        }

    except AttributeError:
        print(f"Skipping game {game_id} (missing data)")
        return None

In [None]:
def get_top_game_ids():
    # This helper function gets the current "hot" board games (popular games based on BGG activity)
    url = "https://boardgamegeek.com/xmlapi2/hot?boardgame"
    response = requests.get(url)
    root = ET.fromstring(response.content)

    game_ids = [item.attrib["id"] for item in root.findall(".//item")]
    return game_ids

In [None]:
def save_games_to_csv(filename, game_ids):
    with open(filename, "w", newline="", encoding="utf-8") as file:
        fieldnames = ["id", "name", "description", "year", "rating", "complexity",
                      "minplayers", "maxplayers", "playingtime", "minage", "categories"
                      ]

        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter=";")
        writer.writeheader()

        for idx, game_id in enumerate(game_ids, 1):
            print(f"Fetching {idx}/{len(game_ids)}: Game ID {game_id}")
            game_data = get_game_data(game_id)

            if game_data:
                writer.writerow(game_data)

            time.sleep(1)

In [None]:
def save_top_games_to_csv(filename, num_games=100):
    game_ids = get_top_game_ids()[:num_games]  # We retrieve the top games (MAYBE CAMBIAR ESTO)

    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["id", "name", "description", "year", "rating", "complexity"], delimiter=";")
        writer.writeheader()

        for idx, game_id in enumerate(game_ids, 1):
            print(f"Fetching {idx}/{num_games}: Game ID {game_id}")
            game_data = get_game_data(game_id)

            if game_data:
                writer.writerow(game_data)

            time.sleep(1)  # Avoid hitting API limits

In [None]:
# Save games from ID 1 to 3000
game_ids = list(range(3000, 6001))
save_games_to_csv(filename="boardgames.csv", game_ids=game_ids)

In [None]:
# Save top 100 games (we only get 50)
# save_top_games_to_csv("top_boardgames.csv", num_games=100)

In [None]:
df = pd.read_csv("boardgames.csv", sep=";")
df