In [190]:
import requests
import xmltodict
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import os

In [191]:
game_df = pd.read_csv("../data/game_info.csv")
review_df = pd.read_csv("../data/reviews.csv")
graph_df = pd.read_csv("../data/graph.csv")
games_to_add = [game_id for game_id in review_df['target_id'].unique() if game_id not in game_df['id'].unique()]
games_with_missing_data = game_df[game_df['long_description'].isna()]['id']
games_with_missing_edges = [game_id for game_id in game_df['id'].unique() if game_id not in graph_df['game_id'].unique()]
ids_to_scrape = np.unique(list(games_with_missing_data) + games_with_missing_edges + list(games_to_add))
REVIEW_DATA_PATH = "../data/reviews.csv"
print(f"{len(ids_to_scrape)} games are missing data")

58931 games are missing data


In [192]:
edges = {}
reviews = {}
retries = 0
for id in tqdm(ids_to_scrape[:75]): ## Only going to scrape the first 75 to make sure we aren't overloading BGG
    response = requests.get(f"https://www.boardgamegeek.com/xmlapi2/thing?id={id}&stats=1&comments=1")
    # Convert the XML response to a dictionary
    data_dict = xmltodict.parse(response.content)

    temp_attributes = {}
    mechanics = []
    categories = []
    families = []
    designers = []
    publishers = []
    artists = []
    if "items" not in data_dict:
        if 'error' in data_dict:
            if data_dict['error']['message']=='Rate limit exceeded.':
                print("taking a break...")
                time.sleep(5)
                if retries > 2:
                    break
                else:
                    retries +=1 
        continue
    if data_dict['items']['item']['@type'] != 'boardgame':
        continue
    num_ratings = data_dict['items']['item']['statistics']['ratings']['usersrated']['@value']
    if int(num_ratings) < 500:
        continue
    for item in data_dict['items']['item']['link']:
        match item['@type']:
            case "boardgameartist":
                artists.append(item['@value'])
            case "boardgamecategory":
                categories.append(item['@value'])
            case "boardgamedesigner":
                designers.append(item['@value'])
            case "boardgamefamily":
                families.append(item['@value'])
            case "boardgamemechanic":
                mechanics.append(item['@value'])
            case "boardgamepublisher":
                publishers.append(item['@value'])
    temp_attributes = {"Mechanics": mechanics, "Categories":categories, "Families": families, "Designers": designers, "Publishers": publishers, "Artists": artists}
    edges[id]=temp_attributes

    ## Append the new reviews to the review data
    usernames= []
    ratings = []
    comments = []
    if 'comments' in data_dict['items']['item']:
        for comment in data_dict['items']['item']['comments']['comment']:
            if comment['@rating'] == 'N/A':
                continue
            usernames.append(comment["@username"])
            ratings.append(comment['@rating'])
            comments.append(comment['@value'])

        include_header = not (os.path.isfile(REVIEW_DATA_PATH))
        pd.DataFrame({
            "source_type": "Person",
            "source_id": usernames,
            "edge_type": "hasReviewed",
            "target_type": "Game",
            "target_id": id,
            "rating": ratings,
            "comment": comments
        }).to_csv(REVIEW_DATA_PATH, index=False, header=include_header, mode='a')

    ## Fill in the missing data for the game info data
    id = int(data_dict['items']['item']['@id'])     
    image_url = data_dict['items']['item']['image']
    long_description = data_dict['items']['item']['description']
    year_published = int(data_dict['items']['item']['yearpublished']['@value'])
    expected_play_time = int(data_dict['items']['item']['playingtime']['@value'])
    min_play_time = int(data_dict['items']['item']['minplaytime']["@value"])
    max_play_time = int(data_dict['items']['item']['maxplaytime']["@value"])
    complexity_score = float(data_dict['items']['item']['statistics']['ratings']['averageweight']['@value'])
    names = data_dict['items']['item']['name']
    if type(names)==list:
        name = [n['@value'] for n in names if n['@type']=='primary'][0]
    else:
        name = names['@value']

    if int(id) in game_df['id'].values:
        game_df.loc[game_df['id']==int(id), ['image_url','long_description','year_published','expected_play_time','min_play_time','max_play_time','complexity_socre']] = [image_url, long_description, year_published,expected_play_time,min_play_time,max_play_time,complexity_score]
    else:
        game_df = pd.concat([game_df, 
                   pd.DataFrame({
                       "id": [int(id)],
                       "name": [name],
                       "avg_rating": [data_dict['items']['item']['statistics']['ratings']['average']['@value']],
                       "num_ratings": [num_ratings],
                       "image_url": [image_url],
                       "long_description":[long_description],
                       "year_published": [year_published],
                       "expected_play_time":[expected_play_time],
                       "min_play_time":[min_play_time],
                       "max_play_time": [max_play_time],
                       "complexity_socre": [complexity_score]
                   })])


 88%|████████▊ | 66/75 [00:29<00:04,  2.03it/s]

taking a break...


 92%|█████████▏| 69/75 [00:36<00:07,  1.22s/it]

taking a break...


 96%|█████████▌| 72/75 [00:42<00:04,  1.44s/it]

taking a break...


100%|██████████| 75/75 [00:49<00:00,  1.52it/s]


In [193]:
edge_df = pd.DataFrame(edges).T.rename(columns={"Mechanics":"Mechanic","Categories":"Category","Families":"Family","Artists":"Artist","Designers":"Designer", "Publishers":"Company"})
graph_df = pd.read_csv("../data/graph.csv")
print(f"Now only {game_df['long_description'].isna().sum()} out of {len(game_df)} games have missing data\nUnique games: {len(game_df['id'].unique())}")

Now only 72 out of 2116 games have missing data
Unique games: 2116


In [194]:
game_df.to_csv("../data/game_info.csv", index=False)

In [195]:
edgeTypeMap = {
    "Mechanic": "hasMechanic",
    "Category": "hasCategory",
    "Company": "hasPublisher",
    "Family": "hasFamily",
    "Artist": "hasArtist",
    "Designer": "hasDesigner"
}

targetTypeMap = {'Mechanic':'Mechanic', 
                 'Category':'Category', 
                 'Family':'Family', 
                 'Designer':'Person', 
                 'Company':'Company', 
                 'Artist':'Person'

}

In [196]:
new_graph = []
for target_type, game_info in edge_df.to_dict().items():
    for game_id, target_list in game_info.items():
        for target in target_list:
            temp = {
                "game_id": game_id,
                "target": target,
                "target_type": targetTypeMap[target_type],
                "edge_type": edgeTypeMap[target_type],
                "weight": 1
            }
            new_graph.append(temp)
    

In [197]:
combined_graph_df = pd.concat([graph_df, pd.DataFrame(new_graph)]).drop_duplicates()
combined_graph_df.to_csv("../data/graph.csv", index=False)
print(f"Added {len(combined_graph_df) - len(graph_df)} attribute edges to the graph")

Added 299 attribute edges to the graph


In [198]:
review_df = pd.read_csv("../data/reviews.csv")
filtered_reviews = review_df.drop_duplicates(subset=['source_id','target_id'], keep='last')
filtered_reviews.to_csv("../data/reviews.csv", index=False)
print(f"Removed {len(review_df) - len(filtered_reviews)} duplicate reviews")

Removed 2346 duplicate reviews
