In [32]:
import requests
import xml.etree.ElementTree as ET
import difflib
import csv


def get_id_by_name(name):
    search_url = 'https://boardgamegeek.com/xmlapi/search?search=' + name + '&exact=1'
    
    x = requests.get(search_url)
    bgg_id = -1
    
    if(x.status_code == 200):
        parser = ET.XMLPullParser(['start'])
        parser.feed(x.text)

        for event, elem in parser.read_events():
            if(elem.tag == 'boardgame'):
                bgg_id = elem.get('objectid')
    
    if(bgg_id == -1):
        bgg_id = not_exact_search(name)
    
    return bgg_id

def get_weight_by_id(id):
    test_url = "https://boardgamegeek.com/xmlapi2/thing?id=" + id + "&stats=1"

    x = requests.get(test_url)
    weight = 0.0
    
    if(x.status_code == 200):      
        parser = ET.XMLPullParser(['start'])
        parser.feed(x.text)

        for event, elem in parser.read_events():
            if(elem.tag == 'averageweight'):
                weight = elem.get('value')
        
    return weight
        
            
#search by name without the exact attribute. Three possibilities:
# 1. Find nothing and so removing the record from the dataset
# 2. Find only 1 game and so this is the one that I have been searching for
# 3. Find more than 1 game for an entry. Choose between them the most similar
def not_exact_search(name):
    search_url = 'https://boardgamegeek.com/xmlapi/search?search=' + name

    x = requests.get(search_url)
    bgg_id = -1
    s = []
    
    if(x.status_code == 200):
        text = x.text
        
        parser = ET.XMLPullParser(['start'])
        parser.feed(text)
        
        #print(text)
        
        result_count = 0
        
        
        for event, elem in parser.read_events():
            if(elem.tag == 'boardgame'):
                result_count += 1
                #bgg_id = elem.get('objectid')
        
        if(result_count == 1):
            parser = ET.XMLPullParser(['start'])
            parser.feed(text)
            for event, elem in parser.read_events():
                if(elem.tag == 'boardgame'):
                    bgg_id = elem.get('objectid')
        elif(result_count > 1):
            parser = ET.XMLPullParser(['start'])
            parser.feed(text)
            
            possible_games = dict()
            tmp_id = 0
            tmp_name = ""
            
            for event, elem in parser.read_events():
                if(elem.tag == 'boardgame'):
                    tmp_id = elem.get('objectid')
                if(elem.tag == 'name'):
                    tmp_name = elem.text
                    possible_games[tmp_name.lower()] = tmp_id
                     
            s = difflib.get_close_matches(name.lower(), list(possible_games.keys()), 1, 0.7)
            
    if(len(s) > 0):
        return possible_games.get(s[0])
    else:
        return -1

In [3]:
dataset = list()

with open('dataset_from_pdfs.csv') as csvfile:
    spamreader = csv.DictReader(csvfile)
    for row in spamreader:
        dataset.append(row)

In [34]:
updated_dataset = list()
            
for d in dataset:
    bbg_id = get_id_by_name(d.get('name'))
    # maintain the record only if it was possible to retrieve the id
    if(bbg_id != -1):
        weight = get_weight_by_id(bbg_id)
        if(weight == None):
            print(bbg_id)
            d['weight'] = 0
        else:
            d['weight'] = float(weight)
        d['id'] = bbg_id
        
        #Maintain the record only if the weight is not 0
        if d['weight'] != 0:
            updated_dataset.append(d)

In [35]:
import pandas as pd

df = pd.DataFrame(updated_dataset)
new_cols = ["id", "name", "year", "publisher", "weight", "rules", "raw_rules"]
df = df.drop_duplicates(subset=['name'], keep='first')
df=df.reindex(columns=new_cols)
df

Unnamed: 0,id,name,year,publisher,weight,rules,raw_rules
0,367525,3000 SCOUNDRELS,2022,UNEXPECTED GAMES,2.1667,"setup before your first game, place all clear ...","SETUP\nBefore your first game, place all clear..."
1,154173,7 RONIN,2013,GREY FOX GAMES,2.2105,setup choose or determine randomly who is the ...,SETUP\nChoose or determine randomly who is the...
2,68448,7 WONDERS,2010,ASMODEE,2.3211,"setup depending on the number of players, retu...","Setup\nDepending on the number of players, ret..."
3,141517,A STUDY IN EMERALD,2013,TREEFROG GAMES,3.4538,setup shuffle the secret identity cards and de...,SETUP\nShuffle the Secret Identity cards and d...
4,253499,A WAR OF WHISPERS,2020,STARLING GAMES,2.5488,setup each player chooses a matching player bo...,SETUP\nEach player chooses a matching player b...
...,...,...,...,...,...,...,...
316,146451,WRATH OF KINGS,2014,COOL MINI OR NOT,2.9000,basics distances (in inches) may be measured a...,BASICS\nDistances (in inches) may be measured ...
318,8593,WRECKAGE,2003,FANTASY FLIGHT GAMES,1.7222,setup 1. starting with a randomly chosen first...,Setup\n1. Starting with a randomly chosen firs...
319,38868,ZANZIAR,2009,DRAGON DAWN PRODUCTIONS,3.5625,setup each player takes: 3 pairs of adventurin...,Setup\nEach player takes:\n3 pairs of Adventur...
320,2471,ZOMBIES!!!,2001,"TWILIGHT CREATIONS, INC.",1.6142,setup place the town square tile in the centre...,SETUP\nPlace the Town Square tile in the centr...


In [36]:
df.to_csv("boardgames_dataset_NEW.csv")