In [1]:
!pip install PyPDF2



In [2]:
#Given the whole pdf return the index of the summary page
def get_summary_page(pages):
    summary_page_index = 0

    for i, page in enumerate(reader.pages):
        text = page.extract_text()

        #Using substring "Page 1:" to check if the page is the one with the summary 
        if 'Page 1:' in text:
            summary_page_index = i
            break
    
    return (summary_page_index)

#Given the summary page return the name, the year and the publisher of the game
def get_game_info(page):
    name = re.search('Game:(.*)\n', page).group(1).strip().upper().rstrip(":")
    #Publisher: ....
    if re.search('Publisher:(.*)\n', page):
        #Publisher: publisher (year)
        if re.search('Publisher:(.*)\)\n', page):
            publisher_and_year = re.search('Publisher:(.*)\)\n', page).group(1).strip() +')'
            publisher = re.search('(.*)\(', publisher_and_year).group(1).strip().upper()
            year = re.search('\((.*)\)', publisher_and_year).group(1).strip()
        #Publisher: publisher
        else:
            publisher = re.search('Publisher:(.*)\n', page).group(1).strip().upper()
            year = None
    #Pub: ...
    elif re.search('Pub:(.*)\n', page):
        #Pub: ©year publisher
        if re.search('Pub: ©(.*)\n', re.sub(' +', ' ', page)):
            publisher_and_year = re.search('Pub: ©(.*)\n', re.sub(' +', ' ', page)).group(1).strip()
            publisher = publisher_and_year[5:].upper()
            year = publisher_and_year[:4]
        #Pub: publisher (year)
        elif re.search('Pub:(.*)\)\n', page):
            publisher_and_year = re.search('Pub:(.*)\)\n', page).group(1).strip() +')'
            publisher = re.search('(.*)\(', publisher_and_year).group(1).strip().upper()
            year = re.search('\((.*)\)', publisher_and_year).group(1).strip()
        #Pub: publisher
        else:
            publisher = re.search('Pub:(.*)\n', page).group(1).strip().upper()
            year = None
    #Impossible to retrieve year and publisher
    else:
        publisher = None
        year = None
    return (name, year, publisher)

#From the original pdf return only those pages that contain rules
def get_rules(current_index, pages):
    line_by_line_text = pages[current_index].extract_text().split('\n')
    rules_pages = 0

    for line in line_by_line_text:
        if re.search('Page ([0-9]): Rules', re.sub(' +', ' ', line)):
            rules_pages = int(re.search('Page ([0-9]): Rules', re.sub(' +', ' ', line)).group(1))
            
    return get_text(pages[current_index + 1: current_index + rules_pages + 1])

#From n pages return only one string after having deleted '\n' char
def get_text(pages):
    text = ""
    for page in pages:
        #Substitute '\n' with blank and remove multiple blanks
        text += re.sub(' +', ' ', page.extract_text().replace('\n',' ')).lower()
    return text

#Return game information as a dictionary
def get_dict(game_info, rules):
    return {"name": game_info[0], "year": game_info[1], "publisher": game_info[2], "rules": rules}

In [None]:
import os
from PyPDF2 import PdfReader
import re

PATH = './EOOG_rulebooks/'
files = os.listdir(PATH)
files.remove('.DS_Store')
files.sort()

list_of_dict = list()

for file in files:
    file_path = PATH + file
    print(file)
    
    reader = PdfReader(file_path)
    
    summary_page = get_summary_page(reader.pages)
    game_info = get_game_info(reader.pages[summary_page].extract_text())
    rules = get_rules(summary_page, reader.pages)
    
    if rules != '':
        list_of_dict.append(get_dict(game_info, rules))

In [None]:
import requests
import xml.etree.ElementTree as ET
import difflib
import csv


def get_id_by_name(name):
    search_url = 'https://boardgamegeek.com/xmlapi/search?search=' + name + '&exact=1'
    
    x = requests.get(search_url)
    bgg_id = -1
    
    if(x.status_code == 200):
        parser = ET.XMLPullParser(['start'])
        parser.feed(x.text)

        for event, elem in parser.read_events():
            if(elem.tag == 'boardgame'):
                bgg_id = elem.get('objectid')
    
    if(bgg_id == -1):
        bgg_id = not_exact_search(name)
    
    return bgg_id

def get_weight_by_id(id):
    test_url = BASE_PATH + "thing?id=" + id + "&stats=1"

    x = requests.get(test_url)
    weight = 0.0
    
    if(x.status_code == 200):      
        parser = ET.XMLPullParser(['start'])
        parser.feed(x.text)

        for event, elem in parser.read_events():
            if(elem.tag == 'averageweight'):
                weight = elem.get('value')

        return weight
        
            
#search by name without the exact attribute. Three possibilities:
# 1. Find nothing and so removing the record from the dataset
# 2. Find only 1 game and so this is the one that I have been searching for
# 3. Find more than 1 game for an entry. Choose between them the most similar
def not_exact_search(name):
    search_url = 'https://boardgamegeek.com/xmlapi/search?search=' + name

    x = requests.get(search_url)
    bgg_id = -1
    s = []
    
    if(x.status_code == 200):
        text = x.text
        
        parser = ET.XMLPullParser(['start'])
        parser.feed(text)
        
        #print(text)
        
        result_count = 0
        
        
        for event, elem in parser.read_events():
            if(elem.tag == 'boardgame'):
                result_count += 1
                #bgg_id = elem.get('objectid')
        
        if(result_count == 1):
            parser = ET.XMLPullParser(['start'])
            parser.feed(text)
            for event, elem in parser.read_events():
                if(elem.tag == 'boardgame'):
                    bgg_id = elem.get('objectid')
        elif(result_count > 1):
            parser = ET.XMLPullParser(['start'])
            parser.feed(text)
            
            possible_games = dict()
            tmp_id = 0
            tmp_name = ""
            
            for event, elem in parser.read_events():
                if(elem.tag == 'boardgame'):
                    tmp_id = elem.get('objectid')
                if(elem.tag == 'name'):
                    tmp_name = elem.text
                    possible_games[tmp_name.lower()] = tmp_id
                     
            s = difflib.get_close_matches(name.lower(), list(possible_games.keys()), 1, 0.7)
            
    if(len(s) > 0):
        return possible_games.get(s[0])
    else:
        return -1

In [None]:
for d in list_of_dict:
    bbg_id = get_id_by_name(d.get('name'))
    if(bbg_id != -1):
        weight = get_weight_by_id(bbg_id)
        d['weight'] = weight
        d['id'] = bbg_id
        updated_dataset.append(d)

In [None]:
import pandas as pd

df = pd.DataFrame(updated_dataset)
df = df.drop(df.columns[0],axis = 1)
new_cols = ["id", "name", "year", "publisher", "weight", "rules"]
df = df.drop_duplicates(subset=['name'], keep='first')
df=df.reindex(columns=new_cols)
df

In [None]:
df.to_csv("bbg_boardgames_dataset.csv")