In [11]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import random
import time
import logging

In [28]:
logging.basicConfig(filename='api_scraper.log', 
                    level=logging.DEBUG)

In [9]:
def calculate_pagecount(total_items):
    if int(total_items)%100 ==0:
        pages = int(total_items)//100
    else:
        pages = (int(total_items)//100)+1
    return pages

## Boardgame Classification

In [35]:
ID = 306882

In [101]:
api_adress = f"https://www.boardgamegeek.com/xmlapi2/thing?id=306882&ratingcomments=1&page=1"

response = requests.get(api_adress)
response.status_code

200

In [102]:
xml = response.text

In [103]:
soup = BeautifulSoup(xml, 'xml')

In [104]:
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<items termsofuse="https://boardgamegeek.com/xmlapi/termsofuse">
 <item id="306882" type="boardgame">
  <thumbnail>
   https://cf.geekdo-images.com/yE1xMCPzckKh9V9CStNFEw__thumb/img/zFQhF83FC3HYETEMz3R8_R0B2mQ=/fit-in/200x150/filters:strip_icc()/pic5350626.jpg
  </thumbnail>
  <image>
   https://cf.geekdo-images.com/yE1xMCPzckKh9V9CStNFEw__original/img/cpsGDBjzoKWhE8u5SojuSYafNmg=/0x0/filters:format(jpeg)/pic5350626.jpg
  </image>
  <name sortindex="1" type="primary" value="Railroad Ink Challenge: Shining Yellow Edition"/>
  <name sortindex="1" type="alternate" value="Railroad Ink Challenge: Edition Sonnengelb"/>
  <name sortindex="1" type="alternate" value="Railroad Ink Challenge: Goudgele Versie"/>
  <description>
   Railroad Ink Challenge is a quick-playing roll-and-write game for 1 to 4 players. Grab a board and a dry-erase marker, and get ready to reach networking nirvana! Roll the dice and draw the routes to connect the exits around your boa

In [72]:
soup.find_all('comments', attrs={page:'2'})

[]

In [None]:
categories = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgamecategory'})]
categories

In [None]:
mechanics = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgamemechanic'})]
mechanics

In [None]:
family = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgamefamily'})]
family

In [None]:
expansions = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgameexpansion'})]
expansions

In [None]:
integrations = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgameintegration'})]
integrations

In [None]:
designers = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgamedesigner'})]
designers

In [None]:
publishers = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgamepublisher'})]
publishers 

## User Ratings

In [None]:
api_adress = "https://www.boardgamegeek.com/xmlapi2/thing?id=306882&ratingcomments=1&page=1"

response = requests.get(api_adress)
response.status_code

In [None]:
xml = response.text
soup = BeautifulSoup(xml, 'xml')

In [None]:
#print(soup.prettify())

In [None]:
page = soup.find('comments').get('page')
page

In [None]:
total_items = soup.find('comments').get('totalitems')
total_items

In [None]:
pagecount = calculate_pagecount(total_items)
pagecount

In [None]:
ratings = []
users = []
comments = []
for comment in soup.find_all('comment'):
    ratings.append(comment.get('rating'))
    users.append(comment.get('username'))
    comments.append(comment.get('value'))
    

In [None]:
boardgame_id = [ID] * len(ratings)

In [None]:
ratings = pd.DataFrame({
    'id':boardgame_id,
    'user':users,
    'rating':ratings,
    'comment':comments
})

In [None]:
ratings['rating'].isna().sum()

To DO:
- Loop through pages
- ignore users without ratings
- User IDs?
- include logging
- append csv not dataframe
- check request status

## Loop over rating pages

In [29]:
boardgames =pd.read_csv('../data/boardgames.csv', index_col='id')
boardgames['categories'] = None
boardgames['mechanics'] = None
boardgames['family'] = None
boardgames['expansions'] = None
boardgames['integrations'] = None
boardgames['designers'] = None
boardgames['publishers'] = None

sleep_default = 2

In [34]:
for ID in [4974, 6927, 8139, 8993, 13121]:
    logging.debug(f'movie id: {ID}')
    
    user_ratings = []
    users = []
    comments = []
    ratings = pd.DataFrame()
    page = 1
    
    # request page 1 of game overview with user ratings and comments
    api_adress = f"https://www.boardgamegeek.com/xmlapi2/thing?id={ID}&ratingcomments=1&page={page}"
    response = requests.get(api_adress)  
    logging.info(f'status {response.status_code} for {api_adress}')    
    
    xml = response.text
    soup = BeautifulSoup(xml, 'xml')
    
    # get the number of pages of ratings
    total_items = soup.find('comments').get('totalitems')
    logging.debug(f'total items: {total_items}')
    pagecount = calculate_pagecount(total_items)
    logging.debug(f'pagecount: {pagecount}')
    logging.debug(f'start scraping page: {page}')

    
    # extend boardgames dataframe 
    categories = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgamecategory'})]
    boardgames.loc[ID,'categories'] = ', '.join(categories)
    mechanics = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgamemechanic'})]
    boardgames.loc[ID,'mechanics'] = ', '.join(mechanics)
    family = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgamefamily'})]
    boardgames.loc[ID,'family'] = ', '.join(family)
    expansions = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgameexpansion'})]
    boardgames.loc[ID,'expansions'] = ', '.join(expansions)
    integrations = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgameintegration'})]
    boardgames.loc[ID,'integrations'] = ', '.join(integrations)
    designers = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgamedesigner'})]
    boardgames.loc[ID,'designers'] = ', '.join(designers)
    publishers = [link.get('value') for link in soup.find_all('link', attrs={'type':'boardgamepublisher'})]
    boardgames.loc[ID,'publishers'] = ', '.join(publishers)
    
    # update number of voters
    boardgames.loc[ID, 'num_voters'] = total_items
    
    boardgames.to_csv('../data/boardgames_extend.csv', index=False)
    
    #get user ratings from page 1
    for comment in soup.find_all('comment'):
        user_ratings.append(comment.get('rating'))
        users.append(comment.get('username'))
        comments.append(comment.get('value')) 
        
    boardgame_id = [ID] * len(user_ratings)

    ratings = pd.DataFrame({
        'id':boardgame_id,
        'user':users,
        'rating':user_ratings,
        'comment':comments
    })
        
    ratings.to_csv('../data/ratings.csv', index=False, header=None, mode='a')
    
    # if more than one page of comments send request for each page
    if pagecount > page:
        page = page + 1

        while page <= pagecount:
            user_ratings = []
            users = []
            comments = []
            ratings = pd.DataFrame()
            
            api_adress = f"https://www.boardgamegeek.com/xmlapi2/thing?id={ID}&ratingcomments=1&page={page}"
            response = requests.get(api_adress)
            logging.info(f'status {response.status_code} for {api_adress}')
            
            if response.status_code != 200:
                logging.warning(f'STATUS {response.status_code}, increase sleep time and try again')
                time.sleep(10)
                sleep_default = 5
                
            else:
            
                xml = response.text
                soup = BeautifulSoup(xml, 'xml')

                for comment in soup.find_all('comment'):
                    user_ratings.append(comment.get('rating'))
                    users.append(comment.get('username'))
                    comments.append(comment.get('value'))

                boardgame_id = [ID] * len(user_ratings)

                ratings = pd.DataFrame({
                    'id':boardgame_id,
                    'user':users,
                    'rating':user_ratings,
                    'comment':comments
                })

                ratings.to_csv('../data/ratings.csv', index=False, header=False, mode='a')

                page = page + 1
                time.sleep(sleep_default+random.random())


In [107]:
boardgames.loc[822:].index

Int64Index([   822,     13,  68448,  36218,   9209, 178900, 167791,  31260,
              3076, 173346,
            ...
            137184, 137355, 137427, 138223, 138518, 138578, 138730, 139581,
            139637, 139930],
           dtype='int64', name='id', length=44999)