In [17]:
import pandas as pd

# https://github.com/bubblebooy/chi20_ds14/blob/master/curriculum/project-02/web-scraping-beautifulsoup/web_scraping_beautifulsoup.ipynb
from bs4 import BeautifulSoup
import requests

from tqdm.notebook import tqdm

import time, os

# https://github.com/bubblebooy/chi20_ds14/blob/master/curriculum/project-02/web-scraping-selenium/web_scraping_selenium.ipynb
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

# Helper Functions

In [18]:
def scrape_browse(page = 1):
    """
    Get scrapes data from BBG browse with specified page.
    browse page is sorted by descending number of votes
    """
    url = f'https://boardgamegeek.com/browse/boardgame/page/{page}?sort=numvoters&sortdir=desc' 
    response = requests.get(url)
    page = response.text
    return BeautifulSoup(page, 'lxml')

In [19]:
def clean_column_str(col):
    """
    removes ( tabs and newlines ) and strips strings in a column
    """    
    return col.str.replace("\t","").str.replace("\n","").str.strip()

In [20]:
def list_get_text(soup_list):
    """
    Gets the textContent from each item in a list of html elements. Returns a list
    """
    return [item.text for item in soup_list]

In [21]:
def browse_html_to_df(soup):
    """
    Takes the html from a BBG browse page and return the relevant information as a DataFrame
    """
    _df = pd.DataFrame()
    _df["title"] = [ element.text for element in soup.select(".collection_objectname a") ]
    _df["link"] = [ element.get("href") for element in soup.select(".collection_objectname a") ]
    _df["date"] = [ element.text for element in soup.select(".collection_objectname span") ]
    _df["BGG_rank"] = [ element.text for element in soup.select(".collection_rank") ]  # so many tabs
    _df["geek_rating"] = [ element.select(".collection_bggrating")[0].text for element in soup.select("tr#row_") ]
    _df["avg_rating"] = [ element.select(".collection_bggrating")[1].text for element in soup.select("tr#row_") ]
    _df["num_votes"] = [ element.select(".collection_bggrating")[2].text for element in soup.select("tr#row_") ]

    _df["date"] = _df["date"].str.replace("(","").str.replace(")","")
    _df["BGG_rank"] = clean_column_str(_df["BGG_rank"])
    _df["geek_rating"] = clean_column_str(_df["geek_rating"])
    _df["avg_rating"] = clean_column_str(_df["avg_rating"])
    _df["num_votes"] = clean_column_str(_df["num_votes"])
    _df = _df[_df.BGG_rank != 'N/A']    #games with no rank will be dropped / these are expansions
    _df = _df.dropna()                  
    return _df

In [22]:
def game_page_info(_df):
    """
    Takes a Datafram w/ information from a BBG browse page and scrapes additional information for each game in the DataFrame from the games BGG game page
    """
    driver = webdriver.Chrome(chromedriver)
    for i,link in enumerate(tqdm(_df["link"], leave  = False)):
        url = 'https://boardgamegeek.com' + link + '/credits'
        driver.get(url)
        time.sleep(.5 + np.random.random()*5 )
        soup_game_page      = BeautifulSoup(driver.page_source, 'html.parser')
        _df.at[i,"players"]  =soup_game_page.find_all(class_ = "gameplay-item-primary")[0].find('span').text
        _df.at[i,"play_time"]=soup_game_page.find_all(class_ = "gameplay-item-primary")[1].find('span').text
        _df.at[i,"age"]      =soup_game_page.find_all(class_ = "gameplay-item-primary")[2].find('span').text

        max_wait_time = 5.0
        while max_wait_time:
            try:
                full_credits = soup_game_page.find_all(class_ = "global-body-content-primary")[1]
                designer_list =  list_get_text(full_credits.find_all(class_ = 'outline-item-description')[3].find(class_ = "ng-scope").find_all(class_ = "ng-scope"))
                artists_list =   list_get_text(full_credits.find_all(class_ = 'outline-item-description')[4].find(class_ = "ng-scope").find_all(class_ = "ng-scope"))
                publishers_list = list_get_text(full_credits.find_all(class_ = 'outline-item-description')[5].find(class_ = "ng-scope").find_all(class_ = "ng-scope"))  
                categories_list = list_get_text(full_credits.find_all(class_ = 'outline-item-description')[6].find(class_ = "ng-scope").find_all(class_ = "ng-scope"))  
                mechanisms_list =   list_get_text(full_credits.find_all(class_ = 'outline-item-description')[7].find(class_ = "ng-scope").find_all(class_ = "ng-scope"))
                family_list = list_get_text(full_credits.find_all(class_ = 'outline-item-description')[8].find(class_ = "ng-scope").find_all(class_ = "ng-scope")) 
                break
            except:
                driver.get(url)
                soup_game_page = BeautifulSoup(driver.page_source, 'html.parser')
                print(f'{url} not loaded waiting : {max_wait_time} seconds')
                designer_list = [""]
                artists_list = [""]
                publishers_list = [""]
                categories_list = [""]
                mechanisms_list = [""]
                family_list = [""]
                time.sleep(.5)
                max_wait_time -= .5

        _df.at[i, "designer"]   = " , ".join(designer_list)    
        _df.at[i, "artists"]    = " , ".join(artists_list)    
        _df.at[i, "publishers"] = " , ".join(publishers_list)    
        _df.at[i, "categories"] = " , ".join(categories_list)    
        _df.at[i, "mechanisms"] = " , ".join(mechanisms_list)    
        _df.at[i, "family"]     = " , ".join(family_list)

    driver.quit()
    _df["players"] = clean_column_str(_df["players"])
    _df["play_time"] = clean_column_str(_df["play_time"])
    _df["age"] = clean_column_str(_df["age"])
    return _df

In [23]:
# Set to True for web scraping otherwise read pickle
pkl_name = "./bbg_df.pkl"


#Set True to concat web scraping to pickle
if False:
    df = pd.read_pickle(pkl_name)
else:
    df = pd.DataFrame()
for page in tqdm(range(0,21)):  #page 21 does not work. random pages after also fail.
    soup = scrape_browse(page)
    df = pd.concat([df , game_page_info(browse_html_to_df(soup))])
    df.to_pickle(pkl_name)
df.shape

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




(200, 16)