In [35]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import xmltodict
import json

In [20]:

bgg = pd.read_csv("data/BGG100.csv", encoding = 'ISO-8859-1')

bgg


Unnamed: 0,Rank,Rating,Name,Subtitle,Year,MinPlayers,MaxPlayers,BestPlayers,MinPlayTime,MaxPlayTime,MinAge,Weight,Type
0,1,8.511,Gloomhaven,Vanquish monsters with strategic cardplay. Ful...,2017,1,4,3,60,120,14,3.87,"Strategy, Thematic"
1,2,8.442,Pandemic Legacy: Season 1,Mutating diseases are spreading around the wor...,2015,2,4,4,60,60,13,2.83,"Strategy, Thematic"
2,3,8.418,Brass: Birmingham,"Build networks, grow industries, and navigate ...",2018,2,4,"3, 4",60,120,14,3.90,Strategy
3,4,8.273,Terraforming Mars,Compete with rival CEOs to make Mars habitable...,2016,1,5,3,120,120,12,3.24,Strategy
4,5,8.262,Twilight Imperium: Fourth Edition,"Build an intergalactic empire through trade, r...",2017,3,6,6,240,480,14,4.26,"Strategy, Thematic"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,7.529,The Isle of Cats,Rescue as many island cats as you can before L...,2019,1,4,3,60,90,8,2.31,"Family, Strategy"
96,97,7.528,Russian Railroads,"Acquire locomotives & industry, build track, a...",2013,2,4,4,90,120,12,3.41,Strategy
97,98,7.526,Tigris & Euphrates,Keep your Mesopotamian civilisation in perfect...,1997,2,4,4,90,90,12,3.51,Strategy
98,99,7.516,Decrypto,Decipher your opponents' code before they deci...,2018,3,8,"4, 6",15,45,12,1.80,Party


# BGG Information

In [21]:
bgg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Rank         100 non-null    int64  
 1   Rating       100 non-null    float64
 2   Name         100 non-null    object 
 3   Subtitle     100 non-null    object 
 4   Year         100 non-null    int64  
 5   MinPlayers   100 non-null    int64  
 6   MaxPlayers   100 non-null    int64  
 7   BestPlayers  97 non-null     object 
 8   MinPlayTime  100 non-null    int64  
 9   MaxPlayTime  100 non-null    int64  
 10  MinAge       100 non-null    int64  
 11  Weight       100 non-null    float64
 12  Type         100 non-null    object 
dtypes: float64(2), int64(7), object(4)
memory usage: 10.3+ KB


### Number of columns

In [22]:
n_columns = bgg.shape[1]

print(f"The number of columns is {n_columns}")

The number of columns is 13


### Number of rows

In [23]:
n_rows = bgg.shape[0]

print(f"The number of rows is {n_rows}")

The number of rows is 100


#### Modifying the column names

In [24]:
bgg.columns = [i.lower().replace(" ", "_").strip() for i in bgg.columns]

bgg.columns

Index(['rank', 'rating', 'name', 'subtitle', 'year', 'minplayers',
       'maxplayers', 'bestplayers', 'minplaytime', 'maxplaytime', 'minage',
       'weight', 'type'],
      dtype='object')

### Amount of Null Values per column

In [25]:
pd.isna(bgg).sum()

rank           0
rating         0
name           0
subtitle       0
year           0
minplayers     0
maxplayers     0
bestplayers    3
minplaytime    0
maxplaytime    0
minage         0
weight         0
type           0
dtype: int64

### Extracting the Top 10 games

In [26]:
top_games = bgg["name"].head(10)

print(top_games)

0                                      Gloomhaven 
1                        Pandemic Legacy: Season 1
2                                Brass: Birmingham
3                                Terraforming Mars
4                Twilight Imperium: Fourth Edition
5                     Gloomhaven: Jaws of the Lion
6                                     Gaia Project
7                             Star Wars: Rebellion
8    Through the Ages: A New Story of Civilization
9                  War of the Ring: Second Edition
Name: name, dtype: object


## Extracting additional information of the top 10 games, from the API

### Gloomhaven

In [109]:

url = "https://api.geekdo.com/xmlapi/boardgame/174430/"

# In this first case I'm taking the url from the first game in 
# the ranking, "Gloomhaven", to find it I made a query searching this
# name in this url = https://boardgamegeek.com/xmlapi/search?search=Gloomhaven
# that gave me the id of the game "174430"

res = requests.get(xml_url)

res

# After find the game I realized the request and name it res, the code 200 let me continue



<Response [200]>

In [110]:
xml_data = res.content

xml_data

# After it I printed the content inside "res" the problem was that
# the information was in xml, not in html

b'<boardgames termsofuse="https://boardgamegeek.com/xmlapi/termsofuse">\n\t\t\t\t\t<boardgame objectid="174430">\n\t\t\t<yearpublished>2017</yearpublished>\n\t\t\t<minplayers>1</minplayers>\n\t\t\t<maxplayers>4</maxplayers>\n\t\t\t<playingtime>120</playingtime>\n\t\t\t<minplaytime>60</minplaytime>\n\t\t\t<maxplaytime>120</maxplaytime>\n\t\t\t<age>14</age>\n\n\t\t\t\t\t\t\t<name primary="true" sortindex="1">Gloomhaven</name>\n\t\t\t\t\t\t\t<name  sortindex="1">Gloomhaven, aventures \xc3\xa0 Havrenuit</name>\n\t\t\t\t\t\t\t<name  sortindex="1">Gloomhaven: \xd0\x9c\xd1\x80\xd0\xb0\xd1\x87\xd0\xbd\xd0\xb0\xd1\x8f \xd0\x93\xd0\xb0\xd0\xb2\xd0\xb0\xd0\xbd\xd1\x8c</name>\n\t\t\t\t\t\t\t<name  sortindex="1">Hom\xc3\xa1lyr\xc3\xa9v</name>\n\t\t\t\t\t\t\t<name  sortindex="1">\xe3\x82\xb0\xe3\x83\xab\xe3\x83\xbc\xe3\x83\xa0\xe3\x83\x98\xe3\x82\xa4\xe3\x83\xb4\xe3\x83\xb3</name>\n\t\t\t\t\t\t\t<name  sortindex="1">\xe5\xb9\xbd\xe6\xb8\xaf\xe8\xbf\xb7\xe5\x9f\x8e</name>\n\t\t\t\t\t\t\t<name  sortin

In [107]:

soup = BeautifulSoup(xml_data, "xml")

# I continued with the process using the library Soup to made it more
# readable

soup

<?xml version="1.0" encoding="utf-8"?>
<boardgames termsofuse="https://boardgamegeek.com/xmlapi/termsofuse">
<boardgame objectid="174430">
<yearpublished>2017</yearpublished>
<minplayers>1</minplayers>
<maxplayers>4</maxplayers>
<playingtime>120</playingtime>
<minplaytime>60</minplaytime>
<maxplaytime>120</maxplaytime>
<age>14</age>
<name primary="true" sortindex="1">Gloomhaven</name>
<name sortindex="1">Gloomhaven, aventures à Havrenuit</name>
<name sortindex="1">Gloomhaven: Мрачная Гавань</name>
<name sortindex="1">Homályrév</name>
<name sortindex="1">グルームヘイヴン</name>
<name sortindex="1">幽港迷城</name>
<name sortindex="1">글룸헤이븐</name>
<description>This page does not exist. You can edit this page to create it.</description>
<thumbnail>https://cf.geekdo-images.com/sZYp_3BTDGjh2unaZfZmuA__thumb/img/veqFeP4d_3zNhFc3GNBkV95rBEQ=/fit-in/200x150/filters:strip_icc()/pic2437871.jpg</thumbnail>
<image>https://cf.geekdo-images.com/sZYp_3BTDGjh2unaZfZmuA__original/img/7d-lj5Gd1e8PFnD97LYFah2c45M=/0x

In [113]:
game_name = soup.find("name", {"primary": "true", "sortindex": "1"}).text

# here I'm creating a variable that will return the name of the game
# that is inside the information
# <name primary="true" sortindex="1">Gloomhaven</name>
# in the dictionary "name", with the keys "primary" and "sortindex"
# and I'll introduce in the future in a new variable

poll_section = soup.find("poll")

# After a search inside of the content I found that the relevant
# information that I was going to use was in the section named "poll"
# that contains information about the votes used to decide the values
# inside "best players" and "minimum age", in the data frame "bgg"

poll_section

<poll name="suggested_numplayers" title="User Suggested Number of Players" totalvotes="1501">
<results numplayers="1">
<result numvotes="143" value="Best"/>
<result numvotes="643" value="Recommended"/>
<result numvotes="356" value="Not Recommended"/>
</results>
<results numplayers="2">
<result numvotes="373" value="Best"/>
<result numvotes="822" value="Recommended"/>
<result numvotes="89" value="Not Recommended"/>
</results>
<results numplayers="3">
<result numvotes="736" value="Best"/>
<result numvotes="501" value="Recommended"/>
<result numvotes="37" value="Not Recommended"/>
</results>
<results numplayers="4">
<result numvotes="457" value="Best"/>
<result numvotes="597" value="Recommended"/>
<result numvotes="159" value="Not Recommended"/>
</results>
<results numplayers="4+">
<result numvotes="6" value="Best"/>
<result numvotes="50" value="Recommended"/>
<result numvotes="836" value="Not Recommended"/>
</results>
</poll>

In [116]:

gloomhaven_votes = []

# After it I created a new list to put inside the result of making
#a for loop of the results in the section "poll"


for poll_result in poll_section.find_all("results"):
    numplayers = poll_result.get("numplayers", "")
    # first obtaining the number of players "1-4+" with the method
    # get, that will take the values inside the key "numplayers",
    # inside the dictionary poll_result
    
    for result in poll_result.find_all("result"):
        value = result.get("value", "")
        numvotes = result.get("numvotes", "")
    # Inside the loop I had to make another iteration to obtain 
    # inside "poll result" all values that contain the key "value"
    # and "numvotes"
        
        
        gloomhaven_votes.append({
            "game_name": game_name, # Here is the name that I extracted before
            "num_players": numplayers,
            "value": value,
            "num_votes": numvotes})
        # And then append all this values inside "gloomhaven_votes" 

gloomhaven = pd.DataFrame(gloomhaven_votes)
# finally create a "DataFrame" with all the information appended in the list

gloomhaven

Unnamed: 0,game_name,num_players,value,num_votes
0,Gloomhaven,1,Best,143
1,Gloomhaven,1,Recommended,643
2,Gloomhaven,1,Not Recommended,356
3,Gloomhaven,2,Best,373
4,Gloomhaven,2,Recommended,822
5,Gloomhaven,2,Not Recommended,89
6,Gloomhaven,3,Best,736
7,Gloomhaven,3,Recommended,501
8,Gloomhaven,3,Not Recommended,37
9,Gloomhaven,4,Best,457


In [125]:
# After this, to avoid to repeat the same process with the other 9 urls
# I'm going to create a function that will do the process with each url

def extract_votes(url):
    # first make the request and obtain the xml
    response = requests.get(url)
    xml_data = response.text
    
    # then use the method "beautifulsoup" to make it more readable
    soup = BeautifulSoup(xml_data, "xml")
    
    # find the "name" of the game inside the text and save inside the variable game_name
    game_name = soup.find("name", {"primary": "true", "sortindex": "1"}).text

    # Find the section "poll" where is all the information about votes
    poll_section = soup.find("poll")
    
    # create a list to append the results in the "poll.section" dictionary
    top_ten_votes = []

    # iterate in the "poll_section" to extract the values for "numplayers"
    for poll_result in poll_section.find_all("results"):
        # save the values inside "numplayers"
        numplayers = poll_result.get("numplayers", "")
        
        # iterate inside the poll_result to extract the values for the keys
        # "value" and "numvotes"
        for result in poll_result.find_all("result"):
            # save the values inside "value" and "numvotes"
            value = result.get("value", "")
            numvotes = result.get("numvotes", "")
            
            # create a dictionario where append all the values of the previous iterations
            top_ten_votes.append({
                "game_name": game_name, # here save the name of the game, obtained before
                "num_players": numplayers,
                "value": value,
                "num_votes": numvotes
            })

    # create a Data Frame with all the extracted information
    df = pd.DataFrame(top_ten_votes)
    
    return df

urls = ["https://api.geekdo.com/xmlapi/boardgame/174430/",
    "https://api.geekdo.com/xmlapi/boardgame/161936/",
    "https://api.geekdo.com/xmlapi/boardgame/224517/",
    "https://api.geekdo.com/xmlapi/boardgame/167791/",
    "https://api.geekdo.com/xmlapi/boardgame/233078/",
    "https://api.geekdo.com/xmlapi/boardgame/291457/",
    "https://api.geekdo.com/xmlapi/boardgame/220308/",
    "https://api.geekdo.com/xmlapi/boardgame/187645/",
    "https://api.geekdo.com/xmlapi/boardgame/182028/",
    "https://api.geekdo.com/xmlapi/boardgame/115746/"]

# here I create a new list that will have all the urls that I will use inside the function


# Crear un DataFrame vacío para almacenar los resultados de todas las URLs
all_votes_df = pd.DataFrame()

# the next step will be create a new object that will create the final DF with all the information extracted

# Iterar sobre las URLs y agregar los resultados al DataFrame general
for url in urls:
    df = extract_votes(url)
    all_votes_df = pd.concat([all_votes, df], ignore_index=True)

# finally create a new loop to extract the url from the list of urls, to use
# the function on each one.
# and concatenate every new votes information in the final DF "all_votes_df".
# ignore_index=True will reindex the "all_votes_df" with the new information until the iteration finish


all_votes_df

Unnamed: 0,game_name,num_players,value,num_votes
0,Gloomhaven,1,Best,143
1,Gloomhaven,1,Recommended,643
2,Gloomhaven,1,Not Recommended,356
3,Gloomhaven,2,Best,373
4,Gloomhaven,2,Recommended,822
...,...,...,...,...
169,War of the Ring: Second Edition,4,Recommended,125
170,War of the Ring: Second Edition,4,Not Recommended,198
171,War of the Ring: Second Edition,4+,Best,1
172,War of the Ring: Second Edition,4+,Recommended,3


In [127]:

all_votes_df.to_csv('data/top_ten_votes.csv')

# this part exports all the changes to a new ".csv" file