In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# Create Growth Rate Dataframe

In [2]:
url = "https://serenesforest.net/the-sacred-stones/characters/growth-rates/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
tables = soup.find_all("table")
rows = tables[0].find_all('tr')
print(rows[0:2])

[<tr>
<th style="width: 23%;">Name</th>
<th style="width: 11%;">HP</th>
<th style="width: 11%;">S/M</th>
<th style="width: 11%;">Skl</th>
<th style="width: 11%;">Spd</th>
<th style="width: 11%;">Lck</th>
<th style="width: 11%;">Def</th>
<th style="width: 11%;">Res</th>
</tr>, <tr>
<td>Eirika</td>
<td>70</td>
<td>40</td>
<td>60</td>
<td>60</td>
<td>60</td>
<td>30</td>
<td>30</td>
</tr>]


In [3]:
def extract_alt_text(image_info):
    if image_info['alt']:
        return image_info['alt']
    return

In [4]:
def scrape_table(table, game_title: str) -> pd.DataFrame:
    """
    Designed to scrape web tables found on serenesforest
    Serenes frequently uses multiple table header rows so this function
        takes that into account
    Note that header_found here is important as sometimes header rows use SLIGHTLY
        different strings so this is the safest route I could come up with.
        I really only want the first row of headers per table.
    Function also accounts for cells containing images and extracts their alt text to use as data.
    """
    data = []
    columns = []
    header_found = False
    
    for index, row in enumerate(table.find_all('tr')):
        row_data = []
        if not header_found:
            headers = row.find_all('th')
            if headers:
                header_found = True
                columns = [header.text for header in headers]
    
        for cell in row.find_all('td'):
            images = cell.find_all('img')
            if images:
                cell_data = [extract_alt_text(image) for image in images]
                if cell.text:
                    text = cell.text.replace(',', '').split()
                    cell_data = [f"{text[i]} {alt_text}" for i, alt_text in enumerate(cell_data)]
                row_data.append(cell_data)
            else:
                row_data.append(cell.text)
    
        if row_data:
            data.append(row_data)
        
    df = pd.DataFrame(data, columns=columns)
    df['Game'] = game_title

    return df

In [5]:
df = scrape_table(tables[0], 'the-sacred-stones')
df

Unnamed: 0,Name,HP,S/M,Skl,Spd,Lck,Def,Res,Game
0,Eirika,70,40,60,60,60,30,30,the-sacred-stones
1,Seth,90,50,45,45,25,40,30,the-sacred-stones
2,Franz,80,40,40,50,40,25,20,the-sacred-stones
3,Gilliam,90,45,35,30,30,55,20,the-sacred-stones
4,Moulder,70,40,50,40,20,25,25,the-sacred-stones
5,Vanessa,50,35,55,60,50,20,30,the-sacred-stones
6,Ross,70,50,35,30,40,25,20,the-sacred-stones
7,Garcia,80,65,40,20,40,25,15,the-sacred-stones
8,Neimi,55,45,50,60,50,15,35,the-sacred-stones
9,Colm,75,40,40,65,45,25,20,the-sacred-stones


In [6]:
df.columns

Index(['Name', 'HP', 'S/M', 'Skl', 'Spd', 'Lck', 'Def', 'Res', 'Game'], dtype='object')

In [7]:
df.to_csv('../data/the-sacred-stones/growth-rates.csv', index=False)

# Create Base Stats Dataframe

Gonna need some tinkering here. Weapon rank and affinity both use images to convey information.

There's also actually two tables on this page. One for the base game characters and the other for creature campaign. Thankfully that's easy to handle.

In [8]:
url = "https://serenesforest.net/the-sacred-stones/characters/base-stats/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
tables = soup.find_all("table")
dataframes = [scrape_table(table, 'the-sacred-stones') for table in tables]

for df in dataframes:
    display(df)

Unnamed: 0,Name,Lv,Class,HP,Str,Skl,Spd,Lck,Def,Res,Mov,Con,Weapon Rank,Affin,Game
0,Eirika,1,Lord,16,4,8,9,5,3,1,5,5,[E Sword],[Light],the-sacred-stones
1,Seth,1,Paladin,30,14,13,12,13,11,8,8,11,"[A Sword, A Lance]",[Anima],the-sacred-stones
2,Franz,1,Cavalier,20,7,5,7,2,6,1,7,9,"[E Sword, D Lance]",[Light],the-sacred-stones
3,Gilliam,4,Knight,25,9,6,3,3,9,3,4,14,[C Lance],[Thunder],the-sacred-stones
4,Vanessa,1,Pegasus Knight,17,5,7,11,4,6,5,7,5,[D Lance],[Anima],the-sacred-stones
5,Moulder,3,Priest,20,4,6,9,1,2,5,5,9,[C Staff],[Anima],the-sacred-stones
6,Ross,1,Journeyman,15,5,2,3,8,3,0,4,8,[E Axe],[Fire],the-sacred-stones
7,Garcia,4,Fighter,28,8,7,7,3,5,1,5,14,[C Axe],[Fire],the-sacred-stones
8,Neimi,1,Archer,17,4,5,6,4,3,2,5,5,[D Bow],[Fire],the-sacred-stones
9,Colm,2,Thief,18,4,4,10,8,3,1,6,6,[E Sword],[Light],the-sacred-stones


Unnamed: 0,Name,Lv,Class,HP,Str,Skl,Spd,Lck,Def,Res,Mov,Con,Weapon Rank,Affin,Game
0,Caellach,12,Hero,47,19,14,13,14,15,13,6,13,"[A Sword, A Axe]",[Thunder],the-sacred-stones
1,Orson,13,Paladin,48,18,15,14,6,14,11,8,12,"[A Sword, A Lance]",[Dark],the-sacred-stones
2,Riev,16,Bishop,49,14,21,19,9,16,18,6,7,"[S Light, A Staff]",[Dark],the-sacred-stones
3,Ismaire,9,Swordmaster,33,16,20,23,12,8,15,6,7,[A Sword],[Wind],the-sacred-stones
4,Selena,11,Mage Knight,38,13,13,16,10,11,17,7,6,"[A Anima, B Staff]",[Thunder],the-sacred-stones
5,Glen,12,Wyvern Lord,46,20,17,13,7,18,5,8,12,"[A Sword, A Lance]",[Wind],the-sacred-stones
6,Hayden,10,Ranger,37,17,14,15,17,12,12,7,10,"[A Sword, A Bow]",[Anima],the-sacred-stones
7,Valter,13,Wyvern Knight,45,19,17,17,3,13,12,8,11,[S Lance],[Ice],the-sacred-stones
8,Fado,11,General,46,20,14,12,5,18,11,5,18,"[A Sword, A Lance, A Axe]",[Fire],the-sacred-stones
9,Lyon,14,Necromancer,44,22,13,11,4,17,19,6,7,"[S Dark, A Staff]",[Ice],the-sacred-stones


In [9]:
for index, df in enumerate(dataframes):
    if index == 0:
        df['creature_campaign'] = False
    else:
        df['creature_campaign'] = True

df = pd.concat([dataframes[0], dataframes[1]], ignore_index=True)
display(df)

Unnamed: 0,Name,Lv,Class,HP,Str,Skl,Spd,Lck,Def,Res,Mov,Con,Weapon Rank,Affin,Game,creature_campaign
0,Eirika,1,Lord,16,4,8,9,5,3,1,5,5,[E Sword],[Light],the-sacred-stones,False
1,Seth,1,Paladin,30,14,13,12,13,11,8,8,11,"[A Sword, A Lance]",[Anima],the-sacred-stones,False
2,Franz,1,Cavalier,20,7,5,7,2,6,1,7,9,"[E Sword, D Lance]",[Light],the-sacred-stones,False
3,Gilliam,4,Knight,25,9,6,3,3,9,3,4,14,[C Lance],[Thunder],the-sacred-stones,False
4,Vanessa,1,Pegasus Knight,17,5,7,11,4,6,5,7,5,[D Lance],[Anima],the-sacred-stones,False
5,Moulder,3,Priest,20,4,6,9,1,2,5,5,9,[C Staff],[Anima],the-sacred-stones,False
6,Ross,1,Journeyman,15,5,2,3,8,3,0,4,8,[E Axe],[Fire],the-sacred-stones,False
7,Garcia,4,Fighter,28,8,7,7,3,5,1,5,14,[C Axe],[Fire],the-sacred-stones,False
8,Neimi,1,Archer,17,4,5,6,4,3,2,5,5,[D Bow],[Fire],the-sacred-stones,False
9,Colm,2,Thief,18,4,4,10,8,3,1,6,6,[E Sword],[Light],the-sacred-stones,False


In [10]:
df.columns

Index(['Name', 'Lv', 'Class', 'HP', 'Str', 'Skl', 'Spd', 'Lck', 'Def', 'Res',
       'Mov', 'Con', 'Weapon Rank', 'Affin', 'Game', 'creature_campaign'],
      dtype='object')

In [11]:
df.to_csv('../data/the-sacred-stones/base-stats.csv', index=False)

## Fixing Weapon Ranks and Affinity Columns

Changes have already been made to the scraping function, but this section shows my messin' around.

In [12]:
import re

seth = tables[0].find_all('tr')[2]
weapon_info = seth.find_all('td')[-2]
affinity = seth.find_all('td')[-1]
print(f"Weapon html: {weapon_info.prettify()}\n\nAffinity html: {affinity.prettify()}")

Weapon html: <td>
 <a href="https://serenesforest.net/wp-content/uploads/2014/04/TypeSword.gif">
  <img alt="Sword" class="alignnone size-full" src="https://serenesforest.net/wp-content/uploads/2014/04/TypeSword.gif"/>
 </a>
 A,
 <a href="https://serenesforest.net/wp-content/uploads/2014/04/TypeLance.gif">
  <img alt="Lance" class="alignnone size-full" src="https://serenesforest.net/wp-content/uploads/2014/04/TypeLance.gif"/>
 </a>
 A
</td>


Affinity html: <td>
 <a href="https://serenesforest.net/wp-content/uploads/2014/04/AffinAnima.gif">
  <img alt="Anima" class="alignnone size-full" src="https://serenesforest.net/wp-content/uploads/2014/04/AffinAnima.gif"/>
 </a>
</td>



In [13]:
affinity_type = affinity.find('img')['alt']
affinity_type

'Anima'

In [14]:
weapon_types = weapon_info.find_all('img')
weapon_types = [weapon['alt'] for weapon in weapon_types]
weapon_types

['Sword', 'Lance']

In [15]:
ranks = weapon_info.text.replace(',', '').split()
ranks

['A', 'A']

In [16]:
print(f"Weapons: {weapon_types}\nRanks: {ranks}")

Weapons: ['Sword', 'Lance']
Ranks: ['A', 'A']


In [17]:
weapon_ranks = [f"{ranks[i]} {weapon}" for i, weapon in enumerate(weapon_types)]
weapon_ranks

['A Sword', 'A Lance']

In [18]:
def extract_alt_text(image_info):
    
    if image_info['alt']:
        return image_info['alt']

In [19]:
for image in weapon_info.find_all('img'):
    print(extract_alt_text(image))

Sword
Lance


In [20]:
for x in weapon_info:
    print("---")
    print(x)

---
<a href="https://serenesforest.net/wp-content/uploads/2014/04/TypeSword.gif"><img alt="Sword" class="alignnone size-full" src="https://serenesforest.net/wp-content/uploads/2014/04/TypeSword.gif"/></a>
---
 A, 
---
<a href="https://serenesforest.net/wp-content/uploads/2014/04/TypeLance.gif"><img alt="Lance" class="alignnone size-full" src="https://serenesforest.net/wp-content/uploads/2014/04/TypeLance.gif"/></a>
---
 A


## Creating Full Dataset

In [21]:
dir = '../data/the-sacred-stones'
bases = pd.read_csv(f'{dir}/base-stats.csv')
growths = pd.read_csv(f'{dir}/growth-rates.csv')

In [22]:
growths.head()

Unnamed: 0,Name,HP,S/M,Skl,Spd,Lck,Def,Res,Game
0,Eirika,70,40,60,60,60,30,30,the-sacred-stones
1,Seth,90,50,45,45,25,40,30,the-sacred-stones
2,Franz,80,40,40,50,40,25,20,the-sacred-stones
3,Gilliam,90,45,35,30,30,55,20,the-sacred-stones
4,Moulder,70,40,50,40,20,25,25,the-sacred-stones


In [23]:
ssdf = (
    growths
    .drop('Game', axis='columns')
    .merge(bases, how='outer', on='Name', suffixes=['_growth', '_base'])
)

float_columns = ssdf.select_dtypes(include='float').columns
ssdf[float_columns] = ssdf[float_columns].astype('Int64')

display(ssdf.head())

Unnamed: 0,Name,HP_growth,S/M,Skl_growth,Spd_growth,Lck_growth,Def_growth,Res_growth,Lv,Class,...,Spd_base,Lck_base,Def_base,Res_base,Mov,Con,Weapon Rank,Affin,Game,creature_campaign
0,Eirika,70,40,60,60,60,30,30,1,Lord,...,9,5,3,1,5,5,['E Sword'],['Light'],the-sacred-stones,False
1,Seth,90,50,45,45,25,40,30,1,Paladin,...,12,13,11,8,8,11,"['A Sword', 'A Lance']",['Anima'],the-sacred-stones,False
2,Franz,80,40,40,50,40,25,20,1,Cavalier,...,7,2,6,1,7,9,"['E Sword', 'D Lance']",['Light'],the-sacred-stones,False
3,Gilliam,90,45,35,30,30,55,20,4,Knight,...,3,3,9,3,4,14,['C Lance'],['Thunder'],the-sacred-stones,False
4,Moulder,70,40,50,40,20,25,25,3,Priest,...,9,1,2,5,5,9,['C Staff'],['Anima'],the-sacred-stones,False


In [24]:
ssdf.to_csv(f'{dir}/full_df.csv')

# Testing on Binding Blade (FE6)

In [25]:
url = "https://serenesforest.net/binding-blade/characters/growth-rates/"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
tables = soup.find_all("table")
dataframes = [scrape_table(table, 'binding-blade') for table in tables]

for df in dataframes:
    display(df.head(10))

Unnamed: 0,Name,HP,S/M,Skl,Spd,Lck,Def,Res,Game
0,Roy,80,40,50,40,60,25,30,binding-blade
1,Marcus,60,25,20,25,20,15,20,binding-blade
2,Allen,85,45,40,45,40,25,10,binding-blade
3,Lance,80,40,45,50,35,20,15,binding-blade
4,Wolt,80,40,50,40,40,20,10,binding-blade
5,Bors,90,30,30,40,50,35,10,binding-blade
6,Merlinus,100,0,50,50,100,20,5,binding-blade
7,Ellen,45,50,30,20,70,5,60,binding-blade
8,Dieck,90,40,40,30,35,20,15,binding-blade
9,Wade,75,50,45,20,45,30,5,binding-blade


In [26]:
def table_to_df(url: str, game: str) -> pd.DataFrame:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    tables = soup.find_all("table")
    dataframes = [scrape_table(table, game) for table in tables]

    return dataframes

In [27]:
game = 'binding-blade'
table_names = ['growth-rates', 'base-stats']
dataframes = []
for table_name in table_names:
    url = f"https://serenesforest.net/{game}/characters/{table_name}/"
    dfs = table_to_df(url, game)
    for df in dfs:
        display(df.head(5))
        dataframes.append(df)

Unnamed: 0,Name,HP,S/M,Skl,Spd,Lck,Def,Res,Game
0,Roy,80,40,50,40,60,25,30,binding-blade
1,Marcus,60,25,20,25,20,15,20,binding-blade
2,Allen,85,45,40,45,40,25,10,binding-blade
3,Lance,80,40,45,50,35,20,15,binding-blade
4,Wolt,80,40,50,40,40,20,10,binding-blade


Unnamed: 0,Name,Class,Lv,HP,S/M,Skl,Spd,Lck,Def,Res,Con,Mov,Affin,Weapon ranks,Game
0,Roy,Lord,1,18,5,5,7,7,5,0,6,5,[Fire],[D Sword],binding-blade
1,Marcus,Paladin,1,32,9,14,11,10,9,8,11,8,[Ice],"[D Sword, A Lance, E Axe]",binding-blade
2,Allen,Cavalier,1,21,7,4,6,3,6,0,9,7,[Fire],"[E Sword, D Lance]",binding-blade
3,Lance,Cavalier,1,20,5,6,8,2,6,0,9,7,[AffinAnima],"[E Sword, D Lance]",binding-blade
4,Wolt,Archer,1,18,4,4,5,2,4,0,7,5,[Ice],[D Bow],binding-blade


In [28]:
growths, bases = dataframes

In [29]:
bbdf = (
    growths
    .drop('Game', axis='columns')
    .merge(bases, how='outer', on='Name', suffixes=['_growth', '_base'])
)

display(bbdf.head())

Unnamed: 0,Name,HP_growth,S/M_growth,Skl_growth,Spd_growth,Lck_growth,Def_growth,Res_growth,Class,Lv,...,Skl_base,Spd_base,Lck_base,Def_base,Res_base,Con,Mov,Affin,Weapon ranks,Game
0,Roy,80,40,50,40,60,25,30,Lord,1,...,5,7,7,5,0,6,5,[Fire],[D Sword],binding-blade
1,Marcus,60,25,20,25,20,15,20,Paladin,1,...,14,11,10,9,8,11,8,[Ice],"[D Sword, A Lance, E Axe]",binding-blade
2,Allen,85,45,40,45,40,25,10,Cavalier,1,...,4,6,3,6,0,9,7,[Fire],"[E Sword, D Lance]",binding-blade
3,Lance,80,40,45,50,35,20,15,Cavalier,1,...,6,8,2,6,0,9,7,[AffinAnima],"[E Sword, D Lance]",binding-blade
4,Wolt,80,40,50,40,40,20,10,Archer,1,...,4,5,2,4,0,7,5,[Ice],[D Bow],binding-blade


In [30]:
bbdf.to_csv('../data/binding-blade/full_data.csv')

In [36]:
bbdf.loc[bbdf["Name"].str.contains("Cath")]

Unnamed: 0,Name,HP_growth,S/M_growth,Skl_growth,Spd_growth,Lck_growth,Def_growth,Res_growth,Class,Lv,...,Skl_base,Spd_base,Lck_base,Def_base,Res_base,Con,Mov,Affin,Weapon ranks,Game
38,Cath,80.0,40.0,45.0,85.0,50.0,15.0,20.0,Thief,5,...,7,11,8,2.0,1.0,5.0,6.0,[AffinAnima],[E Sword],binding-blade
60,Cath (HM),,,,,,,,20,3,...,12,2,3,,,,,,,binding-blade


In [39]:
def extract_headers(row):
    return [header.text for header in row.find_all('th')]

def handle_rowspan(row_data, rowspan_data, row_index):
    for col_index in sorted(rowspan_data.keys()):
        rowspan_count, rowspan_text = rowspan_data[col_index]
        if rowspan_count > 0:
            while len(row_data) <= col_index:
                row_data.append('')
            row_data[col_index] = rowspan_text
            rowspan_data[col_index] = (rowspan_count - 1, rowspan_text)
    return row_data

def extract_cell_data(cell):
    images = cell.find_all('img')
    if images:
        alt_texts = [extract_alt_text(image) for image in images]
        if cell.text.strip():
            text = cell.text.replace(',', '').split()
            return [f"{text[i]} {alt_text}" for i, alt_text in enumerate(alt_texts)]
        else:
            return alt_texts
    return cell.text

def build_dataframe(data, columns, game_title):
    df = pd.DataFrame(data, columns=columns)
    df['Game'] = game_title
    return df

def scrape_table(table, game_title: str) -> pd.DataFrame:
    data, columns, header_found, rowspan_data = [], [], False, {}
    
    for row_index, row in enumerate(table.find_all('tr')):
        row_data, is_header = [], False
        
        if not header_found:
            headers = row.find_all('th')
            if headers:
                header_found = True
                is_header = True
                columns = extract_headers(row)
                
        if not is_header:
            # Handle row span from previous rows
            row_data = handle_rowspan(row_data, rowspan_data, row_index)
            
            for col_index, cell in enumerate(row.find_all(['td', 'th'], recursive=False), len(row_data)):
                if cell.has_attr("rowspan"):
                    rowspan_count = int(cell['rowspan']) - 1
                    rowspan_data[col_index + len(row_data)] = (rowspan_count, cell.text)
                cell_data = extract_cell_data(cell)
                while len(row_data) <= col_index:
                    row_data.append('')
                row_data[col_index] = cell_data
        
        if row_data:
            data.append(row_data)
            
    df = build_dataframe(data, columns, game_title)
    return df

In [41]:
game = 'binding-blade'
table_names = ['growth-rates', 'base-stats']
dataframes = []
for table_name in table_names:
    url = f"https://serenesforest.net/{game}/characters/{table_name}/"
    dfs = table_to_df(url, game)
    for df in dfs:
        display(df.head(5))
        # dataframes.append(df)

Unnamed: 0,Name,HP,S/M,Skl,Spd,Lck,Def,Res,Game
0,Roy,80,40,50,40,60,25,30,binding-blade
1,Marcus,60,25,20,25,20,15,20,binding-blade
2,Allen,85,45,40,45,40,25,10,binding-blade
3,Lance,80,40,45,50,35,20,15,binding-blade
4,Wolt,80,40,50,40,40,20,10,binding-blade


ValueError: 14 columns passed, passed data had 35 columns