### Notebook to get character statistics info from haikyuu.fandom.com web page
Outputs a csv with the following columns: name, team, stats for: power, jumping, stamina, game sense, technique and speed

In [1]:
site = "https://haikyuu.fandom.com/wiki/User_blog:RozzaPanda/Character_Statistics"

In [7]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
page = requests.get(site)
soup = BeautifulSoup(page.text, 'html.parser')

In [4]:
# Inspecting the page, we see that teams are indicated within H2s
# and charcter info is the sibling of these H2 elements
h2s = soup.find_all("h2")

In [5]:
# helper functions to extract player stats from a html block for a specific team
def get_stats(block_text):
    '''Returns player name and skill stats found in the block text
    
    Example block_text: "Shōyō Hinata  Power: 1/5  Jumping: 5/5  Stamina: 5/5  Game Sense: 1/5  Technique: 1/5  Speed: 5/5"
    
    NOTE 
    1) There is a typo in haikyuu.fandom.com where Kanoka Amanai's jumping stat was out of 4 instead of 5. 
    Hence, the regex used below is updated to accomodate the typo.
    2) I kept the name as shown in the webpage which includes additional info in brackets
    such as the post-timeskip team the character joined eg <Character Name (Future team ver.)>
    '''
    pattern = r"(.*?)  Power: (.*?)/5  Jumping: (.*?)/[4/5]  Stamina: (.*?)/5  Game Sense: (.*?)/5  Technique: (.*?)/5  Speed: (.*?)/5"
    results = re.search(pattern, block_text)
    if results is not None:
        return results.groups()
    elif "No stats" not in block_text:
        print("ERROR: ", block_text)
    return None

def get_players_from_sch(characters):
    '''Returns parsed data for each player in given data'''
    data = []
    for c in characters:
        if c.a is not None:
            name = c.a.contents[0]
            stats = get_stats(c.text)
            if stats is not None:
                data.append([name] + list(stats))

    return data

In [8]:
# scrape the data
data = []
for h2 in h2s:
    team = h2.find("span", class_="mw-headline")
    if team is not None:
        team = team.text

        players = h2.find_next_sibling()
        players = players.find_all("div", {"class": "lightbox-caption"})

        player_stats = get_players_from_sch(players)
        for player in player_stats:
            player.append(team)
            data.append(player)


In [9]:
# make as pandas dataframe
headers = ["name", "name_detailed", "Power", "Jumping", "Stamina", "Game Sense", "Technique", "Speed", "team"]
df = pd.DataFrame(data, columns=headers)
df.head()

Unnamed: 0,name,name_detailed,Power,Jumping,Stamina,Game Sense,Technique,Speed,team
0,Shōyō Hinata,Shōyō Hinata,1,5,5,1,1,5,Karasuno High
1,Tobio Kageyama,Tobio Kageyama,4,4,5,5,5,4,Karasuno High
2,Daichi Sawamura,Daichi Sawamura,4,3,3,4,4,3,Karasuno High
3,Kōshi Sugawara,Kōshi Sugawara,2,2,2,4,4,2,Karasuno High
4,Asahi Azumane,Asahi Azumane,5,3,3,3,3,2,Karasuno High


In [11]:
df[df.name == "Shōyō Hinata"] # each character may have multiple rows

Unnamed: 0,name,name_detailed,Power,Jumping,Stamina,Game Sense,Technique,Speed,team
0,Shōyō Hinata,Shōyō Hinata,1,5,5,1,1,5,Karasuno High
110,Shōyō Hinata,Shōyō Hinata (MSBY ver.),3,5,5,4,5,5,MSBY Black Jackals


In [10]:
df.to_csv("player_stats.csv", index=False)