# Character fetching

This notebook is about fetching information about the characters from the Naruto fandom page.

We start of by important the neccesary packages:

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import asyncio
import aiohttp
import json
from aiohttp import hdrs
from aiohttp import ClientResponseError, TooManyRedirects
from yarl import URL  # Import the URL class


The names of all characters are fetched from the URLs in the code below:

In [2]:
# Base URL for Narutopedia character category
url_base = 'https://naruto.fandom.com/wiki/Category:Characters{}'
query_list = [
    '', 
    '?from=Eiki+Fūma%0AEiki+Fūma', 
    '?from=Hidari%0AHidari', 
    '?from=Karai%0AKarai', 
    '?from=Matsuba%0AMatsuba', 
    '?from=Rikumaru%0ARikumaru', 
    '?from=Taiki%0ATaiki', 
    '?from=Yubina%0AYubina'
]

# Create an empty string to store the concatenated content
all_content = ""

for query in query_list:
    # Construct the URL for the current query
    url = url_base.format(query)
    
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the element with class 'category-page__members'
        category_members = soup.find('div', {'class': 'category-page__members'})

        # Extract the content within the 'category-page__members' div
        if category_members:
            content = category_members.get_text()
            # Append the content to the all_content string
            all_content += content
        else:
            print("Div with class 'category-page__members' not found on the page.")
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)

# Save the concatenated content to a single text file
with open('narutopedia_combined_content.txt', 'w', encoding='utf-8') as file:
    file.write(all_content)
    print("Saved combined content to narutopedia_combined_content.txt")


Saved combined content to narutopedia_combined_content.txt


The content from above is stored into a txt file. In order to use the data further, we clean it a little:

In [3]:
cleaned_string = re.sub(r'.\t', '', all_content.replace('\n', ' ')).replace('\t', ' ')

# Split the input string by two or more whitespace characters using regular expression
name_list = re.split(r'\s{2,}', cleaned_string)

# Filter out any empty strings
name_list = [name.strip() for name in name_list if name.strip()]

names_list = []
for name in name_list:
    name=name.replace(' ', '_')
    names_list.append(name)

Now, we define a function that allows us to fetch the text from each character page. We fetch the text, and we fetch every time there is a hyperlink on the page (for instance to another character):

In [4]:
url_characters = 'https://naruto.fandom.com/wiki/{}'

characters_texts = {}
characters_links = {}

async def fetch_character_data(name, session):
    """
    Asynchronously fetch character data from a given URL.

    Parameters:
    - name (str): The name of the character.
    - session (aiohttp.ClientSession): An aiohttp client session.

    """
    url_character = url_characters.format(name)
    
    async with session.get(url_character) as response:
        if response.status == 200:
            html_content = await response.text()
            soup = BeautifulSoup(html_content, 'html.parser')
            
            category_members = soup.find_all('p')
            all_links = [tag['href'].replace('/wiki/', '') for tag in soup.select('p a[href]')]

            characters_links[name] = all_links

            # Use a list to store text content for each character
            characters_texts[name] = [data.get_text().strip() for data in category_members if data.get_text()]

async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_character_data(name, session) for name in names_list]
        await asyncio.gather(*tasks)

await main()

On each character page there is an infobox that contains a lot of different information about the attributes of the different characters. Those are fetched here:

In [6]:
url_base = 'https://naruto.fandom.com/wiki/{}'
characters_infobox = {}

async def fetch_infobox_data(name, session):
    """
    Asynchronously fetch infobox data for a character from a given URL.

    Parameters:
    - name (str): The name of the character.
    - session (aiohttp.ClientSession): An aiohttp client session.

    """
    url = url_base.format(name)

    async with session.get(url) as response:
        if response.status == 200:
            html_content = await response.text()
            soup = BeautifulSoup(html_content, 'html.parser')

            infobox_table = soup.find('table', class_='infobox')
            if infobox_table:
                keys = []
                values = []
                capture_info = False

                for row in infobox_table.find_all('tr'):
                    th = row.find('th')
                    td = row.find('td')

                    if th and 'Personal' in th.get_text():
                        capture_info = True
                        continue

                    if capture_info:
                        key = th.get_text(strip=True) if th else None
                        value = td.get_text(strip=True) if td else None

                        if td:
                            ul_values = []
                            for ul in td.find_all('ul', recursive=False):
                                li_values = [li.get_text(strip=True) for li in ul.find_all('li')]
                                ul_values.extend(li_values)

                            if ul_values:
                                key = key if key else "Additional Information"
                                keys.append(key)
                                values.append(ul_values)
                                continue

                        if th and 'mainheader' in th.get('class', []):
                            next_tr = row.find_next('tr')
                            if next_tr:
                                value = next_tr.get_text(strip=True)

                        if key and value:
                            keys.append(key)
                            values.append(value)

                result_dict = dict(zip(keys, values))
                characters_infobox[name] = result_dict
            else:
                print(f"No infobox found for {name}")
        else:
            print(f"Failed to fetch data for {name}. Status code: {response.status}")

async def main_2():
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_infobox_data(name, session) for name in names_list]
        await asyncio.gather(*tasks)

await main_2()


No infobox found for Zetsu


Now, all the data that has been fetched so far is stored into json files for future analysis:

In [8]:
# Save dictionary to a JSON file
with open('../data/characters_list.json', 'w') as json_file:
    json.dump(names_list, json_file)

with open('../data/characters_texts.json', 'w') as json_file:
    json.dump(characters_texts, json_file)

with open('../data/characters_links.json', 'w') as json_file:
    json.dump(characters_links, json_file)

with open('../data/characters_infobox.json', 'w') as json_file:
    json.dump(characters_infobox, json_file)


Now, we focus on the hyperlinks of the pages. If there is a hyperlink on a character page to another character, it is important that we fetch the header name of the linked character page, since the hyperlink not necessarily is the correct name of the linked character.

-------------------------------

In [9]:
url_characters = 'https://naruto.fandom.com/wiki/'

async def fetch_link_name(key, values, session, counter, progress_callback):
    """
    Asynchronously fetch header names for character links from a given URL.

    Parameters:
    - key (str): The key representing the character.
    - values (list): The list of character links.
    - session (aiohttp.ClientSession): An aiohttp client session.
    - counter (int): The counter to keep track of the total fetched header names.
    - progress_callback (function): A callback function to print progress.

    Returns:
    - int: The count of fetched header names.

    """
    header_names = []  # Store header names for each key
    for idx, value in enumerate(values, start=counter + 1):
        url_character = url_characters + value
        try:
            async with session.get(url_character) as response:
                if response.status == 200:
                    html_content = await response.text()
                    soup = BeautifulSoup(html_content, 'html.parser')

                    # Extract the header name
                    header_name = soup.find('h1', {'class': 'page-header__title'})
                    name = header_name.text.strip() if header_name else f"No header found for {value}"
                    header_names.append(name)

                    # Check and notify the progress callback every 100 fetches
                    if idx % 100 == 0:
                        progress_callback(key, idx)

        except aiohttp.ClientResponseError as e:
            print(f"ClientResponseError exception for {url_character}: {e}")
            continue
        except Exception as e:
            print(f"Exception for {url_character}: {e}")
            continue

    # Update the original dictionary with the fetched header names
    characters_links[key] = header_names
    return len(header_names)


async def process_batch(keys, session, counter, progress_callback):
    tasks = [fetch_link_name(key, values, session, counter, progress_callback) for key, values in characters_links.items() if key in keys]
    return await asyncio.gather(*tasks)

def print_progress(key, count):
    print(f"Fetched {count} header names for {key}")

async def main_3():
    async with aiohttp.ClientSession() as session:
        # Divide the keys into batches (adjust the batch size as needed)
        batch_size = 100
        keys_batches = [list(characters_links.keys())[i:i + batch_size] for i in range(0, len(characters_links), batch_size)]

        total_counter = 0
        for keys_batch in keys_batches:
            batch_results = await process_batch(keys_batch, session, total_counter, print_progress)
            total_counter += sum(batch_results)
            print(f"Total fetched header names: {total_counter}")

await main_3()


TooManyRedirects exception for https://naruto.fandom.com/wiki/Kawaki_%26_Himawari_Academy_Arc: 0, message='', url=URL('https://naruto.fandom.com/wiki/Kawaki_&_Himawari_Academy_Arc')
TooManyRedirects exception for https://naruto.fandom.com/wiki/Kawaki_%26_Himawari_Academy_Arc: 0, message='', url=URL('https://naruto.fandom.com/wiki/Kawaki_&_Himawari_Academy_Arc')
TooManyRedirects exception for https://naruto.fandom.com/wiki/Kawaki_%26_Himawari_Academy_Arc: 0, message='', url=URL('https://naruto.fandom.com/wiki/Kawaki_&_Himawari_Academy_Arc')
Fetched 100 header names for Ao
Fetched 100 header names for Akatsuchi
Fetched 100 header names for Asuma_Sarutobi
Fetched 100 header names for Akamaru
Fetched 100 header names for A_(Third_Raikage)
Fetched 100 header names for Anko_Mitarashi
Fetched 100 header names for A_(Fourth_Raikage)
TooManyRedirects exception for https://naruto.fandom.com/wiki/Kawaki_%26_Himawari_Academy_Arc: 0, message='', url=URL('https://naruto.fandom.com/wiki/Kawaki_&_Hima

The corrected link names are stored into a json file:

In [12]:
with open('../data/characters_links_corrected.json', 'w') as json_file:
    json.dump(characters_links, json_file)

# Delete?

In [13]:
import requests
from bs4 import BeautifulSoup

url = "https://naruto.fandom.com/wiki/Enter:_Naruto_Uzumaki!"

# Make an HTTP request to get the HTML content
response = requests.get(url)

if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table
    table = soup.find('table', class_='wikitable')

    # Find all rows in the table
    rows = table.find_all('tr')

    # Iterate through rows and extract the text from the 'Role' column
    for row in rows:
        role_column = row.find('td')  # Assuming the 'Role' is in a <td> tag
        if role_column:
            role_text = role_column.get_text(strip=True)
            print(f"Role: {role_text}")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Role: Naruto Uzumaki
Role: Sasuke Uchiha
Role: Sakura Haruno
Role: Third Hokage: Sarutobi
Role: Iruka
Role: Shikamaru Nara
Role: Ino Yamanaka
Role: Hinata Hyūga
Role: Mizuki
Role: Bekkō
Role: Iwana
Role: Yajirobee
Role: Ibara
Role: Tsubaki
Role: Tobio's father
Role: Tobio's mother
Role: Iruka (boyhood)


In [20]:
import requests
from bs4 import BeautifulSoup

url = "https://naruto.fandom.com/wiki/Vengeful_Strike!_The_Bracken_Dance!"

# Make an HTTP request to get the HTML content
response = requests.get(url)

if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the table
    table = soup.find('table', class_='wikitable')

    # Find all rows in the table
    rows = table.find_all('tr')

    # Find the header names
    header_row = rows[0]
    headers = [header.get_text(strip=True) for header in header_row.find_all('th')]

    # Iterate through rows and extract the role and associated header name
    for row in rows[1:]:  # Start from the second row to skip the header row
        role_column = row.find('td')  # Assuming the 'Role' is in a <td> tag
        if role_column:
            role_link = role_column.find('a')  # Assuming the role is a hyperlink
            if role_link:
                role_text = role_link.get_text(strip=True)
                title_attribute = role_link.get('title', '')
                print(f"Role: {role_text}, Title: {title_attribute}")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Role: Naruto Uzumaki, Title: Naruto Uzumaki
Role: Rock Lee, Title: Rock Lee
Role: Gaara, Title: Gaara
Role: Kimimaro, Title: Kimimaro
Role: Orochimaru, Title: Orochimaru
Role: Kabuto Yakushi, Title: Kabuto Yakushi
Role: Kazekage, Title: Rasa
Role: Sakon, Title: Sakon
Role: Kidōmaru, Title: Kidōmaru
Role: Tayuya, Title: Tayuya
Role: Jirōbō, Title: Jirōbō
