In [40]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import asyncio
import aiohttp


In [41]:
# Base URL for Narutopedia character category
url_base = 'https://naruto.fandom.com/wiki/Category:Characters{}'
query_list = [
    '', 
    '?from=Eiki+Fūma%0AEiki+Fūma', 
    '?from=Hidari%0AHidari', 
    '?from=Karai%0AKarai', 
    '?from=Matsuba%0AMatsuba', 
    '?from=Rikumaru%0ARikumaru', 
    '?from=Taiki%0ATaiki', 
    '?from=Yubina%0AYubina'
]

# Create an empty string to store the concatenated content
all_content = ""

for query in query_list:
    # Construct the URL for the current query
    url = url_base.format(query)
    
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the element with class 'category-page__members'
        category_members = soup.find('div', {'class': 'category-page__members'})

        # Extract the content within the 'category-page__members' div
        if category_members:
            content = category_members.get_text()
            # Append the content to the all_content string
            all_content += content
        else:
            print("Div with class 'category-page__members' not found on the page.")
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)

# Save the concatenated content to a single text file
with open('narutopedia_combined_content.txt', 'w', encoding='utf-8') as file:
    file.write(all_content)
    print("Saved combined content to narutopedia_combined_content.txt")


Saved combined content to narutopedia_combined_content.txt


In [42]:
cleaned_string = re.sub(r'.\t', '', all_content.replace('\n', ' ')).replace('\t', ' ')

# Split the input string by two or more whitespace characters using regular expression
name_list = re.split(r'\s{2,}', cleaned_string)

# Filter out any empty strings
name_list = [name.strip() for name in name_list if name.strip()]

names_list = []
for name in name_list:
    name=name.replace(' ', '_')
    names_list.append(name)



In [40]:
url_characters = 'https://naruto.fandom.com/wiki/{}'

characters_texts = {}
characters_links = {}


async def fetch_character_data(name, session):
    url_character = url_characters.format(name)
    async with session.get(url_character) as response:
        if response.status == 200:
            html_content = await response.text()
            soup = BeautifulSoup(html_content, 'html.parser')
            
            category_members = soup.find_all('p')
            all_links = [tag['href'].replace('/wiki/', '') for tag in soup.select('p a[href]')]

            characters_links[name] = all_links

            for data in category_members:
                characters_texts[name] = data.get_text()


async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_character_data(name, session) for name in names_list]
        await asyncio.gather(*tasks)

await main()


In [43]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup

url_base = 'https://naruto.fandom.com/wiki/{}'
characters_infobox = {}


async def fetch_infobox_data(name, session):
    url = url_base.format(name)
    async with session.get(url) as response:
        if response.status == 200:
            html_content = await response.text()
            soup = BeautifulSoup(html_content, 'html.parser')

            infobox_table = soup.find('table', class_='infobox')
            if infobox_table:
                keys = []
                values = []
                capture_info = False

                for row in infobox_table.find_all('tr'):
                    th = row.find('th')
                    td = row.find('td')

                    if th and 'Personal' in th.get_text():
                        capture_info = True
                        continue

                    if capture_info:
                        key = th.get_text(strip=True) if th else None
                        value = td.get_text(strip=True) if td else None

                        if td:
                            ul_values = []
                            for ul in td.find_all('ul', recursive=False):
                                li_values = [li.get_text(strip=True) for li in ul.find_all('li')]
                                ul_values.extend(li_values)

                            if ul_values:
                                key = key if key else "Additional Information"
                                keys.append(key)
                                values.append(ul_values)
                                continue

                        if th and 'mainheader' in th.get('class', []):
                            next_tr = row.find_next('tr')
                            if next_tr:
                                value = next_tr.get_text(strip=True)

                        if key and value:
                            keys.append(key)
                            values.append(value)

                result_dict = dict(zip(keys, values))
                characters_infobox[name] = result_dict
            else:
                print(f"No infobox found for {name}")
        else:
            print(f"Failed to fetch data for {name}. Status code: {response.status}")


async def main_2():
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_infobox_data(name, session) for name in names_list]
        await asyncio.gather(*tasks)

await main_2()

No infobox found for Zetsu


In [45]:
import json

# Save dictionary to a JSON file
with open('./data/characters_texts.json', 'w') as json_file:
    json.dump(characters_texts, json_file)

# Save dictionary to a JSON file
with open('./data/characters_links.json', 'w') as json_file:
    json.dump(characters_links, json_file)

# Save dictionary to a JSON file
with open('./data/characters_infobox.json', 'w') as json_file:
    json.dump(characters_infobox, json_file)

