In [12]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import asyncio
import aiohttp


In [13]:
# Base URL for Narutopedia character category
url_base = 'https://naruto.fandom.com/wiki/Category:Characters{}'
query_list = [
    '', 
    '?from=Eiki+Fūma%0AEiki+Fūma', 
    '?from=Hidari%0AHidari', 
    '?from=Karai%0AKarai', 
    '?from=Matsuba%0AMatsuba', 
    '?from=Rikumaru%0ARikumaru', 
    '?from=Taiki%0ATaiki', 
    '?from=Yubina%0AYubina'
]

# Create an empty string to store the concatenated content
all_content = ""

for query in query_list:
    # Construct the URL for the current query
    url = url_base.format(query)
    
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the element with class 'category-page__members'
        category_members = soup.find('div', {'class': 'category-page__members'})

        # Extract the content within the 'category-page__members' div
        if category_members:
            content = category_members.get_text()
            # Append the content to the all_content string
            all_content += content
        else:
            print("Div with class 'category-page__members' not found on the page.")
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)

# Save the concatenated content to a single text file
with open('narutopedia_combined_content.txt', 'w', encoding='utf-8') as file:
    file.write(all_content)
    print("Saved combined content to narutopedia_combined_content.txt")


Saved combined content to narutopedia_combined_content.txt


In [14]:
# cReate names_list

cleaned_string = re.sub(r'.\t', '', all_content.replace('\n', ' ')).replace('\t', ' ')

# Split the input string by two or more whitespace characters using regular expression
name_list = re.split(r'\s{2,}', cleaned_string)

# Filter out any empty strings
name_list = [name.strip() for name in name_list if name.strip()]

names_list = []
for name in name_list:
    name=name.replace(' ', '_')
    names_list.append(name)



In [36]:
names_list_short = names_list[:500]

In [32]:
url_characters = 'https://naruto.fandom.com/wiki/{}'

characters_texts = {}
characters_links = {}

for name in names_list_short:
        
        url_character = url_characters.format(name)
        response = requests.get(url_character)
        
        if response.status_code == 200:
                # Parse the HTML content of the page using BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')

                # Find the element with class 'category-page__members'
                category_members = soup.find_all('p')
                all_links = [tag['href'].replace('/wiki/', '') for tag in soup.select('p a[href]')]

                characters_links[name] = all_links

                for data in category_members:
                        characters_texts[name] = data.get_text()
                        
                        

In [40]:
url_characters = 'https://naruto.fandom.com/wiki/{}'

characters_texts = {}
characters_links = {}


async def fetch_character_data(name, session):
    url_character = url_characters.format(name)
    async with session.get(url_character) as response:
        if response.status == 200:
            html_content = await response.text()
            soup = BeautifulSoup(html_content, 'html.parser')
            
            category_members = soup.find_all('p')
            all_links = [tag['href'].replace('/wiki/', '') for tag in soup.select('p a[href]')]

            characters_links[name] = all_links

            for data in category_members:
                characters_texts[name] = data.get_text()


async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_character_data(name, session) for name in names_list]
        await asyncio.gather(*tasks)

await main()


In [41]:
for link in all_links:
    link = link.replace('/wiki/', '')
    print(link)

Medical-nin
Konohagakure
Sarutobi_clan
Hokage
Hiruzen_Sarutobi
Academy
Madara_Uchiha
Hashirama_Senju
#cite_note-2
Hiruzen_Sarutobi
Kushina_Uzumaki
Minato_Namikaze
Jinch%C5%ABriki
Anbu
Taji
Mikoto_Uchiha
Sasuke_Uchiha
Sasuke_Sarutobi
Naruto
Obito_Uchiha
#cite_note-3
Minato
Hokage
#cite_note-chpt500p12-4
J%C5%8Dnin
Jinch%C5%ABriki
Medical-nin
#cite_note-d4-1


In [44]:
len(characters_texts), len(characters_links)

(1448, 1448)

In [16]:
url_example = 'https://naruto.fandom.com/wiki/Ada'
response = requests.get(url_example)
if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the element with class 'category-page__members'
        table = soup.find('table', attrs={'class':'infobox box colored bordered innerbordered fill-td type-character list-noicon float-right-clear'})
        headings = [th.get_text() for th in table.find('tr').find_all('th', attrs={'class': not 'mainheader'})]
        
        datasets = []
        for row in table.find_all("tr")[1:]:
            dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
            datasets.append(tuple(dataset))
        
        print(datasets)

[(), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), (), ()]
