In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import asyncio
import aiohttp
import json


In [7]:
def fetch_names(url):
    # Create an empty string to store the concatenated content
    all_content = ""
    
    # Send an HTTP GET request to the URL
    response = requests.get(url)
    
        # Check if the request was successful
    if response.status_code == 200:
            # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
    
            # Find the element with class 'category-page__members'
        category_members = soup.find('div', {'class': 'category-page__members'})
    
            # Extract the content within the 'category-page__members' div
        if category_members:
            content = category_members.get_text()
                # Append the content to the all_content string
            all_content += content
        else:
            print("Div with class 'category-page__members' not found on the page.")
    else:
            print("Failed to retrieve the page. Status code:", response.status_code)
    
    return all_content


def clean_names(string):
    
    cleaned_string = re.sub(r'.\t', '', string.replace('\n', ' ')).replace('\t', ' ')
    
    # Split the input string by two or more whitespace characters using regular expression
    name_list = re.split(r'\s{2,}', cleaned_string)
    
    # Filter out any empty strings
    name_list = [name.strip() for name in name_list if name.strip()]
    
    clean_list = []
    for name in name_list:
        name=name.replace(' ', '_')
        clean_list.append(name)

    return clean_list


In [8]:
# Base URL for Narutopedia character category
url_villages_list = 'https://naruto.fandom.com/wiki/Category:Villages'

villages_string = fetch_names(url_villages_list)
villages_list = clean_names(villages_string)


In [10]:
url_base = 'https://naruto.fandom.com/wiki/{}'

villages_texts = {}
villages_links = {}


async def fetch_character_data(name, session):

    url_village = url_base.format(name)
    
    async with session.get(url_village) as response:
        if response.status == 200:
            html_content = await response.text()
            soup = BeautifulSoup(html_content, 'html.parser')
            
            category_members = soup.find_all('p')
            all_links = [tag['href'].replace('/wiki/', '') for tag in soup.select('p a[href]')]

            villages_links[name] = all_links

            for data in category_members:
                villages_texts[name] = data.get_text()


async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_character_data(name, session) for name in villages_list]
        await asyncio.gather(*tasks)

await main()


In [13]:
# Save dictionary to a JSON file
with open('./data/villages_list.json', 'w') as json_file:
    json.dump(villages_list, json_file)

with open('./data/villages_texts.json', 'w') as json_file:
    json.dump(villages_texts, json_file)

with open('./data/villages_links.json', 'w') as json_file:
    json.dump(villages_links, json_file)
