# Fetching village data

This notebook explains the process of fetching information about the villages in the Naruto series.

First of, we import the packages:

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import asyncio
import aiohttp
import json

We define two functions in the following code snippet. One for fetching the names of the villages from the Naruto fandom page, and another code for cleaning the fetched data:

In [7]:
def fetch_names(url):
    """
    Fetch and concatenate content from a given URL.

    Parameters:
    - url (str): The URL to fetch content from.

    Returns:
    - all_content (str): The concatenated content.

    """
    all_content = ""
    
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        category_members = soup.find('div', {'class': 'category-page__members'})
    
        if category_members:
            content = category_members.get_text()
            all_content += content
        else:
            print("Div with class 'category-page__members' not found on the page.")
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)
    
    return all_content


def clean_names(string):
    """
    Clean and process a string containing names.

    Parameters:
    - string (str): The string containing names.

    Returns:
    - clean_list (list): The list of cleaned names.

    """
    cleaned_string = re.sub(r'.\t', '', string.replace('\n', ' ')).replace('\t', ' ')
    
    name_list = re.split(r'\s{2,}', cleaned_string)
    
    name_list = [name.strip() for name in name_list if name.strip()]
    
    clean_list = []
    for name in name_list:
        name = name.replace(' ', '_')
        clean_list.append(name)

    return clean_list

The functions are called for fetching and cleaning the village information:

In [8]:
url_villages_list = 'https://naruto.fandom.com/wiki/Category:Villages'

villages_string = fetch_names(url_villages_list)
villages_list = clean_names(villages_string)

Now, we want to fetch the description of each village and the hyperlinks from the villages in order to see if other villages is referenced in another village's description:

In [10]:
url_base = 'https://naruto.fandom.com/wiki/'

villages_texts = {}
villages_links = {}


async def fetch_character_data(name, session):
    """
    Fetch character data for a given village name.

    Parameters:
    - name (str): The name of the village.
    - session: aiohttp.ClientSession object for making asynchronous requests.

    """
    url_village = f'{url_base}{name}'

    async with session.get(url_village) as response:
        if response.status == 200:
            html_content = await response.text()
            soup = BeautifulSoup(html_content, 'html.parser')

            category_members = soup.find_all('p')
            
            all_links = [tag['href'].replace('/wiki/', '') for tag in soup.select('p a[href]')]
            villages_links[name] = all_links

            for data in category_members:
                villages_texts[name] = data.get_text()


async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_character_data(name, session) for name in villages_list]
        await asyncio.gather(*tasks)

await main()

Finally, the name of the villages, the desription of the villages, and the links in the villages is saved as json files:

In [13]:
with open('./data/villages_list.json', 'w') as json_file:
    json.dump(villages_list, json_file)

with open('./data/villages_texts.json', 'w') as json_file:
    json.dump(villages_texts, json_file)

with open('./data/villages_links.json', 'w') as json_file:
    json.dump(villages_links, json_file)