# Arc fetching

This notebook is fetching the arcs of the series. An arc is a number of episodes that together make up an arc of the series.

We start by important the neccesary packages:

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import asyncio
import aiohttp
import json

Now, with help from BeautilfulSoup, a function will be defined to fetch the arc names from the Naruto fandom page:

In [4]:
def scrape_arc_names(url):
    """
    Scrape arc names from a given URL.

    Parameters:
    - url (str): The URL of the page containing arc names.

    Returns:
    - names_list (list): A list of cleaned and formatted arc names.

    """
    response = requests.get(url)

    if response.status_code == 200:
        
        soup = BeautifulSoup(response.content, 'html.parser')
        category_members = soup.find('div', {'class': 'category-page__members'})
        all_content = ""

        if category_members:
            content = category_members.get_text()
            all_content += content
        else:
            print("Div with class 'category-page__members' not found on the page.")
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)

    # Clean and preprocess the extracted string
    cleaned_string = re.sub(r'.\t', '', all_content.replace('\n', ' ')).replace('\t', ' ')

    # Split the input string by two or more whitespace characters using regular expression
    name_list = re.split(r'\s{2,}', cleaned_string)

    name_list = [name.strip() for name in name_list if name.strip()]
    names_list = [name.replace(' ', '_') for name in name_list]

    return names_list


The function is called on the URL containing the wanted arc names and is stored into a list:

In [5]:
arc_url = 'https://naruto.fandom.com/wiki/Category:Arcs'
arc_names_list = scrape_arc_names(arc_url)

['Plot_of_Naruto', 'Academy_Entrance_Arc', 'Akatsuki_Suppression_Mission', 'Ao_Arc', 'Bikōchū_Search_Mission', "Birth_of_the_Ten-Tails'_Jinchūriki", "Boruto's_Return_Arc", 'Buried_Gold_Excavation_Mission', 'Byakuya_Gang_Arc', 'Childhood', 'Chūnin_Exams_(Arc)', 'Chōchō_Arc', 'Chūnin_Re-Examination_Arc', "Code's_Assault_Arc", 'Cursed_Warrior_Extermination_Mission', 'Kurosuki_Family_Removal_Mission', 'Kaguya_Ōtsutsuki_Strikes', 'Kaima_Capture_Mission', 'Kakashi_Gaiden', "Kakashi's_Anbu_Arc:_The_Shinobi_That_Lives_in_the_Darkness", 'Kara_Actuation_Arc', 'Kawaki_&_Himawari_Academy_Arc', 'Kawaki_Arc', 'Kazekage_Rescue_Mission', 'Konoha_Crush_(Arc)', 'Konoha_Hiden:_The_Perfect_Day_for_a_Wedding_(Arc)', 'Konoha_Plans_Recapture_Mission', "Konohamaru's_Love_Arc", 'Fated_Battle_Between_Brothers', 'Five_Kage_Summit_(Arc)', 'Fourth_Shinobi_World_War:_Climax', 'Fourth_Shinobi_World_War:_Confrontation', 'Fourth_Shinobi_World_War:_Countdown', 'Gantetsu_Escort_Mission', 'Genin_Mission_Arc', 'Gosunkugi_

Now, a function to fetch each episode name within each arc is defined:

In [27]:
def scrape_arc_episodes(url):
    """
    Scrape arc episodes data from a given URL.

    Parameters:
    - url (str): The URL of the page containing arc episodes data.

    Returns:
    - episode_data (list of dict): A list of dictionaries containing episode data (episode number and title).

    """
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        title_span = soup.find('span', class_='mw-headline', string='Episodes')

        if title_span:
    
            table = title_span.find_next('table', class_='box table colored bordered innerbordered style-basic')

            if table:
                episode_data = []

                for row in table.find_all('tr')[1:]:  # Skip the header row
                    columns = row.find_all('td')

                    episode_number = row.find('th').get_text(strip=True)

                    episode_title_tag = columns[0].find('a')
                    episode_title = episode_title_tag.get_text(strip=True).replace(' ', '_') if episode_title_tag else None

                    episode_data.append({
                        'Episode Number': episode_number,
                        'Episode Title': episode_title,
                    })

                return episode_data

            else:
                print("Table not found after the title 'Episode'.")
                return None

        else:
            print("Title with text 'Episode' not found on the page.")
            return None

    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None


Now the function is called in order to retrieve the episode names within each arc:

In [28]:
# Base URL for the Naruto Fandom website
url_base = 'https://naruto.fandom.com/wiki/'

arc_episodes_dict = {}

for arc in arc_names_list:    
    url = url_base + arc
    arc_episodes = scrape_arc_episodes(url)
    
    if arc_episodes:
        arc_episodes_dict[arc] = arc_episodes
    else:
        print(f'Failed to retrieve episode data for the arc {arc}')


Title with text 'Episode' not found on the page.
Failed to retrieve episode data for the arc Plot_of_Naruto
Title with text 'Episode' not found on the page.
Failed to retrieve episode data for the arc Boruto's_Return_Arc
Title with text 'Episode' not found on the page.
Failed to retrieve episode data for the arc Omnipotence_Arc
Title with text 'Episode' not found on the page.
Failed to retrieve episode data for the arc Sasuke_Retsuden:_The_Uchiha_Descendants_and_the_Heavenly_Stardust_(manga)


And finally, the data is stored as a json file so that it can be used for analysis:

In [30]:
# Save arc episodes data to json file
with open('./data/arc_episodes_data.json', 'w') as json_file:
    json.dump(arc_episodes_dict, json_file)