In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import asyncio
import aiohttp
import json

In [4]:
def scrape_arc_names(url):
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the element with class 'category-page__members'
        category_members = soup.find('div', {'class': 'category-page__members'})

        # Initialize all_content
        all_content = ""

        # Extract the content within the 'category-page__members' div
        if category_members:
            content = category_members.get_text()
            # Append the content to the all_content string
            all_content += content
        else:
            print("Div with class 'category-page__members' not found on the page.")
    else:
        print("Failed to retrieve the page. Status code:", response.status_code)

    cleaned_string = re.sub(r'.\t', '', all_content.replace('\n', ' ')).replace('\t', ' ')

    # Split the input string by two or more whitespace characters using regular expression
    name_list = re.split(r'\s{2,}', cleaned_string)    

    # Filter out any empty strings
    name_list = [name.strip() for name in name_list if name.strip()]    

    names_list = []
    for name in name_list:
        name = name.replace(' ', '_')
        names_list.append(name)

    return names_list


In [5]:
arc_url = 'https://naruto.fandom.com/wiki/Category:Arcs'
arc_names_list = scrape_arc_names(arc_url)

print(arc_names_list)

['Plot_of_Naruto', 'Academy_Entrance_Arc', 'Akatsuki_Suppression_Mission', 'Ao_Arc', 'Bikōchū_Search_Mission', "Birth_of_the_Ten-Tails'_Jinchūriki", "Boruto's_Return_Arc", 'Buried_Gold_Excavation_Mission', 'Byakuya_Gang_Arc', 'Childhood', 'Chūnin_Exams_(Arc)', 'Chōchō_Arc', 'Chūnin_Re-Examination_Arc', "Code's_Assault_Arc", 'Cursed_Warrior_Extermination_Mission', 'Kurosuki_Family_Removal_Mission', 'Kaguya_Ōtsutsuki_Strikes', 'Kaima_Capture_Mission', 'Kakashi_Gaiden', "Kakashi's_Anbu_Arc:_The_Shinobi_That_Lives_in_the_Darkness", 'Kara_Actuation_Arc', 'Kawaki_&_Himawari_Academy_Arc', 'Kawaki_Arc', 'Kazekage_Rescue_Mission', 'Konoha_Crush_(Arc)', 'Konoha_Hiden:_The_Perfect_Day_for_a_Wedding_(Arc)', 'Konoha_Plans_Recapture_Mission', "Konohamaru's_Love_Arc", 'Fated_Battle_Between_Brothers', 'Five_Kage_Summit_(Arc)', 'Fourth_Shinobi_World_War:_Climax', 'Fourth_Shinobi_World_War:_Confrontation', 'Fourth_Shinobi_World_War:_Countdown', 'Gantetsu_Escort_Mission', 'Genin_Mission_Arc', 'Gosunkugi_

In [27]:
def scrape_arc_episodes(url):
    
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the span tag with the specified text and class
        title_span = soup.find('span', class_='mw-headline', string='Episodes')

        if title_span:
    
            # Find the corresponding table by searching for the closest table tag after the title
            table = title_span.find_next('table', class_='box table colored bordered innerbordered style-basic')

            if table:
                # Initialize lists to store data
                episode_data = []

                # Loop through rows in the table
                for row in table.find_all('tr')[1:]:  # Skip the header row
                    # Extract data from each column in the row
                    columns = row.find_all('td')

                    # Assuming the structure is consistent, extract data from each column
                    episode_number = row.find('th').get_text(strip=True)

                    # Check if anchor tag is present before trying to get its text
                    episode_title_tag = columns[0].find('a')
                    episode_title = episode_title_tag.get_text(strip=True).replace(' ', '_') if episode_title_tag else None

                    # Append data to the list of dictionaries
                    episode_data.append({
                        'Episode Number': episode_number,
                        'Episode Title': episode_title,
                    })

                return episode_data

            else:
                print(f"Table not found after the title Episode.")
                return None

        else:
            print(f"Title with text 'Episode' not found on the page.")
            return None

    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None


In [28]:
url_base = 'https://naruto.fandom.com/wiki/'

arc_episodes_dict = {}
for arc in arc_names_list:
    
    url = url_base + arc
    arc_episodes = scrape_arc_episodes(url)
    
    if arc_episodes:
       arc_episodes_dict[arc] = arc_episodes
    else:
        print(f'Failed to retrieve episode data for the arc {arc}')



Title with text 'Episode' not found on the page.
Failed to retrieve episode data for the arc Plot_of_Naruto
Title with text 'Episode' not found on the page.
Failed to retrieve episode data for the arc Boruto's_Return_Arc
Title with text 'Episode' not found on the page.
Failed to retrieve episode data for the arc Omnipotence_Arc
Title with text 'Episode' not found on the page.
Failed to retrieve episode data for the arc Sasuke_Retsuden:_The_Uchiha_Descendants_and_the_Heavenly_Stardust_(manga)


In [30]:
# Save arc episodes data to json file
with open('./data/arc_episodes_data.json', 'w') as json_file:
    json.dump(arc_episodes_dict, json_file)