In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import asyncio
import aiohttp
import json
from urllib.parse import quote


In [2]:
import requests
from bs4 import BeautifulSoup

def scrape_episode_data(url, title_text):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the span tag with the specified text and class
        title_span = soup.find('span', class_='mw-headline', string=title_text)

        if title_span:
            # Navigate to the parent h3 tag
            title_tag = title_span.find_parent('h3')

            # Find the corresponding table by searching for the closest table tag after the title
            table = title_tag.find_next('table', class_='box table coloured bordered innerbordered style-basic fill-horiz')

            if table:
                episode_data = []

                # Loop through rows in the table
                for row in table.find_all('tr')[1:]:  # Skip the header row
                    # Extract data from each column in the row
                    columns = row.find_all('td')

                    # Assuming the structure is consistent, extract data from each column
                    episode_number = row.find('th').get_text(strip=True)

                    # Check if anchor tag is present before trying to get its text
                    episode_title_tag = columns[0].find('a')
                    episode_title = episode_title_tag.get_text(strip=True).replace(' ', '_') if episode_title_tag else None

                    japanese_airdate = columns[1].get_text(strip=True) if len(columns) > 1 else None

                    # Append data to the list of dictionaries
                    episode_data.append({
                        'Episode Number': episode_number,
                        'Episode Title': episode_title,
                        'Japanese Airdate': japanese_airdate,
                    })

                return episode_data

            else:
                print(f"Table not found after the title with text '{title_text}'.")
                return None

        else:
            print(f"Title with text '{title_text}' not found on the page.")
            return None

    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None


In [3]:
# Example usage:
url = "https://naruto.fandom.com/wiki/List_of_Animated_Media"
season_titles = ['Naruto: Original', 'Naruto: Shippūden', 'Boruto: Naruto Next Generations']

episodes_seasons_data = {}

for season in season_titles:
    data = scrape_episode_data(url, season)
    if data is not None:
        episodes_seasons_data[season] = data


In [4]:
for season, data in episodes_seasons_data.items():
    print(f"\nSeason: {season}")
    if data:
        for episode in data:
            print(episode)
    else:
        print("No data found.")


Season: Naruto: Original
{'Episode Number': '1', 'Episode Title': 'Enter:_Naruto_Uzumaki!', 'Japanese Airdate': '3 October 2002'}
{'Episode Number': '2', 'Episode Title': 'My_Name_is_Konohamaru!', 'Japanese Airdate': '10 October 2002'}
{'Episode Number': '3', 'Episode Title': 'Sasuke_and_Sakura:_Friends_or_Foes?', 'Japanese Airdate': '17 October 2002'}
{'Episode Number': '4', 'Episode Title': 'Pass_or_Fail:_Survival_Test', 'Japanese Airdate': '24 October 2002'}
{'Episode Number': '5', 'Episode Title': "You_Failed!_Kakashi's_Final_Decision", 'Japanese Airdate': '31 October 2002'}
{'Episode Number': '6', 'Episode Title': 'A_Dangerous_Mission!_Journey_to_the_Land_of_Waves!', 'Japanese Airdate': '7 November 2002'}
{'Episode Number': '7', 'Episode Title': 'The_Assassin_of_the_Mist!', 'Japanese Airdate': '14 November 2002'}
{'Episode Number': '8', 'Episode Title': 'The_Oath_of_Pain', 'Japanese Airdate': '21 November 2002'}
{'Episode Number': '9', 'Episode Title': 'Kakashi:_Sharingan_Warrior

In [26]:
with open('./data/episodes_seasons.json', 'w') as json_file:
    json.dump(episodes_seasons_data, json_file)

In [19]:
def get_synopsis_text(episode_title):
    formatted_title = quote(episode_title.replace(" ", "_"))
    url = f"https://naruto.fandom.com/wiki/{formatted_title}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the element with the "Synopsis" heading
    synopsis_heading = soup.find('span', {'id': 'Synopsis'})

    if synopsis_heading:
        # Find the parent <h2> element
        h2_element = synopsis_heading.find_parent('h2')

        # Find all <p> elements under the <h2> element
        p_elements = h2_element.find_all_next('p')

        # Extract the text of each <p> element
        synopsis_text = '\n'.join(p_element.get_text() for p_element in p_elements)

        return synopsis_text
    else:
        print(f"No 'Synopsis' heading found for episode: {episode_title}")
        return None
    
episodes_seasons_text = episodes_seasons_data
for season, data in episodes_seasons_text.items():
    print(f"\nSeason: {season}")
    
    if data:
        for episode_info in data:
            episode_title = episode_info.get('Episode Title', '')
            print(f"Episode Title: {episode_title}")
            synopsis_text = get_synopsis_text(episode_title)

            episode_info['episode synopsis text'] = synopsis_text
    else:
        print("No data found.")



Season: Naruto: Original
Episode Title: Enter:_Naruto_Uzumaki!
Episode Title: My_Name_is_Konohamaru!
Episode Title: Sasuke_and_Sakura:_Friends_or_Foes?
Episode Title: Pass_or_Fail:_Survival_Test
Episode Title: You_Failed!_Kakashi's_Final_Decision
Episode Title: A_Dangerous_Mission!_Journey_to_the_Land_of_Waves!
Episode Title: The_Assassin_of_the_Mist!
Episode Title: The_Oath_of_Pain
Episode Title: Kakashi:_Sharingan_Warrior!
Episode Title: The_Forest_of_Chakra
Episode Title: The_Land_Where_a_Hero_Once_Lived
Episode Title: Battle_on_the_Bridge!_Zabuza_Returns!
Episode Title: Haku's_Secret_Jutsu:_Demonic_Mirroring_Ice_Crystals
Episode Title: The_Number_One_Hyperactive,_Knucklehead_Ninja_Joins_the_Fight!


KeyboardInterrupt: 

In [15]:
with open('./data/episodes_seasons_with_text.json', 'w') as json_file:
    json.dump(episodes_seasons_text, json_file)

In [27]:
def get_episode_characters(episode_title):
    formatted_title = quote(episode_title.replace(" ", "_"))
    url = f"https://naruto.fandom.com/wiki/{formatted_title}"

    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        credits_heading = soup.find('span', {'id': 'Credits'})

        if credits_heading:
            # Find the table below the 'Credits' heading
            credits_table = credits_heading.find_next('table', class_='wikitable')

            if credits_table:
                # Extract names under the 'Role' column
                character_names = []
                for row in credits_table.find_all('tr')[1:]:  # Skip the header row
                    # Check if the 'td' tag exists in the row
                    td_tag = row.find('td')
                    if td_tag:
                        # Extract data from the first column (Role)
                        role = td_tag.get_text(strip=True)
                        character_names.append(role)

                return character_names

            else:
                print("Table not found below the 'Credits' heading.")
                return None

        else:
            print("Credits heading not found on the page.")
            return None

    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None


In [28]:
for season, data in episodes_seasons_data.items():
    print(f"\nSeason: {season}")
    
    if data:
        for episode_info in data:
            episode_title = episode_info.get('Episode Title', '')
            print(f"Episode Title: {episode_title}")
            episode_characters = get_episode_characters(episode_title)

            episode_info['Episode Characters'] = episode_characters
    else:
        print("No data found.")


Season: Naruto: Original
Episode Title: Enter:_Naruto_Uzumaki!
Episode Title: My_Name_is_Konohamaru!
Episode Title: Sasuke_and_Sakura:_Friends_or_Foes?
Episode Title: Pass_or_Fail:_Survival_Test
Episode Title: You_Failed!_Kakashi's_Final_Decision
Episode Title: A_Dangerous_Mission!_Journey_to_the_Land_of_Waves!
Episode Title: The_Assassin_of_the_Mist!
Episode Title: The_Oath_of_Pain
Episode Title: Kakashi:_Sharingan_Warrior!
Episode Title: The_Forest_of_Chakra
Episode Title: The_Land_Where_a_Hero_Once_Lived
Episode Title: Battle_on_the_Bridge!_Zabuza_Returns!
Episode Title: Haku's_Secret_Jutsu:_Demonic_Mirroring_Ice_Crystals
Episode Title: The_Number_One_Hyperactive,_Knucklehead_Ninja_Joins_the_Fight!
Episode Title: Zero_Visibility:_The_Sharingan_Shatters
Episode Title: The_Broken_Seal
Episode Title: White_Past:_Hidden_Ambition
Episode Title: The_Weapons_Known_as_Shinobi
Episode Title: The_Demon_in_the_Snow
Episode Title: A_New_Chapter_Begins:_The_Chūnin_Exam!
Episode Title: Identify_

In [2]:
def fetch_webpage_character_name(character):
    base_url = "https://naruto.fandom.com/wiki/"
    character_url = base_url + character.replace(" ", "_")

    try:
        # Fetch the webpage content
        response = requests.get(character_url)
        response.raise_for_status()

        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract character name from the header
        header = soup.find('h1', class_='page-header__title')
        if header:
            character_name = header.find('span', class_='mw-page-title-main').text.strip()
            return character_name
        else:
            print(f"Header not found for {character}")
            return character

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {character}: {e}")
        return character
    
    # Iterate through seasons
#for season_name, episodes in episodes_seasons_data.items():
    # Iterate through episodes and characters for each season
    for episode in episodes:
        for i, character in enumerate(episode['Episode Characters']):
            # Fetch and replace character name
            new_name = fetch_webpage_character_name(character)
            episode['Episode Characters'][i] = new_name

    # Display updated episode data for each season
    print(f"Updated episode data for season '{season_name}':")
    print(episodes_seasons_data[season_name])
    print("\n")


In [6]:
# Iterate through seasons
for season_name, episodes in episodes_seasons_data.items():
    # Iterate through episodes and characters for each season
    for episode in episodes:
        for i, character in enumerate(episode['Episode Characters']):
            # Fetch and replace character name
            new_name = fetch_webpage_character_name(character)
            episode['Episode Characters'][i] = new_name

    # Display updated episode data for each season
    print(f"Updated episode data for season '{season_name}':")
    print(episodes_seasons_characters[season_name])
    print("\n")


Error fetching data for Third Hokage: Sarutobi: 404 Client Error: Not Found for url: https://naruto.fandom.com/wiki/Third_Hokage:_Sarutobi
Error fetching data for Tobio's father: 404 Client Error: Not Found for url: https://naruto.fandom.com/wiki/Tobio's_father
Error fetching data for Tobio's mother: 404 Client Error: Not Found for url: https://naruto.fandom.com/wiki/Tobio's_mother
Error fetching data for Iruka (boyhood): 404 Client Error: Not Found for url: https://naruto.fandom.com/wiki/Iruka_(boyhood)
Error fetching data for Third Hokage: Sarutobi: 404 Client Error: Not Found for url: https://naruto.fandom.com/wiki/Third_Hokage:_Sarutobi
Error fetching data for Woman: 404 Client Error: Not Found for url: https://naruto.fandom.com/wiki/Woman
Error fetching data for Gal Naruto: 404 Client Error: Not Found for url: https://naruto.fandom.com/wiki/Gal_Naruto
Error fetching data for Third Hokage: Sarutobi: 404 Client Error: Not Found for url: https://naruto.fandom.com/wiki/Third_Hokage:_S

KeyboardInterrupt: 

In [None]:
import json
json_file_path = '../data/episodes_seasons_characters.json'

# Open the JSON file and load its contents into a dictionary
with open(json_file_path, 'r') as json_file:
    episodes_seasons_data = json.load(json_file)

In [9]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup

async def fetch_webpage_character_name(session, character):
    base_url = "https://naruto.fandom.com/wiki/"
    character_url = base_url + character.replace(" ", "_")

    try:
        async with session.get(character_url) as response:
            response.raise_for_status()

            # Parse HTML content
            html_content = await response.text()
            soup = BeautifulSoup(html_content, 'html.parser')

            # Extract character name from the header
            header = soup.find('h1', class_='page-header__title')
            if header:
                character_name = header.find('span', class_='mw-page-title-main').text.strip()
                return character_name
            else:
                print(f"Header not found for {character}")
                return character

    except aiohttp.ClientError as e:
        print(f"Error fetching data for {character}: {e}")
        return character

async def process_season(session, season_name, episodes):
    # Iterate through episodes and characters for each season
    for episode in episodes:
        characters = episode.get('Episode Characters')
        if characters is not None:
            for i, character in enumerate(characters):
                # Fetch and replace character name asynchronously
                new_name = await fetch_webpage_character_name(session, character)
                episode['Episode Characters'][i] = new_name

    # Display updated episode data for each season
    print(f"Updated episode data for season '{season_name}':")
    print(episodes)
    print("\n")

async def main():
    async with aiohttp.ClientSession() as session:
        # Iterate through seasons
        for season_name, episodes in episodes_seasons_data.items():
            # Process each season asynchronously
            await process_season(session, season_name, episodes)

# Run the asyncio event loop
await main()


Error fetching data for Third Hokage: Sarutobi: 404, message='Not Found', url=URL('https://naruto.fandom.com/wiki/Third_Hokage:_Sarutobi')
Error fetching data for Tobio's father: 404, message='Not Found', url=URL('https://naruto.fandom.com/wiki/Tobio's_father')
Error fetching data for Tobio's mother: 404, message='Not Found', url=URL('https://naruto.fandom.com/wiki/Tobio's_mother')
Error fetching data for Iruka (boyhood): 404, message='Not Found', url=URL('https://naruto.fandom.com/wiki/Iruka_(boyhood)')
Error fetching data for Third Hokage: Sarutobi: 404, message='Not Found', url=URL('https://naruto.fandom.com/wiki/Third_Hokage:_Sarutobi')
Error fetching data for Woman: 404, message='Not Found', url=URL('https://naruto.fandom.com/wiki/Woman')
Error fetching data for Gal Naruto: 404, message='Not Found', url=URL('https://naruto.fandom.com/wiki/Gal_Naruto')
Error fetching data for Third Hokage: Sarutobi: 404, message='Not Found', url=URL('https://naruto.fandom.com/wiki/Third_Hokage:_Sar

CancelledError: 

In [11]:
import aiohttp
from bs4 import BeautifulSoup
from aiohttp import ClientResponseError
from asyncio import Semaphore

async def fetch_characters_for_episode(episode_data, session, semaphore):
    episode_characters = []  # Store characters for each episode
    episode_title = episode_data['Episode Title']
    formatted_title = episode_title.replace(" ", "_")
    url_episode = f'https://naruto.fandom.com/wiki/{formatted_title}'

    try:
        async with semaphore, session.get(url_episode) as response:
            if response.status == 200:
                html_content = await response.text()
                soup = BeautifulSoup(html_content, 'html.parser')

                # Find the 'Credits' heading
                credits_heading = soup.find('span', {'id': 'Credits'})

                # Extract characters from the table under the 'Credits' heading
                characters = extract_characters_from_table(credits_heading)

                episode_characters.extend(characters)

    except ClientResponseError as e:
        print(f"ClientResponseError exception for {url_episode}: {e}")

    return episode_characters

def extract_characters_from_table(credits_heading):
    characters = []
    if credits_heading:
        # Find the table below the 'Credits' heading
        credits_table = credits_heading.find_next('table', class_='wikitable')

        if credits_table:
            # Extract names from the hyperlink in the 'Role' column
            for row in credits_table.find_all('tr')[1:]:  # Skip the header row
                # Check if the 'td' tag exists in the row
                td_tag = row.find('td')
                if td_tag:
                    # Extract data from the first column (Role)
                    character_link = td_tag.find('a')
                    if character_link:
                        character_name = character_link.get_text(strip=True)
                        title_name = character_name.get('title', '')
                        characters.append(title_name)

    return characters

async def process_episodes_data(episodes_data, session, semaphore):
    for episode in episodes_data:
        print(episode)
        episode_characters = await fetch_characters_for_episode(episode, session, semaphore)
        episode['Characters'] = episode_characters
        print(episode)

async def main():
    async with aiohttp.ClientSession() as session:
        semaphore = Semaphore(5)  # Adjust the semaphore limit as needed
        await process_episodes_data(episodes_seasons_data['Naruto: Original'], session, semaphore)

await main()


Processing Season: Naruto: Original
{'Episode Number': '1', 'Episode Title': 'Enter:_Naruto_Uzumaki!', 'Japanese Airdate': '3 October 2002', 'Characters': ['Naruto Uzumaki', 'Sasuke Uchiha', 'Sakura Haruno', 'Hiruzen Sarutobi', 'Iruka Umino', 'Shikamaru Nara', 'Ino Yamanaka', 'Hinata Hyūga', 'Mizuki', 'Bekkō', 'Iwana Akame', 'Yajirobee', 'Ibara', 'Tsubaki (parent)', 'Iruka Umino']}


  for key, value in attrs:


{'Episode Number': '1', 'Episode Title': 'Enter:_Naruto_Uzumaki!', 'Japanese Airdate': '3 October 2002', 'Characters': ['Naruto Uzumaki', 'Sasuke Uchiha', 'Sakura Haruno', 'Hiruzen Sarutobi', 'Iruka Umino', 'Shikamaru Nara', 'Ino Yamanaka', 'Hinata Hyūga', 'Mizuki', 'Bekkō', 'Iwana Akame', 'Yajirobee', 'Ibara', 'Tsubaki (parent)', 'Iruka Umino']}
{'Episode Number': '2', 'Episode Title': 'My_Name_is_Konohamaru!', 'Japanese Airdate': '10 October 2002', 'Characters': ['Naruto Uzumaki', 'Konohamaru Sarutobi', 'Hiruzen Sarutobi', 'Iruka Umino', 'Ebisu', 'Genzō', 'Sexy Technique']}
{'Episode Number': '2', 'Episode Title': 'My_Name_is_Konohamaru!', 'Japanese Airdate': '10 October 2002', 'Characters': ['Naruto Uzumaki', 'Konohamaru Sarutobi', 'Hiruzen Sarutobi', 'Iruka Umino', 'Ebisu', 'Genzō', 'Sexy Technique']}
{'Episode Number': '3', 'Episode Title': 'Sasuke_and_Sakura:_Friends_or_Foes?', 'Japanese Airdate': '17 October 2002', 'Characters': ['Naruto Uzumaki', 'Sasuke Uchiha', 'Sakura Haruno