In [6]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import asyncio
import aiohttp
import json
from urllib.parse import quote


In [1]:
import requests
from bs4 import BeautifulSoup

def scrape_episode_data(url, title_text):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the span tag with the specified text and class
        title_span = soup.find('span', class_='mw-headline', string=title_text)

        if title_span:
            # Navigate to the parent h3 tag
            title_tag = title_span.find_parent('h3')

            # Find the corresponding table by searching for the closest table tag after the title
            table = title_tag.find_next('table', class_='box table coloured bordered innerbordered style-basic fill-horiz')

            if table:
                episode_data = []

                # Loop through rows in the table
                for row in table.find_all('tr')[1:]:  # Skip the header row
                    # Extract data from each column in the row
                    columns = row.find_all('td')

                    # Assuming the structure is consistent, extract data from each column
                    episode_number = row.find('th').get_text(strip=True)

                    # Check if anchor tag is present before trying to get its text
                    episode_title_tag = columns[0].find('a')
                    episode_title = episode_title_tag.get_text(strip=True).replace(' ', '_') if episode_title_tag else None

                    japanese_airdate = columns[1].get_text(strip=True) if len(columns) > 1 else None

                    # Append data to the list of dictionaries
                    episode_data.append({
                        'Episode Number': episode_number,
                        'Episode Title': episode_title,
                        'Japanese Airdate': japanese_airdate,
                    })

                return episode_data

            else:
                print(f"Table not found after the title with text '{title_text}'.")
                return None

        else:
            print(f"Title with text '{title_text}' not found on the page.")
            return None

    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None


In [4]:
# Example usage:
url = "https://naruto.fandom.com/wiki/List_of_Animated_Media"
season_titles = ['Naruto: Original', 'Naruto: Shippūden', 'Boruto: Naruto Next Generations']

episodes_seasons_data = {}

for season in season_titles:
    data = scrape_episode_data(url, season)
    if data is not None:
        episodes_seasons_data[season] = data


In [5]:
# Print the results for debugging
for season, data in episodes_seasons_data.items():
    print(f"\nSeason: {season}")
    if data:
        for episode in data:
            print(episode)
    else:
        print("No data found.")


Season: Naruto: Original
{'Episode Number': '1', 'Episode Title': 'Enter:_Naruto_Uzumaki!', 'Japanese Airdate': '3 October 2002'}
{'Episode Number': '2', 'Episode Title': 'My_Name_is_Konohamaru!', 'Japanese Airdate': '10 October 2002'}
{'Episode Number': '3', 'Episode Title': 'Sasuke_and_Sakura:_Friends_or_Foes?', 'Japanese Airdate': '17 October 2002'}
{'Episode Number': '4', 'Episode Title': 'Pass_or_Fail:_Survival_Test', 'Japanese Airdate': '24 October 2002'}
{'Episode Number': '5', 'Episode Title': "You_Failed!_Kakashi's_Final_Decision", 'Japanese Airdate': '31 October 2002'}
{'Episode Number': '6', 'Episode Title': 'A_Dangerous_Mission!_Journey_to_the_Land_of_Waves!', 'Japanese Airdate': '7 November 2002'}
{'Episode Number': '7', 'Episode Title': 'The_Assassin_of_the_Mist!', 'Japanese Airdate': '14 November 2002'}
{'Episode Number': '8', 'Episode Title': 'The_Oath_of_Pain', 'Japanese Airdate': '21 November 2002'}
{'Episode Number': '9', 'Episode Title': 'Kakashi:_Sharingan_Warrior

In [12]:
def get_synopsis_text(episode_title):
    formatted_title = quote(episode_title.replace(" ", "_"))
    url = f"https://naruto.fandom.com/wiki/{formatted_title}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the element with the "Synopsis" heading
    synopsis_heading = soup.find('span', {'id': 'Synopsis'})

    if synopsis_heading:
        # Find the parent <h2> element
        h2_element = synopsis_heading.find_parent('h2')

        # Find all <p> elements under the <h2> element
        p_elements = h2_element.find_all_next('p')

        # Extract the text of each <p> element
        synopsis_text = '\n'.join(p_element.get_text() for p_element in p_elements)

        return synopsis_text
    else:
        print(f"No 'Synopsis' heading found for episode: {episode_title}")
        return None
    

for season, data in episodes_seasons_data.items():
    print(f"\nSeason: {season}")
    
    if data:
        for episode_info in data:
            episode_title = episode_info.get('Episode Title', '')
            print(f"Episode Title: {episode_title}")
            synopsis_text = get_synopsis_text(episode_title)

            episode_info['episode synopsis text'] = synopsis_text
    else:
        print("No data found.")



Season: Naruto: Original
Episode Title: Enter:_Naruto_Uzumaki!
Episode Title: My_Name_is_Konohamaru!
Episode Title: Sasuke_and_Sakura:_Friends_or_Foes?
Episode Title: Pass_or_Fail:_Survival_Test
Episode Title: You_Failed!_Kakashi's_Final_Decision
Episode Title: A_Dangerous_Mission!_Journey_to_the_Land_of_Waves!
Episode Title: The_Assassin_of_the_Mist!
Episode Title: The_Oath_of_Pain
Episode Title: Kakashi:_Sharingan_Warrior!
Episode Title: The_Forest_of_Chakra
Episode Title: The_Land_Where_a_Hero_Once_Lived
Episode Title: Battle_on_the_Bridge!_Zabuza_Returns!
Episode Title: Haku's_Secret_Jutsu:_Demonic_Mirroring_Ice_Crystals
Episode Title: The_Number_One_Hyperactive,_Knucklehead_Ninja_Joins_the_Fight!
Episode Title: Zero_Visibility:_The_Sharingan_Shatters
Episode Title: The_Broken_Seal
Episode Title: White_Past:_Hidden_Ambition
Episode Title: The_Weapons_Known_as_Shinobi
Episode Title: The_Demon_in_the_Snow
Episode Title: A_New_Chapter_Begins:_The_Chūnin_Exam!
Episode Title: Identify_

In [15]:
with open('./data/episodes_seasons_descriptions.json', 'w') as json_file:
    json.dump(episodes_seasons_data, json_file)