In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import asyncio
import aiohttp
import json

In [4]:
import requests
from bs4 import BeautifulSoup

def scrape_episode_data(url, title_text):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the span tag with the specified text and class
        title_span = soup.find('span', class_='mw-headline', string=title_text)

        if title_span:
            # Navigate to the parent h3 tag
            title_tag = title_span.find_parent('h3')

            # Find the corresponding table by searching for the closest table tag after the title
            table = title_tag.find_next('table', class_='box table coloured bordered innerbordered style-basic fill-horiz')

            if table:
                # Initialize lists to store data
                episode_data = []

                # Loop through rows in the table
                for row in table.find_all('tr')[1:]:  # Skip the header row
                    # Extract data from each column in the row
                    columns = row.find_all('td')

                    # Assuming the structure is consistent, extract data from each column
                    episode_number = row.find('th').get_text(strip=True)

                    # Check if anchor tag is present before trying to get its text
                    episode_title_tag = columns[0].find('a')
                    episode_title = episode_title_tag.get_text(strip=True).replace(' ', '_') if episode_title_tag else None

                    # Append data to the list of dictionaries
                    episode_data.append({
                        'Episode Number': episode_number,
                        'Episode Title': episode_title,
                    })

                return episode_data

            else:
                print(f"Table not found after the title with text '{title_text}'.")
                return None

        else:
            print(f"Title with text '{title_text}' not found on the page.")
            return None

    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        return None


In [5]:
# Example usage:
url = "https://naruto.fandom.com/wiki/List_of_Animated_Media"
season_titles = ['Naruto: Original', 'Naruto: Shippūden', 'Boruto: Naruto Next Generations']

episodes_seasons_data = {}

for season in season_titles:
    data = scrape_episode_data(url, season)
    if data is not None:
        episodes_seasons_data[season] = data


In [6]:
# Print the results for debugging
for season, data in episodes_seasons_data.items():
    print(f"\nSeason: {season}")
    if data:
        for episode in data:
            print(episode)
    else:
        print("No data found.")


Season: Naruto: Original
{'Episode Number': '1', 'Episode Title': 'Enter:_Naruto_Uzumaki!'}
{'Episode Number': '2', 'Episode Title': 'My_Name_is_Konohamaru!'}
{'Episode Number': '3', 'Episode Title': 'Sasuke_and_Sakura:_Friends_or_Foes?'}
{'Episode Number': '4', 'Episode Title': 'Pass_or_Fail:_Survival_Test'}
{'Episode Number': '5', 'Episode Title': "You_Failed!_Kakashi's_Final_Decision"}
{'Episode Number': '6', 'Episode Title': 'A_Dangerous_Mission!_Journey_to_the_Land_of_Waves!'}
{'Episode Number': '7', 'Episode Title': 'The_Assassin_of_the_Mist!'}
{'Episode Number': '8', 'Episode Title': 'The_Oath_of_Pain'}
{'Episode Number': '9', 'Episode Title': 'Kakashi:_Sharingan_Warrior!'}
{'Episode Number': '10', 'Episode Title': 'The_Forest_of_Chakra'}
{'Episode Number': '11', 'Episode Title': 'The_Land_Where_a_Hero_Once_Lived'}
{'Episode Number': '12', 'Episode Title': 'Battle_on_the_Bridge!_Zabuza_Returns!'}
{'Episode Number': '13', 'Episode Title': "Haku's_Secret_Jutsu:_Demonic_Mirroring_

In [4]:
with open('./data/episodes_seasons_descriptions.json', 'w') as json_file:
    json.dump(episodes_seasons_data, json_file)