## Importing dependencies

In [85]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
pd.set_option('display.max_rows',None)

In [56]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.edge.service import Service as EdgeService

In [82]:
pd.reset_option('display.max_rows')

## Getting the elements of the page

In [2]:
url = 'https://riordan.fandom.com/wiki/Percy_Jackson_and_the_Olympians'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [5]:
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Percy Jackson and the Olympians | Riordan Wiki | Fandom</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"65a01eced6b4dc120e27beab8d082c44","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Percy_Jackson_and_the_Olympians","wgTitle":"Percy Jackson and the Olympians","wgCurRevisionId":630359,"wgRevisionId":630359,"wgArticleId":2329,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Real world articles","Percy Jackson and the Olympians"],"wgPageContentLa

## Initialising a webdriver

In [61]:
driver = webdriver.Edge(service=EdgeService(EdgeChromiumDriverManager().install()))

## Getting a list of all the books

In [6]:
div_content = soup.find('div', class_="wds-is-not-scrollable wds-dropdown-level-nested__content")

In [7]:
books = []

if div_content:
    # Find all <a> tags within the div
    a_tags = div_content.find_all('a')

    # Extract book name and URL, then append to the 'books' list
    for a in a_tags:
        book_name = a.find('span').text.strip()
        book_url = a.get('href')
        books.append({'book_name': book_name, 'url': book_url})

In [8]:
books

[{'book_name': 'The Lightning Thief',
  'url': 'https://riordan.fandom.com/wiki/The_Lightning_Thief'},
 {'book_name': 'The Sea of Monsters',
  'url': 'https://riordan.fandom.com/wiki/The_Sea_of_Monsters'},
 {'book_name': "The Titan's Curse",
  'url': 'https://riordan.fandom.com/wiki/The_Titan%27s_Curse'},
 {'book_name': 'The Battle of the Labyrinth',
  'url': 'https://riordan.fandom.com/wiki/The_Battle_of_the_Labyrinth'},
 {'book_name': 'The Last Olympian',
  'url': 'https://riordan.fandom.com/wiki/The_Last_Olympian'}]

## Getting a list of all the characters in the series

In [43]:
url = 'https://riordan.fandom.com/wiki/Category:Characters'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [44]:
print(soup)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Category:Characters | Riordan Wiki | Fandom</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"284b244ecacac12a35fafde7bb98acec","wgCSPNonce":false,"wgCanonicalNamespace":"Category","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":14,"wgPageName":"Category:Characters","wgTitle":"Characters","wgCurRevisionId":612335,"wgRevisionId":612335,"wgArticleId":18657,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["The Heroes of Olympus characters","The Kane Chronicles characters","Percy Jackson and the Olympians characters"],"wgPageContentLanguage":

In [59]:
soup.find('a', class_="category-page__pagination-next wds-button wds-is-secondary")

<a class="category-page__pagination-next wds-button wds-is-secondary" href="https://riordan.fandom.com/wiki/Category:Characters?from=Brer+Fox">
<span>Next</span>
<svg class="wds-icon wds-icon-tiny"><use xlink:href="#wds-icons-menu-control-tiny"></use></svg> </a>

In [None]:
# creating a function that finds the url of the "Next" page and returns it
def get_next_page_url(soup):
    next_button = soup.find('a', class_="category-page__pagination-next wds-button wds-is-secondary")
    if next_button:
        return next_button.get('href')
    return None

In [62]:
url = 'https://riordan.fandom.com/wiki/Category:Characters'
driver.get(url)
time.sleep(10)

character_list = []

while True:
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # getting all the character names and creating a list of them
    character_elems = soup.find_all(class_='category-page__member-link')
    for elem in character_elems:
        character_list.append({'character': elem.text})

    # getting the url of the "Next" page
    next_page_url = get_next_page_url(soup)

    # Break the loop if the "Next" button is not found(On last page)
    if not next_page_url:
        break

    # Navigate to the next page using the obtained URL
    driver.get(next_page_url)
    time.sleep(5) 

In [64]:
# Creating a dataframe for the list of character
df = pd.DataFrame(character_list)

In [65]:
df.to_csv('characters.csv', index=False)

## Data cleaning

In [91]:
df = df[~df['character'].str.contains(r'^User:', na=False)]
df = df[~df['character'].str.contains(r'^Category:', na=False)]
df = df[~df['character'].str.contains(r'^Template:', na=False)]
df = df[~df['character'].str.endswith('/Disney+', na=False)]
df = df[~df['character'].str.endswith('/Film', na=False)]
df['character_firstname'] = df['character'].apply(lambda x: x.split(' ', 1)[0])

In [92]:
df = df.reset_index(drop=True)
df

Unnamed: 0,character,character_firstname
0,Aaron,Aaron
1,Abdel Fadlan,Abdel
2,Abelard,Abelard
3,Abuelo Santiago,Abuelo
4,Achelous,Achelous
5,Acheron (God),Acheron
6,Achilles,Achilles
7,Acrisius,Acrisius
8,Actaeon,Actaeon
9,Ada,Ada


In [93]:
df.to_csv('characters.csv', index=False)