In [68]:
import re
from collections import namedtuple
from os import path
from typing import Type

from pandas import DataFrame
from selenium import webdriver
from selenium.webdriver.remote.webelement import WebElement

In [69]:
profile = webdriver.FirefoxProfile()
profile.add_extension(path.join("webdriver", "ublock_origin-1.49.2.xpi"))
driver = webdriver.Firefox(executable_path=path.join("webdriver", "geckodriver.exe"), firefox_profile=profile)
driver.maximize_window()

In [70]:
urls = ['https://witcher.fandom.com/wiki/Category:Baptism_of_Fire_characters',
        'https://witcher.fandom.com/wiki/Category:Blood_of_Elves_characters',
        'https://witcher.fandom.com/wiki/Category:Something_Ends,_Something_Begins_characters',
        'https://witcher.fandom.com/wiki/Category:Sword_of_Destiny_characters',
        'https://witcher.fandom.com/wiki/Category:The_Lady_of_the_Lake_characters',
        'https://witcher.fandom.com/wiki/Category:The_Last_Wish_characters',
        'https://witcher.fandom.com/wiki/Category:The_Tower_of_the_Swallow_characters',
        'https://witcher.fandom.com/wiki/Category:Time_of_Contempt_characters']
print(*urls, sep="\n")

https://witcher.fandom.com/wiki/Category:Baptism_of_Fire_characters
https://witcher.fandom.com/wiki/Category:Blood_of_Elves_characters
https://witcher.fandom.com/wiki/Category:Something_Ends,_Something_Begins_characters
https://witcher.fandom.com/wiki/Category:Sword_of_Destiny_characters
https://witcher.fandom.com/wiki/Category:The_Lady_of_the_Lake_characters
https://witcher.fandom.com/wiki/Category:The_Last_Wish_characters
https://witcher.fandom.com/wiki/Category:The_Tower_of_the_Swallow_characters
https://witcher.fandom.com/wiki/Category:Time_of_Contempt_characters


In [71]:
roman_numeral = "M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})"


def create_char_dict(book_urls: list[str]) -> list[dict[str, str]]:
    for book_url in book_urls:
        driver.get(book_url)
        char_webpage: list[WebElement] = driver.find_elements_by_class_name("category-page__member-link")
        # Retrieve book title, then remove text after last space
        book_title: str = re.sub("\s[^\s]+$", "", driver.find_element_by_class_name("page-header__title").text)
        # Retrieve character attributes as tuples: the first element is the character's web page; the second element is the character's name
        # character names: a text enclosed by parenthesis will be removed
        char_attrs: list[tuple[str, str]] = \
            [(char.get_attribute("href"), re.sub(r"\s\([^()]*\)", "", char.get_attribute("title")))
             for char in char_webpage]
        for char_url, char_full_name in char_attrs:
            # Remove text after the first space (except if said text is a roman numeral)
            # We're trying to only retrieve the first name (E.g.: "Geralt of Rivia" → "Geralt"; "Radovic I" → "Radovic I")
            char_first_name = re.sub(f"\s(?!{roman_numeral}$).*", "", char_full_name)
            yield dict(Book=book_title, URL=char_url, FullName=char_full_name, FirstName=char_first_name)


char_apps = create_char_dict(urls)
char_apps_df: DataFrame = DataFrame.from_dict(char_apps)

In [72]:
char_apps_df

Unnamed: 0,Book,URL,FullName,FirstName
0,Baptism of Fire,https://witcher.fandom.com/wiki/Adalia,Adalia,Adalia
1,Baptism of Fire,https://witcher.fandom.com/wiki/Adela,Adela,Adela
2,Baptism of Fire,https://witcher.fandom.com/wiki/Aen_Saevherne,Aen Saevherne,Aen
3,Baptism of Fire,https://witcher.fandom.com/wiki/Aevenien,Aevenien,Aevenien
4,Baptism of Fire,https://witcher.fandom.com/wiki/Agla%C3%AFs,Aglaïs,Aglaïs
...,...,...,...,...
1161,Time of Contempt,https://witcher.fandom.com/wiki/Yanna_of_Murivel,Yanna of Murivel,Yanna
1162,Time of Contempt,https://witcher.fandom.com/wiki/Yarpen_Zigrin,Yarpen Zigrin,Yarpen
1163,Time of Contempt,https://witcher.fandom.com/wiki/Yennefer_of_Ve...,Yennefer of Vengerberg,Yennefer
1164,Time of Contempt,https://witcher.fandom.com/wiki/Yiolenta_Suarez,Yiolenta Suarez,Yiolenta


In [77]:
wanted_attrs: set[str] = {"Status", "Race", "Gender", "Nationality", "Parent(s)", "Child(ren)", "Relative(s)",
                          "Partner(s)", "Profession", "Abilities"}
relationship_attrs: set[str] = {"Parent(s)", "Child(ren)", "Relative(s)", "Partner(s)"}
AttributePair: Type["AttributePair"] = namedtuple("AttributePair", ["href", "title"])


def add_relationships(key: str, attrs_dict: dict[str, str | dict[str, str]], item: WebElement):
    data_source: list[WebElement] = item.find_elements_by_class_name("pi-data-value")
    for data_value in data_source:
        anchor_tags: list[AttributePair] = [AttributePair(tag.get_attribute("href"), tag.get_attribute("title"))
                                            for tag in data_value.find_elements_by_css_selector("a")]
        if not anchor_tags:
            return
        inner_text: list[str] = [tag.text for tag in data_value.find_elements_by_tag_name("small")]
        attrs_dict[key] = {hyperlink: f"{name} {relationship}" for name, relationship, hyperlink
                           in zip([p.title for p in anchor_tags], inner_text, [p.href for p in anchor_tags])
                           if name and hyperlink}


def table_contents_to_dict(char_url: str) -> dict[str, str | list[str]]:
    attrs_dict: dict[str, str] = {}
    driver.get(char_url)
    driver.implicitly_wait(0.25)
    items: list[WebElement] = driver.find_elements_by_css_selector("div.pi-item.pi-data")
    for item in items:
        key: str = item.find_element_by_class_name("pi-data-label").text
        if not key:
            continue
        if key in relationship_attrs:
            add_relationships(key, attrs_dict, item)
        elif key in wanted_attrs:
            attrs_dict[key] = item.find_element_by_class_name("pi-data-value").text
    return attrs_dict


def webpages_to_dicts(*args: DataFrame) -> list[dict[str, str | dict[str, str]]]:
    for url, full_name, first_name in zip(args[0], args[1], args[2]):
        prepend: dict[str, str] = dict(URL=url, FullName=full_name, FirstName=first_name)
        append: dict[str, str | dict[str, str]] = table_contents_to_dict(url)
        attrs_dict: dict[str, str | dict[str, str]] = {**append, **prepend}
        yield attrs_dict


chars_df = char_apps_df.drop_duplicates(subset="URL")
chars_df: DataFrame = DataFrame.from_records(
    webpages_to_dicts(chars_df["URL"], chars_df["FullName"], chars_df["FirstName"]))

In [78]:
chars_df["Race"] = chars_df["Race"] \
    .apply(lambda s: re.sub("\[N?\s?\d\]", "", str(s))) \
    .apply(lambda s: re.sub("(?i)Elf \(Aen Seidhe.*\)", "Aen Seidhe Elf", str(s))) \
    .apply(lambda s: re.sub("(?i)Elf \(Aen Elle.*\)", "Aen Elle Elf", str(s))) \
    .apply(lambda s: re.sub("(?i)Elf \(Wood Elf.*\)", "Wood Elf", str(s))) \
    .apply(lambda s: re.sub("(?i)Human \(Witcher.*\)", "Witcher Human", str(s))) \
    .apply(lambda s: re.sub("(?i)Quarter-elf.*", "Quarter-elf", str(s))) \
    .apply(lambda s: re.sub("(?i)\sor.*", "", str(s))) \
    .apply(lambda s: re.sub(r"\s\([^()]*\)", "", str(s))) \
    .apply(lambda s: re.sub(r"[Nn]a[Nn]", "Unknown", str(s)))

In [80]:
char_apps_df.to_csv(path_or_buf=path.join("csv", "character_appearances.csv"), sep=";", index_label="Id", mode="w+",
                    encoding="utf-8")
chars_df.to_csv(path_or_buf=path.join("csv", "characters.csv"), sep=";", index_label="Id", mode="w+",
                encoding="utf-8")