In [56]:
# Import libraries
import pandas as pd
import numpy as np
import re
import time
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

## Scraping Characters

In [57]:
# Create driver
service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

# Go to the character page
url = 'https://www.listchallenges.com/all-harry-potter-characters'
driver.get(url)

# Get buttons of pages
page_num_buttons = driver.find_elements(By.XPATH, '//*[@id="MainContent_MainContent_pager"]/a')

pages = []
for page in page_num_buttons:
    page_url = page.get_attribute('href')
    pages.append(page_url)


In [58]:
# Extracte character pages and saving
characters = []
for page_url in pages:
    
    driver.get(page_url)
    
    cha_elem = driver.find_elements(By.CLASS_NAME, 'item-name')
    
    for cha in cha_elem:
        character_name = cha.text
        characters.append(character_name)

driver.quit()

In [59]:
#Create DataFrame for names
characters_df = pd.DataFrame(characters, columns=['names'])

In [60]:
# Eleminate extra information that some columns had
characters_df.names = characters_df.names.apply(lambda x: x.split('-')[0])

# Extracte first names
characters_df['first_name'] = characters_df.names.apply(lambda x: x.split(' ')[0])

In [61]:
# Extracte last names
last_names = []
for name in characters_df.names:
    try:
        last_names.append(name.split(' ')[1])
    except:
        last_names.append('')

# Create new column for last names
characters_df['last_name'] = last_names

In [62]:
characters_df.head()

Unnamed: 0,names,first_name,last_name
0,Harry Potter,Harry,Potter
1,Ron Weasley,Ron,Weasley
2,Hermione Granger,Hermione,Granger
3,Rubeus Hagrid,Rubeus,Hagrid
4,Albus Dumbledore,Albus,Dumbledore


In [63]:
## Scraping from another source with more names
driver = webdriver.Chrome(service=service, options=options)

# Go to the character page
url = 'http://magical-menagerie.com/wizardry/full-character-listing/'
driver.get(url)

# Get elements
elements = driver.find_element(By.CLASS_NAME, 'postcontent')
sub_elem = elements.find_elements(By.TAG_NAME, 'p')

In [64]:
# Extractetags which holds names
tags = [element.find_elements(By.TAG_NAME, 'strong') for element in sub_elem]

# Eleminate empty results
tags_upd = [elem for elem in tags if elem != []]

# Extracte names
names = [{'names':t.text} for tag in tags_upd[1:-1] for t in tag if t.text != '']

time.sleep(3)
driver.quit()

In [65]:
# DataFrame for names
character_names = pd.DataFrame(names)

In [66]:
# Delete paranthesis and what is inside them
character_names['names'] = character_names['names'].apply(lambda x: re.sub('[\(].*?[\)]','', x))

# Make new column for last names
character_names['last_name'] = character_names['names'].apply(lambda x: x.split(', ', 1)[0])

In [67]:
# Extracte first names
first_names= []
for name in character_names['names']:
    try:
        first_names.append(name.split(', ',1)[1])
    except:
        first_names.append('')
        
# Create new column for first names       
character_names['first_name'] = first_names

In [68]:
# Clean first_name column
character_names.first_name = character_names.first_name.str.replace('-','')
character_names.first_name = character_names.first_name.str.replace('–','')
character_names.first_name = character_names.first_name.str.replace('?','')
character_names.first_name = character_names.first_name.str.strip(' ')

# Clean last_name column
character_names.last_name = character_names.last_name.str.replace('-','')
character_names.last_name = character_names.last_name.str.replace('–','')
character_names.last_name = character_names.last_name.str.replace('?','')
character_names.last_name = character_names.last_name.str.strip(' ')

In [69]:
# Update names column
character_names['names'] = character_names.first_name + ' ' + character_names.last_name

In [70]:
character_names.head()

Unnamed: 0,names,last_name,first_name
0,Euan Abercrombie,Abercrombie,Euan
1,Steward Ackerley,Ackerley,Steward
2,Falcon Aesalon,Aesalon,Falcon
3,Agnes,Agnes,
4,Cornelius Agrippa,Agrippa,Cornelius


In [88]:
# Concatenate two DataFrame to one
combined_df = pd.concat([character_names,characters_df], axis=0, ignore_index=True)

In [89]:
combined_df = combined_df.drop_duplicates(subset=["names"])

In [90]:
# Export data frame
combined_df.to_csv('characters.csv', index=False)