In [1]:
# The purpose of this notebook is to webscrape wikipedia for data on Batman villains to use in our API.
# We'll start by importing pandas and assigning our url variable.
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_Batman_family_enemies'

In [2]:
# Let's use the read_html method to see what we're working with.
tables = pd.read_html(url)
print(tables)
# Looks like this is the data from every table on the website. Let's narrow our focus to just the first table.


In [5]:
# We'll make our dataframe from the first table.
# print(tables[0])
villains_df = tables[0]
villains_df


Unnamed: 0,Villain,Creator(s),First appearance,Fictional biography
0,Bane[5],Chuck Dixon[6][7]Doug Moench[7]Graham Nolan[6][7],Batman: Vengeance of Bane #1(January 1993)[8][9],The international masked criminal known as Ban...
1,Black Mask[11],Doug Moench[12]Tom Mandrake[12],Batman #386(August 1985),Roman Sionis[13] is a corrupt businessman and ...
2,Catwoman[15][16],Bob Kane[17]Bill Finger[17],Batman #1(Spring 1940),Selina Kyle is an accomplished jewel thief. Al...
3,Clayface[18],Bill FingerBob Kane[19],Detective Comics #40(June 1940),Actor Basil Karlo[20][21][22] went mad when he...
4,Deadshot[25],Bob Kane[26]David Vern Reed[26]Lew Schwartz[26],Batman #59(June 1950),Floyd Lawton is an excellent sniper assassin w...
5,Firefly,France HerronDick Sprang,Detective Comics #184(June 1952),Garfield Lynns is an orphan who became a pyrom...
6,Harley Quinn[16][27][28],Paul DiniBruce Timm,"Batman: The Animated Series episode ""Joker's F...",Dr. Harleen Quinzel was the Joker's psychiatri...
7,Hugo Strange[29],Bob Kane[30]Bill Finger[30],Detective Comics #36(February 1940),Hugo Strange is an insane psychologist who use...
8,Hush[32],Jeph Loeb[33]Jim Lee[33],Batman #609(January 2003),Dr. Thomas Elliot is a brilliant surgeon who t...
9,Joker[Note 1][15][16][34],Bob Kane[35]Bill Finger[35]Jerry Robinson[35],Batman #1(spring 1940),The Joker (real name unknown) is a homicidal m...


In [6]:
# Since we like this data, let's convert it to a dictionary to use in our API.
villains_records = villains_df.to_dict('records')
villains_records


[{'Villain': 'Bane[5]',
  'Creator(s)': 'Chuck Dixon[6][7]Doug Moench[7]Graham Nolan[6][7]',
  'First appearance': 'Batman: Vengeance of Bane #1(January 1993)[8][9]',
  'Fictional biography': "The international masked criminal known as Bane has immense strength derived from a super-steroid called Venom. Bane's raw power, coupled with his genius level intellect, makes him a considerable threat to Batman, having once succeeded in breaking Batman's back.[10]"},
 {'Villain': 'Black Mask[11]',
  'Creator(s)': 'Doug Moench[12]Tom Mandrake[12]',
  'First appearance': 'Batman #386(August 1985)',
  'Fictional biography': 'Roman Sionis[13] is a corrupt businessman and crime lord who has a fixation with masks. He wears a black skull-like mask that gives him limited mind control abilities.[14]'},
 {'Villain': 'Catwoman[15][16]',
  'Creator(s)': 'Bob Kane[17]Bill Finger[17]',
  'First appearance': 'Batman #1(Spring 1940)',
  'Fictional biography': 'Selina Kyle is an accomplished jewel thief. Althou

In [7]:
# Ultimately, we'll want a dictionary with the villain name as the keys with a value of 
# another dictionary of the remaining data.
# Ex: {villain1: {name: villain1, bio: villian1_bio, etc...}}
for i in range(len(villains_records)):
    print(villains_records[i]['Villain'])
# It looks like we'll need to tidy up this data a bit, as it still includes the references (ex: [5]) from wikipedia.


Bane[5]
Black Mask[11]
Catwoman[15][16]
Clayface[18]
Deadshot[25]
Firefly
Harley Quinn[16][27][28]
Hugo Strange[29]
Hush[32]
Joker[Note 1][15][16][34]
Killer Croc[32]
Killer Moth[29]
Mad Hatter[32]
Man-Bat[43]
Mr. Freeze(originally known as Mr. Zero)[45][46]
Penguin[16][50]
Poison Ivy[15][54][16]
Ra's al Ghul[56][57]
Riddler[59][60]
Scarecrow[61][62]
Solomon Grundy[32]
Two-Face[63][64]
Ventriloquist /Scarface[67]
Victor Zsasz


In [8]:
# Removing all of the reference numbers from the data.
for i in range(len(villains_records)):
    for char in villains_records[i]['Villain']:
        if char in '0123456789[]':
            villains_records[i]['Villain'] = villains_records[i]['Villain'].replace(char,'')
            
for i in range(len(villains_records)):
    for char in villains_records[i]['Creator(s)']:
        if char in '0123456789[]':
            villains_records[i]['Creator(s)'] = villains_records[i]['Creator(s)'].replace(char,'')
            
for i in range(len(villains_records)):
    for char in villains_records[i]['First appearance']:
        if char in '0123456789[]':
            villains_records[i]['First appearance'] = villains_records[i]['First appearance'].replace(char,'')
            
for i in range(len(villains_records)):
    for char in villains_records[i]['Fictional biography']:
        if char in '0123456789[]':
            villains_records[i]['Fictional biography'] = villains_records[i]['Fictional biography'].replace(char,'')
            
print(villains_records)
    

[{'Villain': 'Bane', 'Creator(s)': 'Chuck DixonDoug MoenchGraham Nolan', 'First appearance': 'Batman: Vengeance of Bane #(January )', 'Fictional biography': "The international masked criminal known as Bane has immense strength derived from a super-steroid called Venom. Bane's raw power, coupled with his genius level intellect, makes him a considerable threat to Batman, having once succeeded in breaking Batman's back."}, {'Villain': 'Black Mask', 'Creator(s)': 'Doug MoenchTom Mandrake', 'First appearance': 'Batman #(August )', 'Fictional biography': 'Roman Sionis is a corrupt businessman and crime lord who has a fixation with masks. He wears a black skull-like mask that gives him limited mind control abilities.'}, {'Villain': 'Catwoman', 'Creator(s)': 'Bob KaneBill Finger', 'First appearance': 'Batman #(Spring )', 'Fictional biography': 'Selina Kyle is an accomplished jewel thief. Although traditionally considered a villain, she is often portrayed as an antihero and is occasionally roma

In [9]:
# Now we'll make a dictionary and use a loop to input all of our data in the correct format.
villains = {}
for i in range(len(villains_records)):
    villains[villains_records[i]['Villain']] = {'name': villains_records[i]['Villain'], 'first_appearance': villains_records[i]['First appearance'], 'bio': villains_records[i]['Fictional biography']}
    
print(villains)

# Perfect! I've copied this and pasted it into my server code.


{'Bane': {'name': 'Bane', 'first_appearance': 'Batman: Vengeance of Bane #(January )', 'bio': "The international masked criminal known as Bane has immense strength derived from a super-steroid called Venom. Bane's raw power, coupled with his genius level intellect, makes him a considerable threat to Batman, having once succeeded in breaking Batman's back."}, 'Black Mask': {'name': 'Black Mask', 'first_appearance': 'Batman #(August )', 'bio': 'Roman Sionis is a corrupt businessman and crime lord who has a fixation with masks. He wears a black skull-like mask that gives him limited mind control abilities.'}, 'Catwoman': {'name': 'Catwoman', 'first_appearance': 'Batman #(Spring )', 'bio': 'Selina Kyle is an accomplished jewel thief. Although traditionally considered a villain, she is often portrayed as an antihero and is occasionally romantically involved with Batman.'}, 'Clayface': {'name': 'Clayface', 'first_appearance': 'Detective Comics #(June )', 'bio': 'Actor Basil Karlo went mad wh