In [1]:
# Import libraries
import pandas as pd
import numpy as np
import requests
import pickle
import re
from bs4 import BeautifulSoup
from sys import stdout

# Part 1: Marvel dataset

## 1.1 Parse the Marvel characters

**We go on the page where all the `Earth-616` Marvel characters are. We start with the first page and we make a list of all the URLs for the different characters.**

**Collect the first URLs:**

In [18]:
URL = 'https://marvel.fandom.com/wiki/Category:Earth-616_Characters'
r = requests.get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

# Collect url of each character in the web page
publications_wrappers = soup.find_all('li', class_='category-page__member')
list_url = []

for p in publications_wrappers:
    for a in p.find_all('a', href=True):
        list_url.append(a['href'])
        
# Remove duplicate
my_set = set(list_url)
good_list_url = list(my_set)

**We then use these URLs to collect the needed characterstics for each Marvel character.**

**Parse the first page:**

In [19]:
# Initialization
new_columns = ['URL', 'Real Name', 'Current Alias', 'Relatives', 'Affiliation']

personnage_pd = pd.DataFrame(columns=new_columns)
idx = 0
dict_geant={}

for pers in good_list_url:
    # Get URL and use html parser
    URL_char = 'https://marvel.fandom.com' + pers
    URL_char = URL_char.replace("'","")
    r_char = requests.get(URL_char)
    page_body_char = r_char.text
    soup_char = BeautifulSoup(page_body_char, 'html.parser')
    
    # Initialize variables
    url, name, current_alias, relatives, affiliation = '','','','',''
    personnage =[]
    
    # Parsing
    url = pers
    side_tab = soup_char.find_all('div', class_='conjoined-infoboxes')
    for p in side_tab:
        for pp in p.find_all('div', class_='pi-item pi-data pi-item-spacing pi-border-color'):
            for ppp in pp.find_all('h3', class_='pi-data-label pi-secondary-font'):
                if(ppp.text[1:]=="Real Name"):
                    name = pp.find('div', class_='pi-data-value pi-font').text
                if(ppp.text[1:]=="Current Alias"):
                    current_alias = pp.find('div', class_='pi-data-value pi-font').text
                if(ppp.text[1:]=="Relatives"):
                    div = pp.find('div', class_='pi-data-value pi-font')
                    if div:
                        for a in div.find_all('a', href=True):
                            relatives = relatives + ', ' + a['href']
                if(ppp.text[1:]=="Affiliation"):
                    div = pp.find('div', class_='pi-data-value pi-font')
                    if div:
                        for a in div.find_all('a', href=True):
                            affiliation = affiliation + ', ' + a['href']

    characteristics_pd = pd.DataFrame([[url, name[1:], current_alias, relatives, affiliation]], columns = new_columns)
    personnage_pd = personnage_pd.append(characteristics_pd,ignore_index=True)
    
nextpage = soup.find('link', {"rel" : "next"})

**After parsing the first page, we can get the links for the next pages, parsing them one by one. After that, we use the collected links to access each character's webpage exactly as we did for the first page. We then follow by storing the characterstics we need for our analysis.**

**Parse the remaining pages:**

In [20]:
i = 0
tot_page = 27839/200

while(len(nextpage['href'])):
    i += 1
    printed= i/tot_page*100
    stdout.write("\r%f %%" % printed)
    stdout.flush()
    
    urlnext_page = nextpage['href']
    r = requests.get(urlnext_page)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')

    # Collect URL of each character in the web page
    publications_wrappers = soup.find_all('li', class_='category-page__member')
    list_url = []
    for p in publications_wrappers:
        for a in p.find_all('a', href=True):
            list_url.append(a['href'])
            
    # Remove duplicate
    my_set = set(list_url)
    good_list_url = list(my_set)
    
    idx = 0
    dict_geant = {}
    
    for pers in good_list_url:
        # Get URL and use html parser
        URL_char = 'https://marvel.fandom.com' + pers
        URL_char = URL_char.replace("'","")
        r_char = requests.get(URL_char)
        page_body_char = r_char.text
        soup_char = BeautifulSoup(page_body_char, 'html.parser')
        
        # Initialize variables
        url, name, current_alias, relatives, affiliation = '','','','',''
        personnage =[]
        
        # Parsing
        side_tab = soup_char.find_all('div', class_='conjoined-infoboxes')
        url = pers
        for p in side_tab:
            for pp in p.find_all('div', class_='pi-item pi-data pi-item-spacing pi-border-color'):
                for ppp in pp.find_all('h3', class_='pi-data-label pi-secondary-font'):
                    if(ppp.text[1:]=="Real Name"):
                        name = pp.find('div', class_='pi-data-value pi-font').text
                    if(ppp.text[1:]=="Current Alias"):
                        current_alias = pp.find('div', class_='pi-data-value pi-font').text
                    if(ppp.text[1:]=="Relatives"):
                        div = pp.find('div', class_='pi-data-value pi-font')
                        if div:
                            for a in div.find_all('a', href=True):
                                relatives = relatives + ', ' + a['href']
                    if(ppp.text[1:]=="Affiliation"):
                        div = pp.find('div', class_='pi-data-value pi-font')
                        if div:
                            for a in div.find_all('a', href=True):
                                affiliation = affiliation + ', ' + a['href']

        characteristics_pd = pd.DataFrame([[url, name[1:], current_alias, relatives, affiliation]], columns = new_columns)
        personnage_pd = personnage_pd.append(characteristics_pd,ignore_index=True)

    nextpage = soup.find('link', {"rel" : "next"})

100.578325 %

TypeError: 'NoneType' object is not subscriptable

**Since the parsing takes a relatively high amount of time, it's better to save it to pickle.**

**Save file to pickle:**

In [21]:
pickle.dump(personnage_pd, open('data/characters_marvel.txt','wb'))

**Open the saved file:**

In [23]:
with open('data/characters_marvel.txt', 'rb') as f:
    characters_marvel = pickle.load(f)

characters_marvel

Unnamed: 0,URL,Real Name,Current Alias,Relatives,Affiliation
0,/wiki/Aaron_Fox_(Earth-616),Aaron Fox,,", /wiki/Beth_Fox_(Earth-616)",
1,/wiki/Acrobat_(1940s)_(Earth-616),nknown,Acrobat,,
2,/wiki/Abigail_Mercury_(Clone)_(Earth-616),Abigail Mercury,,", /wiki/Abigail_Mercury_(Earth-616)",
3,/wiki/Ace_Maxwell_(Earth-616),Ace Maxwell,,,
4,/wiki/Abigail_Boylen_(Earth-616),"Abigail ""Abby"" Boylen",Cloud 9,", #cite_note-Avengers_The_Initiative_Vol_1_1-2",", /wiki/Champions_(Earth-616), /wiki/Undergrou..."
...,...,...,...,...,...
28034,/wiki/Zxaxz_(Earth-616),Zxaxz,,,
28035,/wiki/Zuwena_(Earth-616),Zuwena,,,", /wiki/Elephant%27s_Trunk_(Earth-616)"
28036,/wiki/Zurvan_(Earth-616),Zurvan,,", /wiki/Ahura_Mazda_(Earth-616), /wiki/Ahriman...",
28037,/wiki/Zygo_(Earth-616),Zygo,General Zygo,,


## 1.2 Parse the Marvel comics

**Similarly to the Marvel characters, we want to first collect the comics' URLs to get to a specific comic's page. We start by the first page.**

**Collect the first URLs:**

In [None]:
# Get URL and use html parser
URL = 'https://marvel.fandom.com/wiki/Category:Comics'
r = requests.get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

# Collect URL of each character in the web page
publications_wrappers = soup.find_all('li', class_='category-page__member')

# Initialization
i = 0
list_url = []

for p in publications_wrappers:
    # Don't take into account the first links which are "Categories"
    if(i>=26):
        for a in p.find_all('a', href=True):
            list_url.append(a['href'])
    i +=1
    
list_url[:20]

# Remove duplicate
my_set = set(list_url)
good_list_url = list(my_set)

**After getting the URLs, we parse all the information we need from each page. Let's note that a comic can have more than one story that are separate. We will name them `subcomic`. They will be treated as different comics since the appearing characters and the writers are often different.**

**Parse the first page:**

In [None]:
comics_columns = ['URL', 'Good characters', 'Bad characters', 
                  'Neutral characters', 'Editor-in-chief', 'Editor-in-chief URL', 
                  'Writer', 'Writer URL', 'Publication date', 'Subcomic']

comics_pd = pd.DataFrame(columns=comics_columns)


# Parse the first page
for comics in good_list_url:
    # Get URL and use html parser
    URL_char = 'https://marvel.fandom.com' + comics
    URL_char = URL_char.replace("'","")
    r_char = requests.get(URL_char)
    page_body_char = r_char.text
    soup_char = BeautifulSoup(page_body_char, 'html.parser')
    
    # Initialization
    good, bad, neutral = '','',''
    editor, writer, publication = '','',''
    editorURL, writerURL, subcomic = '','',''
    
    URL2 = URL_char.replace('https://marvel.fandom.com','')

    # Boolean for the first case
    first = 1

    # Parse characters appearances
    appearances = soup_char.find_all('div', class_='mw-content-ltr mw-content-text')

    for p in appearances:
        for pp in p.find_all('p'):
            # If a comic is split in sub-comics
            span = pp.find_previous('span', class_='mw-headline')
            
            if span and ("Appearing" in span.text):
                if (not first and subcomic!=span.text \
                and (good or bad or neutral)):
                    # Take only the title of the subcomic
                    subcomic = subcomic[14:-1]
                    appearances_pd = pd.DataFrame([[URL2, good, bad, neutral, 
                                        editor, editorURL,
                                        writer, writerURL,
                                        publication, subcomic]], columns = comics_columns)
                    comics_pd = comics_pd.append(appearances_pd)

                    good, bad, neutral = '','',''
                    editor, writer, publication = '','',''
                    editorURL, writerURL = '',''
                    subcomic = span.text

                else:
                    first = 0
                    subcomic = span.text

            if "Featured Characters:" in pp.text:
                ul = pp.find_next('ul')
                for a in ul.find_all('a', href=True):
                    good = good + ', ' + a['href']

            if "Supporting Characters:" in pp.text:
                ul = pp.find_next('ul')
                for a in ul.find_all('a', href=True):
                    good = good + ', ' + a['href']

            if "Antagonists:" in pp.text:
                ul = pp.find_next('ul')
                for a in ul.find_all('a', href=True):
                    bad = bad + ', ' + a['href']

            if "Other Characters:" in pp.text:
                ul = pp.find_next('ul')
                for a in ul.find_all('a', href=True):
                    neutral = neutral + ', ' + a['href']

    # Each subcomic is written in the format '"Appearing in *subcomic*"'
    # We keep only the subcomic name
    subcomic = subcomic[14:-1]
    appearances_pd = pd.DataFrame([[URL2, good, bad, neutral, 
                                    editor, editorURL,
                                    writer, writerURL,
                                    publication, subcomic]], columns = comics_columns)

    comics_pd = comics_pd.append(appearances_pd)

    subcomic = ''


    # Parse comic characteristics
    side_tab = soup_char.find_all('div', class_='infobox')

    for p in side_tab:
        # Publication date
        tab = p.find_next('table', style='width:100%; text-align: center;')
        if(tab):
            tr = tab.find_next('tr', style='font-size:12px;')
            td = tab.find_next('td', style='font-size:12px;')
        if (tr):
            publication = tr.find_next('td').text
        if (td):
            publication = td.text
        if (not tr and not td):
            publication = ''

        # Editors
        for pp in p.find_all('div', style='width:100px;float:left;text-align:left;'):
            if ('Editor-in-Chief' in pp.text):
                ppp = pp.find_next('div', style='width:190px;float:left;text-align:right;')
                for a in ppp.find_all('a'):
                    editor = editor + ', ' + a.text
                for a in ppp.find_all('a', href=True):
                    editorURL = editorURL + ', ' + a['href']
                    
        if(p.find_next('div', style='width:88%; text-align:left;')):
            sub = p.find_next('div', style='width:88%; text-align:left;')

        # Writers
        for pp in p.find_all('div', style='width:100px;float:left;text-align:left;'):
            if (pp.find_previous('div', style='width:88%; text-align:left;')):
                sub = pp.find_previous('div', style='width:88%; text-align:left;')
            if sub:
                adiv = sub.find_next('div', style='width:100px;float:left;text-align:left;')

            if (adiv and 'Writer' in adiv.text and 'Writer' in pp.text):
                if (sub):
                    subcomic = sub.text[1:-1]

                ppp = pp.find_next('div', style='width:190px;float:left;text-align:right;')
                for a in ppp.find_all('a'):
                    writer = writer + ', ' + a.text
                for a in ppp.find_all('a', href=True):
                    writerURL = writerURL + ', ' + a['href']

                comics_pd.loc[comics_pd.Subcomic == subcomic,'Writer'] = writer
                comics_pd.loc[comics_pd.Subcomic == subcomic,'Writer URL'] = writerURL

                if (sub):
                    sub = sub.find_next('div', style='width:88%; text-align:left;')
                    writer, writerURL = '',''

            elif (sub): 
                sub = sub.find_next('div', style='width:88%; text-align:left;')
                writer, writerURL = '',''    
    
    comics_pd.loc[comics_pd.URL == URL2,'Editor-in-chief'] = editor
    comics_pd.loc[comics_pd.URL == URL2,'Editor-in-chief URL'] = editorURL
    comics_pd.loc[comics_pd.URL == URL2,'Publication date'] = publication

**Again, we use the next link to parse the remaining pages. The schema is exactly the same as the previous one.**

**Parse the remaining pages:**

In [None]:
# Parse all the other pages
nextpage = soup.find('link', {"rel" : "next"})

i = 0
tot_page = 49759/200

while(len(nextpage['href'])):
    # Print a loading bar
    i += 1
    printed= i/tot_page*100
    stdout.write("\r%f %%" % printed)
    stdout.flush()
    urlnext_page = nextpage['href']
    r = requests.get(urlnext_page)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')

    # Collect url of each character in the web page
    publications_wrappers = soup.find_all('li', class_='category-page__member')
    list_url = []
    for p in publications_wrappers:
        for a in p.find_all('a', href=True):
            list_url.append(a['href'])
            
    # Remove duplicates
    my_set = set(list_url)
    good_list_url = list(my_set)
    
    for comics in good_list_url:
        # Get URL and use html parser
        URL_char = 'https://marvel.fandom.com' + comics
        URL_char = URL_char.replace("'","")
        r_char = requests.get(URL_char)
        page_body_char = r_char.text
        soup_char = BeautifulSoup(page_body_char, 'html.parser')

        # Initialization
        good, bad, neutral = '','',''
        editor, writer, publication = '','',''
        editorURL, writerURL, subcomic = '','',''

        URL2 = URL_char.replace('https://marvel.fandom.com','')

        # Boolean for the first case
        first = 1

        # Parse characters appearances
        appearances = soup_char.find_all('div', class_='mw-content-ltr mw-content-text')

        for p in appearances:
            for pp in p.find_all('p'):
                # If a comic is split in sub-comics
                span = pp.find_previous('span', class_='mw-headline')
                
                if span and ("Appearing" in span.text):
                    if (not first and subcomic!=span.text \
                    and (good or bad or neutral)):
                        # Take only the title of the subcomic
                        subcomic = subcomic[14:-1]
                        appearances_pd = pd.DataFrame([[URL2, good, bad, neutral, 
                                            editor, editorURL,
                                            writer, writerURL,
                                            publication, subcomic]], columns = comics_columns)
                        comics_pd = comics_pd.append(appearances_pd)

                        good, bad, neutral = '','',''
                        editor, writer, publication = '','',''
                        editorURL, writerURL = '',''
                        subcomic = span.text

                    else:
                        first = 0
                        subcomic = span.text

                if "Featured Characters:" in pp.text:
                    ul = pp.find_next('ul')
                    for a in ul.find_all('a', href=True):
                        good = good + ', ' + a['href']

                if "Supporting Characters:" in pp.text:
                    ul = pp.find_next('ul')
                    for a in ul.find_all('a', href=True):
                        good = good + ', ' + a['href']

                if "Antagonists:" in pp.text:
                    ul = pp.find_next('ul')
                    for a in ul.find_all('a', href=True):
                        bad = bad + ', ' + a['href']

                if "Other Characters:" in pp.text:
                    ul = pp.find_next('ul')
                    for a in ul.find_all('a', href=True):
                        neutral = neutral + ', ' + a['href']

        # Each subcomic is written in the format '"Appearing in *subcomic*"'
        # We keep only the subcomic name
        subcomic = subcomic[14:-1]
        appearances_pd = pd.DataFrame([[URL2, good, bad, neutral, 
                                        editor, editorURL,
                                        writer, writerURL,
                                        publication, subcomic]], columns = comics_columns)

        comics_pd = comics_pd.append(appearances_pd)

        subcomic = ''


        # Parse comic characteristics
        side_tab = soup_char.find_all('div', class_='infobox')

        for p in side_tab:
            # Publication date
            if(p.find_next('div', style='width:88%; text-align:left;')):
                tab = p.find_next('table', style='width:100%; text-align: center;')
            if(tab):
                tr = tab.find_next('tr', style='font-size:12px;')
                td = tab.find_next('td', style='font-size:12px;')
            if (tr):
                publication = tr.find_next('td').text
            if (td):
                publication = td.text
            if (not tr and not td):
                publication = ''

            # Editors
            for pp in p.find_all('div', style='width:100px;float:left;text-align:left;'):
                if ('Editor-in-Chief' in pp.text):
                    ppp = pp.find_next('div', style='width:190px;float:left;text-align:right;')
                    for a in ppp.find_all('a'):
                        editor = editor + ', ' + a.text
                    for a in ppp.find_all('a', href=True):
                        editorURL = editorURL + ', ' + a['href']

            sub = p.find_next('div', style='width:88%; text-align:left;')

            # Writers
            for pp in p.find_all('div', style='width:100px;float:left;text-align:left;'):
                if (pp.find_previous('div', style='width:88%; text-align:left;')):
                    sub = pp.find_previous('div', style='width:88%; text-align:left;')
                if sub:
                    adiv = sub.find_next('div', style='width:100px;float:left;text-align:left;')

                if (adiv and 'Writer' in adiv.text and 'Writer' in pp.text):
                    if (sub):
                        subcomic = sub.text[1:-1]

                    ppp = pp.find_next('div', style='width:190px;float:left;text-align:right;')
                    for a in ppp.find_all('a'):
                        writer = writer + ', ' + a.text
                    for a in ppp.find_all('a', href=True):
                        writerURL = writerURL + ', ' + a['href']

                    comics_pd.loc[comics_pd.Subcomic == subcomic,'Writer'] = writer
                    comics_pd.loc[comics_pd.Subcomic == subcomic,'Writer URL'] = writerURL

                    if (sub):
                        sub = sub.find_next('div', style='width:88%; text-align:left;')
                        writer, writerURL = '',''

                elif (sub): 
                    sub = sub.find_next('div', style='width:88%; text-align:left;')
                    writer, writerURL = '',''    

        comics_pd.loc[comics_pd.URL == URL2,'Editor-in-chief'] = editor
        comics_pd.loc[comics_pd.URL == URL2,'Editor-in-chief URL'] = editorURL
        comics_pd.loc[comics_pd.URL == URL2,'Publication date'] = publication
    nextpage = soup.find('link', {"rel" : "next"})

**Save file to pickle:**

In [None]:
pickle.dump(comics_pd, open('data/comics_marvel.txt','wb'))

**Open the saved file:**

In [3]:
with open('data/comics_marvel.txt', 'rb') as f:
    comics_marvel = pickle.load(f)

comics_marvel

Unnamed: 0,URL,Good characters,Bad characters,Neutral characters,Editor-in-chief,Editor-in-chief URL,Writer,Writer URL,Publication date,Subcomic
0,/wiki/Marvel_Mystery_Comics_Vol_1_NN,,,,,,", Joe Caramagna",", /wiki/Joe_Caramagna","January, 1943",st stor
0,/wiki/Comedy_Comics_Vol_1_12,,,,", Stan Lee",", /wiki/Stan_Lee",,,"December, 1942",Morphy
0,/wiki/Marvel_Mystery_Comics_Vol_1_7,", /wiki/Human_Torch_(Android)_(Earth-616), /wi...",", /wiki/Roglo_(Earth-616), #cite_note-Only_App...",", /wiki/New_York_City_Police_Department_(Earth...",", Joe Simon",", /wiki/Joe_Simon",", Stan Lee, Larry Lieber",", /wiki/Stan_Lee, /wiki/Larry_Lieber","May, 1940",The Human Torch
0,/wiki/Marvel_Mystery_Comics_Vol_1_7,", /wiki/Thomas_Halloway_(Earth-616), /wiki/Bet...",", /wiki/Emma_Martin_(Earth-616)",", /wiki/Henry_Martin_(Earth-616)",", Joe Simon",", /wiki/Joe_Simon",", Paul Gustavson, Ray Gill",", /wiki/Paul_Gustavson, /wiki/Ray_Gill","May, 1940",The Angel: Master of Men
0,/wiki/Marvel_Mystery_Comics_Vol_1_7,", /wiki/Namor_McKenzie_(Earth-616), /wiki/Thak...",,", /wiki/Homo_mermanus, /wiki/New_York_City_Pol...",", Joe Simon",", /wiki/Joe_Simon",", William Blake Everett",", /wiki/William_Blake_Everett","May, 1940","Prince Namor, the Sub-Mariner"
...,...,...,...,...,...,...,...,...,...,...
0,/wiki/Spider-Man:_The_Complete_Clone_Saga_Epic...,", /wiki/Peter_Parker_(Earth-616), /wiki/Ben_Re...",", /wiki/Kaine_Parker_(Earth-616), /wiki/Samuel...",", /wiki/Guardian_(Spider-Clone)_(Earth-616), /...",", Joe Quesada",", /wiki/Joe_Quesada",", J.M. DeMatteis",", /wiki/J.M._DeMatteis",1979,Resurrection!
0,/wiki/Spider-Man:_The_Complete_Clone_Saga_Epic...,", /wiki/Peter_Parker_(Earth-616), /wiki/Ben_Re...",", /wiki/Miles_Warren_(Jackal_Clone_2)_(Earth-616)",", /wiki/Kaine_Parker_(Earth-616), /wiki/Charle...",", Joe Quesada",", /wiki/Joe_Quesada",", Howard Mackie",", /wiki/Howard_Mackie",1979,Truths & Deceptions
0,/wiki/Hellraiser_Vol_1_17,,,,,,", Clive Barker",", /wiki/Clive_Barker",1992,Resurrection
0,/wiki/Ultimate_Spider-Man_Infinite_Comic_Vol_2_10,", /wiki/Peter_Parker_(Earth-12041), /wiki/Pete...",", /wiki/Shazana_(Earth-12041)",", /wiki/William_Howard_Taft_(Earth-12041), /wi...",", Axel Alonso",", /wiki/Axel_Alonso",", John Barber",", /wiki/John_Barber",2016,Ham-ilton (Part 2)


# Part 2: DC dataset

**Let's note that the DC dataset is extremely similar to the Marvel dataset. Indeed, the characters have the same characteristics and we can thus use the same attributes. The difference is that the characters are in two different pages which are classified into good and bad characters (compared to the Marvel dataset where we took all the characters in `Earth-616`, not knowing if they were good or bad). Moreover, the comics are also very similarly written and we can apply a similar code as the one we did for Marvel. We just have to be careful when we use the different `find` functions. Indeed, even if the characteristics are the same, the html code might be different and we had to adapt the code we had.**

## 2.1 Parse the DC characters:

### 2.1.1 Good DC characters

**Collect the first URLs:**

In [5]:
URL = 'https://dc.fandom.com/wiki/Category:Good_Characters'
r = requests.get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

# Collect URL of each character in the web page
publications_wrappers = soup.find_all('li', class_='category-page__member')

list_url = []
for p in publications_wrappers:
    for a in p.find_all('a', href=True):
        list_url.append(a['href'])
        
# Remove duplicate
my_set = set(list_url)
good_list_url = list(my_set)

**Parse the first pages:**

In [6]:
new_columns = ['URL', 'Real Name', 'Current Alias', 'Relatives', 'Affiliation']

personnage_pd = pd.DataFrame(columns=new_columns)
idx = 0
dict_geant = {}
for pers in good_list_url:
    # Get URL and use html parser
    URL_char = 'https://dc.fandom.com/' + pers
    URL_char = URL_char.replace("'","")
    r_char = requests.get(URL_char)
    page_body_char = r_char.text
    soup_char = BeautifulSoup(page_body_char, 'html.parser')
    
    # Initialize variables
    url, name, current_alias, relatives, affiliation = '','','','',''
    personnage =[]
    
    # Parsing
    url = pers
    side_tab = soup_char.find_all('div', class_='mw-content-ltr mw-content-text')
    for p in side_tab:
        for pp in p.find_all('div', class_='pi-item pi-data pi-item-spacing pi-border-color'):
            for ppp in pp.find_all('h3', class_='pi-data-label pi-secondary-font'):
                if(ppp.text[0:]=="Real Name"):
                    name = pp.find('div', class_='pi-data-value pi-font').text
                if(ppp.text[0:]=="Current Alias"):
                    current_alias = pp.find('div', class_='pi-data-value pi-font').text
                if(ppp.text[0:]=="Relatives"):
                    div = pp.find('div', class_='pi-data-value pi-font')
                    if div:
                        for a in div.find_all('a', href=True):
                            relatives = relatives + ', ' + a['href']
                if(ppp.text[0:]=="Affiliation"):
                    div = pp.find('div', class_='pi-data-value pi-font')
                    if div:
                        for a in div.find_all('a', href=True):
                            affiliation = affiliation + ', ' + a['href']

    characteristics_pd = pd.DataFrame([[url, name, current_alias, relatives, affiliation]], columns = new_columns)

    personnage_pd = personnage_pd.append(characteristics_pd,ignore_index=True)
    
nextpage = soup.find('link', {"rel" : "next"})

**Parse the remaining pages:**

In [8]:
i = 0
tot_page = 11644/200

while(len(nextpage['href'])):
    # Print a loading bar
    i += 1
    printed= i/tot_page*100
    stdout.write("\r%f %%" % printed)
    stdout.flush()
    
    urlnext_page = nextpage['href']
    r = requests.get(urlnext_page)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')

    # Collect URL of each character in the web page
    publications_wrappers = soup.find_all('li', class_='category-page__member')
    list_url = []
    for p in publications_wrappers:
        for a in p.find_all('a', href=True):
            list_url.append(a['href'])
            
    # Remove duplicate
    my_set =set(list_url)
    good_list_url = list(my_set)
    idx =0
    dict_geant={}
    for pers in good_list_url:
        # Get URL and use html parser
        URL_char = 'https://dc.fandom.com/' + pers
        URL_char = URL_char.replace("'","")
        r_char = requests.get(URL_char)
        page_body_char = r_char.text
        soup_char = BeautifulSoup(page_body_char, 'html.parser')
        
        # Initialize variables
        url, name, current_alias, relatives, affiliation = '','','','',''

        personnage =[]

        # Parsing
        url = pers
        side_tab = soup_char.find_all('div', class_='mw-content-ltr mw-content-text')
        for p in side_tab:
            for pp in p.find_all('div', class_='pi-item pi-data pi-item-spacing pi-border-color'):
                for ppp in pp.find_all('h3', class_='pi-data-label pi-secondary-font'):
                    if(ppp.text[0:]=="Real Name"):
                        name = pp.find('div', class_='pi-data-value pi-font').text
                    if(ppp.text[0:]=="Current Alias"):
                        current_alias = pp.find('div', class_='pi-data-value pi-font').text
                    if(ppp.text[0:]=="Relatives"):
                        div = pp.find('div', class_='pi-data-value pi-font')
                        if div:
                            for a in div.find_all('a', href=True):
                                relatives = relatives + ', ' + a['href']
                    if(ppp.text[0:]=="Affiliation"):
                        div = pp.find('div', class_='pi-data-value pi-font')
                        if div:
                            for a in div.find_all('a', href=True):
                                affiliation = affiliation + ', ' + a['href']

        characteristics_pd = pd.DataFrame([[url, name, current_alias, relatives, affiliation]], columns = new_columns)

        personnage_pd = personnage_pd.append(characteristics_pd,ignore_index=True)
    
    nextpage = soup.find('link', {"rel" : "next"})

101.339746 %

TypeError: 'NoneType' object is not subscriptable

### 2.1.2 Bad DC characters

**Collect the first URLs:**

In [9]:
URL = 'https://dc.fandom.com/wiki/Category:Bad_Characters'
r = requests.get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

# Collect URL of each character in the web page
publications_wrappers = soup.find_all('li', class_='category-page__member')

list_url = []
for p in publications_wrappers:
    for a in p.find_all('a', href=True):
        list_url.append(a['href'])
        
# Remove duplicate
my_set = set(list_url)
good_list_url = list(my_set)

**Parse the first pages:**

In [10]:
new_columns = ['URL', 'Real Name', 'Current Alias', 'Relatives', 'Affiliation']

personnage_pd = pd.DataFrame(columns=new_columns)
idx = 0
dict_geant = {}
for pers in good_list_url:
    # Get URL and use html parser
    URL_char = 'https://dc.fandom.com/' + pers
    URL_char = URL_char.replace("'","")
    r_char = requests.get(URL_char)
    page_body_char = r_char.text
    soup_char = BeautifulSoup(page_body_char, 'html.parser')
    
    # Initialize variables
    url, name, current_alias, relatives, affiliation = '','','','',''
    personnage =[]
    
    # Parsing
    url = pers
    side_tab = soup_char.find_all('div', class_='mw-content-ltr mw-content-text')
    for p in side_tab:
        for pp in p.find_all('div', class_='pi-item pi-data pi-item-spacing pi-border-color'):
            for ppp in pp.find_all('h3', class_='pi-data-label pi-secondary-font'):
                if(ppp.text[0:]=="Real Name"):
                    name = pp.find('div', class_='pi-data-value pi-font').text
                if(ppp.text[0:]=="Current Alias"):
                    current_alias = pp.find('div', class_='pi-data-value pi-font').text
                if(ppp.text[0:]=="Relatives"):
                    div = pp.find('div', class_='pi-data-value pi-font')
                    if div:
                        for a in div.find_all('a', href=True):
                            relatives = relatives + ', ' + a['href']
                if(ppp.text[0:]=="Affiliation"):
                    div = pp.find('div', class_='pi-data-value pi-font')
                    if div:
                        for a in div.find_all('a', href=True):
                            affiliation = affiliation + ', ' + a['href']

    characteristics_pd = pd.DataFrame([[url, name, current_alias, relatives, affiliation]], columns = new_columns)

    personnage_pd = personnage_pd.append(characteristics_pd,ignore_index=True)
    
nextpage = soup.find('link', {"rel" : "next"})

**Parse the remaining pages:**

In [11]:
i = 0
tot_page = 10272/200

while(len(nextpage['href'])):
    # Print a loading bar
    i += 1
    printed= i/tot_page*100
    stdout.write("\r%f %%" % printed)
    stdout.flush()
    
    urlnext_page = nextpage['href']
    r = requests.get(urlnext_page)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')

    # Collect URL of each character in the web page
    publications_wrappers = soup.find_all('li', class_='category-page__member')
    list_url = []
    for p in publications_wrappers:
        for a in p.find_all('a', href=True):
            list_url.append(a['href'])
            
    # Remove duplicate
    my_set =set(list_url)
    good_list_url = list(my_set)
    idx =0
    dict_geant={}
    for pers in good_list_url:
        # Get URL and use html parser
        URL_char = 'https://dc.fandom.com/' + pers
        URL_char = URL_char.replace("'","")
        r_char = requests.get(URL_char)
        page_body_char = r_char.text
        soup_char = BeautifulSoup(page_body_char, 'html.parser')
        
        # Initialize variables
        url, name, current_alias, relatives, affiliation = '','','','',''

        personnage =[]

        # Parsing
        url = pers
        side_tab = soup_char.find_all('div', class_='mw-content-ltr mw-content-text')
        for p in side_tab:
            for pp in p.find_all('div', class_='pi-item pi-data pi-item-spacing pi-border-color'):
                for ppp in pp.find_all('h3', class_='pi-data-label pi-secondary-font'):
                    if(ppp.text[0:]=="Real Name"):
                        name = pp.find('div', class_='pi-data-value pi-font').text
                    if(ppp.text[0:]=="Current Alias"):
                        current_alias = pp.find('div', class_='pi-data-value pi-font').text
                    if(ppp.text[0:]=="Relatives"):
                        div = pp.find('div', class_='pi-data-value pi-font')
                        if div:
                            for a in div.find_all('a', href=True):
                                relatives = relatives + ', ' + a['href']
                    if(ppp.text[0:]=="Affiliation"):
                        div = pp.find('div', class_='pi-data-value pi-font')
                        if div:
                            for a in div.find_all('a', href=True):
                                affiliation = affiliation + ', ' + a['href']

        characteristics_pd = pd.DataFrame([[url, name, current_alias, relatives, affiliation]], columns = new_columns)

        personnage_pd = personnage_pd.append(characteristics_pd,ignore_index=True)
    
    nextpage = soup.find('link', {"rel" : "next"})

101.246106 %

TypeError: 'NoneType' object is not subscriptable

In [13]:
pickle.dump(personnage_pd, open('data/bad_dc.txt','wb'))

### 2.1.3 Join dataframes

**Concatenate the dataframes for good and bad characters:**

In [7]:
with open('data/bad_dc.txt', 'rb') as f:
    bad_dc = pickle.load(f)

bad_dc

Unnamed: 0,URL,Real Name,Current Alias,Relatives,Affiliation
0,/wiki/Albert_Rothstein_(Arrow:_Earth-2),Albert Rothstein,Atom Smasher,,
1,/wiki/Ahk-Ton_(New_Earth),Ahk-Ton,Metamorpho,,
2,/wiki/Adam_Bomb_(New_Earth),Unknown,Adam Bomb,,
3,/wiki/Adolf_Hitler_(JSA:_The_Golden_Age),Adolf Hitler,,,", /wiki/Nazi_Party"
4,/wiki/Agarushnawokliag_(Prime_Earth),Agarushnawokliag,,,
...,...,...,...,...,...
10472,/wiki/Zeta_(Earth-One),Unknown,Zeta,,", /wiki/Pantheon"
10473,/wiki/Zotan_(Earth-S),Zotan,Zotan,,
10474,/wiki/Zond_(Earth-One),Zond,Zond the Sorcerer,,", /wiki/Morgaine_le_Fey"
10475,/wiki/Zora_Vi-Lar_(Earth-One),Zora Vi-Lar,Black Flame,,


In [6]:
with open('data/good_dc.txt', 'rb') as f:
    good_dc = pickle.load(f)

good_dc

Unnamed: 0,URL,Real Name,Current Alias,Relatives,Affiliation
0,/wiki/Aaron_Hayley_(New_Earth),Aaron Hayley,Swamp Thing,,
1,/wiki/Abigail_Fine_(Smallville),Abigail Fine,Abigail Fine,", /wiki/Elise_Fine_(Smallville)",
2,/wiki/Adam_Strange_(JSA:_The_Golden_Age),Adam Strange,Adam Strange,,
3,/wiki/Alan_Barnes_(New_Earth),Alan Barnes,Brainstorm,,
4,/wiki/Alan_Scott_(Earth_2),Alan Scott,Green Lantern,", /wiki/Sam_Zhao_(Earth_2)",", /wiki/Wonders_of_the_World, /wiki/The_Green"
...,...,...,...,...,...
11841,/wiki/Zor_In-Ze_(DCAU),Zor In-Ze,Zor In-Ze,", /wiki/Kala_Im-Re_(DCAU), /wiki/Kara_In-Ze_(D...",
11842,/wiki/Zoe_Lawton_(DC_Legends),Zoe Lawton,,", /wiki/Floyd_Lawton_(DC_Legends)",
11843,/wiki/Zilya_Popoff_(Earth-Prime),Zilya Popoff,Zilya Popoff,,", /wiki/United_Planets_Young_Heroes_(Earth-Prime)"
11844,/wiki/Zor-El_(DC_Legends),Zor-El,,", /wiki/Jor-El_(DC_Legends), /wiki/Kara_Zor-El...",", /wiki/House_of_El"


In [9]:
pers_dc = [good_dc, bad_dc]
characters_dc = pd.concat(pers_dc)

**Save to pickle:**

In [10]:
pickle.dump(characters_dc, open('data/characters_dc.txt','wb'))

**Open the saved file:**

In [11]:
character_dc = pd.read_pickle('data/characters_dc.txt')

character_dc

Unnamed: 0,URL,Real Name,Current Alias,Relatives,Affiliation
0,/wiki/Aaron_Hayley_(New_Earth),Aaron Hayley,Swamp Thing,,
1,/wiki/Abigail_Fine_(Smallville),Abigail Fine,Abigail Fine,", /wiki/Elise_Fine_(Smallville)",
2,/wiki/Adam_Strange_(JSA:_The_Golden_Age),Adam Strange,Adam Strange,,
3,/wiki/Alan_Barnes_(New_Earth),Alan Barnes,Brainstorm,,
4,/wiki/Alan_Scott_(Earth_2),Alan Scott,Green Lantern,", /wiki/Sam_Zhao_(Earth_2)",", /wiki/Wonders_of_the_World, /wiki/The_Green"
...,...,...,...,...,...
10472,/wiki/Zeta_(Earth-One),Unknown,Zeta,,", /wiki/Pantheon"
10473,/wiki/Zotan_(Earth-S),Zotan,Zotan,,
10474,/wiki/Zond_(Earth-One),Zond,Zond the Sorcerer,,", /wiki/Morgaine_le_Fey"
10475,/wiki/Zora_Vi-Lar_(Earth-One),Zora Vi-Lar,Black Flame,,


# 2.2 Parse the DC comics:

**Collect the first URLs:**

In [None]:
# Get URL and use html parser
URL = 'https://dc.fandom.com/wiki/Category:Comics'
r = requests.get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

# Collect URL of each character in the web page
publications_wrappers = soup.find_all('li', class_='category-page__member')

# Initialization
i = 0
list_url = []

for p in publications_wrappers:
    # Don't take into account the first links which are "Categories"
    if(i>=24):
        for a in p.find_all('a', href=True):
            list_url.append(a['href'])
    i +=1
    
# Remove duplicate
my_set = set(list_url)
good_list_url = list(my_set)

**Parse the first pages:**

In [None]:
comics_columns = ['URL', 'Good characters', 'Bad characters', 
                  'Neutral characters', 'Editor-in-chief', 'Editor-in-chief URL', 
                  'Writer', 'Writer URL', 'Publication date', 'Subcomic']

comics_pd = pd.DataFrame(columns=comics_columns)

# Parse the first page
for comics in good_list_url:
    # Get URL and use html parser
    URL_char = 'https://dc.fandom.com' + comics
    URL_char = URL_char.replace("'","")
    r_char = requests.get(URL_char)
    page_body_char = r_char.text
    soup_char = BeautifulSoup(page_body_char, 'html.parser')
    
    # Initialization
    good, bad, neutral = '','',''
    editor, writer, publication = '','',''
    editorURL, writerURL, subcomic = '','',''
    URL2 = URL_char.replace('https://dc.fandom.com','')
    
    # Boolean for the first case
    first = 1
    
    # Parse characters appearances
    appearances = soup_char.find_all('div', class_='mw-content-ltr mw-content-text')
    for p in appearances:
        for pp in p.find_all('p'):
            # If a comic is split in sub-comics
            span = pp.find_previous('span', class_='mw-headline')
            if span and ("Appearing" in span.text):
                if (not first and subcomic!=span.text \
                and (good or bad or neutral)):
                    # Take only the title of the subcomic
                    subcomic = subcomic[14:-1]
                    appearances_pd = pd.DataFrame([[URL2, good, bad, neutral, 
                                        editor, editorURL,
                                        writer, writerURL,
                                        publication, subcomic]], columns = comics_columns)
                    comics_pd = comics_pd.append(appearances_pd)
                    good, bad, neutral = '','',''
                    editor, writer, publication = '','',''
                    editorURL, writerURL = '',''
                    subcomic = span.text
                else:
                    first = 0
                    subcomic = span.text
                    
            if "Featured Characters:" in pp.text:
                ul = pp.find_next('ul')
                for a in ul.find_all('a', href=True):
                    good = good + ', ' + a['href']
            if "Supporting Characters:" in pp.text:
                ul = pp.find_next('ul')
                for a in ul.find_all('a', href=True):
                    good = good + ', ' + a['href']
            if "Antagonists:" in pp.text:
                ul = pp.find_next('ul')
                for a in ul.find_all('a', href=True):
                    bad = bad + ', ' + a['href']
            if "Other Characters:" in pp.text:
                ul = pp.find_next('ul')
                for a in ul.find_all('a', href=True):
                    neutral = neutral + ', ' + a['href']
                    
    # Each subcomic is written in the format '"Appearing in *subcomic*"'
    # We keep only the subcomic name
    subcomic = subcomic[14:-1]
    appearances_pd = pd.DataFrame([[URL2, good, bad, neutral, 
                                    editor, editorURL,
                                    writer, writerURL,
                                    publication, subcomic]], columns = comics_columns)
    comics_pd = comics_pd.append(appearances_pd)
    subcomic = ''
    
    
    # Parse comic characteristics
    side_tab = soup_char.find_all('div', class_='mw-content-ltr mw-content-text')
    for p in side_tab:
        # Publication date
        if(p.find_next('h2', {'data-source':'StoryTitle1'})):
            tab = p.find_next('h2', class_='pi-item pi-item-spacing pi-title')
        if(tab):
            tab = tab.find_next('h2', class_='pi-item pi-item-spacing pi-title')
        if(tab):
            tab = tab.find_next('h2', class_='pi-item pi-item-spacing pi-title')
        if(tab):
            publication = tab.text
            
        # Editors
        for pp in p.find_all('h3', class_='pi-data-label pi-secondary-font'):
            if ('Executive Editor' in pp.text):
                ppp = pp.find_next('div', class_='pi-data-value pi-font')
                for a in ppp.find_all('a'):
                    editor = editor + ', ' + a.text
                for a in ppp.find_all('a', href=True):
                    editorURL = editorURL + ', ' + a['href']

        sub = tab.find_next('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')

        # Writers
        for pp in p.find_all('h3', class_='pi-data-label pi-secondary-font'):
            if(pp.find_previous('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')):
                sub = pp.find_previous('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')
            if sub:
                adiv = sub.find_next('h3', class_='pi-data-label pi-secondary-font')
            if (adiv and 'Writer' in adiv.text and 'Writer' in pp.text):
                if (sub):
                    subcomic = sub.text[1:-1]
                ppp = pp.find_next('div', class_='pi-data-value pi-font')
                for a in ppp.find_all('a'):
                    writer = writer + ', ' + a.text
                for a in ppp.find_all('a', href=True):
                    writerURL = writerURL + ', ' + a['href']
                comics_pd.loc[comics_pd.Subcomic == subcomic,'Writer'] = writer
                comics_pd.loc[comics_pd.Subcomic == subcomic,'Writer URL'] = writerURL
                if (sub):
                    sub = sub.find_next('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')
                    writer, writerURL = '',''
            elif (sub): 
                sub = sub.find_next('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')
                writer, writerURL = '',''    
                
    comics_pd.loc[comics_pd.URL == URL2,'Editor-in-chief'] = editor
    comics_pd.loc[comics_pd.URL == URL2,'Editor-in-chief URL'] = editorURL
    comics_pd.loc[comics_pd.URL == URL2,'Publication date'] = publication

**Parse the remaining pages:**

In [None]:
# Parse all the other pages
nextpage = soup.find('link', {"rel" : "next"})

i = 0
tot_page = 49759/200

while(len(nextpage['href'])):
    # Print a loading bar
    i += 1
    printed= i/tot_page*100
    stdout.write("\r%f %%" % printed)
    stdout.flush()
    urlnext_page = nextpage['href']
    r = requests.get(urlnext_page)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    # Collect url of each character in the web page
    publications_wrappers = soup.find_all('li', class_='category-page__member')
    list_url = []
    for p in publications_wrappers:
        for a in p.find_all('a', href=True):
            list_url.append(a['href'])
          
    # Remove duplicates
    my_set = set(list_url)
    good_list_url = list(my_set)
    good_list_url = [ x for x in good_list_url if "/wiki/Category:" not in x ]
  
    # Parse the first page
    for comics in good_list_url:
        # Get URL and use html parser
        URL_char = 'https://dc.fandom.com' + comics
        URL_char = URL_char.replace("'","")
        r_char = requests.get(URL_char)
        page_body_char = r_char.text
        soup_char = BeautifulSoup(page_body_char, 'html.parser')
        
        # Initialization
        good, bad, neutral = '','',''
        editor, writer, publication = '','',''
        editorURL, writerURL, subcomic = '','',''
        URL2 = URL_char.replace('https://dc.fandom.com','')

        # Boolean for the first case
        first = 1

        # Parse characters appearances
        appearances = soup_char.find_all('div', class_='mw-content-ltr mw-content-text')
        for p in appearances:
            for pp in p.find_all('p'):
                # If a comic is split in sub-comics
                span = pp.find_previous('span', class_='mw-headline')
                if span and ("Appearing" in span.text):
                    if (not first and subcomic!=span.text \
                    and (good or bad or neutral)):
                        # Take only the title of the subcomic
                        subcomic = subcomic[14:-1]
                        appearances_pd = pd.DataFrame([[URL2, good, bad, neutral, 
                                              editor, editorURL,
                                              writer, writerURL,
                                              publication, subcomic]], columns = comics_columns)
                        comics_pd = comics_pd.append(appearances_pd)

                        good, bad, neutral = '','',''
                        editor, writer, publication = '','',''
                        editorURL, writerURL = '',''
                        subcomic = span.text
                    else:
                        first = 0
                        subcomic = span.text
                if "Featured Characters:" in pp.text:
                    ul = pp.find_next('ul')
                    for a in ul.find_all('a', href=True):
                        good = good + ', ' + a['href']
                if "Supporting Characters:" in pp.text:
                    ul = pp.find_next('ul')
                    for a in ul.find_all('a', href=True):
                        good = good + ', ' + a['href']
                if "Antagonists:" in pp.text:
                    ul = pp.find_next('ul')
                    for a in ul.find_all('a', href=True):
                        bad = bad + ', ' + a['href']
                if "Other Characters:" in pp.text:
                    ul = pp.find_next('ul')
                    for a in ul.find_all('a', href=True):
                        neutral = neutral + ', ' + a['href']

        # Each subcomic is written in the format '"Appearing in *subcomic*"'
        # We keep only the subcomic name
        subcomic = subcomic[14:-1]

        appearances_pd = pd.DataFrame([[URL2, good, bad, neutral, 
                                          editor, editorURL,
                                          writer, writerURL,
                                          publication, subcomic]], columns = comics_columns)

        comics_pd = comics_pd.append(appearances_pd)

        subcomic = ''


        # Parse comic characteristics
        side_tab = soup_char.find_all('div', class_='mw-content-ltr mw-content-text')
        for p in side_tab:
            # Publication date
            if(p.find_next('h2', {'data-source':'StoryTitle1'})):
                tab = p.find_next('h2', class_='pi-item pi-item-spacing pi-title')
                if(tab):
                    tab = tab.find_next('h2', class_='pi-item pi-item-spacing pi-title')
                if(tab):
                    tab = tab.find_next('h2', class_='pi-item pi-item-spacing pi-title')
                if(tab):
                    publication = tab.text
            else:
                if(tab):
                    tab = p.find_next('h2', class_='pi-item pi-item-spacing pi-title')
                if(tab):
                    tab = tab.find_next('h2', class_='pi-item pi-item-spacing pi-title')
                if(tab):
                    publication = tab.text

            # Editors
            for pp in p.find_all('h3', class_='pi-data-label pi-secondary-font'):
                if ('Executive Editor' in pp.text):
                    ppp = pp.find_next('div', class_='pi-data-value pi-font')
                    for a in ppp.find_all('a'):
                        editor = editor + ', ' + a.text
                    for a in ppp.find_all('a', href=True):
                        editorURL = editorURL + ', ' + a['href']
            if(tab):           
                if(tab.find_next('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')):
                      sub = tab.find_next('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')

            # Writers
            for pp in p.find_all('h3', class_='pi-data-label pi-secondary-font'):
                if(pp.find_previous('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')):
                    sub = pp.find_previous('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')
                if sub:
                    adiv = sub.find_next('h3', class_='pi-data-label pi-secondary-font')
                if (adiv and 'Writer' in adiv.text and 'Writer' in pp.text):
                    if (sub):
                        subcomic = sub.text[1:-1]
                        ppp = pp.find_next('div', class_='pi-data-value pi-font')
                    for a in ppp.find_all('a'):
                        writer = writer + ', ' + a.text
                    for a in ppp.find_all('a', href=True):
                        writerURL = writerURL + ', ' + a['href']
                    comics_pd.loc[comics_pd.Subcomic == subcomic,'Writer'] = writer
                    comics_pd.loc[comics_pd.Subcomic == subcomic,'Writer URL'] = writerURL
                    if (sub):
                        sub = sub.find_next('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')
                        writer, writerURL = '',''
                elif (sub): 
                    sub = sub.find_next('h2', class_='pi-item pi-header pi-secondary-font pi-item-spacing pi-secondary-background')
                    writer, writerURL = '',''    
                
        comics_pd.loc[comics_pd.URL == URL2,'Editor-in-chief'] = editor
        comics_pd.loc[comics_pd.URL == URL2,'Editor-in-chief URL'] = editorURL
        comics_pd.loc[comics_pd.URL == URL2,'Publication date'] = publication
    nextpage = soup.find('link', {"rel" : "next"})


**Save to pickle:**

In [None]:
pickle.dump(comics_pd, open('data/comics_dc.txt','wb'))

**Open the saved file:**

In [11]:
with open('data/comics_dc.txt', 'rb') as f:
    comics_pd = pickle.load(f)

comics_pd

Unnamed: 0,URL,Good characters,Bad characters,Neutral characters,Editor-in-chief,Editor-in-chief URL,Writer,Writer URL,Publication date,Subcomic
0,/wiki/100_Bullets_Vol_1_64,", /wiki/Jack_Daw_(100_Bullets), /wiki/Philip_G...",,,", Karen Berger",", /wiki/Karen_Berger",,,"November, 2005",The Dive
0,/wiki/100_Bullets_Vol_1_25,", /wiki/Augustus_Medici_(100_Bullets), /wiki/B...",,,", Karen Berger",", /wiki/Karen_Berger",,,"August, 2001",Red Prince Blues (Part III of III)
0,/wiki/2020_Visions_Vol_1_5,,,,", Karen Berger",", /wiki/Karen_Berger",", Ron Marz",", /wiki/Ron_Marz","September, 1997",
0,/wiki/100%25_True%3F_Vol_1_2,,,,", Jenette Kahn",", /wiki/Jenette_Kahn",", Ron Marz",", /wiki/Ron_Marz","December, 1997",
0,/wiki/100_Bullets_Vol_1_11,", /wiki/Philip_Graves_(100_Bullets)",,,", Karen Berger",", /wiki/Karen_Berger",,,"June, 2000","Heartbreak, Sunny Side Up"
...,...,...,...,...,...,...,...,...,...,...
0,/wiki/Zatanna_Vol_2_1,,,,", Dan DiDio",", /wiki/Dan_DiDio",,,"July, 2010",
0,/wiki/Zero_Girl_Vol_1_4,,,,", Jim Lee",", /wiki/Jim_Lee",,,"May, 2001",
0,/wiki/Young_Romance_Vol_1_196,,,,,,,,"December, 1973",he 1st Stor
0,/wiki/Young_Romance_Vol_1_126,,,,,,,,"November, 1963",he 1st Stor
