#### Extract Shakespeare's plays text

In [1]:
import os
import os.path
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import re
import urllib.request
import pickle
import time

#### Front page

In [2]:
main_page = 'http://shakespeare.mit.edu/'

In [3]:
def parse_main_page(url):
    page = urllib.request.urlopen(url)
    bs = BeautifulSoup(page, 'html.parser')
    
    table = bs.find(lambda tag: tag.name=='table' and tag.has_attr('align')) 
    rows = table.findAll(lambda tag: tag.name=='tr')

    cells1 = rows[0].findAll(lambda tag: tag.name=='td')
    cells2 = rows[1].findAll(lambda tag: tag.name=='td')
    
    plays = []

    for i, cell2 in enumerate(cells2):
        genre = cells1[i].find(lambda tag: tag.name=='h2').text.strip()
        # do not load poetry, only plays
        if genre == 'Poetry':
            continue
        links = cell2.findAll(lambda tag: tag.name=='a')
        for link in links:
            play = {}
            play['genre'] = genre.replace('\n','')
            play['name'] = link.text.replace('\n','')
            play['url'] = link['href'].replace('\n','')[:-10]
            plays.append(play)
    return plays

In [4]:
for p in parse_main_page(main_page):
    print(p['genre'], p['name'], p['url'])

Comedy All's Well That Ends Well allswell/
Comedy As You Like It asyoulikeit/
Comedy The Comedy of Errors comedy_errors/
Comedy Cymbeline cymbeline/
Comedy Love's Labours Lost lll/
Comedy Measure for Measure measure/
Comedy TheMerry Wives of Windsor merry_wives/
Comedy The Merchant of Venice merchant/
Comedy A Midsummer Night's Dream midsummer/
Comedy Much Ado About Nothing much_ado/
Comedy Pericles, Prince of Tyre pericles/
Comedy Taming of the Shrew taming_shrew/
Comedy The Tempest tempest/
Comedy Troilus and Cressida troilus_cressida/
Comedy Twelfth Night twelfth_night/
Comedy Two Gentlemen of Verona two_gentlemen/
Comedy Winter's Tale winters_tale/
History Henry IV, part 1 1henryiv/
History Henry IV, part 2 2henryiv/
History Henry V henryv/
History Henry VI, part 1 1henryvi/
History Henry VI, part 2 2henryvi/
History Henry VI, part 3 3henryvi/
History Henry VIII henryviii/
History King John john/
History Richard II richardii/
History Richard III richardiii/
Tragedy Antony and Cleop

In [5]:
def parse_play_page(url):
    page = urllib.request.urlopen(url)
    bs = BeautifulSoup(page, 'html.parser')
    
    regex = re.compile('^[A-z]+\.(\d+)\.(\d+)\.html$', re.IGNORECASE)
    part_links = bs.findAll(lambda tag: tag.name=='a')
    
    parts = []

    for part_link in part_links:
        m = regex.match(part_link['href'])
        if(m is None):
            continue
        part = {}
        part['url'] = m.group(0)
        part['act'] = m.group(1)
        part['scene'] = m.group(2)
        part['name'] = part_link.text.replace('\n','')
        parts.append(part)
    return parts

####  The Life of King Henry the Fifth

In [6]:
for part in parse_play_page('http://shakespeare.mit.edu/henryv/'):
    print(part['act'], part['scene'], part['name'], part['url'])

1 0 PROLOGUE henryv.1.0.html
1 1 London. An ante-chamber in the KING'S palace. henryv.1.1.html
1 2 The same. The Presence chamber. henryv.1.2.html
2 0 PROLOGUE henryv.2.0.html
2 1 London. A street. henryv.2.1.html
2 2 Southampton. A council-chamber. henryv.2.2.html
2 3 London. Before a tavern. henryv.2.3.html
2 4 France. The KING'S palace. henryv.2.4.html
3 0 PROLOGUE henryv.3.0.html
3 1 France. Before Harfleur. henryv.3.1.html
3 2 The same. henryv.3.2.html
3 3 The same. Before the gates. henryv.3.3.html
3 4 The FRENCH KING's palace. henryv.3.4.html
3 5 The same. henryv.3.5.html
3 6 The English camp in Picardy. henryv.3.6.html
3 7 The French camp, near Agincourt: henryv.3.7.html
4 0 PROLOGUE henryv.4.0.html
4 1 The English camp at Agincourt. henryv.4.1.html
4 2 The French camp. henryv.4.2.html
4 3 The English camp. henryv.4.3.html
4 4 The field of battle. henryv.4.4.html
4 5 Another part of the field. henryv.4.5.html
4 6 Another part of the field. henryv.4.6.html
4 7 Another part of th

In [7]:
def parse_act_page_by_sentence(url, debug = False):
    page = urllib.request.urlopen(url)
    bs = BeautifulSoup(page, 'html.parser')
    
    regex = re.compile('^speech(\d+)$', re.IGNORECASE)

    speech_links = bs.findAll(lambda tag: tag.name=='a' and tag.has_attr('name'))

    sentences = []
    for speech_link in speech_links:

        m = regex.match(speech_link['name'])
        if(m is None):
            continue

        speech_number = m.group(1)
        speaker = speech_link.text.replace('\n','').strip()
        
        blockquote = speech_link.find_next_sibling('blockquote')
        sentence_links = blockquote.findAll(
            lambda tag: tag.name=='a' and tag.has_attr('name') and tag['name'].isdigit()
        )
        
        for sentence_link in sentence_links:
            if(debug):
                print(sentence_link)
            sentence = {}
            sentence['speech_number'] = speech_number
            sentence['speaker'] = speaker
            sentence['text'] = sentence_link.text.replace('\n','').strip()
            sentence['number'] = sentence_link['name']
            sentences.append(sentence)
    return sentences

In [8]:
def parse_act_page_by_speech(url, debug = False):
    page = urllib.request.urlopen(url)
    bs = BeautifulSoup(page, 'html.parser')
    
    regex = re.compile('^speech(\d+)$', re.IGNORECASE)

    speech_links = bs.findAll(lambda tag: tag.name=='a' and tag.has_attr('name'))

    speeches = []
    for speech_link in speech_links:

        m = regex.match(speech_link['name'])
        if(m is None):
            continue

        speech_number = m.group(1)
        speaker = speech_link.text.strip()
        blockquote = speech_link.find_next_sibling('blockquote')
        
        speech = {}
        speech['speech_number'] = speech_number
        speech['speaker'] = speaker
        speech['text'] = blockquote.text.strip()
        
        speeches.append(speech)
    return speeches

####  The Life of King Henry the Fifth Act 1 Scene 1

In [9]:
for sentence in parse_act_page_by_sentence('http://shakespeare.mit.edu/henryv/henryv.1.1.html'):
    print(sentence['speech_number'], sentence['speaker'], sentence['number'], sentence['text'])

1 CANTERBURY 1 My lord, I'll tell you; that self bill is urged,
1 CANTERBURY 2 Which in the eleventh year of the last king's reign
1 CANTERBURY 3 Was like, and had indeed against us pass'd,
1 CANTERBURY 4 But that the scambling and unquiet time
1 CANTERBURY 5 Did push it out of farther question.
2 ELY 6 But how, my lord, shall we resist it now?
3 CANTERBURY 7 It must be thought on. If it pass against us,
3 CANTERBURY 8 We lose the better half of our possession:
3 CANTERBURY 9 For all the temporal lands which men devout
3 CANTERBURY 10 By testament have given to the church
3 CANTERBURY 11 Would they strip from us; being valued thus:
3 CANTERBURY 12 As much as would maintain, to the king's honour,
3 CANTERBURY 13 Full fifteen earls and fifteen hundred knights,
3 CANTERBURY 14 Six thousand and two hundred good esquires;
3 CANTERBURY 15 And, to relief of lazars and weak age,
3 CANTERBURY 16 Of indigent faint souls past corporal toil.
3 CANTERBURY 17 A hundred almshouses right well supplied

In [10]:
for speech in parse_act_page_by_speech('http://shakespeare.mit.edu/henryv/henryv.1.1.html'):
    print(speech['speech_number'], speech['speaker'], speech['text'])

1 CANTERBURY My lord, I'll tell you; that self bill is urged,
Which in the eleventh year of the last king's reign
Was like, and had indeed against us pass'd,
But that the scambling and unquiet time
Did push it out of farther question.
2 ELY But how, my lord, shall we resist it now?
3 CANTERBURY It must be thought on. If it pass against us,
We lose the better half of our possession:
For all the temporal lands which men devout
By testament have given to the church
Would they strip from us; being valued thus:
As much as would maintain, to the king's honour,
Full fifteen earls and fifteen hundred knights,
Six thousand and two hundred good esquires;
And, to relief of lazars and weak age,
Of indigent faint souls past corporal toil.
A hundred almshouses right well supplied;
And to the coffers of the king beside,
A thousand pounds by the year: thus runs the bill.
4 ELY This would drink deep.
5 CANTERBURY 'Twould drink the cup and all.
6 ELY But what prevention?
7 CANTERBURY The king is full of

In [11]:
file_name = 'shakespeare_plays.pickle'
if os.path.isfile(file_name):
    with open(file_name, 'rb') as handle:
        features = pickle.load(handle)
else:
    features = []
    for play in parse_main_page(main_page):
        for part in parse_play_page(main_page + play['url']):
            act_url = main_page + play['url'] + part['url']
            for speech in parse_act_page_by_speech(act_url):
                feature = {}
                feature['genre'] = play['genre']
                feature['play_name'] = play['name']
                feature['act'] = part['act']
                feature['scene'] = part['scene']
                feature['scene_name'] = part['name']
                feature['speech_number'] = speech['speech_number']
                feature['speaker'] = speech['speaker']
                feature['speech_text'] = speech['text']
                features.append(feature)
    with open(file_name, 'wb') as handle:
        pickle.dump(features, handle, protocol=pickle.HIGHEST_PROTOCOL)    

We have complete list of sentences (from all the plays)

In [12]:
features_df = pd.DataFrame(features)

In [13]:
print(features_df.shape)

(27091, 8)


In [14]:
features_df.sample(n = 10)

Unnamed: 0,act,genre,play_name,scene,scene_name,speaker,speech_number,speech_text
25574,4,Tragedy,Romeo and Juliet,1,Friar Laurence's cell.,PARIS,21,"Thy face is mine, and thou hast slander'd it."
5991,4,Comedy,TheMerry Wives of Windsor,2,A room in FORD'S house.,MISTRESS FORD,71,"Nay, good, sweet husband! Good gentlemen, let ..."
25086,1,Tragedy,Romeo and Juliet,4,A street.,MERCUTIO,17,"Why, may one ask?"
8259,5,Comedy,Much Ado About Nothing,4,A room in LEONATO'S house.,DON PEDRO,16,Good morrow to this fair assembly.
11018,3,Comedy,Troilus and Cressida,2,The same. Pandarus' orchard.,CRESSIDA,34,"Hard to seem won: but I was won, my lord,\nWit..."
14540,4,History,Henry V,1,The English camp at Agincourt.,KING HENRY V,28,I thank you: God be with you!
14016,5,Comedy,Winter's Tale,1,A room in LEONTES' palace.,PAULINA,12,"Had she such power,\nShe had just cause."
1014,1,Comedy,As You Like It,2,Lawn before the Duke's palace.,ROSALIND,38,As wit and fortune will.
20415,2,Tragedy,Hamlet,2,A room in the castle.,LORD POLONIUS,53,"What is the matter, my lord?"
21339,1,Tragedy,Julius Caesar,3,The same. A street.,CASSIUS,17,"You are dull, Casca, and those sparks of life\..."
