#### Extract Shakespeare's poetry text

In [1]:
import os
import os.path
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import re
import urllib.request
import pickle
import time

#### The Sonnets

In [2]:
def parse_sonnets_page(url):
    page = urllib.request.urlopen(url)
    bs = BeautifulSoup(page, 'html.parser')
    sonnet_links = bs.findAll(
        lambda tag: tag.name=='a' and tag.has_attr('href') and tag['href'].startswith('sonnet')
    )
    sonnets = []
    
    for sonnet_link in sonnet_links:
        sonnet = {}
        sonnet['name'] = sonnet_link.text.strip()
        sonnet['url'] = sonnet_link['href']
        sonnets.append(sonnet)
    
    return sonnets

In [3]:
def parse_sonet_page(url):
    page = urllib.request.urlopen(url)
    bs = BeautifulSoup(page, 'html.parser')
    blockquote = bs.find(lambda tag: tag.name=='blockquote')
    return blockquote.text

In [4]:
poetries = parse_sonnets_page('http://shakespeare.mit.edu/Poetry/sonnets.html')
for poetry in poetries:
    sonnet_url = 'http://shakespeare.mit.edu/Poetry/' + poetry['url']
    poetry['text'] = parse_sonet_page(sonnet_url)

In [5]:
poetry = {}
poetry['name'] = "A Lover's Complaint"
poetry['text'] = ''
poetries.append(poetry)

page = urllib.request.urlopen('http://shakespeare.mit.edu/Poetry/LoversComplaint.html')
bs = BeautifulSoup(page, 'html.parser')

title_tag = bs.find(lambda tag: tag.name=='h1')
text_tags = title_tag.find_next_siblings(lambda tag: tag.name=='p' or tag.name=='blockquote')
for text_tag in text_tags:
    poetry['text'] += text_tag.text.strip()

In [6]:
poetry = {}
poetry['name'] = 'The Rape of Lucrece'
poetry['text'] = ''
poetries.append(poetry)

page = urllib.request.urlopen('http://shakespeare.mit.edu/Poetry/RapeOfLucrece.html')
bs = BeautifulSoup(page, 'html.parser')

title_tag = bs.find(lambda tag: tag.name=='h1')
text_tags = title_tag.find_next_siblings(lambda tag: tag.name=='p' or tag.name=='blockquote')
for text_tag in text_tags:
    poetry['text'] += text_tag.text.strip()

In [7]:
poetry = {}
poetry['name'] = 'Venus and Adonis'
poetry['text'] = ''
poetries.append(poetry)

page = urllib.request.urlopen('http://shakespeare.mit.edu/Poetry/VenusAndAdonis.html')
bs = BeautifulSoup(page, 'html.parser')

text_tags = bs.findAll(lambda tag: tag.name=='blockquote')
# skip first two blockquotes
for text_tag in text_tags[2:]:
    poetry['text'] += text_tag.text.strip()

In [8]:
page = urllib.request.urlopen('http://shakespeare.mit.edu/Poetry/elegy.html')
bs = BeautifulSoup(page, 'html.parser')

poetry = {}
poetry['name'] = 'A Funeral Elegy'
poetry['text'] = re.sub("\d+", " ", bs.text[915:])

poetries.append(poetry)

First 5 texts

In [9]:
poetries[0:5]

[{'name': 'I. FROM fairest creatures we desire increase,',
  'text': "FROM fairest creatures we desire increase,\nThat thereby beauty's rose might never die,\nBut as the riper should by time decease,\nHis tender heir might bear his memory:\nBut thou, contracted to thine own bright eyes,\nFeed'st thy light'st flame with self-substantial fuel,\nMaking a famine where abundance lies,\nThyself thy foe, to thy sweet self too cruel.\nThou that art now the world's fresh ornament\nAnd only herald to the gaudy spring,\nWithin thine own bud buriest thy content\nAnd, tender churl, makest waste in niggarding.\n  Pity the world, or else this glutton be,\n  To eat the world's due, by the grave and thee.\n",
  'url': 'sonnet.I.html'},
 {'name': 'II. When forty winters shall beseige thy brow,',
  'text': "When forty winters shall beseige thy brow,\nAnd dig deep trenches in thy beauty's field,\nThy youth's proud livery, so gazed on now,\nWill be a tatter'd weed, of small worth held:\nThen being ask'd wh

Pickle for further use

In [10]:
file_name = 'shakespeare_poetries.pickle'
with open(file_name, 'wb') as handle:
    pickle.dump(poetries, handle, protocol=pickle.HIGHEST_PROTOCOL)    