In [51]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

import re
from unicodedata import normalize
from ast import literal_eval

import requests as rq
from bs4 import BeautifulSoup as bs
from selenium import webdriver


import time
import pickle

from functions import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
pd.set_option('max_colwidth', 150)

##### Manually create dictionary with url codes for each genre.

In [3]:
genre_codes = {
    'augustan': 149,
    'beat': 150,
    'black_arts_movement': 304,
    'black_mountain': 151,
    'confessional': 152,
    'fugitive': 153,
    'georgian': 154,
    'harlem_renaissance': 155,
    'imagist': 156,
    'language_poetry': 157,
    'middle_english': 158,
    'modern': 159,
    'new_york_school': 160,
    'new_york_school_2nd_generation': 161,
    'objectivist': 162,
    'renaissance': 163,
    'romantic': 164,
    'victorian': 165
}

##### Run function in a loop to create dictionary of poet urls.

In [193]:
poet_urls = {genre:poet_urls_by_genre(genre_code, 3) for genre,genre_code in genre_codes.items()}
poet_urls['augustan']

['https://www.poetryfoundation.org/poets/mary-barber',
 'https://www.poetryfoundation.org/poets/susanna-blamire',
 'https://www.poetryfoundation.org/poets/henry-carey',
 'https://www.poetryfoundation.org/poets/thomas-chatterton',
 'https://www.poetryfoundation.org/poets/william-collins',
 'https://www.poetryfoundation.org/poets/william-cowper',
 'https://www.poetryfoundation.org/poets/daniel-defoe',
 'https://www.poetryfoundation.org/poets/anne-finch',
 'https://www.poetryfoundation.org/poets/john-gay',
 'https://www.poetryfoundation.org/poets/oliver-goldsmith',
 'https://www.poetryfoundation.org/poets/thomas-gray',
 'https://www.poetryfoundation.org/poets/matthew-green',
 'https://www.poetryfoundation.org/poets/warren-hastings',
 'https://www.poetryfoundation.org/poets/samuel-johnson',
 'https://www.poetryfoundation.org/poets/mary-jones',
 'https://www.poetryfoundation.org/poets/lady-mary-wortley-montagu',
 'https://www.poetryfoundation.org/poets/alexander-pope',
 'https://www.poetryf

##### The loop only partially worked, so let's re-run sections in which some urls are missing.

In [196]:
poet_urls['black_arts_movement'] = poet_urls_by_genre(genre_codes['black_arts_movement'])

In [198]:
poet_urls['modern'] = poet_urls_by_genre(genre_codes['modern'])

In [200]:
poet_urls['renaissance'] = poet_urls_by_genre(genre_codes['renaissance'])

In [203]:
poet_urls['romantic'] = poet_urls_by_genre(genre_codes['romantic'])

In [206]:
poet_urls['victorian'] = poet_urls_by_genre(genre_codes['victorian'])

In [207]:
# confirm all urls have been grabbed
url_lens = {k:len(v) for k,v in poet_urls.items()}
url_lens

{'augustan': 23,
 'beat': 13,
 'black_arts_movement': 23,
 'black_mountain': 10,
 'confessional': 7,
 'fugitive': 7,
 'georgian': 22,
 'harlem_renaissance': 17,
 'imagist': 6,
 'language_poetry': 18,
 'middle_english': 3,
 'modern': 54,
 'new_york_school': 9,
 'new_york_school_2nd_generation': 16,
 'objectivist': 5,
 'renaissance': 41,
 'romantic': 51,
 'victorian': 55}

##### Pickle it! uncomment to save/load

In [4]:
# with open('poet_urls_dict.pickle', 'wb') as w:
#     pickle.dump(poet_urls, w, protocol=pickle.HIGHEST_PROTOCOL)

# with open('poet_urls_dict.pickle', 'rb') as r:
#     poet_urls_dict = pickle.load(r)

##### Check for duplicate values

In [5]:
poet_df = pd.DataFrame([(genre,v) for genre in poet_urls_dict.keys() for v in poet_urls_dict[genre]])
pd.concat(g for _, g in poet_df.groupby(1) if len(g) > 1)

Unnamed: 0,0,1
126,imagist,https://www.poetryfoundation.org/poets/ezra-pound
186,modern,https://www.poetryfoundation.org/poets/ezra-pound
122,imagist,https://www.poetryfoundation.org/poets/richard-aldington
150,modern,https://www.poetryfoundation.org/poets/richard-aldington


##### We'll give those poets to the imagist genre, since it has so few already

In [6]:
dups = [value for value in poet_df[poet_df.duplicated(1)][1]]
dups

['https://www.poetryfoundation.org/poets/richard-aldington',
 'https://www.poetryfoundation.org/poets/ezra-pound']

In [7]:
len(poet_urls_dict['modern'])

54

In [8]:
# re-listify the modernist urls without pound and aldington
poet_urls_dict['modern'] = [url for url in poet_urls_dict['modern'] if url not in dups]
len(poet_urls_dict['modern'])

52

##### Instantiate an empty dataframe, then loop over each genre in our poet urls dictionary, create a dataframe for each genre and add that to the original dataframe, saving it after each concatenation

In [15]:
%%time

# instantiate an empty dataframe
df = pd.DataFrame()

# loop over each genre, create dataframe with desired information,
# concat to original dataframe, then save it before looping again
for genre in list(poet_urls_dict.keys()):
    genre_df = pf_scraper(poet_urls_dict, genre, 0.5)
    df = pd.concat([df, genre_df])
    df.to_csv('data/poetry_foundation_raw.csv')

KeyboardInterrupt: 

##### Uncomment to save/load dataframe

In [None]:
# TO SAVE
# df.to_csv('data/poetry_foundation_raw.csv')

# TO LOAD
# df = pd.read_csv('data/poetry_foundation_raw.csv', index_col=0)

In [118]:
# rename the columns
df.columns = ['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_lines', 'poem_string']
df.head()

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
0,https://www.poetryfoundation.org/poets/mary-barber,augustan,https://www.poetryfoundation.org/poems/50523/advice-to-her-son-on-marriage,Mary Barber,Advice to Her Son on Marriage,,"['When you gain her Affection, take care to preserve it;\r', 'Lest others persuade her, you do not deserve it.\r', 'Still study to heighten the Jo...","When you gain her Affection, take care to preserve it;\r\nLest others persuade her, you do not deserve it.\r\nStill study to heighten the Joys of ..."
1,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50534/auld-robin-forbes,Susanna Blamire,Auld Robin Forbes,,"['And auld Robin Forbes hes gien tem a dance,\r', 'I pat on my speckets to see them aw prance;\r', 'I thout o’ the days when I was but fifteen,\r'...","And auld Robin Forbes hes gien tem a dance,\r\nI pat on my speckets to see them aw prance;\r\nI thout o’ the days when I was but fifteen,\r\nAnd s..."
2,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50533/o-donald-ye-are-just-the-man,Susanna Blamire,O Donald! Ye Are Just the Man,,"['O Donald! ye are just the man\r', ' Who, when he’s got a wife,\r', 'Begins to fratch— nae notice ta’en—\r', ' They’re strangers a’ their life....","O Donald! ye are just the man\r\n Who, when he’s got a wife,\r\nBegins to fratch— nae notice ta’en—\r\n They’re strangers a’ their life.\r\n\nTh..."
3,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50532/the-siller-croun,Susanna Blamire,The Siller Croun,,"['And ye shall walk in silk attire,\r', ' And siller hae to spare,\r', 'Gin ye’ll consent to be his bride,\r', ' Nor think o’ Donald mair.\r'...","And ye shall walk in silk attire,\r\n And siller hae to spare,\r\nGin ye’ll consent to be his bride,\r\n Nor think o’ Donald mair.\r\nO wha w..."
4,https://www.poetryfoundation.org/poets/henry-carey,augustan,https://www.poetryfoundation.org/poems/43884/the-ballad-of-sally-in-our-alley,Henry Carey,The Ballad of Sally in our Alley,,"['Of all the Girls that are so smart\r', ' There’s none like pretty SALLY,\r', 'She is the Darling of my Heart,\r', ' And she lives in our...","Of all the Girls that are so smart\r\n There’s none like pretty SALLY,\r\nShe is the Darling of my Heart,\r\n And she lives in our Alley.\..."


##### Explore to see how the data looks

In [119]:
df.shape

(5295, 8)

In [120]:
df.genre.unique()

array(['augustan', 'beat', 'black_arts_movement', 'black_mountain',
       'confessional', 'fugitive', 'georgian', 'harlem_renaissance',
       'imagist', 'language_poetry', 'middle_english', 'modern',
       'new_york_school', 'new_york_school_2nd_generation', 'objectivist',
       'renaissance', 'romantic', 'victorian'], dtype=object)

In [121]:
df.genre.value_counts()

modern                            1324
victorian                          674
renaissance                        430
romantic                           407
imagist                            370
new_york_school                    265
black_mountain                     257
new_york_school_2nd_generation     193
language_poetry                    192
confessional                       176
georgian                           167
black_arts_movement                165
objectivist                        159
harlem_renaissance                 148
beat                               147
augustan                           121
fugitive                            90
middle_english                      10
Name: genre, dtype: int64

##### Check for duplicate values across multiple columns and drop those rows

In [122]:
df.duplicated(subset=['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_string'], keep='last').sum()

98

In [123]:
df.drop_duplicates(subset=['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_string'],
                   keep='last',
                   inplace=True)
df.reset_index(drop=True, inplace=True)

In [124]:
df.shape

(5197, 8)

In [125]:
df.genre.value_counts()

modern                            1284
victorian                          643
renaissance                        427
romantic                           398
imagist                            370
new_york_school                    265
black_mountain                     257
new_york_school_2nd_generation     192
language_poetry                    192
confessional                       176
black_arts_movement                165
georgian                           160
objectivist                        159
harlem_renaissance                 148
beat                               147
augustan                           114
fugitive                            90
middle_english                      10
Name: genre, dtype: int64

##### Looks like the poem_lines column converted to a list inside of a string while saving to CSV. We'll wait to convert it until we can fill some missing values for that column, a process I found to be more easily done as a list inside of a string.

In [126]:
df.loc[0,'poem_lines']

"['When you gain her Affection, take care to preserve it;\\r', 'Lest others persuade her, you do not deserve it.\\r', 'Still study to heighten the Joys of her Life;\\r', 'Not treat her the worse, for her being your Wife.\\r', 'If in Judgment she errs, set her right, without Pride:\\r', '’Tis the Province of insolent Fools, to deride.\\r', 'A Husband’s first Praise, is a ', 'Then change not these Titles, for ', 'Let your Person be neat, unaffectedly clean,\\r', 'Tho’ alone with your wife the whole Day you remain.\\r', 'Chuse Books, for her study, to fashion her Mind,\\r', 'To emulate those who excell’d of her Kind.\\r', 'Be Religion the principal Care of your Life,\\r', 'As you hope to be blest in your Children and Wife:\\r', 'So you, in your Marriage, shall gain its true End;\\r', 'And find, in your Wife, a ', '', '']"

##### Check for missing values

In [127]:
df.isna().sum()

poet_url          0
genre             0
poem_url          0
poet             13
title           215
year           1649
poem_lines      410
poem_string     412
dtype: int64

In [47]:
df[df.poet.isna()]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
858,https://www.poetryfoundation.org/poets/w-d-snodgrass,confessional,https://www.poetryfoundation.org/poetrymagazine/poems/48292/road-56d22969928f0,,,2006.0,"['ILEANA MALANCIOIU', '', 'Road', '', 'I walk on a dark road so that I won’t see', '', 'The way my young oxen limp so much;', '', 'The horseshoes ...",ILEANA MALANCIOIU\n\nRoad\n\nI walk on a dark road so that I won’t see\n\nThe way my young oxen limp so much;\n\nThe horseshoes gouging into their...
1409,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14311/after-how-many-years-tr-by-amy-lowell-and-florence-ayscough,,After How Many Years Tr By Amy Lowell And Florence Ayscough,1919.0,,
1410,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14312/calligraphy-tr-by-amy-lowell-and-florence-ayscough,,Calligraphy Tr By Amy Lowell And Florence Ayscough,1919.0,,
1411,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14322/the-emperors-return-from-a-journey-to-the-south-tr-by-amy-lowell-and-florence-ayscough,,The Emperors Return From A Journey To The South Tr By Amy Lowell And Florence Ayscough,1919.0,,
1412,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14310/an-evening-meeting-tr-by-amy-lowell-and-florence-ayscough,,An Evening Meeting Tr By Amy Lowell And Florence Ayscough,1919.0,,
1413,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14314/from-the-straw-hut-among-the-seven-peaks-tr-by-amy-lowell-and-florence-ayscough,,From The Straw Hut Among The Seven Peaks Tr By Amy Lowell And Florence Ayscough,1919.0,,
1414,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14321/the-inn-at-the-western-lake-tr-by-amy-lowell-and-florence-ayscough,,The Inn At The Western Lake Tr By Amy Lowell And Florence Ayscough,1919.0,,
1415,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14296/on-seeing-the-portrait-of-a-beautiful-concubine-tr-by-amy-lowell-and-florence-ayscough,,On Seeing The Portrait Of A Beautiful Concubine Tr By Amy Lowell And Florence Ayscough,1919.0,,
1416,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14316/on-the-classic-of-the-hills-and-sea-tr-by-amy-lowell-and-florence-ayscough,,On The Classic Of The Hills And Sea Tr By Amy Lowell And Florence Ayscough,1919.0,,
1417,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14313/one-goes-a-journey-tr-by-amy-lowell-and-florence-ayscough,,One Goes A Journey Tr By Amy Lowell And Florence Ayscough,1919.0,,


##### The Amy Lowell and Ben Jonson entries appear unuseable, so we'll drop those rows, but since we're here, let's go ahead and fill in the missing info for the Snodgrass poem (which is actually a translation of another poet, but a confessional translator will probably produce a confessional work).

In [129]:
# manually load in information to the poet and title column
df.loc[858,'poet'] = 'ILEANA MALANCIOIU'.title()
df.loc[858,'title'] = 'Road'
df[df.index == 858]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
858,https://www.poetryfoundation.org/poets/w-d-snodgrass,confessional,https://www.poetryfoundation.org/poetrymagazine/poems/48292/road-56d22969928f0,Ileana Malancioiu,Road,2006.0,"['ILEANA MALANCIOIU', '', 'Road', '', 'I walk on a dark road so that I won’t see', '', 'The way my young oxen limp so much;', '', 'The horseshoes ...",ILEANA MALANCIOIU\n\nRoad\n\nI walk on a dark road so that I won’t see\n\nThe way my young oxen limp so much;\n\nThe horseshoes gouging into their...


In [131]:
# drop the rows with missing values in the poet column
df.dropna(subset=['poet'], inplace=True)

In [134]:
df.isna().sum()

poet_url          0
genre             0
poem_url          0
poet              0
title           214
year           1649
poem_lines      398
poem_string     400
dtype: int64

In [104]:
df.to_csv('data/poetry_foundation_cleaner_temp.csv')

##### After reworking the scraping function a bit, we can try to fill in some missing poem_lines and poem_string values.

In [135]:
# create a list of index numbers with NaN values in the poem_lines column
lookups = list(df[df.poem_lines.isna()].index)
lookups

[158,
 168,
 169,
 171,
 175,
 183,
 184,
 200,
 203,
 210,
 229,
 254,
 283,
 324,
 325,
 336,
 351,
 354,
 361,
 458,
 466,
 482,
 484,
 487,
 490,
 503,
 511,
 512,
 513,
 531,
 532,
 542,
 558,
 568,
 576,
 578,
 624,
 626,
 648,
 660,
 661,
 663,
 664,
 694,
 701,
 702,
 703,
 704,
 705,
 707,
 708,
 711,
 714,
 715,
 716,
 717,
 719,
 727,
 736,
 749,
 751,
 753,
 769,
 770,
 817,
 834,
 853,
 872,
 881,
 885,
 886,
 892,
 897,
 900,
 917,
 921,
 940,
 942,
 943,
 944,
 945,
 946,
 947,
 1004,
 1025,
 1123,
 1163,
 1169,
 1171,
 1184,
 1186,
 1192,
 1234,
 1297,
 1299,
 1319,
 1326,
 1345,
 1348,
 1363,
 1367,
 1371,
 1379,
 1383,
 1392,
 1395,
 1404,
 1440,
 1446,
 1452,
 1456,
 1467,
 1468,
 1477,
 1482,
 1489,
 1495,
 1496,
 1498,
 1500,
 1502,
 1503,
 1505,
 1515,
 1516,
 1517,
 1518,
 1519,
 1551,
 1552,
 1553,
 1554,
 1555,
 1556,
 1560,
 1565,
 1566,
 1587,
 1591,
 1594,
 1602,
 1604,
 1617,
 1618,
 1623,
 1631,
 1711,
 1731,
 1732,
 1743,
 1748,
 1770,
 1786,
 1815,
 1816

In [None]:
%%time

# iterate over the list, attempting to re-scrape the lines and string
# NOTE: I was getting a 'ValueError: Must have equal len keys and value when setting with an iterable', but converting
# the list to a string first seemed to make that go away. We have to convert this entire column anyway next.
for i in lookups:
    info = poem_scraper(df.loc[i, 'poem_url'])
    try:
        df.loc[i,'poem_lines'] = str(info[3])
        df.loc[i,'poem_string'] = info[4]
        print(f'Success -- {i}')
    except:
        print(f'Failure -- {i}')
        continue

Success -- 158
Success -- 168
Success -- 169
Success -- 171
Success -- 175
Success -- 183
Success -- 184
Success -- 200
Success -- 203
Success -- 210
Success -- 229
Success -- 254
Success -- 283
Success -- 324
Success -- 325
Success -- 336
Success -- 351
Success -- 354
Success -- 361
Success -- 458
Success -- 466
Success -- 482
Success -- 484
Success -- 487
Success -- 490
Success -- 503
Success -- 511
Success -- 512
Success -- 513
Success -- 531
Success -- 532
Success -- 542
Success -- 558
Success -- 568
Success -- 576
Success -- 578
Success -- 624
Success -- 626
Success -- 648
Success -- 660
Success -- 661
Success -- 663
Success -- 664
Success -- 694
Success -- 701
Success -- 702
Success -- 703
Success -- 704
Success -- 705
Success -- 707
Success -- 708
Success -- 711
Success -- 714
Success -- 715
Success -- 716
Success -- 717
Success -- 719
Success -- 727
Success -- 736
Success -- 749
Success -- 751
Success -- 753
Success -- 769
Success -- 770
Success -- 817
Success -- 834
Success --

In [112]:
info = poem_scraper(df.loc[158,'poem_url'])
info

['Lawrence Ferlinghetti',
 'Beatitudes Visuales Mexicanas',
 2015,
 ['Autobus on Paseo de la Reforma with destination signs:',
  'A boy and three burros run across a stubble field, away from the white mountain. He holds a stick. There is no other way.',
  'Deep yellow flowers in the dusk by the road, beds of them stretching away into darkness. A moon the same color comes up.',
  'As the bus turns + turns down the winding hill, moon swings wildly from side to side. It has had too many pathetic phallusies written about it to stand still for one more.',
  'In Xalapa I am a head taller than anyone else in town — A foot of flesh and two languages separate us.',
  'At a stand in the park at the center of Xalapa I eat white corn on the cob with a stick in the end, sprinkled with salt, butter, grated cheese + hot sauce. The dark stone Indian who hands it to me has been standing there three thousand years.',
  'I’m taking this trip from Mexico City to the Gulf of Mexico and back without any bag

In [117]:
df.loc[158, 'poem_lines'] = str(info[3])
df.loc[158, 'poem_string'] = info[4]
df.loc[158]

poet_url                                                                                                https://www.poetryfoundation.org/poets/lawrence-ferlinghetti
genre                                                                                                                                                           beat
poem_url                                                                   https://www.poetryfoundation.org/poetrymagazine/poems/58150/beatitudes-visuales-mexicanas
poet                                                                                                                                           Lawrence Ferlinghetti
title                                                                                                                                  Beatitudes Visuales Mexicanas
year                                                                                                                                                            2015
poem_lines

In [113]:
info[3]

['Autobus on Paseo de la Reforma with destination signs:',
 'A boy and three burros run across a stubble field, away from the white mountain. He holds a stick. There is no other way.',
 'Deep yellow flowers in the dusk by the road, beds of them stretching away into darkness. A moon the same color comes up.',
 'As the bus turns + turns down the winding hill, moon swings wildly from side to side. It has had too many pathetic phallusies written about it to stand still for one more.',
 'In Xalapa I am a head taller than anyone else in town — A foot of flesh and two languages separate us.',
 'At a stand in the park at the center of Xalapa I eat white corn on the cob with a stick in the end, sprinkled with salt, butter, grated cheese + hot sauce. The dark stone Indian who hands it to me has been standing there three thousand years.',
 'I’m taking this trip from Mexico City to the Gulf of Mexico and back without any bag or person — only what I can carry in my pockets. The need for baggage is 

In [111]:
df[df.poem_lines.isna()]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
158,https://www.poetryfoundation.org/poets/lawrence-ferlinghetti,beat,https://www.poetryfoundation.org/poetrymagazine/poems/58150/beatitudes-visuales-mexicanas,Lawrence Ferlinghetti,Beatitudes Visuales Mexicanas,2015.0,,
168,https://www.poetryfoundation.org/poets/allen-ginsberg,beat,https://www.poetryfoundation.org/poems/47660/a-supermarket-in-california,Allen Ginsberg,A Supermarket in California,1984.0,,
169,https://www.poetryfoundation.org/poets/allen-ginsberg,beat,https://www.poetryfoundation.org/poetrymagazine/poems/36505/written-in-my-dream-by-w-c-williams,Allen Ginsberg,Written In My Dream By W C Williams,1986.0,,
171,https://www.poetryfoundation.org/poets/jack-hirschman,beat,https://www.poetryfoundation.org/poetrymagazine/poems/29383/from-one-finger-from-the-seasons,Jack Hirschman,From One Finger From The Seasons,1963.0,,
175,https://www.poetryfoundation.org/poets/bob-kaufman,beat,https://www.poetryfoundation.org/poems/55713/a-terror-is-more-certain-,Bob Kaufman,A Terror is More Certain . . .,1996.0,,
...,...,...,...,...,...,...,...,...
4309,https://www.poetryfoundation.org/poets/heinrich-heine,romantic,https://www.poetryfoundation.org/poetrymagazine/poems/25286/leaning-against-the-mast-tr-by-vernon-watkins,Heinrich Heine,Leaning Against The Mast Tr By Vernon Watkins,1949.0,,
4312,https://www.poetryfoundation.org/poets/friedrich-holderlin,romantic,https://www.poetryfoundation.org/poetrymagazine/poems/52408/in-lovely-blue,Friedrich Hölderlin,In Lovely Blue,2009.0,,
4317,https://www.poetryfoundation.org/poets/thomas-hood,romantic,https://www.poetryfoundation.org/poems/52339/silence-56d230b89fd5e,Thomas Hood,Silence,1950.0,,
4685,https://www.poetryfoundation.org/poets/phoebe-cary,victorian,https://www.poetryfoundation.org/poems/43918/shakesperian-readings,Phoebe Cary,Shakesperian Readings,,,


In [53]:
df['poem_lines'] = df.poem_lines.apply(lambda x: literal_eval(x))
df.loc[0,'poem_lines']

ValueError: malformed node or string: nan

In [89]:
page = rq.get('https://www.poetryfoundation.org/poetrymagazine/poems/58150/beatitudes-visuales-mexicanas')
soup = bs(page.content, 'html.parser')

In [92]:
lines_raw = soup.find_all('div', {'style': 'text-align: justify;'})
lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
lines

['Autobus on Paseo de la Reforma with destination signs: ',
 '<br/>',
 '<br/>',
 '<br/>',
 'A boy and three burros run across a stubble field, away from the white mountain. He holds a stick. There is no other way.',
 'Deep yellow flowers in the dusk by the road, beds of them stretching away into darkness. A moon the same color comes up.',
 'As the bus turns + turns down the winding hill, moon swings wildly from side to side. It has had too many pathetic phallusies written about it to stand still for one more.',
 'In Xalapa I am a head taller than anyone else in town — A foot of flesh and two languages separate us.',
 'At a stand in the park at the center of Xalapa I eat white corn on the cob with a stick in the end, sprinkled with salt, butter, grated cheese + hot sauce. The dark stone Indian who hands it to me has been standing there three thousand years.',
 'I’m taking this trip from Mexico City to the Gulf of Mexico and back without any bag or person — only what I can carry in my po

In [87]:
lines_raw = soup.find('div', {'data-view': 'PoemView'}).contents[1]
lines = [normalize('NFKD', str(line)) for line in lines_raw if line]
lines = [line.replace('<br/>', '') for line in lines]
lines = [line.strip() for line in lines if line]
lines

['What thoughts I have of you tonight, Walt Whitman, for I walked down the sidestreets under the trees with a headache self-conscious looking at the full moon.',
 'In my hungry fatigue, and shopping for images, I went into the neon fruit supermarket, dreaming of your enumerations!',
 'What peaches and what penumbras! Whole families shopping at night! Aisles full of husbands! Wives in the avocados, babies in the tomatoes!—and you, Garcia Lorca, what were you doing down by the watermelons?',
 'I saw you, Walt Whitman, childless, lonely old grubber, poking among the meats in the refrigerator and eyeing the grocery boys.',
 'I heard you asking questions of each: Who killed the pork chops? What price bananas? Are you my Angel?',
 'I wandered in and out of the brilliant stacks of cans following you, and followed in my imagination by the store detective.',
 'We strode down the open corridors together in our solitary fancy tasting artichokes, possessing every frozen delicacy, and never passing

In [70]:
lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
lines = [line.replace('<br/>', '') for line in lines]
lines

[]

In [63]:
line_pattern = '<.*>'
# re.search(line_pattern, line, re.I).group(1)


In [66]:
# line_pattern = '>(.*?)<'
lines_test = [re.sub(line_pattern, '', line) if '<' in line else line for line in lines]
lines_test

['There is a silence where hath been no sound, ',
 '   There is a silence where no sound may be, ',
 '   In the cold grave—under the deep deep sea, ',
 'Or in the wide desert where no life is found, ',
 'Which hath been mute, and still must sleep profound; ',
 '   No voice is hush’d—no life treads silently, ',
 '   But clouds and cloudy shadows wander free, ',
 'That never spoke, over the idle ground: ',
 'But in green ruins, in the desolate walls ',
 '   Of antique palaces, where Man hath been, ',
 'Though the dun fox, or wild hyena, calls, ',
 '   And owls, that flit continually between, ',
 'Shriek to the echo, and the low winds moan, ',
 'There the true Silence is, self-conscious and alone. ',
 '',
 '',
 '']

In [103]:
poem_scraper('https://www.poetryfoundation.org/poems/52339/silence-56d230b89fd5e')[3]

['There is a silence where hath been no sound,',
 'There is a silence where no sound may be,',
 'In the cold grave—under the deep deep sea,',
 'Or in the wide desert where no life is found,',
 'Which hath been mute, and still must sleep profound;',
 'No voice is hush’d—no life treads silently,',
 'But clouds and cloudy shadows wander free,',
 'That never spoke, over the idle ground:',
 'But in green ruins, in the desolate walls',
 'Of antique palaces, where Man hath been,',
 'Though the dun fox, or wild hyena, calls,',
 'And owls, that flit continually between,',
 'Shriek to the echo, and the low winds moan,',
 'There the true Silence is, self-conscious and alone.']

In [102]:
df[df.poem_lines.isna()]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
158,https://www.poetryfoundation.org/poets/lawrence-ferlinghetti,beat,https://www.poetryfoundation.org/poetrymagazine/poems/58150/beatitudes-visuales-mexicanas,Lawrence Ferlinghetti,Beatitudes Visuales Mexicanas,2015.0,,
168,https://www.poetryfoundation.org/poets/allen-ginsberg,beat,https://www.poetryfoundation.org/poems/47660/a-supermarket-in-california,Allen Ginsberg,A Supermarket in California,1984.0,,
169,https://www.poetryfoundation.org/poets/allen-ginsberg,beat,https://www.poetryfoundation.org/poetrymagazine/poems/36505/written-in-my-dream-by-w-c-williams,Allen Ginsberg,Written In My Dream By W C Williams,1986.0,,
171,https://www.poetryfoundation.org/poets/jack-hirschman,beat,https://www.poetryfoundation.org/poetrymagazine/poems/29383/from-one-finger-from-the-seasons,Jack Hirschman,From One Finger From The Seasons,1963.0,,
175,https://www.poetryfoundation.org/poets/bob-kaufman,beat,https://www.poetryfoundation.org/poems/55713/a-terror-is-more-certain-,Bob Kaufman,A Terror is More Certain . . .,1996.0,,
...,...,...,...,...,...,...,...,...
4309,https://www.poetryfoundation.org/poets/heinrich-heine,romantic,https://www.poetryfoundation.org/poetrymagazine/poems/25286/leaning-against-the-mast-tr-by-vernon-watkins,Heinrich Heine,Leaning Against The Mast Tr By Vernon Watkins,1949.0,,
4312,https://www.poetryfoundation.org/poets/friedrich-holderlin,romantic,https://www.poetryfoundation.org/poetrymagazine/poems/52408/in-lovely-blue,Friedrich Hölderlin,In Lovely Blue,2009.0,,
4317,https://www.poetryfoundation.org/poets/thomas-hood,romantic,https://www.poetryfoundation.org/poems/52339/silence-56d230b89fd5e,Thomas Hood,Silence,1950.0,,
4685,https://www.poetryfoundation.org/poets/phoebe-cary,victorian,https://www.poetryfoundation.org/poems/43918/shakesperian-readings,Phoebe Cary,Shakesperian Readings,,,


In [184]:
df[df.poem_lines.isna()]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
65,https://www.poetryfoundation.org/poets/lady-mary-wortley-montagu,augustan,https://www.poetryfoundation.org/poems/44765/town-eclogues-thursday-the-bassette-table,Lady Mary Wortley Montagu,Town Eclogues: Thursday; the Bassette-Table,,,
306,https://www.poetryfoundation.org/poets/gwendolyn-brooks,black_arts_movement,https://www.poetryfoundation.org/poems/58377/riot-56d23cb395a01,Gwendolyn Brooks,,,,
408,https://www.poetryfoundation.org/poets/jay-wright,black_arts_movement,https://www.poetryfoundation.org/poems/42736/benjamin-banneker-helps-to-build-a-city,Jay Wright,Benjamin Banneker Helps to Build a City,2000.0,,
500,https://www.poetryfoundation.org/poets/robert-creeley,black_mountain,https://www.poetryfoundation.org/poetrymagazine/poems/55314/a-prayer-56d236c6bb760,Robert Creeley,A Prayer,1982.0,,
799,https://www.poetryfoundation.org/poets/frederick-seidel,confessional,https://www.poetryfoundation.org/poetrymagazine/poems/55728/snow-56d23797074a2,Frederick Seidel,Snow,2012.0,,
949,https://www.poetryfoundation.org/poets/hilaire-belloc,georgian,https://www.poetryfoundation.org/poems/46684/ballade-of-modest-confession,Hilaire Belloc,Ballade of Modest Confession,1970.0,,
1087,https://www.poetryfoundation.org/poets/siegfried-sassoon,georgian,https://www.poetryfoundation.org/poems/57215/blighters,Siegfried Sassoon,'Blighters',1917.0,,
1165,https://www.poetryfoundation.org/poets/langston-hughes,harlem_renaissance,https://www.poetryfoundation.org/poetrymagazine/poems/55313/god-56d236c65624c,Langston Hughes,God,1994.0,,
1211,https://www.poetryfoundation.org/poets/claude-mckay,harlem_renaissance,https://www.poetryfoundation.org/poems/56983/the-lynching,Claude McKay,The Lynching,1922.0,,
1430,https://www.poetryfoundation.org/poets/ezra-pound,imagist,https://www.poetryfoundation.org/poems/44915/hugh-selwyn-mauberley-part-i,Ezra Pound,Hugh Selwyn Mauberley [Part I],,,


In [226]:
df[df.poem_string.isna()]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
65,https://www.poetryfoundation.org/poets/lady-mary-wortley-montagu,augustan,https://www.poetryfoundation.org/poems/44765/town-eclogues-thursday-the-bassette-table,Lady Mary Wortley Montagu,Town Eclogues: Thursday; the Bassette-Table,,,
126,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poetrymagazine/poems/31338/wood,,,,[],
140,https://www.poetryfoundation.org/poets/william-everson,beat,https://www.poetryfoundation.org/poetrymagazine/poems/21676/dust-and-the-glory,,,,[],
141,https://www.poetryfoundation.org/poets/william-everson,beat,https://www.poetryfoundation.org/poetrymagazine/poems/21675/we-in-the-fields,,,,[],
158,https://www.poetryfoundation.org/poets/lawrence-ferlinghetti,beat,https://www.poetryfoundation.org/poetrymagazine/poems/58150/beatitudes-visuales-mexicanas,Lawrence Ferlinghetti,Beatitudes Visuales Mexicanas,2015.0,[],
...,...,...,...,...,...,...,...,...
4438,https://www.poetryfoundation.org/poets/percy-bysshe-shelley,romantic,https://www.poetryfoundation.org/poems/56665/laon-and-cythna-or-the-revolution-of-the-golden-city,Percy Bysshe Shelley,Laon and Cythna; or The Revolution of the Golden City,2002.0,,
4777,https://www.poetryfoundation.org/poets/gerard-manley-hopkins,victorian,https://www.poetryfoundation.org/poems/44403/the-wreck-of-the-deutschland,Gerard Manley Hopkins,The Wreck of the Deutschland,1950.0,,
4812,https://www.poetryfoundation.org/poets/rudyard-kipling,victorian,https://www.poetryfoundation.org/poems/57409/epitaphs-of-the-war,Rudyard Kipling,Epitaphs of the War,1919.0,,
4831,https://www.poetryfoundation.org/poets/emma-lazarus,victorian,https://www.poetryfoundation.org/poems/46791/by-the-waters-of-babylon,Emma Lazarus,By the Waters of Babylon,2002.0,[],


In [189]:
df.head()

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
0,https://www.poetryfoundation.org/poets/mary-barber,augustan,https://www.poetryfoundation.org/poems/50523/advice-to-her-son-on-marriage,Mary Barber,Advice to Her Son on Marriage,,"['When you gain her Affection, take care to preserve it;\r', 'Lest others persuade her, you do not deserve it.\r', 'Still study to heighten the Jo...","When you gain her Affection, take care to preserve it;\nLest others persuade her, you do not deserve it.\nStill study to heighten the Joys of her ..."
1,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50534/auld-robin-forbes,Susanna Blamire,Auld Robin Forbes,,"['And auld Robin Forbes hes gien tem a dance,\r', 'I pat on my speckets to see them aw prance;\r', 'I thout o’ the days when I was but fifteen,\r'...","And auld Robin Forbes hes gien tem a dance,\nI pat on my speckets to see them aw prance;\nI thout o’ the days when I was but fifteen,\nAnd skipp’d..."
2,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50533/o-donald-ye-are-just-the-man,Susanna Blamire,O Donald! Ye Are Just the Man,,"['O Donald! ye are just the man\r', ' Who, when he’s got a wife,\r', 'Begins to fratch— nae notice ta’en—\r', ' They’re strangers a’ their life....","O Donald! ye are just the man\n Who, when he’s got a wife,\nBegins to fratch— nae notice ta’en—\n They’re strangers a’ their life.\n\nThe fan ma..."
3,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50532/the-siller-croun,Susanna Blamire,The Siller Croun,,"['And ye shall walk in silk attire,\r', ' And siller hae to spare,\r', 'Gin ye’ll consent to be his bride,\r', ' Nor think o’ Donald mair.\r'...","And ye shall walk in silk attire,\n And siller hae to spare,\nGin ye’ll consent to be his bride,\n Nor think o’ Donald mair.\nO wha wad buy a..."
4,https://www.poetryfoundation.org/poets/henry-carey,augustan,https://www.poetryfoundation.org/poems/43884/the-ballad-of-sally-in-our-alley,Henry Carey,The Ballad of Sally in our Alley,,"['Of all the Girls that are so smart\r', ' There’s none like pretty SALLY,\r', 'She is the Darling of my Heart,\r', ' And she lives in our...","Of all the Girls that are so smart\n There’s none like pretty SALLY,\nShe is the Darling of my Heart,\n And she lives in our Alley.\nThere..."


In [262]:
import urllib

In [274]:
def poem_scraper(poem_url):
    '''Scraper for PoetryFoundation.org--scrapes poet name, poem title, poem year, list of poem's lines,
       and the poem as a string.
       Input the url for a poem's page on PoetryFoundation.org.
       Output is a list.'''
    
    # load a page and soupify it
    page = rq.get(poem_url)
    soup = bs(page.content, 'html.parser')
    
    # series of try/except statements to scrape info or return NaN value if desired info cannot be scraped
    try:
        poet = soup.find('a', href=re.compile('.*/poets/.*')).contents[0]
    except:
        poet = np.nan
        
    try:
        title = soup.find('h1').contents[-1].strip()
    except:
        try:
            title_pattern = '[a-z\-]*$'
            title = re.search(title_pattern, poem_url, re.I).group().replace('-', ' ').title()
        except:
            title = np.nan
        
    try:
        lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
        lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
        lines = [line.replace('<br/>', '') for line in lines]
        line_pattern = '>(.*?)<'
        lines = [re.search(line_pattern, line, re.I).group(1) if '<' in line else line for line in lines]
        if lines == []:
            try:
                img_link = soup.find('img', src=re.compile('.*/jstor/.*'))['src']
                img_data = rq.get(img_link).content
                with open('poem_imgs/temp.png', 'wb') as handle:
                    handle.write(img_data)
                text = pytesseract.image_to_string('poem_imgs/temp.png')
                scan_pattern = fr'{title.upper()}\s*((.*\s.*)*)'
                lines = re.search(scan_pattern, text, re.I).group(1).splitlines()
            except:
                lines = np.nan
    except:
        lines = np.nan
        
    try:
        poem_string = '\n'.join(lines)
    except:
        poem_string = np.nan
        
    try:
        year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
        year_pattern = r'[12]\d{3}'
        year = int(re.search(year_pattern, year_blurb, re.I).group())
    except:
        try:
            year_blurb = soup.find_all('span', {'class': 'c-txt c-txt_note c-txt_note_mini'})[-1].contents[2]
            year_pattern = r'[12]\d{3}'
            year = int(re.search(year_pattern, year_blurb, re.I).group())
        except:
            year = np.nan
    
    info = [poet, title, year, lines, poem_string]
    
    return info

In [275]:
poem_scraper('https://www.poetryfoundation.org/poetrymagazine/poems/31338/wood')

['Richard Brautigan',
 'Wood',
 1969,
 ['We age in darkness like wood',
  'and watch our phantoms change',
  'eir clothes',
  'of shingles and boards',
  'for a purpose that can only be',
  'described as wood.'],
 'We age in darkness like wood\nand watch our phantoms change\neir clothes\nof shingles and boards\nfor a purpose that can only be\ndescribed as wood.']

In [266]:
img_link

'https://static.poetryfoundation.org/jstor/i20599092/pages/36.png'

In [269]:
page = rq.get('https://www.poetryfoundation.org/poetrymagazine/poems/31338/wood')
soup = bs(page.content, 'html.parser')
img_link = soup.find('img', src=re.compile('.*/jstor/.*'))['src']

img_data = rq.get(img_link).content
with open('poem_imgs/temp.png', 'wb') as handle:
    handle.write(img_data)
# with open('poem_imgs/temp.png', 'wb') as handle:
#     response = rq.get(img_link, stream=True)
# text = pytesseract.image_to_string('poem_imgs/temp.png')
# scan_pattern = fr'{title.upper()}\s*((.*\s.*)*)'
# lines = re.search(scan_pattern, text, re.I).group(1).splitlines()

In [272]:
img_data = rq.get(img_link).content
with open('poem_imgs/temp.png', 'wb') as handle:
    handle.write(img_data)
text = pytesseract.image_to_string('poem_imgs/temp.png')
scan_pattern = fr'{title.upper()}\s*((.*\s.*)*)'
lines = re.search(scan_pattern, text, re.I).group(1).splitlines()

In [273]:
lines

['We age in darkness like wood',
 'and watch our phantoms change',
 'eir clothes',
 'of shingles and boards',
 'for a purpose that can only be',
 'described as wood.']

In [271]:
text

'POETRY\n\nRICHARD BRAUTIGAN\n\n \n\nWOOD\n\nWe age in darkness like wood\nand watch our phantoms change\neir clothes\nof shingles and boards\nfor a purpose that can only be\ndescribed as wood.'

In [253]:
soup.find('img', src=re.compile('.*/jstor/.*'))['src']

'https://static.poetryfoundation.org/jstor/i20599092/pages/36.png'

In [236]:
import pytesseract

text = pytesseract.image_to_string('poem_imgs/36.png')
text

'POETRY\n\nRICHARD BRAUTIGAN\n\n \n\nWOOD\n\nWe age in darkness like wood\nand watch our phantoms change\neir clothes\nof shingles and boards\nfor a purpose that can only be\ndescribed as wood.'

In [237]:
title_test = 'Wood'.upper()
title_test

'WOOD'

In [241]:
scan_pattern = fr'{title_test}\s*((.*\s.*)*)'
re.search(scan_pattern, text, re.I).group(1).splitlines()

['We age in darkness like wood',
 'and watch our phantoms change',
 'eir clothes',
 'of shingles and boards',
 'for a purpose that can only be',
 'described as wood.']

In [215]:
print(text)

POETRY

RICHARD BRAUTIGAN

 

WOOD

We age in darkness like wood
and watch our phantoms change
eir clothes
of shingles and boards
for a purpose that can only be
described as wood.


In [196]:
page = rq.get('https://www.poetryfoundation.org/poetrymagazine/poems/21676/dust-and-the-glory')
soup = bs(page.content, 'html.parser')
poet = soup.find('a', href=re.compile('.*/poets/.*')).contents[0]
title = soup.find('span', attrs={'class':'c-hdgSans c-hdgSans_7'}).contents[-1].strip()
print(poet)
print(title)

William Everson
We in the Fields


In [207]:
url = 'https://www.poetryfoundation.org/poetrymagazine/poems/21676/dust-and-the-glory'
title_pattern = '[a-z\-]*$'
title = re.search(title_pattern, url, re.I).group().replace('-', ' ').title()
year_blurb = soup.find_all('span', {'class': 'c-txt c-txt_note c-txt_note_mini'})[-1].contents[2]
year_pattern = r'[12]\d{3}'
year = int(re.search(year_pattern, year_blurb, re.I).group())
year

1937

In [159]:
nan_lines = {ind:row.poem_url for ind,row in df[df.poem_lines.isna()].iterrows()}
nan_lines

{65: 'https://www.poetryfoundation.org/poems/44765/town-eclogues-thursday-the-bassette-table',
 306: 'https://www.poetryfoundation.org/poems/58377/riot-56d23cb395a01',
 408: 'https://www.poetryfoundation.org/poems/42736/benjamin-banneker-helps-to-build-a-city',
 500: 'https://www.poetryfoundation.org/poetrymagazine/poems/55314/a-prayer-56d236c6bb760',
 799: 'https://www.poetryfoundation.org/poetrymagazine/poems/55728/snow-56d23797074a2',
 949: 'https://www.poetryfoundation.org/poems/46684/ballade-of-modest-confession',
 1087: 'https://www.poetryfoundation.org/poems/57215/blighters',
 1165: 'https://www.poetryfoundation.org/poetrymagazine/poems/55313/god-56d236c65624c',
 1211: 'https://www.poetryfoundation.org/poems/56983/the-lynching',
 1430: 'https://www.poetryfoundation.org/poems/44915/hugh-selwyn-mauberley-part-i',
 1431: 'https://www.poetryfoundation.org/poems/57353/hugh-selwyn-mauberley-part-ii',
 1646: 'https://www.poetryfoundation.org/poetrymagazine/poems/52593/advent-56d231303d

In [None]:
for i,url in nan_lines.items():
    df[i]['poem_lines'] = 

In [155]:
df.iloc[308]

poet_url                                                                                                     https://www.poetryfoundation.org/poets/gwendolyn-brooks
genre                                                                                                                                            black_arts_movement
poem_url                                                                                                 https://www.poetryfoundation.org/poems/43311/sadie-and-maud
poet                                                                                                                                                Gwendolyn Brooks
title                                                                                                                                                 Sadie and Maud
year                                                                                                                                                             NaN
poem_lines

In [178]:
def poem_scraper(poem_url):
    '''Scraper for PoetryFoundation.org--scrapes poet name, poem title, poem year, list of poem's lines,
       and the poem as a string.
       Input the url for a poem's page on PoetryFoundation.org.
       Output is a list.'''
    
    # load a page and soupify it
    page = rq.get(poem_url)
    soup = bs(page.content, 'html.parser')
    
    # series of try/except statements to scrape info or return NaN value if desired info cannot be scraped
    try:
        poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
    except:
        poet = np.nan
        
    try:
        title = soup.find('h1').contents[-1].strip()
    except:
        title = np.nan
        
    try:
        lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
        lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
        lines = [line.replace('<br/>', '') for line in lines]
    except:
        lines = np.nan
        
    try:
        poem_string = '\n'.join(lines)
    except:
        poem_string = np.nan
        
    try:
        year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
        year_pattern = r'[12]\d{3}'
        year = int(re.search(year_pattern, year_blurb, re.I).group())
    except:
        year = np.nan
    
    info = [poet, title, year, lines, poem_string]
    
    return info

In [181]:
infor = poem_scraper('https://www.poetryfoundation.org/poetrymagazine/poems/55209/the-cenotaph')
infor

['Fanny Howe',
 'The Cenotaph',
 2011,
 ['I want to leave this place',
  'unremembered.',
  'The gas stove is leaking',
  'and the door of the refrigerator',
  'stained with rust.',
  'The mugs are ugly',
  'and there are only two forks.',
  'The walls are black',
  'and soft, the bed a balloon',
  'of night-clothing.',
  'The stairwell sloped',
  'to a dragger’s pace.',
  '',
  'There are big windows',
  'with blind-slats dusty',
  'and gray. Street life ',
  'goes all night and at dawn',
  'freedmen shout and ',
  'laugh outside the kitchen.',
  '',
  'Where does life begin and end?',
  'In the lamb or the cotton?',
  'My pillow is my friend.',
  ''],
 'I want to leave this place\nunremembered.\nThe gas stove is leaking\nand the door of the refrigerator\nstained with rust.\nThe mugs are ugly\nand there are only two forks.\nThe walls are black\nand soft, the bed a balloon\nof night-clothing.\nThe stairwell sloped\nto a dragger’s pace.\n\nThere are big windows\nwith blind-slats dusty\n

In [182]:
page = rq.get('https://www.poetryfoundation.org/poetrymagazine/poems/55209/the-cenotaph')
soup = bs(page.content, 'html.parser')
lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
lines = [line.replace('<br/>', '') for line in lines]
year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'})
year_blurb

<span class="c-txt c-txt_note c-txt_note_mini">
                        Source:
                        <em>Poetry</em>
                                                                                                                                                                    (December 2011)
                                            </span>

In [175]:
print('\n'.join(lines).replace('<br/>', '\n'))

I want to leave this place
unremembered.
The gas stove is leaking
and the door of the refrigerator
stained with rust.
The mugs are ugly
and there are only two forks.
The walls are black
and soft, the bed a balloon
of night-clothing.
The stairwell sloped
to a dragger’s pace.

There are big windows
with blind-slats dusty
and gray. Street life 
goes all night and at dawn
freedmen shout and 
laugh outside the kitchen.

Where does life begin and end?
In the lamb or the cotton?
My pillow is my friend.



In [166]:
poem_string = "\n".join(''.join(lines).splitlines()).replace('<br/>', '\n')
print(poem_string)

I want to leave this placeunremembered.The gas stove is leakingand the door of the refrigeratorstained with rust.The mugs are uglyand there are only two forks.The walls are blackand soft, the bed a balloonof night-clothing.The stairwell slopedto a dragger’s pace.
There are big windowswith blind-slats dustyand gray. Street life goes all night and at dawnfreedmen shout and laugh outside the kitchen.
Where does life begin and end?In the lamb or the cotton?My pillow is my friend.



In [121]:
poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
poet

'Fanny Howe'

In [122]:
title = soup.find('h1').contents[-1].strip()
title

'The Cenotaph'

In [124]:
lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
lines_raw

[<div style="text-indent: -1em; padding-left: 1em;">I want to leave this place<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">unremembered.<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">The gas stove is leaking<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">and the door of the refrigerator<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">stained with rust.<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">The mugs are ugly<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">and there are only two forks.<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">The walls are black<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">and soft, the bed a balloon<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">of night-clothing.<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">The stairwell sloped<br/></div>,
 <div style="text-indent: -1em; padding-le

In [131]:
lines_raw[-2]

<div style="text-indent: -1em; padding-left: 1em;"></div>

In [129]:
lines_raw[-2].contents[0]

IndexError: list index out of range

In [132]:
lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
lines

['I want to leave this place',
 'unremembered.',
 'The gas stove is leaking',
 'and the door of the refrigerator',
 'stained with rust.',
 'The mugs are ugly',
 'and there are only two forks.',
 'The walls are black',
 'and soft, the bed a balloon',
 'of night-clothing.',
 'The stairwell sloped',
 'to a dragger’s pace.',
 '<br/>',
 'There are big windows',
 'with blind-slats dusty',
 'and gray. Street life ',
 'goes all night and at dawn',
 'freedmen shout and ',
 'laugh outside the kitchen.',
 '<br/>',
 'Where does life begin and end?',
 'In the lamb or the cotton?',
 'My pillow is my friend.',
 '<br/>']

In [None]:

    
    # series of try/except statements to scrape info or return NaN value if desired info cannot be scraped
    try:
        poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
    except:
        poet = np.nan
        
    try:
        title = soup.find('h1').contents[-1].strip()
    except:
        title = np.nan
        
    try:
        lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
        lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw]
    except:
        lines = np.nan
        
    try:
        poem_string = "\n".join(''.join(lines).splitlines()).replace('<br/>', '\n') 
    except:
        poem_string = np.nan
        
    try:
        year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
        year_pattern = r'[12]\d{3}'
        year = int(re.search(year_pattern, year_blurb, re.I).group())
    except:
        year = np.nan
    
    info = [poet, title, year, lines, poem_string]
    
    return info

In [116]:
df.groupby('genre').sum()

Unnamed: 0_level_0,year
genre,Unnamed: 1_level_1
augustan,30930.0
beat,183660.0
black_arts_movement,271883.0
black_mountain,194322.0
confessional,150867.0
fugitive,62157.0
georgian,56322.0
harlem_renaissance,152419.0
imagist,122877.0
language_poetry,342601.0


In [87]:
df[df.genre == 'modern'].title.value_counts()

The Waste Land                         14
The Love Song of J. Alfred Prufrock    10
Gerontion                               6
Rhapsody on a Windy Night               4
Portrait of a Lady                      4
                                       ..
No Second Troy                          1
Under Ben Bulben                        1
The People, Yes                         1
Leave-Taking                            1
Fixed Ideas                             1
Name: title, Length: 459, dtype: int64

In [88]:
df[df.title == 'The Waste Land']

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
245,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
246,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
247,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
256,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
257,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
261,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
262,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
265,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
266,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
267,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."


In [82]:
df.genre.value_counts()

modern                            1324
victorian                          674
renaissance                        430
romantic                           407
imagist                            370
beat                               294
new_york_school                    265
black_mountain                     257
new_york_school_2nd_generation     193
language_poetry                    192
confessional                       176
georgian                           167
black_arts_movement                165
objectivist                        159
harlem_renaissance                 148
augustan                           121
fugitive                            90
middle_english                      10
Name: genre, dtype: int64

In [86]:
df[df.genre == 'middle_english']

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
0,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43926/the-canterbury-tales-general-prologue,Geoffrey Chaucer,The Canterbury Tales: General Prologue,,"[Whan that Aprille with his shour, The droghte of March hath perc, And bath, Of which vertú engendr, Whan Zephirus eek with his swet, Inspir, The...",Whan that Aprille with his shourThe droghte of March hath percAnd bathOf which vertú engendrWhan Zephirus eek with his swetInspirThe tendrHath in...
1,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43936/the-parlement-of-fowls,Geoffrey Chaucer,The Parlement of Fowls,,"[Now welcome, somer, with thy sonne softe,, \r That hast this wintres wedres overshake,, \r And driven away the longe nyghtes blake!, <br/>, \r Sa...","Now welcome, somer, with thy sonne softe,\n That hast this wintres wedres overshake,\n And driven away the longe nyghtes blake!\n\n Saynt Valentyn..."
2,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/45694/to-rosemounde-a-balade,Geoffrey Chaucer,To Rosemounde: A Balade,1891.0,"[<br/>, \r Madame, ye ben of al beaute shryne, \r As fer as cercled is the mapamounde,, \r For as the cristal glorious ye shyne,, \r And lyke ruby...","\n\n Madame, ye ben of al beaute shryne\n As fer as cercled is the mapamounde,\n For as the cristal glorious ye shyne,\n And lyke ruby ben your ch..."
3,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43937/troilus-and-criseyde-book-i,Geoffrey Chaucer,Troilus and Criseyde: Book I,,"[And so bifel, whan comen was the tym, Of Aperil, whan clothed is the med, With new, And swot, In sondry wises shew, The folk of Troie hir observa...","And so bifel, whan comen was the tymOf Aperil, whan clothed is the medWith newAnd swotIn sondry wises shewThe folk of Troie hir observaunces oldPa..."
4,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43938/troilus-and-criseyde-book-ii,Geoffrey Chaucer,Troilus and Criseyde: Book II,,"[With this he took his leve, and hom he wente; , And lord, so he was glad and wel bygon! , Criseyde aroos, no lenger she ne stente, , But streght ...","With this he took his leve, and hom he wente; And lord, so he was glad and wel bygon! Criseyde aroos, no lenger she ne stente, But streght in-to h..."
5,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43939/troilus-and-criseyde-book-v,Geoffrey Chaucer,Troilus and Criseyde: Book V,,"[The morwen com, and gostly for to speke, , This Diomede is come un-to Criseyde; , And shortly, lest that ye my tale breke, , So wel he for hym-se...","The morwen com, and gostly for to speke, This Diomede is come un-to Criseyde; And shortly, lest that ye my tale breke, So wel he for hym-selven sp..."
6,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43940/truth-56d222d5bf80c,Geoffrey Chaucer,Truth,,"[Fle fro the pres, and dwelle with sothefastnesse,, \r Suffise thin owen thing, thei it be smal;, \r For hord hath hate, and clymbyng tykelnesse,,...","Fle fro the pres, and dwelle with sothefastnesse,\n Suffise thin owen thing, thei it be smal;\n For hord hath hate, and clymbyng tykelnesse,\n Pre..."
7,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/50383/if-no-love-is-o-god-what-fele-i-so,Petrarch,"“If no love is, O God, what fele I so?”",,"[If no love is, O God, what fele I so?, \r And if love is, what thing and which is he?, \r If love be good, from whennes cometh my woo?, \r If it ...","If no love is, O God, what fele I so?\n And if love is, what thing and which is he?\n If love be good, from whennes cometh my woo?\n If it be wikk..."
8,https://www.poetryfoundation.org/poets/william-langland,middle_english,https://www.poetryfoundation.org/poems/47350/piers-plowman-the-prologue,William Langland,Piers Plowman: The Prologue,,"[<br/>, \r In a somer sesun, whon softe was the sonne,, \r I schop me into a shroud, as I a scheep were;, \r In habite as an hermite unholy of wer...","\n\n In a somer sesun, whon softe was the sonne,\n I schop me into a shroud, as I a scheep were;\n In habite as an hermite unholy of werkes\n Went..."
9,https://www.poetryfoundation.org/poets/john-lydgate,middle_english,https://www.poetryfoundation.org/poems/44660/the-testament-of-john-lydgate,John Lydgate,The Testament of John Lydgate,,"[Beholde, o man! lyft up thyn eye and see , What mortall peyne I suffre for thi trespace. , With pietous voys I crye and sey to the: , ...","Beholde, o man! lyft up thyn eye and see What mortall peyne I suffre for thi trespace. With pietous voys I crye and sey to the: Behold..."


# SCRAP HEAP

In [526]:
ultra_dict = {genre: [] for genre in poet_urls_dict.keys()}
ultra_dict

{'augustan': [],
 'beat': [],
 'black_arts_movement': [],
 'black_mountain': [],
 'confessional': [],
 'fugitive': [],
 'georgian': [],
 'harlem_renaissance': [],
 'imagist': [],
 'language_poetry': [],
 'middle_english': [],
 'modern': [],
 'new_york_school': [],
 'new_york_school_2nd_generation': [],
 'objectivist': [],
 'renaissance': [],
 'romantic': [],
 'victorian': []}

In [46]:
s = rq.Session()
s.get(genre_urls[0])

<Response [200]>

### SCRAPER ATTEMPT READ IMAGE

In [None]:
def poem_scraper(poem_url):
    '''Scraper for PoetryFoundation.org--scrapes poet name, poem title, poem year, list of poem's lines,
       and the poem as a string.
       Input the url for a poem's page on PoetryFoundation.org.
       Output is a list.'''
    
    # load a page and soupify it
    page = rq.get(poem_url)
    soup = bs(page.content, 'html.parser')
    
    # series of try/except statements to scrape info or return NaN value if desired info cannot be scraped
    try:
        poet = soup.find('a', href=re.compile('.*/poets/.*')).contents[0]
    except:
        poet = np.nan
        
    try:
        title = soup.find('h1').contents[-1].strip()
    except:
        try:
            title_pattern = '[a-z\-]*$'
            title = re.search(title_pattern, poem_url, re.I).group().replace('-', ' ').title()
        except:
            title = np.nan
        
    try:
        lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
        lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
        lines = [line.replace('<br/>', '') for line in lines]
        line_pattern = '>(.*?)<'
        lines = [re.search(line_pattern, line, re.I).group(1) if '<' in line else line for line in lines]
        if lines == []:
            try:
                img_link = soup.find('img', src=re.compile('.*/jstor/.*'))['src']
                resource = urlopen(img_link)
                output = open('poem_imgs/temp.png','wb')
                output.write(resource.read())
                text = pytesseract.image_to_string('poem_imgs/temp.png')
                scan_pattern = fr'{title.upper()}\s*((.*\s.*)*)'
                lines = re.search(scan_pattern, text, re.I).group(1).splitlines()
            except:
                lines = np.nan
    except:
        lines = np.nan
        
    try:
        poem_string = '\n'.join(lines)
    except:
        poem_string = np.nan
        
    try:
        year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
        year_pattern = r'[12]\d{3}'
        year = int(re.search(year_pattern, year_blurb, re.I).group())
    except:
        try:
            year_blurb = soup.find_all('span', {'class': 'c-txt c-txt_note c-txt_note_mini'})[-1].contents[2]
            year_pattern = r'[12]\d{3}'
            year = int(re.search(year_pattern, year_blurb, re.I).group())
        except:
            year = np.nan
    
    info = [poet, title, year, lines, poem_string]
    
    return info