# Webscraping and Data Cleaning
### Using Selenium and BeautifulSoup to scrape PoetryFoundation.org
- **The first step is to import necessary libraries.**

In [1]:
# custom functions for this project
from functions import *
from functions_webscraping import *

# dataframe libraries
import pandas as pd
import numpy as np

# graphing libraries
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

# string manipulation libraries
import re
from unicodedata import normalize
from ast import literal_eval

# webscraping libraries
import requests as rq
from bs4 import BeautifulSoup as bs
from selenium import webdriver

# miscellany
import time
import gzip
import pickle

# reload functions/libraries when edited
%load_ext autoreload
%autoreload 2

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# increase column width of dataframe
pd.set_option('max_colwidth', 150)

- **Manually create dictionary with URL codes for each genre.**

In [2]:
# genre name: code found on PoetryFoundation.org
genre_codes = {
    'augustan': 149,
    'beat': 150,
    'black_arts_movement': 304,
    'black_mountain': 151,
    'confessional': 152,
    'fugitive': 153,
    'georgian': 154,
    'harlem_renaissance': 155,
    'imagist': 156,
    'language_poetry': 157,
    'middle_english': 158,
    'modern': 159,
    'new_york_school': 160,
    'new_york_school_2nd_generation': 161,
    'objectivist': 162,
    'renaissance': 163,
    'romantic': 164,
    'victorian': 165
}

- **Run function in a loop to create dictionary of poet urls.**

In [None]:
# dictionary creation using custom function
poet_urls = {genre: poet_urls_by_genre(genre_code, 3) for genre, genre_code in genre_codes.items()}

# check a genre
poet_urls['augustan']

- **The loop only partially worked, so I'll re-run sections in which some URLs are missing.**

In [196]:
# re-run on genre
poet_urls['black_arts_movement'] = poet_urls_by_genre(genre_codes['black_arts_movement'])

In [198]:
# re-run on genre
poet_urls['modern'] = poet_urls_by_genre(genre_codes['modern'])

In [200]:
# re-run on genre
poet_urls['renaissance'] = poet_urls_by_genre(genre_codes['renaissance'])

In [203]:
# re-run on genre
poet_urls['romantic'] = poet_urls_by_genre(genre_codes['romantic'])

In [206]:
# re-run on genre
poet_urls['victorian'] = poet_urls_by_genre(genre_codes['victorian'])

In [207]:
# confirm all urls have been grabbed
url_lens = {k:len(v) for k,v in poet_urls.items()}
url_lens

{'augustan': 23,
 'beat': 13,
 'black_arts_movement': 23,
 'black_mountain': 10,
 'confessional': 7,
 'fugitive': 7,
 'georgian': 22,
 'harlem_renaissance': 17,
 'imagist': 6,
 'language_poetry': 18,
 'middle_english': 3,
 'modern': 54,
 'new_york_school': 9,
 'new_york_school_2nd_generation': 16,
 'objectivist': 5,
 'renaissance': 41,
 'romantic': 51,
 'victorian': 55}

### Save/Load poet URLs dictionary

In [4]:
# # uncomment to save/load
# with open('poet_urls_dict.pickle', 'wb') as w:
#     pickle.dump(poet_urls, w, protocol=pickle.HIGHEST_PROTOCOL)

# with open('poet_urls_dict.pickle', 'rb') as r:
#     poet_urls_dict = pickle.load(r)

- **Check for duplicate values**

In [5]:
# create dataframe from poet_urls_dict
poet_df = pd.DataFrame([(genre,v) for genre in poet_urls_dict.keys() for v in poet_urls_dict[genre]])

# check if any URLs appear more than once
pd.concat(g for _, g in poet_df.groupby(1) if len(g) > 1)

Unnamed: 0,0,1
126,imagist,https://www.poetryfoundation.org/poets/ezra-pound
186,modern,https://www.poetryfoundation.org/poets/ezra-pound
122,imagist,https://www.poetryfoundation.org/poets/richard-aldington
150,modern,https://www.poetryfoundation.org/poets/richard-aldington


- **I'll give those poets to the imagist genre, since it has so few already.**

In [6]:
# list of duplicate URLs
dups = [value for value in poet_df[poet_df.duplicated(1)][1]]
dups

['https://www.poetryfoundation.org/poets/richard-aldington',
 'https://www.poetryfoundation.org/poets/ezra-pound']

In [7]:
# number of modern poets before
len(poet_urls_dict['modern'])

54

In [8]:
# re-listify the modernist URLs without Pound and Aldington
poet_urls_dict['modern'] = [url for url in poet_urls_dict['modern'] if url not in dups]

# number of modern poets after
len(poet_urls_dict['modern'])

52

## Build a dataframe
- **Scrape poems and other info.**

In [15]:
%%time

# instantiate an empty dataframe
df = pd.DataFrame()

# loop over each genre, create dataframe with desired information,
# concat to original dataframe, then save it before looping again
for genre in list(poet_urls_dict.keys()):
    genre_df = pf_scraper(poet_urls_dict, genre, 0.5)
    df = pd.concat([df, genre_df])
    df.to_csv('data/poetry_foundation_raw.csv')

KeyboardInterrupt: 

### Save/load dataframe

In [2]:
# # uncomment to save
# df.to_csv('data/poetry_foundation_raw.csv')

# # uncomment to load
# df = pd.read_csv('data/poetry_foundation_raw.csv', index_col=0)

In [3]:
# rename the columns
df.columns = ['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_lines', 'poem_string']
df.head()

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
0,https://www.poetryfoundation.org/poets/mary-barber,augustan,https://www.poetryfoundation.org/poems/50523/advice-to-her-son-on-marriage,Mary Barber,Advice to Her Son on Marriage,,"['When you gain her Affection, take care to preserve it;\r', 'Lest others persuade her, you do not deserve it.\r', 'Still study to heighten the Jo...","When you gain her Affection, take care to preserve it;\r\nLest others persuade her, you do not deserve it.\r\nStill study to heighten the Joys of ..."
1,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50534/auld-robin-forbes,Susanna Blamire,Auld Robin Forbes,,"['And auld Robin Forbes hes gien tem a dance,\r', 'I pat on my speckets to see them aw prance;\r', 'I thout o’ the days when I was but fifteen,\r'...","And auld Robin Forbes hes gien tem a dance,\r\nI pat on my speckets to see them aw prance;\r\nI thout o’ the days when I was but fifteen,\r\nAnd s..."
2,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50533/o-donald-ye-are-just-the-man,Susanna Blamire,O Donald! Ye Are Just the Man,,"['O Donald! ye are just the man\r', ' Who, when he’s got a wife,\r', 'Begins to fratch— nae notice ta’en—\r', ' They’re strangers a’ their life....","O Donald! ye are just the man\r\n Who, when he’s got a wife,\r\nBegins to fratch— nae notice ta’en—\r\n They’re strangers a’ their life.\r\n\nTh..."
3,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50532/the-siller-croun,Susanna Blamire,The Siller Croun,,"['And ye shall walk in silk attire,\r', ' And siller hae to spare,\r', 'Gin ye’ll consent to be his bride,\r', ' Nor think o’ Donald mair.\r'...","And ye shall walk in silk attire,\r\n And siller hae to spare,\r\nGin ye’ll consent to be his bride,\r\n Nor think o’ Donald mair.\r\nO wha w..."
4,https://www.poetryfoundation.org/poets/henry-carey,augustan,https://www.poetryfoundation.org/poems/43884/the-ballad-of-sally-in-our-alley,Henry Carey,The Ballad of Sally in our Alley,,"['Of all the Girls that are so smart\r', ' There’s none like pretty SALLY,\r', 'She is the Darling of my Heart,\r', ' And she lives in our...","Of all the Girls that are so smart\r\n There’s none like pretty SALLY,\r\nShe is the Darling of my Heart,\r\n And she lives in our Alley.\..."


- **Explore how the data looks.**

In [4]:
df.shape

(5295, 8)

In [5]:
df.genre.unique()

array(['augustan', 'beat', 'black_arts_movement', 'black_mountain',
       'confessional', 'fugitive', 'georgian', 'harlem_renaissance',
       'imagist', 'language_poetry', 'middle_english', 'modern',
       'new_york_school', 'new_york_school_2nd_generation', 'objectivist',
       'renaissance', 'romantic', 'victorian'], dtype=object)

In [6]:
df.genre.value_counts()

modern                            1324
victorian                          674
renaissance                        430
romantic                           407
imagist                            370
new_york_school                    265
black_mountain                     257
new_york_school_2nd_generation     193
language_poetry                    192
confessional                       176
georgian                           167
black_arts_movement                165
objectivist                        159
harlem_renaissance                 148
beat                               147
augustan                           121
fugitive                            90
middle_english                      10
Name: genre, dtype: int64

- **Check for duplicate values across multiple columns and drop those rows.**

In [7]:
df.duplicated(subset=['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_string'], keep='last').sum()

98

In [8]:
# drop duplicates
df.drop_duplicates(subset=['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_string'],
                   keep='last',
                   inplace=True)

# reset index
df.reset_index(drop=True, inplace=True)

In [9]:
# check changes
df.shape

(5197, 8)

In [10]:
df.genre.value_counts()

modern                            1284
victorian                          643
renaissance                        427
romantic                           398
imagist                            370
new_york_school                    265
black_mountain                     257
new_york_school_2nd_generation     192
language_poetry                    192
confessional                       176
black_arts_movement                165
georgian                           160
objectivist                        159
harlem_renaissance                 148
beat                               147
augustan                           114
fugitive                            90
middle_english                      10
Name: genre, dtype: int64

- **Looks like the poem_lines column converted to a list inside of a string while saving to CSV.**
- **I'll wait to convert it until I can fill some missing values for that column, a process I found to be more easily done as a list inside of a string.**

In [11]:
df.loc[0,'poem_lines']

"['When you gain her Affection, take care to preserve it;\\r', 'Lest others persuade her, you do not deserve it.\\r', 'Still study to heighten the Joys of her Life;\\r', 'Not treat her the worse, for her being your Wife.\\r', 'If in Judgment she errs, set her right, without Pride:\\r', '’Tis the Province of insolent Fools, to deride.\\r', 'A Husband’s first Praise, is a ', 'Then change not these Titles, for ', 'Let your Person be neat, unaffectedly clean,\\r', 'Tho’ alone with your wife the whole Day you remain.\\r', 'Chuse Books, for her study, to fashion her Mind,\\r', 'To emulate those who excell’d of her Kind.\\r', 'Be Religion the principal Care of your Life,\\r', 'As you hope to be blest in your Children and Wife:\\r', 'So you, in your Marriage, shall gain its true End;\\r', 'And find, in your Wife, a ', '', '']"

- **Check for missing values.**

In [12]:
df.isna().sum()

poet_url          0
genre             0
poem_url          0
poet             13
title           215
year           1649
poem_lines      410
poem_string     412
dtype: int64

In [13]:
df[df.poet.isna()]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
858,https://www.poetryfoundation.org/poets/w-d-snodgrass,confessional,https://www.poetryfoundation.org/poetrymagazine/poems/48292/road-56d22969928f0,,,2006.0,"['ILEANA MALANCIOIU', '', 'Road', '', 'I walk on a dark road so that I won’t see', '', 'The way my young oxen limp so much;', '', 'The horseshoes ...",ILEANA MALANCIOIU\n\nRoad\n\nI walk on a dark road so that I won’t see\n\nThe way my young oxen limp so much;\n\nThe horseshoes gouging into their...
1409,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14311/after-how-many-years-tr-by-amy-lowell-and-florence-ayscough,,After How Many Years Tr By Amy Lowell And Florence Ayscough,1919.0,,
1410,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14312/calligraphy-tr-by-amy-lowell-and-florence-ayscough,,Calligraphy Tr By Amy Lowell And Florence Ayscough,1919.0,,
1411,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14322/the-emperors-return-from-a-journey-to-the-south-tr-by-amy-lowell-and-florence-ayscough,,The Emperors Return From A Journey To The South Tr By Amy Lowell And Florence Ayscough,1919.0,,
1412,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14310/an-evening-meeting-tr-by-amy-lowell-and-florence-ayscough,,An Evening Meeting Tr By Amy Lowell And Florence Ayscough,1919.0,,
1413,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14314/from-the-straw-hut-among-the-seven-peaks-tr-by-amy-lowell-and-florence-ayscough,,From The Straw Hut Among The Seven Peaks Tr By Amy Lowell And Florence Ayscough,1919.0,,
1414,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14321/the-inn-at-the-western-lake-tr-by-amy-lowell-and-florence-ayscough,,The Inn At The Western Lake Tr By Amy Lowell And Florence Ayscough,1919.0,,
1415,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14296/on-seeing-the-portrait-of-a-beautiful-concubine-tr-by-amy-lowell-and-florence-ayscough,,On Seeing The Portrait Of A Beautiful Concubine Tr By Amy Lowell And Florence Ayscough,1919.0,,
1416,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14316/on-the-classic-of-the-hills-and-sea-tr-by-amy-lowell-and-florence-ayscough,,On The Classic Of The Hills And Sea Tr By Amy Lowell And Florence Ayscough,1919.0,,
1417,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/14313/one-goes-a-journey-tr-by-amy-lowell-and-florence-ayscough,,One Goes A Journey Tr By Amy Lowell And Florence Ayscough,1919.0,,


- **The Amy Lowell and Ben Jonson entries appear unuseable, so I'll drop those rows.**
- **I'll go ahead and fill in the missing info for the Snodgrass poem (which is actually a translation of another poet, but a Confessional translator will probably produce a Confessional work).**

In [14]:
# manually load in information to the poet and title column
df.loc[858,'poet'] = 'ILEANA MALANCIOIU'.title()
df.loc[858,'title'] = 'Road'
df[df.index == 858]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
858,https://www.poetryfoundation.org/poets/w-d-snodgrass,confessional,https://www.poetryfoundation.org/poetrymagazine/poems/48292/road-56d22969928f0,Ileana Malancioiu,Road,2006.0,"['ILEANA MALANCIOIU', '', 'Road', '', 'I walk on a dark road so that I won’t see', '', 'The way my young oxen limp so much;', '', 'The horseshoes ...",ILEANA MALANCIOIU\n\nRoad\n\nI walk on a dark road so that I won’t see\n\nThe way my young oxen limp so much;\n\nThe horseshoes gouging into their...


In [15]:
# drop the rows with missing values in the poet column
df.dropna(subset=['poet'], inplace=True)

In [16]:
df.isna().sum()

poet_url          0
genre             0
poem_url          0
poet              0
title           214
year           1649
poem_lines      398
poem_string     400
dtype: int64

## Rescraping
- **After reworking the scraping function a bit, I can try to fill in some missing poem_lines and poem_string values.**

### Round 1

In [17]:
# create a list of index numbers with NaN values in the poem_lines column
lookups = list(df[df.poem_lines.isna()].index)
lookups

[158,
 168,
 169,
 171,
 175,
 183,
 184,
 200,
 203,
 210,
 229,
 254,
 283,
 324,
 325,
 336,
 351,
 354,
 361,
 458,
 466,
 482,
 484,
 487,
 490,
 503,
 511,
 512,
 513,
 531,
 532,
 542,
 558,
 568,
 576,
 578,
 624,
 626,
 648,
 660,
 661,
 663,
 664,
 694,
 701,
 702,
 703,
 704,
 705,
 707,
 708,
 711,
 714,
 715,
 716,
 717,
 719,
 727,
 736,
 749,
 751,
 753,
 769,
 770,
 817,
 834,
 853,
 872,
 881,
 885,
 886,
 892,
 897,
 900,
 917,
 921,
 940,
 942,
 943,
 944,
 945,
 946,
 947,
 1004,
 1025,
 1123,
 1163,
 1169,
 1171,
 1184,
 1186,
 1192,
 1234,
 1297,
 1299,
 1319,
 1326,
 1345,
 1348,
 1363,
 1367,
 1371,
 1379,
 1383,
 1392,
 1395,
 1404,
 1440,
 1446,
 1452,
 1456,
 1467,
 1468,
 1477,
 1482,
 1489,
 1495,
 1496,
 1498,
 1500,
 1502,
 1503,
 1505,
 1515,
 1516,
 1517,
 1518,
 1519,
 1551,
 1552,
 1553,
 1554,
 1555,
 1556,
 1560,
 1565,
 1566,
 1587,
 1591,
 1594,
 1602,
 1604,
 1617,
 1618,
 1623,
 1631,
 1711,
 1731,
 1732,
 1743,
 1748,
 1770,
 1786,
 1815,
 1816

In [18]:
%%time

# iterate over the list, attempting to re-scrape the lines and string
# NOTE: I was getting a 'ValueError: Must have equal len keys and value when setting with an iterable', but converting
# the list to a string first seemed to make that go away. I have to convert this entire column anyway next.
for i in lookups:
    info = poem_scraper(df.loc[i, 'poem_url'])
    try:
        df.loc[i,'poem_lines'] = str(info[3])
        df.loc[i,'poem_string'] = info[4]
        print(f'Success -- {i}')
    except:
        print(f'Failure -- {i}')
        continue

Success -- 158
Success -- 168
Success -- 169
Success -- 171
Success -- 175
Success -- 183
Success -- 184
Success -- 200
Success -- 203
Success -- 210
Success -- 229
Success -- 254
Success -- 283
Success -- 324
Success -- 325
Success -- 336
Success -- 351
Success -- 354
Success -- 361
Success -- 458
Success -- 466
Success -- 482
Success -- 484
Success -- 487
Success -- 490
Success -- 503
Success -- 511
Success -- 512
Success -- 513
Success -- 531
Success -- 532
Success -- 542
Success -- 558
Success -- 568
Success -- 576
Success -- 578
Success -- 624
Success -- 626
Success -- 648
Success -- 660
Success -- 661
Success -- 663
Success -- 664
Success -- 694
Success -- 701
Success -- 702
Success -- 703
Success -- 704
Success -- 705
Success -- 707
Success -- 708
Success -- 711
Success -- 714
Success -- 715
Success -- 716
Success -- 717
Success -- 719
Success -- 727
Success -- 736
Success -- 749
Success -- 751
Success -- 753
Success -- 769
Success -- 770
Success -- 817
Success -- 834
Success --

- **Looks like the loop was somewhat successful though it did turn NaN values into the string 'nan'.**
- **I'll look first for other NaNs I may want to get rid of.**

In [20]:
df['poem_lines'] = df['poem_lines'].apply(destringify)

In [21]:
df.loc[0,'poem_lines']

['When you gain her Affection, take care to preserve it;\r',
 'Lest others persuade her, you do not deserve it.\r',
 'Still study to heighten the Joys of her Life;\r',
 'Not treat her the worse, for her being your Wife.\r',
 'If in Judgment she errs, set her right, without Pride:\r',
 '’Tis the Province of insolent Fools, to deride.\r',
 'A Husband’s first Praise, is a ',
 'Then change not these Titles, for ',
 'Let your Person be neat, unaffectedly clean,\r',
 'Tho’ alone with your wife the whole Day you remain.\r',
 'Chuse Books, for her study, to fashion her Mind,\r',
 'To emulate those who excell’d of her Kind.\r',
 'Be Religion the principal Care of your Life,\r',
 'As you hope to be blest in your Children and Wife:\r',
 'So you, in your Marriage, shall gain its true End;\r',
 'And find, in your Wife, a ',
 '',
 '']

In [23]:
# convert the string 'nan' back to NaN value
df['poem_lines'] = np.where(df['poem_lines'] == 'nan', np.nan, df['poem_lines'])

# check
df.loc[169,'poem_lines']

nan

In [24]:
df.isna().sum()

poet_url          0
genre             0
poem_url          0
poet              0
title           214
year           1649
poem_lines      344
poem_string     346
dtype: int64

### Round 2

In [34]:
# again, create a list of index numbers with NaN values in the poem_lines column
lookups2 = list(df[df.poem_lines.isna()].index)
lookups2

[169,
 171,
 183,
 184,
 200,
 203,
 210,
 229,
 254,
 283,
 324,
 325,
 458,
 466,
 482,
 484,
 487,
 490,
 503,
 511,
 512,
 513,
 531,
 532,
 558,
 568,
 576,
 578,
 624,
 626,
 648,
 660,
 661,
 663,
 664,
 694,
 701,
 702,
 703,
 704,
 705,
 707,
 708,
 711,
 714,
 715,
 716,
 717,
 719,
 727,
 736,
 749,
 751,
 753,
 769,
 770,
 834,
 853,
 872,
 881,
 885,
 886,
 892,
 897,
 900,
 917,
 921,
 940,
 942,
 943,
 944,
 945,
 946,
 947,
 1004,
 1025,
 1163,
 1169,
 1171,
 1184,
 1186,
 1234,
 1297,
 1299,
 1319,
 1363,
 1367,
 1371,
 1379,
 1383,
 1392,
 1395,
 1404,
 1440,
 1446,
 1452,
 1456,
 1467,
 1468,
 1477,
 1482,
 1489,
 1495,
 1496,
 1498,
 1500,
 1502,
 1503,
 1505,
 1551,
 1552,
 1553,
 1554,
 1555,
 1556,
 1560,
 1565,
 1566,
 1587,
 1591,
 1594,
 1602,
 1604,
 1617,
 1618,
 1623,
 1711,
 1834,
 1836,
 1837,
 1839,
 1844,
 1865,
 1867,
 1870,
 1875,
 1876,
 1877,
 1906,
 1914,
 1915,
 1940,
 1965,
 1975,
 1976,
 1977,
 1978,
 1979,
 1993,
 1994,
 1997,
 1999,
 2000,
 20

In [41]:
%%time

# iterate over the list, attempting to re-scrape the lines and string
# NOTE: I was getting a 'ValueError: Must have equal len keys and value when setting with an iterable', but converting
# the list to a string first seemed to make that go away. I have to convert this entire column anyway next.
for i in lookups2:
    try:
        info = image_rescraper_poet(df.loc[i, 'poem_url'], df.loc[i, 'poet'])
        df.loc[i,'poem_lines'] = str(info[0])
        df.loc[i,'poem_string'] = info[1]
        print(f'Success -- {i}')
    except:
        print(f'Failure -- {i}')
        continue

Success -- 169
Success -- 171
Failure -- 183
Failure -- 184
Failure -- 200
Failure -- 203
Success -- 210
Failure -- 229
Failure -- 254
Failure -- 283
Failure -- 324
Failure -- 325
Success -- 458
Success -- 466
Success -- 482
Success -- 484
Success -- 487
Success -- 490
Success -- 503
Success -- 511
Success -- 512
Failure -- 513
Success -- 531
Success -- 532
Success -- 558
Success -- 568
Failure -- 576
Failure -- 578
Success -- 624
Success -- 626
Failure -- 648
Success -- 660
Success -- 661
Success -- 663
Success -- 664
Success -- 694
Success -- 701
Success -- 702
Success -- 703
Failure -- 704
Success -- 705
Failure -- 707
Success -- 708
Success -- 711
Success -- 714
Failure -- 715
Success -- 716
Failure -- 717
Success -- 719
Success -- 727
Success -- 736
Success -- 749
Failure -- 751
Failure -- 753
Failure -- 769
Failure -- 770
Success -- 834
Success -- 853
Success -- 872
Failure -- 881
Success -- 885
Success -- 886
Failure -- 892
Success -- 897
Failure -- 900
Failure -- 917
Success --

### Round 3

In [42]:
# again, create a list of index numbers with NaN values in the poem_lines column
lookups3 = list(df[df.poem_lines.isna()].index)
lookups3

[183,
 184,
 200,
 203,
 229,
 254,
 283,
 324,
 325,
 513,
 576,
 578,
 648,
 704,
 707,
 715,
 717,
 751,
 753,
 769,
 770,
 881,
 892,
 900,
 917,
 940,
 943,
 945,
 946,
 947,
 1025,
 1163,
 1169,
 1184,
 1234,
 1297,
 1299,
 1319,
 1363,
 1367,
 1371,
 1383,
 1392,
 1404,
 1440,
 1446,
 1456,
 1467,
 1468,
 1477,
 1482,
 1489,
 1495,
 1496,
 1498,
 1500,
 1502,
 1503,
 1505,
 1552,
 1554,
 1587,
 1594,
 1604,
 1617,
 1618,
 1623,
 1711,
 1834,
 1836,
 1837,
 1839,
 1865,
 1870,
 1915,
 1975,
 1976,
 1977,
 1978,
 1979,
 1993,
 1997,
 2003,
 2008,
 2011,
 2013,
 2019,
 2021,
 2023,
 2026,
 2032,
 2037,
 2042,
 2044,
 2050,
 2055,
 2091,
 2092,
 2093,
 2117,
 2122,
 2123,
 2156,
 2163,
 2165,
 2171,
 2193,
 2206,
 2240,
 2249,
 2293,
 2307,
 2310,
 2336,
 2349,
 2412,
 2417,
 2421,
 2424,
 2425,
 2434,
 2444,
 2451,
 2452,
 2457,
 2458,
 2461,
 2464,
 2488,
 2492,
 2528,
 2546,
 2572,
 2647,
 2648,
 2649,
 2728,
 2730,
 2744,
 2746,
 2776,
 2787,
 2803,
 2829,
 2851,
 2869,
 2877,
 

In [46]:
%%time

# iterate over the list, attempting to re-scrape the lines and string
# NOTE: I was getting a 'ValueError: Must have equal len keys and value when setting with an iterable', but converting
# the list to a string first seemed to make that go away. I have to convert this entire column anyway next.
for i in lookups3:
    try:
        info = image_rescraper_POETRY(df.loc[i, 'poem_url'])
        df.loc[i,'poem_lines'] = str(info[0])
        df.loc[i,'poem_string'] = info[1]
        print(f'Success -- {i}')
    except:
        print(f'Failure -- {i}')
        continue

Failure -- 183
Failure -- 184
Success -- 200
Success -- 203
Success -- 229
Success -- 254
Failure -- 283
Success -- 324
Success -- 325
Success -- 513
Success -- 576
Success -- 578
Success -- 648
Success -- 704
Success -- 707
Success -- 715
Success -- 717
Success -- 751
Success -- 753
Success -- 769
Success -- 770
Success -- 881
Failure -- 892
Success -- 900
Success -- 917
Success -- 940
Success -- 943
Failure -- 945
Success -- 946
Success -- 947
Success -- 1025
Success -- 1163
Success -- 1169
Success -- 1184
Success -- 1234
Failure -- 1297
Success -- 1299
Success -- 1319
Failure -- 1363
Success -- 1367
Failure -- 1371
Failure -- 1383
Failure -- 1392
Success -- 1404
Failure -- 1440
Failure -- 1446
Success -- 1456
Success -- 1467
Success -- 1468
Success -- 1477
Failure -- 1482
Failure -- 1489
Failure -- 1495
Failure -- 1496
Success -- 1498
Failure -- 1500
Success -- 1502
Success -- 1503
Success -- 1505
Failure -- 1552
Success -- 1554
Success -- 1587
Success -- 1594
Success -- 1604
Failur

In [47]:
df.loc[200,'poem_lines']

"['© SHE IS AS LOVELY-OFTEN', 'And tallness stood upon the sky like a sparkling mane', 'O she is as lovely-often as every day; the day', 'following the day . . the day of our lives, the brief day.', 'Within this moving room, this shadowy often-', 'ness of days where the little hurry of our lives is said. .', 'O as lovely-often as the moving wing of a bird.', 'But ah, alas, sooner or later each of us must', 'stand before that Roman Court, and be judged free of', 'even such lies as I told about the imperishable beauty of', 'her hair. But that time is not now, and even such lies as', 'I said about the enduring wonder of her grace, are lies', 'that contain within them the only truth by which a', 'man may live in this world.', 'she is as lovely-often as every day; the day', 'following the little day . . the day of our lives, ah, alas,', 'the brief day.', 'FIRST CAME THE LION-RIDER', 'First came the Lion-Rider, across the green', 'fields of the morning, holding golden in his golden', 'hands 

### Round 4

In [48]:
# again, create a list of index numbers with NaN values in the poem_lines column
lookups4 = list(df[df.poem_lines.isna()].index)
lookups4

[183,
 184,
 283,
 892,
 945,
 1297,
 1363,
 1371,
 1383,
 1392,
 1440,
 1446,
 1482,
 1489,
 1495,
 1496,
 1500,
 1552,
 1617,
 1836,
 1839,
 1865,
 1870,
 1975,
 1976,
 1977,
 1978,
 1979,
 2003,
 2013,
 2050,
 2093,
 2122,
 2123,
 2412,
 2424,
 2434,
 2451,
 2452,
 2457,
 2458,
 2546,
 2572,
 2728,
 2776,
 2933,
 3004,
 3327,
 3335,
 3336,
 3452,
 4309]

In [60]:
%%time

# iterate over the list, attempting to re-scrape the lines and string
# NOTE: I reworked the image_rescraper_poet function from earlier, so I'm running that again
for i in lookups4:
    try:
        info = image_rescraper_poet(df.loc[i, 'poem_url'], df.loc[i, 'poet'])
        df.loc[i,'poem_lines'] = str(info[0])
        df.loc[i,'poem_string'] = info[1]
        print(f'Success -- {i}')
    except:
        print(f'Failure -- {i}')
        continue

Success -- 183
Success -- 184
Failure -- 283
Failure -- 892
Failure -- 945
Success -- 1297
Failure -- 1363
Failure -- 1371
Failure -- 1383
Failure -- 1392
Failure -- 1440
Failure -- 1446
Failure -- 1482
Failure -- 1489
Failure -- 1495
Failure -- 1496
Failure -- 1500
Failure -- 1552
Failure -- 1617
Failure -- 1836
Failure -- 1839
Failure -- 1865
Failure -- 1870
Failure -- 1975
Failure -- 1976
Failure -- 1977
Failure -- 1978
Failure -- 1979
Success -- 2003
Success -- 2013
Failure -- 2050
Success -- 2093
Failure -- 2122
Failure -- 2123
Failure -- 2412
Failure -- 2424
Success -- 2434
Failure -- 2451
Failure -- 2452
Failure -- 2457
Failure -- 2458
Failure -- 2546
Failure -- 2572
Failure -- 2728
Failure -- 2776
Failure -- 2933
Failure -- 3004
Success -- 3327
Success -- 3335
Success -- 3336
Failure -- 3452
Failure -- 4309
CPU times: user 5.96 s, sys: 798 ms, total: 6.75 s
Wall time: 1min 13s


### Round 5

In [61]:
# again, create a list of index numbers with NaN values in the poem_lines column
lookups5 = list(df[df.poem_lines.isna()].index)
lookups5

[283,
 892,
 945,
 1363,
 1371,
 1383,
 1392,
 1440,
 1446,
 1482,
 1489,
 1495,
 1496,
 1500,
 1552,
 1617,
 1836,
 1839,
 1865,
 1870,
 1975,
 1976,
 1977,
 1978,
 1979,
 2050,
 2122,
 2123,
 2412,
 2424,
 2451,
 2452,
 2457,
 2458,
 2546,
 2572,
 2728,
 2776,
 2933,
 3004,
 3452,
 4309]

In [69]:
%%time

# iterate over the list, attempting to re-scrape the lines and string
# NOTE: I reworked the image_rescraper_poet function from earlier, so am running that again
for i in lookups5:
    try:
        info = image_rescraper_title(df.loc[i, 'poem_url'], df.loc[i, 'title'])
        df.loc[i,'poem_lines'] = str(info[0])
        df.loc[i,'poem_string'] = info[1]
        print(f'Success -- {i}')
    except:
        print(f'Failure -- {i}')
        continue

Success -- 283
Success -- 892
Success -- 945
Success -- 1363
Success -- 1371
Success -- 1383
Success -- 1392
Failure -- 1440
Success -- 1446
Failure -- 1482
Success -- 1489
Failure -- 1495
Success -- 1496
Failure -- 1500
Success -- 1552
Failure -- 1617
Success -- 1836
Failure -- 1839
Success -- 1865
Success -- 1870
Failure -- 1975
Failure -- 1976
Success -- 1977
Failure -- 1978
Failure -- 1979
Failure -- 2050
Success -- 2122
Failure -- 2123
Failure -- 2412
Success -- 2424
Failure -- 2451
Success -- 2452
Success -- 2457
Failure -- 2458
Success -- 2546
Success -- 2572
Success -- 2728
Failure -- 2776
Success -- 2933
Success -- 3004
Success -- 3452
Success -- 4309
CPU times: user 4.89 s, sys: 663 ms, total: 5.56 s
Wall time: 58.8 s


### A little excessive, but not bad!

In [73]:
df.isna().sum()

poet_url          0
genre             0
poem_url          0
poet              0
title           214
year           1649
poem_lines        7
poem_string       9
dtype: int64

- **I'll drop the remaining rows with missing poem_lines values.**

In [75]:
# drop the rows with missing values in the poem_lines column
df.dropna(subset=['poem_lines'], inplace=True)

In [76]:
df.isna().sum()

poet_url          0
genre             0
poem_url          0
poet              0
title           214
year           1649
poem_lines        0
poem_string       2
dtype: int64

- **The pages for the rows with missing poem_string values appear to be blank so I'll drop those.**

In [77]:
df[df.poem_string.isna()]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
2941,https://www.poetryfoundation.org/poets/dylan-thomas,modern,https://www.poetryfoundation.org/poems/26804/poem-on-his-birthday-facs-drafts,Dylan Thomas,Poem on His Birthday [Facs. drafts],,[],
3230,https://www.poetryfoundation.org/poets/barbara-guest,new_york_school,https://www.poetryfoundation.org/poems/49367/imagined-room,Barbara Guest,Imagined Room,,[],


In [78]:
# drop the rows with missing values in the poem_string column, the pages for which do appear blank
df.dropna(subset=['poem_string'], inplace=True)

- **I'll try to fill in the title column using Regex.**

In [79]:
# create a list of index numbers with NaN values in the title column
lookups_title = list(df[df.title.isna()].index)
lookups_title

[166,
 251,
 275,
 285,
 306,
 459,
 460,
 462,
 463,
 469,
 470,
 471,
 472,
 514,
 517,
 521,
 522,
 523,
 552,
 556,
 557,
 559,
 561,
 563,
 567,
 619,
 631,
 639,
 641,
 642,
 696,
 710,
 779,
 780,
 830,
 831,
 906,
 908,
 922,
 924,
 986,
 999,
 1012,
 1046,
 1112,
 1136,
 1143,
 1164,
 1174,
 1261,
 1262,
 1296,
 1349,
 1455,
 1539,
 1540,
 1586,
 1588,
 1596,
 1599,
 1609,
 1757,
 1842,
 1848,
 1849,
 1903,
 1907,
 1908,
 1930,
 1935,
 1946,
 1947,
 1955,
 2028,
 2034,
 2118,
 2159,
 2160,
 2167,
 2177,
 2182,
 2188,
 2198,
 2210,
 2211,
 2212,
 2219,
 2223,
 2291,
 2363,
 2415,
 2426,
 2428,
 2460,
 2466,
 2493,
 2494,
 2522,
 2757,
 2758,
 2760,
 2767,
 2778,
 2781,
 2796,
 2806,
 2816,
 2820,
 2830,
 2845,
 2847,
 2858,
 2862,
 2864,
 2871,
 2953,
 2955,
 2969,
 2996,
 2997,
 3002,
 3008,
 3167,
 3271,
 3309,
 3346,
 3360,
 3369,
 3380,
 3381,
 3390,
 3430,
 3431,
 3433,
 3449,
 3456,
 3533,
 3592,
 3593,
 3641,
 3644,
 3677,
 3696,
 3704,
 3705,
 3707,
 3708,
 3709,
 3714,

In [80]:
%%time

# create regex pattern to capture the ending of the url
title_pattern = '.+/([a-z\-]*).*$'

# iterate over the list, attempting to fill in the title with re-stylized url ending
for i in lookups_title:
    title = re.search(title_pattern, df.loc[i,'poem_url'], re.I).group(1).replace('-', ' ').title()
    try:
        df.loc[i,'title'] = title
        print(f'Success -- {i}')
    except:
        print(f'Failure -- {i}')
        continue

Success -- 166
Success -- 251
Success -- 275
Success -- 285
Success -- 306
Success -- 459
Success -- 460
Success -- 462
Success -- 463
Success -- 469
Success -- 470
Success -- 471
Success -- 472
Success -- 514
Success -- 517
Success -- 521
Success -- 522
Success -- 523
Success -- 552
Success -- 556
Success -- 557
Success -- 559
Success -- 561
Success -- 563
Success -- 567
Success -- 619
Success -- 631
Success -- 639
Success -- 641
Success -- 642
Success -- 696
Success -- 710
Success -- 779
Success -- 780
Success -- 830
Success -- 831
Success -- 906
Success -- 908
Success -- 922
Success -- 924
Success -- 986
Success -- 999
Success -- 1012
Success -- 1046
Success -- 1112
Success -- 1136
Success -- 1143
Success -- 1164
Success -- 1174
Success -- 1261
Success -- 1262
Success -- 1296
Success -- 1349
Success -- 1455
Success -- 1539
Success -- 1540
Success -- 1586
Success -- 1588
Success -- 1596
Success -- 1599
Success -- 1609
Success -- 1757
Success -- 1842
Success -- 1848
Success -- 1849
Su

In [81]:
df.isna().sum()

poet_url          0
genre             0
poem_url          0
poet              0
title             0
year           1647
poem_lines        0
poem_string       0
dtype: int64

- **I'll drop the year column, as that didn't seem to be too successful.**

In [83]:
df.drop(columns='year', inplace=True)
df.isna().sum()

poet_url       0
genre          0
poem_url       0
poet           0
title          0
poem_lines     0
poem_string    0
dtype: int64

In [84]:
df.shape

(5176, 7)

### Save a copy

In [87]:
df.to_csv('data/poetry_foundation_raw_rescrape.csv')


- **I'll look at a breakdown of genres and see if there are any I should get rid of.**
- **My initial thoughts are to limit it in time period, so as to remove any language barriers, so to speak (between, say, Shakespearean English and modern English).**

In [88]:
df.genre.value_counts()

modern                            1279
victorian                          643
renaissance                        426
romantic                           398
imagist                            356
new_york_school                    264
black_mountain                     257
new_york_school_2nd_generation     192
language_poetry                    192
confessional                       176
black_arts_movement                165
georgian                           160
objectivist                        159
harlem_renaissance                 148
beat                               147
augustan                           114
fugitive                            90
middle_english                      10
Name: genre, dtype: int64

In [89]:
# check a sample Middle English poem
print(df[df.genre == 'middle_english'].iloc[0,-1])

Whan that Aprille with his shour
The droghte of March hath perc
And bath
Of which vertú engendr
Whan Zephirus eek with his swet
Inspir
The tendr
Hath in the Ram his half
And smal
That slepen al the nyght with open y
So priketh hem Natúre in hir corag
Thanne longen folk to goon on pilgrimag
And palmeres for to seken straung
To fern
And specially, from every shir
Of Eng
The hooly blisful martir for to sek
That hem hath holpen whan that they were seek

Bifil that in that seson on a day, 
In Southwerk at the Tabard as I lay, 
Redy to wenden on my pilgrymag
To Caunterbury with ful devout corag
At nyght were come into that hostelry
Wel nyne and twenty in a compaigny
Of sondry folk, by áventure y-fall
In felaweshipe, and pilgrimes were they all
That toward Caunterbury wolden ryd
The chambr
And wel we weren es
And shortly, whan the sonn
So hadde I spoken with hem everychon, 
That I was of hir felaweshipe anon, 
And mad
To take oure wey, ther as I yow devys

But nath
Er that I ferther in thi

- **Indeed, Middle English is definitely out.**

In [90]:
df = df[df.genre != 'middle_english']
df.shape

(5166, 7)

In [91]:
# check a sample Renaissance poem
print(df[df.genre == 'renaissance'].iloc[0,-1])

Long have I long’d to see my love againe,
   Still have I wisht, but never could obtaine it;
   Rather than all the world (if I might gaine it)
Would I desire my love’s sweet precious gaine.
Yet in my soule I see him everie day,
   See him, and see his still sterne countenaunce,
   But (ah) what is of long continuance,
Where majestie and beautie beares the sway?
Sometimes, when I imagine that I see him,
   (As love is full of foolish fantasies)
   Weening to kisse his lips, as my love’s fees,
I feele but aire: nothing but aire to bee him.
   Thus with Ixion, kisse I clouds in vaine:
   Thus with Ixion, feele I endles paine.





In [92]:
# check a sample Augustan poem
print(df[df.genre == 'augustan'].iloc[1,-1])

And auld Robin Forbes hes gien tem a dance,
I pat on my speckets to see them aw prance;
I thout o’ the days when I was but fifteen,
And skipp’d wi’ the best upon Forbes’s green.
Of aw things that is I think thout is meast queer,
It brings that that’s by-past and sets it down here;
I see Willy as plain as I dui this bit leace,
When he tuik his cwoat lappet and deeghted his feace.

The lasses aw wonder’d what Willy cud see
In yen that was dark and hard featur’d leyke me;
And they wonder’d ay mair when they talk’d o’ my wit,
And slily telt Willy that cudn’t be it:
But Willy he laugh’d, and he meade me his weyfe,
And whea was mair happy thro’ aw his lang leyfe?
It’s e’en my great comfort, now Willy is geane,
The he offen said— nae place was leyke his awn heame!

I mind when I carried my wark to yon steyle
Where Willy was deykin, the time to beguile,
He wad fling me a daisy to put i’ my breast,
And I hammer’d my noddle to mek out a jest.
But merry or grave, Willy often w

- **According to Poetry Foundation's website, Renaissance and Augustan poems are from the years 1500 - 1780, and the differences in the English are fairly clear.**
- **For now, I'll drop these.**

In [93]:
df_trim = df[df.genre != 'renaissance']
df_trim = df_trim[df_trim.genre != 'augustan']
df_trim.shape

(4626, 7)

In [94]:
# check a sample Victorian poem
print(df[df.genre == 'victorian'].iloc[1,-1])

I
The evening comes, the fields are still. 
The tinkle of the thirsty rill, 
Unheard all day, ascends again; 
Deserted is the half-mown plain, 
Silent the swaths! the ringing wain, 
The mower's cry, the dog's alarms, 
All housed within the sleeping farms! 
The business of the day is done, 
The last-left haymaker is gone. 
And from the thyme upon the height, 
And from the elder-blossom white 
And pale dog-roses in the hedge, 
And from the mint-plant in the sedge, 
In puffs of balm the night-air blows 
The perfume which the day forgoes. 
And on the pure horizon far, 
See, pulsing with the first-born star, 
The liquid sky above the hill! 
The evening comes, the fields are still. 

       Loitering and leaping, 
       With saunter, with bounds— 
       Flickering and circling 
       In files and in rounds— 
       Gaily their pine-staff green 
       Tossing in air, 
       Loose o'er their shoulders white 
       Showering their hair— 
       See! the wild Maenads 
       Break from the

In [95]:
# check a sample Romantic poem
print(df[df.genre == 'romantic'].iloc[1,-1])

Now in thy dazzling half-oped eye, 
Thy curled nose and lip awry, 
Uphoisted arms and noddling head, 
And little chin with crystal spread, 
Poor helpless thing! what do I see, 
That I should sing of thee? 

From thy poor tongue no accents come, 
Which can but rub thy toothless gum: 
Small understanding boasts thy face, 
Thy shapeless limbs nor step nor grace: 
A few short words thy feats may tell, 
And yet I love thee well. 

When wakes the sudden bitter shriek, 
And redder swells thy little cheek 
When rattled keys thy woes beguile, 
And through thine eyelids gleams the smile, 
Still for thy weakly self is spent 
Thy little silly plaint. 

But when thy friends are in distress. 
Thou’lt laugh and chuckle n’ertheless, 
Nor with kind sympathy be smitten, 
Though all are sad but thee and kitten; 
Yet puny varlet that thou art, 
Thou twitchest at the heart. 

Thy smooth round cheek so soft and warm; 
Thy pinky hand and dimpled arm; 
Thy silken locks that scantly peep, 
With gold tipped end

- **Romantic and Victorian poems are from 1781-1900, but the language seems fairly similar.**
- **Plus, these are some very formative genres for poetry in English. For now, I'll keep these.**

- **All other genres are from after 1900.**

In [96]:
# let's reindex
df_trim.reset_index(drop=True, inplace=True)

## Rescraping (again)
- **Look more closely at how the scraping went.**
- **Eventually, I'll want to create some new features, like number of lines and average line length.**
    - **Since I can't divide by zero, this is a good opportunity to look for any unsuccessful scrapes--those where 0 or too few lines were scraped.**
    - **NOTE: I'm checking if length of poem_lines is less than or equal to 1 because that yielded the desired results, whereas seeing if length equaled 0 did not.**

In [97]:
df_trim[df_trim['poem_lines'].map(lambda x: len(x)) <= 1]

Unnamed: 0,poet_url,genre,poem_url,poet,title,poem_lines,poem_string
222,https://www.poetryfoundation.org/poets/henry-dumas,black_arts_movement,https://www.poetryfoundation.org/poems/53477/kef-21,Henry Dumas,Kef 21,"[First there was the earth in my mouth. It was there like a running stream, the July fever sweating the delirium of August, and the green buckling...","First there was the earth in my mouth. It was there like a running stream, the July fever sweating the delirium of August, and the green buckling ..."
428,https://www.poetryfoundation.org/poets/robert-duncan,black_mountain,https://www.poetryfoundation.org/poems/46316/a-poem-beginning-with-a-line-by-pindar,Robert Duncan,A Poem Beginning with a Line by Pindar,[I],I
703,https://www.poetryfoundation.org/poets/anne-sexton,confessional,https://www.poetryfoundation.org/poems/152252/o-ye-tongues,Anne Sexton,O Ye Tongues,[First Psalm],First Psalm
952,https://www.poetryfoundation.org/poets/wilfred-owen,georgian,https://www.poetryfoundation.org/poems/57369/the-send-off,Wilfred Owen,The Send-Off,[ ],
953,https://www.poetryfoundation.org/poets/wilfred-owen,georgian,https://www.poetryfoundation.org/poems/57347/smile-smile-smile,Wilfred Owen,"Smile, Smile, Smile","[Head to limp head, the sunk-eyed wounded scanned]","Head to limp head, the sunk-eyed wounded scanned"
1231,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poems/53772/spring-day-56d233626c49b,Amy Lowell,Spring Day,[<em> Bath</em>],<em> Bath</em>
1234,https://www.poetryfoundation.org/poets/amy-lowell,imagist,https://www.poetryfoundation.org/poems/53773/towns-in-colour,Amy Lowell,Towns in Colour,"[Red slippers in a shop-window, and outside in the street, flaws of grey, windy sleet!]","Red slippers in a shop-window, and outside in the street, flaws of grey, windy sleet!"
1389,https://www.poetryfoundation.org/poets/william-carlos-williams,imagist,https://www.poetryfoundation.org/poems/54567/kora-in-hell-improvisations-xi,William Carlos Williams,Kora in Hell: Improvisations XI,[XI],XI
1603,https://www.poetryfoundation.org/poets/lyn-hejinian,language_poetry,https://www.poetryfoundation.org/poems/47892/my-life-a-name-trimmed-with-colored-ribbons,Lyn Hejinian,My Life: A name trimmed with colored ribbons,[A name trimmed],A name trimmed
1615,https://www.poetryfoundation.org/poets/fanny-howe,language_poetry,https://www.poetryfoundation.org/poems/46762/everythings-a-fake,Fanny Howe,Everything’s a Fake,"[Coyote scruff in canyons off Mulholland Drive. Fragrance of sage and rosemary, now it’s spring. At night the mockingbirds ring their warnings of ...","Coyote scruff in canyons off Mulholland Drive. Fragrance of sage and rosemary, now it’s spring. At night the mockingbirds ring their warnings of c..."


- **After building out some specific rescraping functions, I can replace the poem_lines and poem_string values.**

In [100]:
# rescrape poem based on index from above 
df_trim.loc[428,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[428,'poem_url'])[0])
df_trim.loc[428,'poem_string'] = PoemView_rescraper(df_trim.loc[428,'poem_url'])[1]

df_trim.loc[703,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[703,'poem_url'])[0])
df_trim.loc[703,'poem_string'] = PoemView_rescraper(df_trim.loc[703,'poem_url'])[1]

df_trim.loc[952,'poem_lines'] = str(poempara_rescraper(df_trim.loc[952,'poem_url'])[0])
df_trim.loc[952,'poem_string'] = poempara_rescraper(df_trim.loc[952,'poem_url'])[1]

df_trim.loc[953,'poem_lines'] = str(modified_regular_rescraper(df_trim.loc[953,'poem_url'])[0])
df_trim.loc[953,'poem_string'] = modified_regular_rescraper(df_trim.loc[953,'poem_url'])[1]

df_trim.loc[1231,'poem_lines'] = str(justify_rescraper(df_trim.loc[1231,'poem_url'])[0])
df_trim.loc[1231,'poem_string'] = justify_rescraper(df_trim.loc[1231,'poem_url'])[1]

df_trim.loc[1234,'poem_lines'] = str(justify_rescraper(df_trim.loc[1234,'poem_url'])[0])
df_trim.loc[1234,'poem_string'] = justify_rescraper(df_trim.loc[1234,'poem_url'])[1]

df_trim.loc[1389,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[1389,'poem_url'])[0])
df_trim.loc[1389,'poem_string'] = PoemView_rescraper(df_trim.loc[1389,'poem_url'])[1]

df_trim.loc[1603,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[1603,'poem_url'])[0])
df_trim.loc[1603,'poem_string'] = PoemView_rescraper(df_trim.loc[1603,'poem_url'])[1]

df_trim.loc[2514,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[2514,'poem_url'])[0])
df_trim.loc[2514,'poem_string'] = PoemView_rescraper(df_trim.loc[2514,'poem_url'])[1]

df_trim.loc[2517,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[2517,'poem_url'])[0])
df_trim.loc[2517,'poem_string'] = PoemView_rescraper(df_trim.loc[2517,'poem_url'])[1]

df_trim.loc[3335,'poem_lines'] = str(ranged_rescraper(df_trim.loc[3335,'poem_url'])[0])
df_trim.loc[3335,'poem_string'] = ranged_rescraper(df_trim.loc[3335,'poem_url'])[1]

df_trim.loc[3418,'poem_lines'] = str(center_rescraper(df_trim.loc[3418,'poem_url'])[0])
df_trim.loc[3418,'poem_string'] = center_rescraper(df_trim.loc[3418,'poem_url'])[1]

df_trim.loc[3421,'poem_lines'] = str(justify_rescraper(df_trim.loc[3421,'poem_url'])[0])
df_trim.loc[3421,'poem_string'] = justify_rescraper(df_trim.loc[3421,'poem_url'])[1]

df_trim.loc[4217,'poem_lines'] = str(poempara_rescraper(df_trim.loc[4217,'poem_url'])[0])
df_trim.loc[4217,'poem_string'] = poempara_rescraper(df_trim.loc[4217,'poem_url'])[1]

df_trim.loc[4611,'poem_lines'] = str(poempara_rescraper(df_trim.loc[4611,'poem_url'])[0])
df_trim.loc[4611,'poem_string'] = poempara_rescraper(df_trim.loc[4611,'poem_url'])[1]

In [104]:
# found some more...
df_trim.loc[1388,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[1388,'poem_url'])[0])
df_trim.loc[1388,'poem_string'] = PoemView_rescraper(df_trim.loc[1388,'poem_url'])[1]

df_trim.loc[1390,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[1390,'poem_url'])[0])
df_trim.loc[1390,'poem_string'] = PoemView_rescraper(df_trim.loc[1390,'poem_url'])[1]

df_trim.loc[1391,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[1391,'poem_url'])[0])
df_trim.loc[1391,'poem_string'] = PoemView_rescraper(df_trim.loc[1391,'poem_url'])[1]

df_trim.loc[1392,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[1392,'poem_url'])[0])
df_trim.loc[1392,'poem_string'] = PoemView_rescraper(df_trim.loc[1392,'poem_url'])[1]

In [106]:
# another one...
df_trim.loc[3399,'poem_lines'] = str(image_rescraper(df_trim.loc[3399,'poem_url'])[0])
df_trim.loc[3399,'poem_string'] = image_rescraper(df_trim.loc[3399,'poem_url'])[1]

- **Some scrapings contain only BeautifulSoup garbage, so I'll try to re-scrape those.**

In [108]:
# check if html tags are in the string
df_trim[df_trim.poem_string.str.contains('<div')]

Unnamed: 0,poet_url,genre,poem_url,poet,title,poem_lines,poem_string
237,https://www.poetryfoundation.org/poets/nikki-giovanni,black_arts_movement,https://www.poetryfoundation.org/poems/90181/no-complaints,Nikki Giovanni,No Complaints,"[, <div class=""c-epigraph"">\n<p>\n<div style=""font-style:italic;""><p><span style=""font-style:normal"">(For Gwendolyn Brooks, 1917—2001)</span></p><...","\n<div class=""c-epigraph"">\n<p>\n<div style=""font-style:italic;""><p><span style=""font-style:normal"">(For Gwendolyn Brooks, 1917—2001)</span></p></..."
1687,https://www.poetryfoundation.org/poets/ron-silliman,language_poetry,https://www.poetryfoundation.org/poems/55563/you-part-i,Ron Silliman,"You, part I","[, <div class=""c-epigraph"">\n<p>\n<div style=""font-style:italic;""><p><span style=""font-style:normal"">for Pat Silliman</span></p></div>\n</p>\n</di...","\n<div class=""c-epigraph"">\n<p>\n<div style=""font-style:italic;""><p><span style=""font-style:normal"">for Pat Silliman</span></p></div>\n</p>\n</div>\n"
1688,https://www.poetryfoundation.org/poets/ron-silliman,language_poetry,https://www.poetryfoundation.org/poems/55564/you-part-xii,Ron Silliman,"You, part XII","[, <div class=""c-epigraph"">\n<p>\n<div style=""font-style:italic;""><p><span style=""font-style:normal"">for Pat Silliman</span></p></div>\n</p>\n</di...","\n<div class=""c-epigraph"">\n<p>\n<div style=""font-style:italic;""><p><span style=""font-style:normal"">for Pat Silliman</span></p></div>\n</p>\n</div>\n"
4260,https://www.poetryfoundation.org/poets/emma-lazarus,victorian,https://www.poetryfoundation.org/poems/46791/by-the-waters-of-babylon,Emma Lazarus,By the Waters of Babylon,"[, <div class=""c-epigraph"">\n<p>\n<div style=""font-style:italic;""><div align=""center"">Little Poems in Prose</div></div>\n</p>\n</div>, ]","\n<div class=""c-epigraph"">\n<p>\n<div style=""font-style:italic;""><div align=""center"">Little Poems in Prose</div></div>\n</p>\n</div>\n"


In [159]:
# rescrape poem based on index from above 
df_trim.loc[237,'poem_lines'] = str(PoemView_rescraper_2(df_trim.loc[237,'poem_url'])[0])
df_trim.loc[237,'poem_string'] = PoemView_rescraper_2(df_trim.loc[237,'poem_url'])[1]

df_trim.loc[1687,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[1687,'poem_url'])[0])
df_trim.loc[1687,'poem_string'] = PoemView_rescraper(df_trim.loc[1687,'poem_url'])[1]

df_trim.loc[1688,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[1688,'poem_url'])[0])
df_trim.loc[1688,'poem_string'] = PoemView_rescraper(df_trim.loc[1688,'poem_url'])[1]

df_trim.loc[4260,'poem_lines'] = str(PoemView_rescraper(df_trim.loc[4260,'poem_url'])[0])
df_trim.loc[4260,'poem_string'] = PoemView_rescraper(df_trim.loc[4260,'poem_url'])[1]

In [160]:
# re-run the destringify function
df_trim['poem_lines'] = df_trim['poem_lines'].apply(destringify)

- **Re-check for any missing poem_lines values that aren't NaNs.**

In [165]:
df_trim[df_trim['poem_lines'].map(lambda d: len(d)) == 0]

Unnamed: 0,poet_url,genre,poem_url,poet,title,poem_lines,poem_string
783,https://www.poetryfoundation.org/poets/randall-jarrell,fugitive,https://www.poetryfoundation.org/poetrymagazine/poems/25237/goodbye-wendover-goodbye-mountain-home,Randall Jarrell,Goodbye Wendover Goodbye Mountain Home,[],
1326,https://www.poetryfoundation.org/poets/ezra-pound,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/13071/dogmatic-statement-concerning-the-game-of-chess-theme-for-a-series-of-pictures,Ezra Pound,Dogmatic Statement Concerning The Game Of Chess Theme For A Series Of Pictures,[],
1433,https://www.poetryfoundation.org/poets/william-carlos-williams,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/20226/a-foot-note,William Carlos Williams,A Foot Note,[],
1438,https://www.poetryfoundation.org/poets/william-carlos-williams,imagist,https://www.poetryfoundation.org/poetrymagazine/poems/24855/paterson-book-ii,William Carlos Williams,Paterson Book Ii,[],
1736,https://www.poetryfoundation.org/poets/w-h-auden,modern,https://www.poetryfoundation.org/poetrymagazine/poems/22702/poem-he-watched-with-all-his,W. H. Auden,Poem He Watched With All His,[],
1738,https://www.poetryfoundation.org/poets/w-h-auden,modern,https://www.poetryfoundation.org/poetrymagazine/poems/21500/poem-o-who-can-ever-praise-enough-the-price,W. H. Auden,Poem O Who Can Ever Praise Enough The Price,[],
1775,https://www.poetryfoundation.org/poets/louise-bogan,modern,https://www.poetryfoundation.org/poetrymagazine/poems/21807/untitled-tender-and-insolent,Louise Bogan,Untitled Tender And Insolent,[],
1826,https://www.poetryfoundation.org/poets/hart-crane,modern,https://www.poetryfoundation.org/poetrymagazine/poems/17345/at-melvilles-tomb,Hart Crane,At Melvilles Tomb,[],
2056,https://www.poetryfoundation.org/poets/a-m-klein,modern,https://www.poetryfoundation.org/poetrymagazine/poems/23448/come-two-like-shadows,A. M. Klein,Come Two Like Shadows,[],
2582,https://www.poetryfoundation.org/poets/wallace-stevens,modern,https://www.poetryfoundation.org/poetrymagazine/poems/19837/good-man-bad-woman,Wallace Stevens,Good Man Bad Woman,[],


In [169]:
# create a list of indices
lookups6 = list(df_trim[df_trim['poem_lines'].map(lambda d: len(d)) == 0].index)
lookups6

[783,
 1326,
 1433,
 1438,
 1736,
 1738,
 1775,
 1826,
 2056,
 2582,
 2685,
 2790,
 2817,
 3191]

In [174]:
%%time

# iterate over the list, attempting to re-scrape the lines and string
# NOTE: I reworked the image_rescraper_poet function from earlier, so I'm running that again
for i in lookups6:
    try:
        info = image_rescraper_title(df_trim.loc[i, 'poem_url'], df_trim.loc[i, 'title'])
        df_trim.loc[i,'poem_lines'] = str(info[0])
        df_trim.loc[i,'poem_string'] = info[1]
        print(f'Success -- {i}')
    except:
        print(f'Failure -- {i}')
        continue

Success -- 783
Success -- 1326
Success -- 1433
Success -- 1438
Success -- 1736
Success -- 1738
Success -- 1775
Success -- 1826
Success -- 2056
Success -- 2582
Success -- 2685
Success -- 2790
Success -- 2817
Failure -- 3191
CPU times: user 1.58 s, sys: 214 ms, total: 1.79 s
Wall time: 51.6 s


In [177]:
# one final one to redo
df_trim.loc[3191,'title'] = 'Radio'
info = image_rescraper_title(df_trim.loc[3191, 'poem_url'], df_trim.loc[3191, 'title'])
df_trim.loc[3191,'poem_lines'] = str(info[0])
df_trim.loc[3191,'poem_string'] = info[1]

In [181]:
# re-run destringify
df_trim['poem_lines'] = df_trim['poem_lines'].apply(destringify)

## SAVE IT!

In [182]:
df_trim.to_csv('data/poetry_foundation_raw_rescrape.csv')