In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import requests

# Dataset

To obtain the daataset we have done web scrapping in the https://www.poetryfoundation.org/ web. First we obtain the number of poems of a given period and then we iterate over all the pages to obtain the title, author and the poem itself.

In [43]:
def extract_total_results(url):
    response = requests.get(url)
    data = response.json()
    total_results = data['TotalResults']
    return total_results

In [45]:
def extract_poems(url,period):
    response = requests.get(url)
    data = response.json()

    # Extract attributes from each entry
    entries = []
    for entry in data['Entries']:
        entry_data = {
            'id': entry['id'],
            'title': entry['title'],
            'author': entry['author'],
            'snippet': entry['snippet'],
            'link': entry['link'],
            'categories': [category['title'] for category in entry['categories']],
            'period': period
        }

        # Fetch the poem content from the link
        poem_response = requests.get(entry_data['link'])
        poem_soup = BeautifulSoup(poem_response.text, 'html.parser')
        poem = poem_soup.find(class_='o-poem')
        if poem != None:
          poem_text = poem.get_text(separator=' ', strip=True) # If we wanted to take into accouent the form of the poem use \n as separator

          # Add poem text to the entry data
          entry_data['poem'] = poem_text
        else: # When the poem is an image
          entry_data['poem'] = None

        entries.append(entry_data)

    # Create a pandas DataFrame
    df = pd.DataFrame(entries)
    return df

In [52]:
# Initialize an empty list to store individual DataFrames
dfs = []
period_ids = {
    'Middle English': 158,
    'Augustan': 149,
    'Renaissance': 163,
    'Romantic': 164,
    'Victorian': 165,
    'Fugitive': 153,
    'Georgian': 154,
    'Harlem Renaissance': 155,
    'Imagist': 156,
    'Modern': 159,
    'Objectivist': 162,
    'Beat': 150,
    'Black Arts Movement': 304,
    'Black Mountain': 151,
    'Confessional': 152,
    'Language Poetry': 157,
    'New York School': 160,
    'New York School (2nd Generation)': 161
}

#Loop over all the periods to find how many entries they have
for i in period_ids:
  total_results = extract_total_results("https://www.poetryfoundation.org/ajax/poems?page=1&sort_by=title&school-period={}".format(period_ids[i]))
  print("Extracting poems of ", i)
  # Loop through the URLs
  for j in range(total_results // 20 + 1): # 20 poems per page
      url = "https://www.poetryfoundation.org/ajax/poems?page={}&sort_by=title&school-period={}".format(j+1, period_ids[i])
      # Extract DataFrame from each URL and append to the list
      dfs.append(extract_poems(url,i))

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)

# Display the final DataFrame
print(len(final_df))
final_df.head()


Middle English
Augustan
Augustan
Augustan
Augustan
Augustan
Augustan
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Renaissance
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Romantic
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Victorian
Fugitive
Fugitive
Fugitive
Fugitive
Fugitive
Georgian
Georgian
Georgian
Georgian
Georgian
Georgian
Georgian
Georgian
Harlem Renaissance
Harlem Renai

Unnamed: 0,id,title,author,snippet,link,categories,period,poem
0,43926,The Canterbury Tales: General Prologue,By Geoffrey Chaucer,Whan that Aprille with his shour<strong>e</str...,https://www.poetryfoundation.org/poems/43926/t...,"[The Body, The Mind, Love, Activities, Eating ...",Middle English,Here bygynneth the Book of the tales of Caunte...
1,44295,"Confessio Amantis, Book III: The Tale of Apoll...",By John Gower,"Appolinus his levE tok,",https://www.poetryfoundation.org/poems/44295/c...,[],Middle English,"Appolinus his leve tok, To God and al the lond..."
2,159137,Ego Dormio: [All perishes and passes],By Richard Rolle,[Alle perisches and passes pat we with eghe see],https://www.poetryfoundation.org/poems/159137/...,"[Love, Classic Love, Religion, Christianity, G...",Middle English,[Alle perisches and passes pat we with eghe se...
3,44442,The King&#39;s Quire,"By James I, of Scotland","Bewailing in my chamber thus allone,",https://www.poetryfoundation.org/poems/44442/t...,[],Middle English,"Bewailing in my chamber thus allone, Despeired..."
4,43936,The Parlement of Fowls,By Geoffrey Chaucer,"Now welcome, somer, with thy sonne soft<b>e</b>,",https://www.poetryfoundation.org/poems/43936/t...,"[Love, Romantic Love, Relationships, Nature, S...",Middle English,"(excerpt) Now welcome, somer, with thy sonne s..."


In [58]:
final_df.to_csv("poems_data.csv", index=False, quoting=1)