In [208]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

## Fiction Winners

In [209]:

# Send a GET request to the URL
url = 'https://en.wikipedia.org/wiki/Pulitzer_Prize_for_Fiction'
response = requests.get(url)
display(response.status_code)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')


200

In [210]:
# Parse the HTML content using BeautifulSoup
#soup = BeautifulSoup(response.content, 'html.parser')
#print(soup.prettify())

### Table 1918-1978

In [211]:
#soup.select('#mw-content-text > div.mw-parser-output > table:nth-child(24)')

In [212]:
winners = soup.select("table")[1]
#winners

#### Year and Book Title 1918 to 1979

#mw-content-text > div.mw-parser-output > table:nth-child(24) > tbody > tr:nth-child(1) > td:nth-child(1) > b > a

In [213]:
winners.select('tr td b a')[1]['title']

'His Family'

In [214]:
titles = []
for name in winners.select('tr td b a'):
    book = name.get('title')
    titles.append(name['title'])
    
titles[0:10]

['1917 in literature',
 'His Family',
 '1918 in literature',
 'The Magnificent Ambersons',
 '1919 in literature',
 '1920 in literature',
 'The Age of Innocence',
 '1921 in literature',
 'Alice Adams (novel)',
 '1922 in literature']

In [215]:
len(titles)

115

In [216]:
# Adding NaN to empty winner books

# Loop through the titles list
i = 0
while i < len(titles)-1:
    # Check if two consecutive titles both contain the phrase 'in literature'
    if 'in literature' in titles[i] and 'in literature' in titles[i+1]:
        # If they do, insert a 'NaN' in between them
        titles.insert(i+1, 'NaN')
        i += 2
    else:
        # If not, move on to the next title
        i += 1

display(titles[0:15], len(titles))

['1917 in literature',
 'His Family',
 '1918 in literature',
 'The Magnificent Ambersons',
 '1919 in literature',
 'NaN',
 '1920 in literature',
 'The Age of Innocence',
 '1921 in literature',
 'Alice Adams (novel)',
 '1922 in literature',
 'One of Ours',
 '1923 in literature',
 'The Able McLaughlins',
 '1924 in literature']

124

In [217]:
# Checking that there's at least one different row in between 'in literature' rows
has_different_row = False
for i in range(len(titles)-2):
    if 'in literature' in titles[i] and 'in literature' in titles[i+2]:
        has_different_row = True
        break

print(has_different_row)

True


In [218]:
# Generating first dataframe

years = []
books = []

# Loop through the titles list and separate the years and book titles
for i in range(len(titles)):
    # Every other element contains a year
    if i % 2 == 0:
        years.append(titles[i])
    # Every other element contains a book title
    else:
        books.append(titles[i])

# Combine the two lists into a DataFrame
pulitzer_fiction = pd.DataFrame({
    'year': years,
    'book': books
})

# Print the resulting DataFrame
pulitzer_fiction


Unnamed: 0,year,book
0,1917 in literature,His Family
1,1918 in literature,The Magnificent Ambersons
2,1919 in literature,
3,1920 in literature,The Age of Innocence
4,1921 in literature,Alice Adams (novel)
...,...,...
57,1974 in literature,The Killer Angels
58,1975 in literature,Humboldt's Gift
59,1976 in literature,
60,1977 in literature,Elbow Room (short story collection)


In [222]:
# Clean up values
pulitzer_fiction['year'] = pulitzer_fiction['year'].str.replace(' in literature', '')
pulitzer_fiction['book'] = pulitzer_fiction['book'].str.replace('\(.*\)', '', regex=True).str.strip()

#pulitzer_fiction

In [224]:
# Correcting awards year

# Convert 'year' to numeric and add 1
pulitzer_fiction['year'] = pd.to_numeric(pulitzer_fiction['year'])
pulitzer_fiction['year'] += 1

# display dataframe
pulitzer_fiction

Unnamed: 0,year,book
0,1919,His Family
1,1920,The Magnificent Ambersons
2,1921,
3,1922,The Age of Innocence
4,1923,Alice Adams
...,...,...
57,1976,The Killer Angels
58,1977,Humboldt's Gift
59,1978,
60,1979,Elbow Room


In [225]:
# Saving dataframe
pulitzer_fiction.to_csv('pulitzer_fiction.csv')

#### Author, Publisher and Genre

In [226]:
display(winners.select('tr td a')[0]['title'],
        winners.select('tr td a')[1]['title'],
        winners.select('tr td a')[2]['title'],
        winners.select('tr td a')[3]['title'],
        winners.select('tr td a')[4]['title'],
        winners.select('tr td a')[5]['title'])

'1917 in literature'

'Ernest Poole'

'His Family'

'Macmillan Publishers'

'Novel'

'Illinois'

- Author

#mw-content-text > div.mw-parser-output > table:nth-child(24) > tbody > tr:nth-child(1) > td:nth-child(3) > a

In [187]:
authors = []
for name in winners.select('tr td a'):
    book = name.get('href')
    authors.append(name['href'])
    
authors

['/wiki/1917_in_literature',
 '/wiki/Ernest_Poole',
 '/wiki/His_Family',
 '/wiki/Macmillan_Publishers',
 '/wiki/Novel',
 '/wiki/Illinois',
 '/wiki/1918_in_literature',
 '/wiki/Booth_Tarkington',
 '/wiki/The_Magnificent_Ambersons',
 '/wiki/Doubleday_(publisher)',
 '/wiki/Indiana',
 '/wiki/1919_in_literature',
 '#cite_note-5',
 '/wiki/1920_in_literature',
 '/wiki/Edith_Wharton',
 '/wiki/The_Age_of_Innocence',
 '/wiki/D._Appleton_%26_Company',
 '/wiki/New_York_(state)',
 '/wiki/1921_in_literature',
 '/wiki/Booth_Tarkington',
 '/wiki/Alice_Adams_(novel)',
 '/wiki/Doubleday_(publisher)',
 '/wiki/Indiana',
 '/wiki/1922_in_literature',
 '/wiki/Willa_Cather',
 '/wiki/One_of_Ours',
 '/wiki/Alfred_A._Knopf',
 '/wiki/Virginia',
 '/wiki/1923_in_literature',
 '/wiki/Margaret_Wilson_(novelist)',
 '/wiki/The_Able_McLaughlins',
 '/wiki/Harper_(publisher)',
 '/wiki/Debut_novel',
 '/wiki/Iowa',
 '/wiki/1924_in_literature',
 '/wiki/Edna_Ferber',
 '/wiki/So_Big_(novel)',
 '/wiki/Grosset_%26_Dunlap',
 '/wi

In [188]:
len(authors)

299

In [189]:
authors = []
for name in winners.select('tr td a'):
    book = name.get('href')
    # extract only the last part of the link (after the last '/')
    author = book.split('/')[-1]
    # replace underscores with spaces
    author = author.replace('_', ' ')
    authors.append(author)

In [190]:
authors

['1917 in literature',
 'Ernest Poole',
 'His Family',
 'Macmillan Publishers',
 'Novel',
 'Illinois',
 '1918 in literature',
 'Booth Tarkington',
 'The Magnificent Ambersons',
 'Doubleday (publisher)',
 'Indiana',
 '1919 in literature',
 '#cite note-5',
 '1920 in literature',
 'Edith Wharton',
 'The Age of Innocence',
 'D. Appleton %26 Company',
 'New York (state)',
 '1921 in literature',
 'Booth Tarkington',
 'Alice Adams (novel)',
 'Doubleday (publisher)',
 'Indiana',
 '1922 in literature',
 'Willa Cather',
 'One of Ours',
 'Alfred A. Knopf',
 'Virginia',
 '1923 in literature',
 'Margaret Wilson (novelist)',
 'The Able McLaughlins',
 'Harper (publisher)',
 'Debut novel',
 'Iowa',
 '1924 in literature',
 'Edna Ferber',
 'So Big (novel)',
 'Grosset %26 Dunlap',
 'Michigan',
 '1925 in literature',
 'Sinclair Lewis',
 'Arrowsmith (novel)',
 '#cite note-7',
 'Harcourt (publisher)',
 'Minnesota',
 '1926 in literature',
 'Louis Bromfield',
 'Early Autumn',
 'Ohio',
 '1927 in literature',
 

In [191]:
# initialize empty lists for each category
year = []
winner = []
book = []
publisher = []
genre = []
origin = []

# loop through the authors list in steps of 6
for i in range(0, len(authors), 6):
    # check if there are enough elements left in the list
    if i+5 < len(authors):
        # append the corresponding values to each category list
        year.append(authors[i].split('/')[-1])
        winner.append(authors[i+1])
        book.append(authors[i+2])
        publisher.append(authors[i+3])
        genre.append(authors[i+4])
        origin.append(authors[i+5])

In [192]:
year

['1917 in literature',
 '1918 in literature',
 '#cite note-5',
 '1921 in literature',
 'Willa Cather',
 'The Able McLaughlins',
 'So Big (novel)',
 '#cite note-7',
 'Ohio',
 '1928 in literature',
 'Oliver La Farge',
 'Years of Grace',
 'John Day Company',
 'Doubleday (publisher)',
 'Georgia (U.S. state)',
 '1935 in literature',
 'Margaret Mitchell',
 'The Late George Apley',
 'The Yearling',
 'The Grapes of Wrath',
 'Ellen Glasgow',
 'Dragon%27s Teeth (novel)',
 'Harper (publisher)',
 'War novel',
 'Robert Penn Warren',
 'James A. Michener',
 'James Gould Cozzens',
 'The Way West',
 'The Town (Richter novel)',
 'Doubleday (publisher)',
 'Novella',
 'A Fable',
 'Penguin Books',
 'A Death in the Family',
 'The Travels of Jaimie McPheeters',
 'Doubleday (publisher)',
 'Southern Gothic',
 'Little, Brown and Company',
 'Mississippi',
 'Alfred A. Knopf',
 'Short story collection',
 'New York (state)',
 '1968 in literature',
 'Jean Stafford',
 '1971 in literature',
 'Eudora Welty',
 '1974 in 

In [193]:
# Genre

In [194]:
# Year



In [195]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.phlibraryfriends.org/page/pulitzer-prize-for-fiction'
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')
soup.prettify()

'<!DOCTYPE html>\n<html lang="en">\n <head>\n  <meta content="Public" http-equiv="cache-control"/>\n  <meta content="Sun, 01 Jan 2040 00:00:00 UTC" http-equiv="expires"/>\n </head>\n <body onload="localStorage.clear();sessionStorage.clear()">\n </body>\n</html>\n'

In [196]:
soup.select('custom-page > table > tbody > tr:nth-child(2) > td:nth-child(1)')

[]

In [197]:
#custom-page > table > tbody > tr:nth-child(2) > td:nth-child(1)