In [85]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

## Fiction Winners

In [86]:
# Send a GET request to the URL
url = 'https://en.wikipedia.org/wiki/Pulitzer_Prize_for_Fiction'
response = requests.get(url)
display(response.status_code)
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')


200

In [87]:
# Parse the HTML content using BeautifulSoup
#soup = BeautifulSoup(response.content, 'html.parser')
#print(soup.prettify())

### Table 1918-1978

In [88]:
#soup.select('#mw-content-text > div.mw-parser-output > table:nth-child(24)')

In [89]:
winners = soup.select("table")[1]
#winners

#### Year and Book Title 1918 to 1979

#mw-content-text > div.mw-parser-output > table:nth-child(24) > tbody > tr:nth-child(1) > td:nth-child(1) > b > a

In [90]:
winners.select('tr td b a')[1]['title']

'His Family'

In [91]:
titles = []
for name in winners.select('tr td b a'):
    book = name.get('title')
    titles.append(name['title'])
    
titles[0:10]

['1917 in literature',
 'His Family',
 '1918 in literature',
 'The Magnificent Ambersons',
 '1919 in literature',
 '1920 in literature',
 'The Age of Innocence',
 '1921 in literature',
 'Alice Adams (novel)',
 '1922 in literature']

In [92]:
len(titles)

115

In [93]:
# Adding NaN to empty winner books

# Loop through the titles list
i = 0
while i < len(titles)-1:
    # Check if two consecutive titles both contain the phrase 'in literature'
    if 'in literature' in titles[i] and 'in literature' in titles[i+1]:
        # If they do, insert a 'NaN' in between them
        titles.insert(i+1, 'NaN')
        i += 2
    else:
        # If not, move on to the next title
        i += 1

display(titles[0:15], len(titles))

['1917 in literature',
 'His Family',
 '1918 in literature',
 'The Magnificent Ambersons',
 '1919 in literature',
 'NaN',
 '1920 in literature',
 'The Age of Innocence',
 '1921 in literature',
 'Alice Adams (novel)',
 '1922 in literature',
 'One of Ours',
 '1923 in literature',
 'The Able McLaughlins',
 '1924 in literature']

124

In [94]:
# Checking that there's at least one different row in between 'in literature' rows
has_different_row = False
for i in range(len(titles)-2):
    if 'in literature' in titles[i] and 'in literature' in titles[i+2]:
        has_different_row = True
        break

print(has_different_row)

True


In [95]:
# Generating first dataframe

years = []
books = []

# Loop through the titles list and separate the years and book titles
for i in range(len(titles)):
    # Every other element contains a year
    if i % 2 == 0:
        years.append(titles[i])
    # Every other element contains a book title
    else:
        books.append(titles[i])

# Combine the two lists into a DataFrame
pulitzer_fiction = pd.DataFrame({
    'year': years,
    'book': books
})

# Print the resulting DataFrame
pulitzer_fiction


Unnamed: 0,year,book
0,1917 in literature,His Family
1,1918 in literature,The Magnificent Ambersons
2,1919 in literature,
3,1920 in literature,The Age of Innocence
4,1921 in literature,Alice Adams (novel)
5,1922 in literature,One of Ours
6,1923 in literature,The Able McLaughlins
7,1924 in literature,So Big (novel)
8,1925 in literature,Arrowsmith (novel)
9,1926 in literature,Early Autumn


In [96]:
# Clean up values
pulitzer_fiction['year'] = pulitzer_fiction['year'].str.replace(' in literature', '')
pulitzer_fiction['book'] = pulitzer_fiction['book'].str.replace('\(.*\)', '', regex=True).str.strip()

#pulitzer_fiction

In [97]:
# Correcting awards year

# Convert 'year' to numeric and add 1
pulitzer_fiction['year'] = pd.to_numeric(pulitzer_fiction['year'])
pulitzer_fiction['year'] += 1

# display dataframe
pulitzer_fiction

Unnamed: 0,year,book
0,1918,His Family
1,1919,The Magnificent Ambersons
2,1920,
3,1921,The Age of Innocence
4,1922,Alice Adams
5,1923,One of Ours
6,1924,The Able McLaughlins
7,1925,So Big
8,1926,Arrowsmith
9,1927,Early Autumn


In [98]:
# Saving dataframe
pulitzer_fiction.to_csv('pulitzer_fiction.csv')

#### Author, Publisher and Genre

In [99]:
display(winners.select('tr td a')[0]['title'],
        winners.select('tr td a')[1]['title'],
        winners.select('tr td a')[2]['title'],
        winners.select('tr td a')[3]['title'],
        winners.select('tr td a')[4]['title'],
        winners.select('tr td a')[5]['title'])

'1917 in literature'

'Ernest Poole'

'His Family'

'Macmillan Publishers'

'Novel'

'Illinois'

In [100]:
data = []

for winner in winners.select('tr td'):
    for a_tag in winner.select('a'):
        if 'title' in a_tag.attrs:
            data.append(a_tag['title'])


In [101]:
type(data)

list

In [102]:
# All data
#data

In [103]:
# Extracting author
author = []
for i in range(len(data)):
    if 'in literature' in data[i]:
        if i+1 < len(data) and 'in literature' not in data[i+1]:
            author.append(data[i+1])
        else:
            author.append('NaN')

In [104]:
author

['Ernest Poole',
 'Booth Tarkington',
 'NaN',
 'Edith Wharton',
 'Booth Tarkington',
 'Willa Cather',
 'Margaret Wilson (novelist)',
 'Edna Ferber',
 'Sinclair Lewis',
 'Louis Bromfield',
 'Thornton Wilder',
 'Julia Peterkin',
 'Oliver La Farge',
 'Margaret Ayer Barnes',
 'Pearl S. Buck',
 'T. S. Stribling',
 'Caroline Pafford Miller',
 'Josephine Johnson',
 'H. L. Davis',
 'Margaret Mitchell',
 'John P. Marquand',
 'Marjorie Kinnan Rawlings',
 'John Steinbeck',
 'NaN',
 'Ellen Glasgow',
 'Upton Sinclair',
 'Martin Flavin',
 'John Hersey',
 'NaN',
 'Robert Penn Warren',
 'James A. Michener',
 'James Gould Cozzens',
 'A. B. Guthrie Jr.',
 'Conrad Richter',
 'Herman Wouk',
 'Ernest Hemingway',
 'NaN',
 'William Faulkner',
 'MacKinlay Kantor',
 'NaN',
 'James Agee',
 'Robert Lewis Taylor',
 'Allen Drury',
 'Harper Lee',
 "Edwin O'Connor",
 'William Faulkner',
 'NaN',
 'Shirley Ann Grau',
 'Katherine Anne Porter',
 'Bernard Malamud',
 'William Styron',
 'N. Scott Momaday',
 'Jean Stafford'

In [105]:
# Add 'author' column to the dataframe and append values from 'author' list
pulitzer_fiction['author'] = author
pulitzer_fiction.head(10)

Unnamed: 0,year,book,author
0,1918,His Family,Ernest Poole
1,1919,The Magnificent Ambersons,Booth Tarkington
2,1920,,
3,1921,The Age of Innocence,Edith Wharton
4,1922,Alice Adams,Booth Tarkington
5,1923,One of Ours,Willa Cather
6,1924,The Able McLaughlins,Margaret Wilson (novelist)
7,1925,So Big,Edna Ferber
8,1926,Arrowsmith,Sinclair Lewis
9,1927,Early Autumn,Louis Bromfield


In [106]:
# Adding 'genre' column. All values are 'Novel' except rows with NaN
pulitzer_fiction.loc[pulitzer_fiction['author'] != 'NaN', 'genre'] = 'Novel'
pulitzer_fiction.loc[pulitzer_fiction['author'] == 'NaN', 'genre'] = 'NaN'

In [107]:
pulitzer_fiction

Unnamed: 0,year,book,author,genre
0,1918,His Family,Ernest Poole,Novel
1,1919,The Magnificent Ambersons,Booth Tarkington,Novel
2,1920,,,
3,1921,The Age of Innocence,Edith Wharton,Novel
4,1922,Alice Adams,Booth Tarkington,Novel
5,1923,One of Ours,Willa Cather,Novel
6,1924,The Able McLaughlins,Margaret Wilson (novelist),Novel
7,1925,So Big,Edna Ferber,Novel
8,1926,Arrowsmith,Sinclair Lewis,Novel
9,1927,Early Autumn,Louis Bromfield,Novel


## Table 1980s to 2022

In [108]:
#previous: soup.select('#mw-content-text > div.mw-parser-output > table:nth-child(24)')

In [109]:
# previous winners = soup.select("table")[1]
# winners
# winners.select('tr td b a')[1]['title']

In [110]:
soup.select('mw-content-text > div.mw-parser-output > table:nth-child(27) > tbody')

[]

In [111]:
winners2 = soup.select("table")[2]
#winners2

In [112]:
winners2.select('tr td b a')[0]['title']

'1979 in literature'

In [113]:
winners2.select('tr td b a')[1]['title']

"The Executioner's Song"

In [114]:
# Extracting year and book
titles2 = []
for name in winners2.select('tr td b a'):
    book = name.get('title')
    titles2.append(name['title'])
    
display(titles2[0:10], len(titles2))

['1979 in literature',
 "The Executioner's Song",
 '1980 in literature',
 'A Confederacy of Dunces',
 '1981 in literature',
 'Rabbit Is Rich',
 '1982 in literature',
 'The Color Purple',
 '1983 in literature',
 'Ironweed (novel)']

85

In [115]:
# Adding NaN to empty winner books

# Loop through the titles list
i = 0
while i < len(titles2)-1:
    # Check if two consecutive titles both contain the phrase 'in literature'
    if 'in literature' in titles2[i] and 'in literature' in titles2[i+1]:
        # If they do, insert a 'NaN' in between them
        titles2.insert(i+1, 'NaN')
        i += 2
    else:
        # If not, move on to the next title
        i += 1

display(titles2[0:15], len(titles2))

['1979 in literature',
 "The Executioner's Song",
 '1980 in literature',
 'A Confederacy of Dunces',
 '1981 in literature',
 'Rabbit Is Rich',
 '1982 in literature',
 'The Color Purple',
 '1983 in literature',
 'Ironweed (novel)',
 '1984 in literature',
 'Foreign Affairs (novel)',
 '1985 in literature',
 'Lonesome Dove',
 '1986 in literature']

86

In [116]:
# Checking that there's at least one different row in between 'in literature' rows
has_different_row = False
for i in range(len(titles2)-2):
    if 'in literature' in titles2[i] and 'in literature' in titles2[i+2]:
        has_different_row = True
        break

print(has_different_row)

True


In [117]:
# Generating first dataframe

years = []
books = []

# Loop through the titles list and separate the years and book titles
for i in range(len(titles2)):
    # Every other element contains a year
    if i % 2 == 0:
        years.append(titles2[i])
    # Every other element contains a book title
    else:
        books.append(titles2[i])

# Combine the two lists into a DataFrame
pulitzer_fiction2 = pd.DataFrame({
    'year': years,
    'book': books
})

# Print the resulting DataFrame
pulitzer_fiction2.head()


Unnamed: 0,year,book
0,1979 in literature,The Executioner's Song
1,1980 in literature,A Confederacy of Dunces
2,1981 in literature,Rabbit Is Rich
3,1982 in literature,The Color Purple
4,1983 in literature,Ironweed (novel)


In [118]:
# Clean up values
pulitzer_fiction2['year'] = pulitzer_fiction2['year'].str.replace(' in literature', '')
pulitzer_fiction2['book'] = pulitzer_fiction2['book'].str.replace('\(.*\)', '', regex=True).str.strip()

#pulitzer_fiction2

In [119]:
# Correcting awards year

# Convert 'year' to numeric and add 1
pulitzer_fiction2['year'] = pd.to_numeric(pulitzer_fiction2['year'])
pulitzer_fiction2['year'] += 1

# display dataframe
pulitzer_fiction2.head()

Unnamed: 0,year,book
0,1980,The Executioner's Song
1,1981,A Confederacy of Dunces
2,1982,Rabbit Is Rich
3,1983,The Color Purple
4,1984,Ironweed


In [120]:
# Saving dataframe
pulitzer_fiction2.to_csv('pulitzer_fiction2.csv')

In [121]:
data2 = []

for winner in winners2.select('tr td'):
    for a_tag in winner.select('a'):
        if 'title' in a_tag.attrs:
            data2.append(a_tag['title'])


In [122]:
data2[0:20]

['1979 in literature',
 'Norman Mailer',
 "The Executioner's Song",
 'Little, Brown and Company',
 'True crime',
 'New Jersey',
 'William Wharton (author)',
 'Birdy (novel)',
 'Philip Roth',
 'The Ghost Writer',
 '1980 in literature',
 'John Kennedy Toole',
 'A Confederacy of Dunces',
 'Louisiana State University Press',
 'Picaresque novel',
 'Louisiana',
 'Frederick Buechner',
 'Godric (novel)',
 'William Keepers Maxwell Jr.',
 'So Long, See You Tomorrow (novel)']

In [123]:
# Extracting author
author2 = []
for i in range(len(data2)):
    if 'in literature' in data2[i]:
        if i+1 < len(data2) and 'in literature' not in data2[i+1]:
            author2.append(data2[i+1])
        else:
            author2.append('NaN')

In [124]:
author2[0:10]

['Norman Mailer',
 'John Kennedy Toole',
 'John Updike',
 'Alice Walker',
 'William Kennedy (author)',
 'Alison Lurie',
 'Larry McMurtry',
 'Peter Taylor (writer)',
 'Toni Morrison',
 'Anne Tyler']

In [125]:
# Add 'author' column to the dataframe and append values from 'author' list
pulitzer_fiction2['author'] = author2
pulitzer_fiction2.tail(12)

Unnamed: 0,year,book,author
31,2011,A Visit from the Goon Squad,Jennifer Egan
32,2012,,Karen Russell
33,2013,The Orphan Master's Son,Adam Johnson (writer)
34,2014,The Goldfinch,Donna Tartt
35,2015,All the Light We Cannot See,Anthony Doerr
36,2016,The Sympathizer,Viet Thanh Nguyen
37,2017,The Underground Railroad,Colson Whitehead
38,2018,Less,Andrew Sean Greer
39,2019,The Overstory,Richard Powers
40,2020,The Nickel Boys,Colson Whitehead


In [126]:
# Correcting 2012 ('Author' value it's finalist, but there was no award)
pulitzer_fiction2.at[32, 'author'] = 'NaN'

In [127]:
pulitzer_fiction2.tail(12)

Unnamed: 0,year,book,author
31,2011,A Visit from the Goon Squad,Jennifer Egan
32,2012,,
33,2013,The Orphan Master's Son,Adam Johnson (writer)
34,2014,The Goldfinch,Donna Tartt
35,2015,All the Light We Cannot See,Anthony Doerr
36,2016,The Sympathizer,Viet Thanh Nguyen
37,2017,The Underground Railroad,Colson Whitehead
38,2018,Less,Andrew Sean Greer
39,2019,The Overstory,Richard Powers
40,2020,The Nickel Boys,Colson Whitehead


In [128]:
# Extract genre
#genre = []
#for i in range(len(data2)):
#    if 'in literature' in data2[i]:
#        if i+4 < len(data2):
#            genre.append(data2[i+4])
#        else:
#            genre.append('NaN')


In [129]:
# Adding 'genre' column. All values are 'Novel' except rows with NaN
pulitzer_fiction2.loc[pulitzer_fiction2['author'] != 'NaN', 'genre'] = 'Novel'
pulitzer_fiction2.loc[pulitzer_fiction2['author'] == 'NaN', 'genre'] = 'NaN'

In [130]:
pulitzer_fiction2

Unnamed: 0,year,book,author,genre
0,1980,The Executioner's Song,Norman Mailer,Novel
1,1981,A Confederacy of Dunces,John Kennedy Toole,Novel
2,1982,Rabbit Is Rich,John Updike,Novel
3,1983,The Color Purple,Alice Walker,Novel
4,1984,Ironweed,William Kennedy (author),Novel
5,1985,Foreign Affairs,Alison Lurie,Novel
6,1986,Lonesome Dove,Larry McMurtry,Novel
7,1987,A Summons to Memphis,Peter Taylor (writer),Novel
8,1988,Beloved,Toni Morrison,Novel
9,1989,Breathing Lessons,Anne Tyler,Novel


## Concatenate both dataframes

In [131]:
all_pulitzer_fiction_winners = pd.concat([pulitzer_fiction, pulitzer_fiction2], axis=0, ignore_index=True)

In [132]:
# Drop 'genre'
all_pulitzer_fiction_winners = all_pulitzer_fiction_winners.drop(['genre'],axis = 1)

In [133]:
all_pulitzer_fiction_winners

Unnamed: 0,year,book,author
0,1918,His Family,Ernest Poole
1,1919,The Magnificent Ambersons,Booth Tarkington
2,1920,,
3,1921,The Age of Innocence,Edith Wharton
4,1922,Alice Adams,Booth Tarkington
5,1923,One of Ours,Willa Cather
6,1924,The Able McLaughlins,Margaret Wilson (novelist)
7,1925,So Big,Edna Ferber
8,1926,Arrowsmith,Sinclair Lewis
9,1927,Early Autumn,Louis Bromfield


In [134]:
# Delete '(any text)' from author's name in 'authors' column
import re

all_pulitzer_fiction_winners['author'] = all_pulitzer_fiction_winners['author'].apply(lambda x: re.sub(r'\([^()]*\)', '', x).strip())

In [135]:
# NaN was string..
import numpy as np
all_pulitzer_fiction_winners = all_pulitzer_fiction_winners.replace("NaN", np.nan)

In [136]:
all_pulitzer_fiction_winners.isna()

Unnamed: 0,year,book,author
0,False,False,False
1,False,False,False
2,False,True,True
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,False
9,False,False,False


In [137]:
# Replace NaN values for 'None awarded'
all_pulitzer_fiction_winners['book'] = all_pulitzer_fiction_winners['book'].fillna('None awarded')
all_pulitzer_fiction_winners['author'] = all_pulitzer_fiction_winners['author'].fillna('None awarded')


In [138]:
all_pulitzer_fiction_winners

Unnamed: 0,year,book,author
0,1918,His Family,Ernest Poole
1,1919,The Magnificent Ambersons,Booth Tarkington
2,1920,None awarded,None awarded
3,1921,The Age of Innocence,Edith Wharton
4,1922,Alice Adams,Booth Tarkington
5,1923,One of Ours,Willa Cather
6,1924,The Able McLaughlins,Margaret Wilson
7,1925,So Big,Edna Ferber
8,1926,Arrowsmith,Sinclair Lewis
9,1927,Early Autumn,Louis Bromfield


In [139]:
all_pulitzer_fiction_winners.tail(10)

Unnamed: 0,year,book,author
95,2013,The Orphan Master's Son,Adam Johnson
96,2014,The Goldfinch,Donna Tartt
97,2015,All the Light We Cannot See,Anthony Doerr
98,2016,The Sympathizer,Viet Thanh Nguyen
99,2017,The Underground Railroad,Colson Whitehead
100,2018,Less,Andrew Sean Greer
101,2019,The Overstory,Richard Powers
102,2020,The Nickel Boys,Colson Whitehead
103,2021,The Night Watchman,Louise Erdrich
104,2022,The Netanyahus: An Account of a Minor and Ulti...,Joshua Cohen


## URL

In [140]:
# URL example
# https://www.pulitzer.org/winners/joshua-cohen

In [141]:
# create a new list to store the URLs
url_list = []

# iterate through the authors and get their Pulitzer page URLs
for author in all_pulitzer_fiction_winners['author']:
    if author == 'None awarded':
        url_list.append('None awarded')
    else:
        author_url = 'https://www.pulitzer.org/winners/' + '-'.join(author.lower().split())
        author_url = author_url.strip()
        url_list.append(author_url)

In [142]:
url_list[:10]

['https://www.pulitzer.org/winners/ernest-poole',
 'https://www.pulitzer.org/winners/booth-tarkington',
 'None awarded',
 'https://www.pulitzer.org/winners/edith-wharton',
 'https://www.pulitzer.org/winners/booth-tarkington',
 'https://www.pulitzer.org/winners/willa-cather',
 'https://www.pulitzer.org/winners/margaret-wilson',
 'https://www.pulitzer.org/winners/edna-ferber',
 'https://www.pulitzer.org/winners/sinclair-lewis',
 'https://www.pulitzer.org/winners/louis-bromfield']

In [143]:
# Add the URLs as a new column in the main dataframe
all_pulitzer_fiction_winners['URL'] = url_list

In [146]:
pd.set_option('display.max_rows', None)
all_pulitzer_fiction_winners.tail(60)

Unnamed: 0,year,book,author,URL
45,1963,The Reivers,William Faulkner,https://www.pulitzer.org/winners/william-faulkner
46,1964,None awarded,None awarded,None awarded
47,1965,The Keepers of the House,Shirley Ann Grau,https://www.pulitzer.org/winners/shirley-ann-grau
48,1966,The Collected Stories of Katherine Anne Porter,Katherine Anne Porter,https://www.pulitzer.org/winners/katherine-ann...
49,1967,The Fixer,Bernard Malamud,https://www.pulitzer.org/winners/bernard-malamud
50,1968,The Confessions of Nat Turner,William Styron,https://www.pulitzer.org/winners/william-styron
51,1969,House Made of Dawn,N. Scott Momaday,https://www.pulitzer.org/winners/n.-scott-momaday
52,1970,The Collected Stories of Jean Stafford,Jean Stafford,https://www.pulitzer.org/winners/jean-stafford
53,1971,None awarded,None awarded,None awarded
54,1972,Angle of Repose,Wallace Stegner,https://www.pulitzer.org/winners/wallace-stegner


## Save to csv

In [145]:
all_pulitzer_fiction_winners.to_csv('pulitzer_fiction_winners_WEB_SCRAPING.csv')