In [1]:
import requests
from bs4 import BeautifulSoup

popular_books = []

response = requests.get('https://www.gutenberg.org/ebooks/search/?sort_order=downloads')

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    books = soup.find_all('li', class_='booklink')[:10]
    for book in books:
        book_id = book.find('a', class_='link')['href'].split('/')[-1]
        book_title = book.find('span', class_='title').text.strip()
        popular_books.append({'id' : book_id, 'title' : book_title})
else:
    print(response.status_code)

for book in popular_books:
    print(book)

{'id': '2701', 'title': 'Moby Dick; Or, The Whale'}
{'id': '84', 'title': 'Frankenstein; Or, The Modern Prometheus'}
{'id': '1342', 'title': 'Pride and Prejudice'}
{'id': '1513', 'title': 'Romeo and Juliet'}
{'id': '145', 'title': 'Middlemarch'}
{'id': '2641', 'title': 'A Room with a View'}
{'id': '37106', 'title': 'Little Women; Or, Meg, Jo, Beth, and Amy'}
{'id': '100', 'title': 'The Complete Works of William Shakespeare'}
{'id': '67979', 'title': 'The Blue Castle: a novel'}
{'id': '16389', 'title': 'The Enchanted April'}


In [2]:
import random

id = random.choice(popular_books)['id']
url = f"https://www.gutenberg.org/files/{id}/{id}-h/{id}-h.htm"
    
r = requests.get(url)
r.encoding = 'utf-8'
html = r.text
    
html[:1000]

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"\r\n"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\r\n<head>\r\n<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />\r\n<meta http-equiv="Content-Style-Type" content="text/css" />\r\n<title>The Project Gutenberg eBook of Moby Dick; Or the Whale, by Herman Melville</title>\r\n\r\n<style type="text/css" xml:space="preserve">\r\n\r\n    body {margin-left:15%; margin-right:15%; text-align:justify }\r\n    p { text-indent: 1em; margin-top: .25em; margin-bottom: .25em; }\r\n    H1,H2,H3,H4,H5,H6 { text-align: center; margin-left: 15%; margin-right: 15%; }\r\n    hr  { width: 50%; text-align: center;}\r\n    blockquote {font-size: 100%; margin-left: 0%; margin-right: 0%;}\r\n    .mynote    {background-color: #DDE; color: #000; padding: .5em; margin-left: 10%; margin-right: 10%; font-family: sans-serif; font-size: 95%;}\r\n    .toc       { margin

In [3]:
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text()

text[:1000]

'\n\n\n\n\nThe Project Gutenberg eBook of Moby Dick; Or the Whale, by Herman Melville\n\n\n\nThe Project Gutenberg eBook of Moby-Dick; or The Whale, by Herman Melville\n\r\nThis eBook is for the use of anyone anywhere in the United States and\r\nmost other parts of the world at no cost and with almost no restrictions\r\nwhatsoever. You may copy it, give it away or re-use it under the terms\r\nof the Project Gutenberg License included with this eBook or online\r\nat www.gutenberg.org. If you\r\nare not located in the United States, you will have to check the laws of the\r\ncountry where you are located before using this eBook.\r\n\nTitle: Moby-Dick; or The Whale\nAuthor: Herman Melville\nRelease Date: June, 2001 [eBook #2701]\r\n[Most recently updated: August 18, 2021]\nLanguage: English\nCharacter set encoding: UTF-8\nProduced by: Daniel Lazarus, Jonesey, and David Widger\n*** START OF THE PROJECT GUTENBERG EBOOK MOBY-DICK; OR THE WHALE ***\n\r\n      MOBY-DICK;or, THE WHALE.\r\n    \n

In [4]:
import nltk

tokenizer = nltk.tokenize.RegexpTokenizer('\\w+')
tokens = tokenizer.tokenize(text)

tokens[:10]

['The',
 'Project',
 'Gutenberg',
 'eBook',
 'of',
 'Moby',
 'Dick',
 'Or',
 'the',
 'Whale']

In [5]:
words = [token.lower() for token in tokens]

words[:10]

['the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'moby',
 'dick',
 'or',
 'the',
 'whale']

In [6]:
# nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')

stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [7]:
words_ = [word for word in words if word not in stopwords]

words_[:10]

['project',
 'gutenberg',
 'ebook',
 'moby',
 'dick',
 'whale',
 'herman',
 'melville',
 'project',
 'gutenberg']

In [8]:
from collections import Counter

count = Counter(words_)
print(count.most_common(20))

[('whale', 1244), ('one', 925), ('like', 647), ('upon', 568), ('man', 527), ('ship', 519), ('ahab', 517), ('ye', 473), ('sea', 455), ('old', 452), ('would', 432), ('though', 384), ('head', 348), ('yet', 345), ('boat', 337), ('long', 334), ('time', 334), ('captain', 329), ('still', 312), ('chapter', 308)]
