# Beautiful Soup

| Home                                           | Desc                                               | Docs                                                   |
|------------------------------------------------|----------------------------------------------------|--------------------------------------------------------|
| https://www.crummy.com/software/BeautifulSoup/ | Beautiful Soup 可以从 HTML 或 XML 文件中提取数据 | https://www.crummy.com/software/BeautifulSoup/bs4/doc/ |

它是写“爬虫”的利器，通常与 requests 或 selenium 配合。

## Code Example

In [7]:
from bs4 import BeautifulSoup

# Read the HTML file
with open("example.html", "r") as file:
    html_content = file.read()
    
# type(html_content)
# print(html_content)

# Create a BeautifulSoup object
soup = BeautifulSoup(html_content, 'html.parser')

# type(soup)

# Extract and print the title
title = soup.title.text
print(f"Title: {title}")

# Extract and print the text inside the <p> tag
paragraph = soup.p.text
print(f"Paragraph: {paragraph}")

# Extract and print the text inside each <li> tag in the <ul>
list_items = soup.ul.find_all('li')
print("List Items:")
for item in list_items:
    print(item.text)


Title: Example Page
Paragraph: This is a sample paragraph.


In [78]:
import re
import requests
from bs4 import BeautifulSoup

URL = "https://www.jjwxc.net/onebook.php?novelid=31816&chapterid=2"

html = requests.get(URL)
html_content = html.content

soup = BeautifulSoup(html_content, 'html.parser')

str_chapter_title = soup.h2.text


novel_body = soup.find('div', style='font-size: 16px;line-height: 1.8;padding: 0 19px 25px;font-family: \'Microsoft YaHei\', PingFangSC-Regular, HelveticaNeue-Light, \'Helvetica Neue Light\', sans-serif !important')

novel_content = novelbody.find_all(string=True, recursive=False)
mid_novel_content = '\n'.join(novelbody.find_all(string=True, recursive=False)).replace('\r\n', '').replace(' ', '')
str_novel_content = re.sub('\n+', '\n\n', mid_novel_content)

# print(f'\u3000\u3000{str_chapter_title}')
# print(str_novel_content)

file_path = f"{str_chapter_title}.txt"

with open(file_path, 'w', encoding='utf-8') as file:
    file.write(f'\u3000\u3000{str_chapter_title}')
    file.write(str_novel_content)
    
print(f'{str_chapter_title}已经成功写入 {file_path}')


第1章已经成功写入 第1章.txt


In [79]:
import re
import requests
from bs4 import BeautifulSoup

# def get_html_content(url):
#     """Fetches the HTML content from the given URL."""
#     response = requests.get(url)
#     return response.content

# def extract_novel_content(soup):
#     """Extracts and formats the novel content from the BeautifulSoup object."""
#     novel_body = soup.find('div', style='font-size: 16px;line-height: 1.8;padding: 0 19px 25px;font-family: \'Microsoft YaHei\', PingFangSC-Regular, HelveticaNeue-Light, \'Helvetica Neue Light\', sans-serif !important')
    
#     if novel_body:
#         novel_content = '\n'.join(novel_body.find_all(string=True, recursive=False)).replace('\r\n', '').replace(' ', '')
#         return re.sub('\n+', '\n\n', novel_content)
#     else:
#         return None

# def write_to_file(file_path, chapter_title, novel_content):
#     """Writes chapter title and novel content to a text file."""
#     with open(file_path, 'w', encoding='utf-8') as file:
#         file.write(f'\u3000\u3000{chapter_title}')
#         file.write(novel_content)

def main():
    URL = "https://www.jjwxc.net/onebook.php?novelid=31816&chapterid=2"
    html_content = get_html_content(URL)

    soup = BeautifulSoup(html_content, 'html.parser')

    chapter_title = soup.h2.text
    novel_content = extract_novel_content(soup)

    if novel_content:
        file_path = f"{chapter_title}.txt"
        write_to_file(file_path, chapter_title, novel_content)
        print(f'{chapter_title} has been successfully written to {file_path}')
    else:
        print("Failed to extract novel content. Check the HTML structure.")

if __name__ == "__main__":
    main()


第1章 has been successfully written to 第1章.txt


In [84]:
import requests
from bs4 import BeautifulSoup

URL = "https://www.jjwxc.net/onebook.php?novelid=31816"

response = requests.get(URL)
html_content = response.content

soup = BeautifulSoup(html_content, 'html.parser')

target_table = soup.find('table', {'class': 'cytable', 'id': 'oneboolt'})

href_list = []

target_rows = target_table.find_all('tr', {'itemprop': 'chapter'})

for row in target_rows:
    a_element = row.find('a', {'itemprop': 'url'})
    if a_element:
        href_value = a_element.get('href')
        href_list.append(href_value)

['http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=1',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=2',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=3',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=4',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=5',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=6',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=7',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=8',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=9',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=10',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=11',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=12',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=13',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=14',
 'http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=15',
 'http://www.jjwxc.net/onebook.php?novelid=31816&

In [85]:
import requests
from bs4 import BeautifulSoup

# URL of the target webpage
NOVEL_URL = "https://www.jjwxc.net/onebook.php?novelid=31816"

def get_html_content(url):
    """
    Function to retrieve HTML content from a given URL.
    
    Args:
    - url (str): The URL of the webpage to fetch.
    
    Returns:
    - str: The HTML content of the webpage.
    """
    response = requests.get(url)
    return response.content

def extract_chapter_links(html_content):
    """
    Function to extract chapter links from the HTML content using BeautifulSoup.
    
    Args:
    - html_content (str): The HTML content of the webpage.
    
    Returns:
    - list: A list of chapter links.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the target table containing chapter information
    target_table = soup.find('table', {'class': 'cytable', 'id': 'oneboolt'})
    
    href_list = []

    # Find all rows containing chapter information
    target_rows = target_table.find_all('tr', {'itemprop': 'chapter'})

    for row in target_rows:
        # Find the anchor element within the row
        a_element = row.find('a', {'itemprop': 'url'})
        if a_element:
            # Extract and append the href value to the list
            href_value = a_element.get('href')
            href_list.append(href_value)
    
    return href_list

def main():
    """
    Main function to execute the script.
    """
    # Retrieve HTML content from the specified URL
    html_content = get_html_content(NOVEL_URL)
    
    # Extract chapter links from the HTML content
    chapter_links = extract_chapter_links(html_content)
    
    # Print the extracted chapter links
    print("Extracted Chapter Links:")
    for link in chapter_links:
        print(link)

# Execute the main function if the script is run directly
if __name__ == "__main__":
    main()


Extracted Chapter Links:
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=1
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=2
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=3
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=4
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=5
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=6
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=7
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=8
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=9
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=10
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=11
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=12
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=13
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=14
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=15
http://www.jjwxc.net/onebook.php?novelid=31816&chapterid=16
http://www.jjwxc.net/one

In [88]:
import requests
from bs4 import BeautifulSoup

URL = "https://www.jjwxc.net/onebook.php?novelid=31816"

html_content = requests.get(URL).content

soup = BeautifulSoup(html_content, 'html.parser')

# Find the h1 tag with itemprop="name"
h1_tag = soup.find('h1', itemprop='name')

# Find the span tag inside the h1 tag with itemprop="articleSection"
span_tag = h1_tag.find('span', itemprop='articleSection')

# Extract the text from the span tag
result = span_tag.text.strip()

print(result)


碧甃沉（完结）


In [None]:
import requests


def extract_novel_author(self, soup):
    try:
        # Extract the novel title from the BeautifulSoup object
        h2_tag = soup.find('h2')

        # Check if h2_tag is found before trying to find span_tag
        if h2_tag:
            span_tag = h1_tag.find('span', itemprop='author')

            # Check if span_tag is found before accessing text
            if span_tag:
                return span_tag.text.strip()
            else:
                raise Exception("Unable to find span_tag with itemprop='author'")
        else:
            raise Exception("Unable to find h2_tag")

    except Exception as e:
        messagebox.showerror('错误', '找不到对应的小说作者。')
        return
    
