In [1]:
import requests
from bs4 import BeautifulSoup
import nltk

nltk.download('punkt')

def get_subcategory_link(url: str, subcategory: str):
    # Send a GET request to the genre page
    response = requests.get(base_url)
    
    if response.status_code == 200:
        # Parse the HTML content of the genre page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the subcategory link that matches the query
        subcategory_link_tag = soup.find('a', href=True, string=lambda text: text and subcategory.lower() in text.lower())
        
        if subcategory_link_tag:
            subcategory_link = 'https://www.gutenberg.org' + subcategory_link_tag['href']
            return subcategory_link
        else:
            subcategory_links = soup.find_all('a', href=True)
            for link in subcategory_links:
                href = link['href']
                text = link.get_text(strip=True)
                if href and text:
                    print(f"Available Options: {text}, Href: https://www.gutenberg.org{href}")
                    
            raise ValueError(f"No subcategory link found for query: {subcategory}")
    else:
        subcategory_links = soup.find_all('a', href=True)
        for link in subcategory_links:
            href = link['href']
            text = link.get_text(strip=True)
            if href and text:
                print(f"Available Options: {text}, Href: https://www.gutenberg.org{href}")
                
        raise ValueError(f"No subcategory link found for query: {subcategory}")


def get_book_links(subcategory_url: str):
    # Send a GET request to the subcategory page
    response = requests.get(subcategory_url)
    
    if response.status_code == 200:
        # Parse the HTML content of the subcategory page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all <li> tags with the class 'booklink'
        book_links = soup.find_all('li', class_='booklink')
        
        # List to store links to .txt files
        txt_file_links = {}
    
        # Iterate over each book link and extract the title, author, and .txt file link
        for book in book_links:
            title_tag = book.find('span', class_='title')
            subtitle_tag = book.find('span', class_='subtitle')
            
            # Check if both title and subtitle tags exist
            if title_tag and subtitle_tag:
                title = title_tag.text
                author = subtitle_tag.text
                print(f'Title: {title}, Author: {author}')
                
                # Find the link to the book's page
                book_page_link = 'https://www.gutenberg.org' + book.find('a')['href']
                
                # Send a GET request to the book's page
                book_response = requests.get(book_page_link)
                if book_response.status_code == 200:
                    book_soup = BeautifulSoup(book_response.content, 'html.parser')
                    # Find the link to the .txt file
                    txt_link_tag = book_soup.find('a', href=True, string='Plain Text UTF-8')
                    if txt_link_tag:
                        txt_file_link = txt_link_tag['href']
                        txt_file_links[title] = txt_file_link
                        print(f'Text file link: {txt_file_link}')
                    else:
                        print('No .txt link found for this book.')
                else:
                    print(f"Failed to retrieve the book page. Status code: {book_response.status_code}")
            else:
                print('Title or author not found for one of the books.')

        return txt_file_links
    else:
        subcategory_links = soup.find_all('a', href=True)
        for link in subcategory_links:
            href = link['href']
            text = link.get_text(strip=True)
            if href and text:
                print(f"Available Options: {text}, Href: https://www.gutenberg.org{href}")
                
        raise ValueError(f"No subcategory link found for query: {subcategory} - Status code: {response.status_code}")

def get_sentences(txt_links: dict, intro_pct: float = 0.02):
    all_sentences = []

    for link in txt_links.values():
        # Construct the full URL
        full_url = 'https://www.gutenberg.org' + link
        
        # Fetch the .txt file content
        response = requests.get(full_url)
        if response.status_code == 200:
            content = response.text

            # Tokenize the text into words
            words = nltk.word_tokenize(content)
            
            # Remove the first 1000 words
            remaining_words = words[int(intro_pct * len(words)):]
            
            # Join the remaining words back into a string
            remaining_text = ' '.join(remaining_words)
            
            # Split the remaining text into sentences
            sentences = nltk.sent_tokenize(remaining_text)
            
            # Add the sentences to the list of all sentences
            all_sentences.extend(sentences)
        else:
            print(f"Failed to retrieve the .txt file from {full_url}. Status code: {response.status_code}")

    return all_sentences

def get_text_data(url: str, subcategory: str):
    # Get the link to the subcategory page
    subcategory_link = get_subcategory_link(url, subcategory)
    
    if subcategory_link:
        # Get the list of book .txt file links from the subcategory page
        txt_links = get_book_links(subcategory_link)
        
        # Get all sentences from the list of .txt file links
        all_sentences = get_sentences(txt_links)
        
        # Print the number of sentences and the first few sentences as a sample
        print(f"Total number of sentences: {len(all_sentences)}")
        return txt_links, all_sentences
    else:
        print("Failed to find subcategory link.")

url = 'https://www.gutenberg.org/ebooks/bookshelf/'
subcategory = 'World War II'
txt_links, all_sentences = get_text_data(url=url, subcategory=subcategory)
print(all_sentences[::10000])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Amram\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NameError: name 'base_url' is not defined