## libraries

In [32]:
import requests
from bs4 import BeautifulSoup

In [33]:
class WebScraper:
    def __init__(self, url):
        self.__url = url

    @property
    def url(self):
        return self.__url
    
    @url.setter
    def url(self, url):
        self.__url = url
    
    def download_html(self, file_name='webpage.html'):
        try:
            print(f"Downloading page content from: {self.url}")
            response = requests.get(self.url)
            with open(file_name, 'w') as file:
                print(f"Saving page content to: {file_name}")
                file.write(response.text)
            print("Download completed successfully")
        except Exception as e:
            print("Unable to download page content")
            print(e)

    def extract_content(self, file_name='webpage.html'):

        paragraphs = []
        title = None

        try:
            print(f"Extracting content from: {file_name}")
            with open(file_name, 'r') as page:
                page_soup = BeautifulSoup(page, 'html.parser')
                
                print(f"Page parsed successfully")
                
                title = page_soup.title.string.strip()
                print(f"Page title extracted: {title}")
                
                paragraphs = page_soup.find_all('p')

                #clean up paragraphs removing spaces and new lines
                paragraphs = [p.text.strip() for p in paragraphs if p.text.strip() != '']

                print(f"Paragraphs extracted, total paragraphs: {len(paragraphs)}")
                
        except Exception as e:
            print("Unable to extract content")
            print(e)
        
        return title, paragraphs
    

    def save_text(self, file_name='extracted_content.txt'):
        
        try:
            title, paragraphs = self.extract_content()
            print(f"Saving extracted content to: {file_name}")

            with open(file_name, 'w') as file:
                
                file.write(f"{title}\n\n")

                for i, paragraph in enumerate(paragraphs):
                    file.write(f"{paragraph}\n\n")
            
            print("Content saved successfully")
        except Exception as e:
            
            print("Unable to save extracted content")
            print(e)

## Testing block

In [34]:
web_scrapper = WebScraper('http://books.toscrape.com')
web_scrapper.download_html()

title, paragraphs = web_scrapper.extract_content()

print(f"Title: {title}")

for i, paragraph in enumerate(paragraphs):
    print(f"Paragraph {i+1}: {paragraph}")
    print()

web_scrapper.save_text()

Downloading page content from: http://books.toscrape.com
Saving page content to: webpage.html
Download completed successfully
Extracting content from: webpage.html
Page parsed successfully
Page title extracted: All products | Books to Scrape - Sandbox
Paragraphs extracted, total paragraphs: 40
Title: All products | Books to Scrape - Sandbox
Paragraph 1: Â£51.77

Paragraph 2: In stock

Paragraph 3: Â£53.74

Paragraph 4: In stock

Paragraph 5: Â£50.10

Paragraph 6: In stock

Paragraph 7: Â£47.82

Paragraph 8: In stock

Paragraph 9: Â£54.23

Paragraph 10: In stock

Paragraph 11: Â£22.65

Paragraph 12: In stock

Paragraph 13: Â£33.34

Paragraph 14: In stock

Paragraph 15: Â£17.93

Paragraph 16: In stock

Paragraph 17: Â£22.60

Paragraph 18: In stock

Paragraph 19: Â£52.15

Paragraph 20: In stock

Paragraph 21: Â£13.99

Paragraph 22: In stock

Paragraph 23: Â£20.66

Paragraph 24: In stock

Paragraph 25: Â£17.46

Paragraph 26: In stock

Paragraph 27: Â£52.29

Paragraph 28: In stock

Paragrap