## libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import re
from textblob import TextBlob

ModuleNotFoundError: No module named 'requests'

In [55]:
class WebScraper:
    def __init__(self, url):
        self.__url = url

    @property
    def url(self):
        return self.__url
    
    @url.setter
    def url(self, url):
        self.__url = url
    
    def download_html(self, file_name='webpage.html'):
        try:
            print(f"Downloading page content from: {self.url}")
            response = requests.get(self.url)
            with open(file_name, 'w') as file:
                print(f"Saving page content to: {file_name}")
                file.write(response.text)
            print("Download completed successfully")
        except Exception as e:
            print("Unable to download page content")
            print(e)

    def extract_content(self, file_name='webpage.html'):

        paragraphs = []
        title = None

        try:
            print(f"Extracting content from: {file_name}")
            with open(file_name, 'r') as page:
                page_soup = BeautifulSoup(page, 'html.parser')
                
                print(f"Page parsed successfully")
                
                title = page_soup.title.string.strip()
                print(f"Page title extracted: {title}")
                
                paragraphs = page_soup.find_all('p')

                #clean up paragraphs removing spaces and new lines
                paragraphs = [p.text.strip() for p in paragraphs if p.text.strip() != '']

                print(f"Paragraphs extracted, total paragraphs: {len(paragraphs)}")
                
        except Exception as e:
            print("Unable to extract content")
            print(e)
        
        return title, paragraphs
    

    def save_text(self, file_name='extracted_content.txt'):
        
        try:
            title, paragraphs = self.extract_content()
            print(f"Saving extracted content to: {file_name}")

            with open(file_name, 'w') as file:
                
                file.write(f"{title}\n\n")

                for i, paragraph in enumerate(paragraphs):
                    file.write(f"{paragraph}\n\n")
            
            print("Content saved successfully")
        except Exception as e:
            
            print("Unable to save extracted content")
            print(e)

class TextProcessor:
    
    @staticmethod
    def clean_text(paragraphs):
        cleaned_paragraphs = []
        
        for paragraph in paragraphs:
            #remove html tags
            paragraph = BeautifulSoup(paragraph, 'html.parser').get_text()
            
            #remove special characters using regex
            paragraph = re.sub(r'[^a-zA-Z0-9\s]', '', paragraph)

            # remove extra spaces
            paragraph = re.sub(r'\s+', ' ', paragraph)
           
            #convert to lowercase
            paragraph = paragraph.lower()
            
            cleaned_paragraphs.append(paragraph)
        
        return cleaned_paragraphs
    


class SentimentAnalyzer:
        
        @staticmethod
        def analyze_sentiment(file_name='extracted_content.txt'):
            
            try:
                with open(file_name, 'r') as file:
                    paragraphs = file.readlines()
                    
                    for i, paragraph in enumerate(paragraphs):
                        blob = TextBlob(paragraph)
                        print(f"Paragraph {i+1}:")
                        print(paragraph)
                        print(f"Sentiment Polarity: {blob.sentiment.polarity}")
                        print(f"Sentiment Subjectivity: {blob.sentiment.subjectivity}")
                        print()
            except Exception as e:
                print("Unable to analyze sentiment")
                print(e)


        @staticmethod
        def count_sentiments(file_name='extracted_content.txt'):
            counts = {
                'positive': 0,
                'negative': 0,
                'neutral': 0
            }

            try:
                with open(file_name, 'r') as file:
                    paragraphs = file.readlines()
                    
                    for i, paragraph in enumerate(paragraphs):
                        blob = TextBlob(paragraph)
                        if blob.sentiment.polarity > 0:
                            counts['positive'] += 1
                        elif blob.sentiment.polarity < 0:
                            counts['negative'] += 1
                        else:
                            counts['neutral'] += 1
                    
                    print("Sentiment Counts:")
                    print(f"Positive: {counts['positive']}")
                    print(f"Negative: {counts['negative']}")
                    print(f"Neutral: {counts['neutral']}")
                    
            except Exception as e:
                print("Unable to count sentiments")
                print(e)

            return counts



## Testing block

In [56]:
webpage_url = 'https://www.africa.engineering.cmu.edu/'

web_scrapper = WebScraper(webpage_url)
web_scrapper.download_html()

title, paragraphs = web_scrapper.extract_content()

web_scrapper.save_text()

paragraphs = TextProcessor.clean_text(paragraphs)

# SentimentAnalyzer.analyze_sentiment()

SentimentAnalyzer.count_sentiments()

Downloading page content from: https://www.africa.engineering.cmu.edu/
Saving page content to: webpage.html
Download completed successfully
Extracting content from: webpage.html
Page parsed successfully
Page title extracted: CMU-Africa
Paragraphs extracted, total paragraphs: 53
Extracting content from: webpage.html
Page parsed successfully
Page title extracted: CMU-Africa
Paragraphs extracted, total paragraphs: 53
Saving extracted content to: extracted_content.txt
Content saved successfully
Positive paragraphs: 10
Negative paragraphs: 3
Neutral paragraphs: 95
