## libraries

In [39]:
import requests
from bs4 import BeautifulSoup
import re

In [40]:
class WebScraper:
    def __init__(self, url):
        self.__url = url

    @property
    def url(self):
        return self.__url
    
    @url.setter
    def url(self, url):
        self.__url = url
    
    def download_html(self, file_name='webpage.html'):
        try:
            print(f"Downloading page content from: {self.url}")
            response = requests.get(self.url)
            with open(file_name, 'w') as file:
                print(f"Saving page content to: {file_name}")
                file.write(response.text)
            print("Download completed successfully")
        except Exception as e:
            print("Unable to download page content")
            print(e)

    def extract_content(self, file_name='webpage.html'):

        paragraphs = []
        title = None

        try:
            print(f"Extracting content from: {file_name}")
            with open(file_name, 'r') as page:
                page_soup = BeautifulSoup(page, 'html.parser')
                
                print(f"Page parsed successfully")
                
                title = page_soup.title.string.strip()
                print(f"Page title extracted: {title}")
                
                paragraphs = page_soup.find_all('p')

                #clean up paragraphs removing spaces and new lines
                paragraphs = [p.text.strip() for p in paragraphs if p.text.strip() != '']

                print(f"Paragraphs extracted, total paragraphs: {len(paragraphs)}")
                
        except Exception as e:
            print("Unable to extract content")
            print(e)
        
        return title, paragraphs
    

    def save_text(self, file_name='extracted_content.txt'):
        
        try:
            title, paragraphs = self.extract_content()
            print(f"Saving extracted content to: {file_name}")

            with open(file_name, 'w') as file:
                
                file.write(f"{title}\n\n")

                for i, paragraph in enumerate(paragraphs):
                    file.write(f"{paragraph}\n\n")
            
            print("Content saved successfully")
        except Exception as e:
            
            print("Unable to save extracted content")
            print(e)

class TextProcessor:
    
    @staticmethod
    def clean_text(paragraphs):
        cleaned_paragraphs = []
        
        for paragraph in paragraphs:
            #remove html tags
            paragraph = BeautifulSoup(paragraph, 'html.parser').get_text()
            
            #remove special characters using regex
            paragraph = re.sub(r'[^a-zA-Z0-9\s]', '', paragraph)

            # remove extra spaces
            paragraph = re.sub(r'\s+', ' ', paragraph)
           
            #convert to lowercase
            paragraph = paragraph.lower()
            
            cleaned_paragraphs.append(paragraph)
        
        return cleaned_paragraphs

## Testing block

In [41]:
webpage_url = 'https://www.africa.engineering.cmu.edu/'

web_scrapper = WebScraper(webpage_url)
web_scrapper.download_html()

title, paragraphs = web_scrapper.extract_content()

print(f"Title: {title}")

for i, paragraph in enumerate(paragraphs):
    print(f"Paragraph {i+1}: {paragraph}")
    print()

web_scrapper.save_text()


paragraphs = TextProcessor.clean_text(paragraphs)

for i, paragraph in enumerate(paragraphs):
    print(f"Cleaned Paragraph {i+1}: {paragraph}")
    print()

Downloading page content from: https://www.africa.engineering.cmu.edu/
Saving page content to: webpage.html
Download completed successfully
Extracting content from: webpage.html
Page parsed successfully
Page title extracted: CMU-Africa
Paragraphs extracted, total paragraphs: 54
Title: CMU-Africa
Paragraph 1: Carnegie Mellon University Africa

Paragraph 2: Educating the next generation of African tech leaders and innovators

Paragraph 3: College of Engineering: #7

Paragraph 4: U.S. News & World Report, 2025

Paragraph 5: Electrical Engineering: #7

Paragraph 6: Electrical, Electronic, CommunicationU.S. News & World Report, 2025

Paragraph 7: Computer Engineering: #5

Paragraph 8: U.S. News & World Report, 2025

Paragraph 9: Learn more

Paragraph 10: Explore all rankings

Paragraph 11: Master of Science in Electrical and Computer Engineering(MS ECE)→

Paragraph 12: Master of Science in Information Technology(MSIT)→

Paragraph 13: Master of Science in Engineering Artificial Intelligence(