## libraries

In [21]:
import requests
from bs4 import BeautifulSoup
import re
from textblob import TextBlob
import os
from collections import Counter
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')  

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\STUDENT-3\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [24]:
class WebScraper:
    def __init__(self, url):
        self.__url = url

    @property
    def url(self):
        return self.__url
    
    @url.setter
    def url(self, url):
        self.__url = url
    
    def download_html(self, file_name='webpage.html'):
        try:
            print(f"Downloading page content from: {self.url}")
            response = requests.get(self.url)
            with open(file_name, 'w', encoding="utf-8") as file:
                print(f"Saving page content to: {file_name}")
                file.write(response.text)
            print("Download completed successfully")
        except Exception as e:
            print("Unable to download page content")
            print(e)


    def scrape_multiple_pages(self, urls, item_name='article'):
        for i, url in enumerate(urls):
            page_html = f"{item_name}{i+1}.html"
            file_name = f"{item_name}{i+1}.txt"
            
            self.url = url #update url

            self.download_html(page_html)
            self.save_text(file_name, page_html)

            # delete the html file
            os.remove(page_html)


    def extract_content(self, file_name='webpage.html'):

        paragraphs = []
        title = None

        try:
            print(f"Extracting content from: {file_name}")
            with open(file_name, 'r', encoding="utf-8") as page:
                page_soup = BeautifulSoup(page, 'html.parser')
                
                print(f"Page parsed successfully")
                
                title = page_soup.title.string.strip()
                print(f"Page title extracted: {title}")
                
                paragraphs = page_soup.find_all('p')

                #clean up paragraphs removing spaces and new lines
                paragraphs = [p.text.strip() for p in paragraphs if p.text.strip() != '']

                print(f"Paragraphs extracted, total paragraphs: {len(paragraphs)}")
                
        except Exception as e:
            print("Unable to extract content")
            print(e)
        
        return title, paragraphs
    

    def save_text(self, file_name='extracted_content.txt', from_file='webpage.html'):
        
        try:
            title, paragraphs = self.extract_content(from_file)
            print(f"Saving extracted content to: {file_name}")

            with open(file_name, 'w', encoding="utf-8") as file:
                title = title.strip()
                file.write(f"{title}\n\n")

                for paragraph in paragraphs:
                    paragraph = paragraph.strip()
                    file.write(f"{paragraph}\n\n")
            
            print("Content saved successfully")
        except Exception as e:
            print("Unable to save extracted content")
            print(e)

class TextProcessor:
    
    @staticmethod
    def clean_text(paragraphs):
        cleaned_paragraphs = []
        
        for paragraph in paragraphs:
            #remove html tags
            paragraph = BeautifulSoup(paragraph, 'html.parser').get_text()
            
            #remove special characters using regex
            paragraph = re.sub(r'[^a-zA-Z0-9\s]', '', paragraph)

            # remove extra spaces
            paragraph = re.sub(r'\s+', ' ', paragraph)
           
            #convert to lowercase
            paragraph = paragraph.lower()
            
            cleaned_paragraphs.append(paragraph)
        
        return cleaned_paragraphs
    
    
    @staticmethod
    def aggregate_texts(file_names, output_file='aggregated_content.txt'):  
        aggregated_content = ''
        for file_name in file_names:
            with open(file_name, 'r', encoding="utf-8") as file:
                content = file.read()
                aggregated_content += content + '\n\n'
        
        with open(output_file, 'w', encoding="utf-8") as file:
            file.write(aggregated_content)
        return aggregated_content

    
    @staticmethod
    def find_frequent_words(aggregated_content):
        text_blob =TextBlob(aggregated_content) 
        stop_words = set(stopwords.words('english'))
        filtered_words = [word for word in text_blob.words if word.lower() not in stop_words]  
        word_counts = Counter(filtered_words)
        top_10_most_words = word_counts.most_common(10)
        return top_10_most_words
    
    


class SentimentAnalyzer:
        
        @staticmethod
        def analyze_sentiment(file_name='extracted_content.txt'):
            
            try:
                with open(file_name, 'r', encoding="utf-8") as file:
                    paragraphs = file.readlines()
                    
                    for i, paragraph in enumerate(paragraphs):
                        blob = TextBlob(paragraph)
                        print(f"Paragraph {i+1}:")
                        print(paragraph)
                        print(f"Sentiment Polarity: {blob.sentiment.polarity}")
                        print(f"Sentiment Subjectivity: {blob.sentiment.subjectivity}")
                        print()
            except Exception as e:
                print("Unable to analyze sentiment")
                print(e)


        @staticmethod
        def count_sentiments(file_name='extracted_content.txt'):
            counts = {
                'positive': 0,
                'negative': 0,
                'neutral': 0
            }

            try:
                with open(file_name, 'r', encoding="utf-8") as file:
                    paragraphs = file.readlines()
                    
                    for i, paragraph in enumerate(paragraphs):
                        blob = TextBlob(paragraph)
                        if blob.sentiment.polarity > 0:
                            counts['positive'] += 1
                        elif blob.sentiment.polarity < 0:
                            counts['negative'] += 1
                        else:
                            counts['neutral'] += 1
                    
                    print("Sentiment Counts:")
                    print(f"Positive: {counts['positive']}")
                    print(f"Negative: {counts['negative']}")
                    print(f"Neutral: {counts['neutral']}")
                    
            except Exception as e:
                print("Unable to count sentiments")
                print(e)

            return counts

        @staticmethod
        def summarize_sentiment(paragraphs,file_name = "summary.txt"):
            with open(file_name, 'w', encoding="utf-8") as file:
                file.write(f"Total number of paragraphs analyzed: {len(paragraphs)}\n\n")
                counts = SentimentAnalyzer.count_sentiments()

                # polarity counts
                file.write(f"Positive: {counts['positive']}\n\n")
                file.write(f"Negative: {counts['negative']}\n\n")
                file.write(f"Neutral: {counts['neutral']}\n\n")

                # average
                #  sum / total 
                sum_polarity = 0
                sum_subjectivity = 0
                for paragraph in paragraphs:
                    blob = TextBlob(paragraph)
                    sum_polarity += blob.sentiment.polarity
                    sum_subjectivity += blob.sentiment.subjectivity

                # avg 
                avg_polarity = sum_polarity/len(paragraphs)
                avg_subjectivity = sum_subjectivity/len(paragraphs)
                file.write(f"The average polarity: {avg_polarity:.3f}\n\n")
                file.write(f"The average subjectivity: {avg_subjectivity:.3f}")

    
class Visualizer:
    def plot_sentiment(counts):
        import seaborn as sns
        import matplotlib.pyplot as plt

        labels = counts.keys()
        values = counts.values()
        
        sns.barplot(x=list(labels), y=list(values))
        plt.title('Sentiment Analysis')
        plt.xlabel('Sentiment')
        plt.ylabel('Count')
        plt.show()
    
        



## Testing block

In [25]:
webpage_url = 'https://www.africa.engineering.cmu.edu/'

urls = [
    'https://www.africa.engineering.cmu.edu/',
    'https://www.africa.engineering.cmu.edu/about',
    'https://www.africa.engineering.cmu.edu/programs',
    'https://www.africa.engineering.cmu.edu/research',
    'https://www.africa.engineering.cmu.edu/news',
    'https://www.africa.engineering.cmu.edu/events',
]

# web_scrapper = WebScraper(webpage_url)
# web_scrapper.download_html()

# title, paragraphs = web_scrapper.extract_content()

# web_scrapper.save_text()

# paragraphs = TextProcessor.clean_text(paragraphs)

# # SentimentAnalyzer.analyze_sentiment()

# counts = SentimentAnalyzer.count_sentiments()

# SentimentAnalyzer.summarize_sentiment(paragraphs)

# Visualizer.plot_sentiment(counts)

# web_scrapper.scrape_multiple_pages(urls, item_name='page')


file_names = ['page1.html', 'page2.html', 'page3.html', 'page4.html', 'page5.html', 'page6.html']

file_names = [f.replace('.html', '.txt') for f in file_names]

aggregated_content = TextProcessor.aggregate_texts(file_names, output_file='aggregated_content.txt')


# print("aggregate")
# SentimentAnalyzer.analyze_sentiment(file_name = 'aggregated_content.txt')

print("the top 10 most frequent words in the aggregated content are:")
TextProcessor.find_frequent_words(aggregated_content)



the top 10 most frequent words in the aggregated content are:


[('Africa', 65),
 ('Carnegie', 53),
 ('Mellon', 53),
 ('CMU-Africa', 50),
 ('University', 50),
 ('PM', 36),
 ('2024', 27),
 ('students', 24),
 ('’', 23),
 ('October', 21)]