In [9]:
import requests
from bs4 import BeautifulSoup
from docx import Document
from fpdf import FPDF
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

# Ensure required NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')

# Function to scrape website data
def scrape_website(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text(separator='\n')
    else:
        return None

# Function to save scraped data in a well-formatted DOCX file
def save_to_docx(scraped_data, filename='Microsoft.docx'):
    document = Document()
    lines = scraped_data.split("\n")
    document.add_heading('Scraped Web Content', level=1)

    for line in lines:
        if "Inclusive Design" in line:
            document.add_heading(line, level=2)
        elif line.strip():
            document.add_paragraph(line)
    
    document.save(filename)
    print(f"Data saved to {filename}")

# Function to save scraped data in PDF format
def save_to_pdf(scraped_data, filename='Microdoft.pdf'):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    for line in scraped_data.split("\n"):
        pdf.multi_cell(0, 10, line)
    
    pdf.output(filename)
    print(f"Data saved to {filename}")

# Function to preprocess text for summarization
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    processed_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words_filtered = [word for word in words if word.isalnum() and word not in stop_words]
        processed_sentences.append(" ".join(words_filtered))
    
    return sentences, processed_sentences

# Function to perform extractive summarization using TF-IDF
def extractive_summary(text, num_sentences=5):
    original_sentences, processed_sentences = preprocess_text(text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)

    sentence_scores = tfidf_matrix.sum(axis=1)
    ranked_sentences = [(score, sent) for score, sent in zip(sentence_scores, original_sentences)]
    ranked_sentences.sort(reverse=True, key=lambda x: x[0])

    summary = " ".join([sent for _, sent in ranked_sentences[:num_sentences]])
    return summary

# Main function to scrape, save, and summarize data
def main():
    url = "https://inclusive.microsoft.design/"
    scraped_data = scrape_website(url)

    if scraped_data:
        print("Website scraped successfully.")

        format_choice = input("Save as (1) Formatted DOCX or (2) PDF: ")
        if format_choice == '1':
            save_to_docx(scraped_data)
        elif format_choice == '2':
            save_to_pdf(scraped_data)
        else:
            print("Invalid choice! Please choose either 1 or 2.")
        
        summary_choice = input("Would you like to generate a summary? (y/n): ")
        if summary_choice.lower() == 'y':
            summary = extractive_summary(scraped_data, num_sentences=5)
            print("\nSummary:\n", summary)
        else:
            print("Summary generation skipped.")
    else:
        print("Failed to scrape the website.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Website scraped successfully.


Save as (1) Formatted DOCX or (2) PDF:  1


Data saved to Microsoft.docx


Would you like to generate a summary? (y/n):  y



Summary:
 Download Inclusive Design for Mental Health cards (PDF)
Case Studies: Creating for Cognition
Browse our curated selection of stories about how employees and customers reduced cognitive exclusion in their products. Download inclusive activity cards (PDF)
Inclusive Design in action
At Microsoft, our technology is intended to deliver increased access, reduced friction, and more emotional context to the greatest number of people. Inclusive Design is for you
Whether you're a program manager, engineer, data scientist, designer, or anyone else who helps create products and services, Inclusive Design is a practice you can follow. Microsoft Inclusive Design
Skip to main content
Microsoft Inclusive Design
Inclusive Design is a methodology, born out of digital environments, that enables and draws on the full range of human diversity. Download Case Studies: Creating for Cognition (PDF)
Inclusive Design for Cognition Screeners
These screeners are questions for recruiting co-creators that