In [1]:
import requests
from bs4 import BeautifulSoup
from docx import Document
from fpdf import FPDF
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import unicodedata

# Ensure required NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')

# Function to scrape data from a webpage
def scrape_website(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text(separator='\n')
    else:
        print(f"Failed to scrape the website. Status code: {response.status_code}")
        return None

# Function to clean scraped text
def clean_text(text):
    # Remove unnecessary information using regular expressions
    cleaned_text = re.sub(r'(subscribe|find your story|download media|follow us|contact us|privacy notice|cookie policy|social media|legal|corporate news|cookies|site uses cookies|download all|media cart|press release|related links)', '', text, flags=re.IGNORECASE)
    
    # Remove extra whitespace and fix inconsistent capitalization
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    # Fix sentence capitalization
    cleaned_text = '. '.join(sentence.capitalize() for sentence in cleaned_text.split('. '))
    
    return cleaned_text

# Function to save cleaned text to DOCX with URL
def save_to_docx(cleaned_text, url, output_path='Scraped_Content.docx'):
    doc = Document()
    doc.add_heading('Scraped Content', level=1)
    doc.add_paragraph(f"URL: {url}", style='Italic')  # Add URL at the top

    paragraphs = cleaned_text.split("\n\n")
    for para in paragraphs:
        doc.add_paragraph(para)
    
    doc.save(output_path)
    print(f"Cleaned content saved to {output_path}")

# Function to save cleaned text to PDF with URL
def save_to_pdf(cleaned_text, url, output_path='Scraped_Content.pdf'):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Add URL at the top of the PDF
    pdf.multi_cell(0, 10, f"URL: {url}\n")

    # Add cleaned text
    pdf.multi_cell(0, 10, cleaned_text)
    
    pdf.output(output_path)
    print(f"Cleaned content saved to {output_path}")

# Function to preprocess text for summarization
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    processed_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words_filtered = [word for word in words if word.isalnum() and word not in stop_words]
        processed_sentences.append(" ".join(words_filtered))
    
    return sentences, processed_sentences

# Function to perform extractive summarization using TF-IDF
def extractive_summary(text, num_sentences=5):
    original_sentences, processed_sentences = preprocess_text(text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)

    sentence_scores = tfidf_matrix.sum(axis=1)
    ranked_sentences = [(score, sent) for score, sent in zip(sentence_scores, original_sentences)]
    ranked_sentences.sort(reverse=True, key=lambda x: x[0])

    summary = " ".join([sent for _, sent in ranked_sentences[:num_sentences]])
    return summary

# Main function to scrape, clean, save, and summarize data
def main():
    # URL to scrape
    url = "https://news.adidas.com/training/adidas-brings-universal-design-principles-to-kit-for-paris-2024--to-optimise-fit-and-performance-for/s/675a0e1a-ddd7-428f-b15d-1713d59bb352"

    # Step 2: Scrape the website content
    scraped_data = scrape_website(url)
    
    if scraped_data:
        # Step 3: Clean the scraped text
        cleaned_text = clean_text(scraped_data)
        
        # Step 4: Save the cleaned content to a new DOCX file with URL
        save_to_docx(cleaned_text, url, output_path='Scraped_Content.docx')
        
        # Step 5: Optionally, save the cleaned content to a PDF file with URL
        save_to_pdf(cleaned_text, url, output_path='Scraped_Content.pdf')

        # Step 6: Ask the user if they want to generate a summary
        summary_choice = input("Would you like to generate a summary? (y/n): ")
        if summary_choice.lower() == 'y':
            summary = extractive_summary(cleaned_text, num_sentences=5)
            print("\nSummary:\n", summary)
        else:
            print("Summary generation skipped.")
    else:
        print("Failed to scrape the website. Exiting the process.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyError: "no style with name 'Italic'"

In [3]:
import requests
from bs4 import BeautifulSoup
from docx import Document
from fpdf import FPDF
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import unicodedata

# Ensure required NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')

# Function to scrape data from a webpage
def scrape_website(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text(separator='\n')
    else:
        print(f"Failed to scrape the website. Status code: {response.status_code}")
        return None

# Function to clean scraped text
def clean_text(text):
    # Remove unnecessary information using regular expressions
    cleaned_text = re.sub(r'(subscribe|find your story|download media|follow us|contact us|privacy notice|cookie policy|social media|legal|corporate news|cookies|site uses cookies|download all|media cart|press release|related links)', '', text, flags=re.IGNORECASE)
    
    # Remove extra whitespace and fix inconsistent capitalization
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    # Fix sentence capitalization
    cleaned_text = '. '.join(sentence.capitalize() for sentence in cleaned_text.split('. '))
    
    return cleaned_text

# Function to save cleaned text to DOCX with URL
def save_to_docx(cleaned_text, url, output_path='Scraped_Content.docx'):
    doc = Document()
    doc.add_heading('Scraped Content', level=1)

    # Add the URL in italic style
    para = doc.add_paragraph()
    run = para.add_run(f"URL: {url}")
    run.italic = True

    paragraphs = cleaned_text.split("\n\n")
    for para in paragraphs:
        doc.add_paragraph(para)
    
    doc.save(output_path)
    print(f"Cleaned content saved to {output_path}")

# Function to save cleaned text to PDF with URL
def save_to_pdf(cleaned_text, url, output_path='Scraped_Content.pdf'):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Add URL at the top of the PDF
    pdf.multi_cell(0, 10, f"URL: {url}\n")

    # Add cleaned text
    pdf.multi_cell(0, 10, cleaned_text)
    
    pdf.output(output_path)
    print(f"Cleaned content saved to {output_path}")

# Function to preprocess text for summarization
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    processed_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words_filtered = [word for word in words if word.isalnum() and word not in stop_words]
        processed_sentences.append(" ".join(words_filtered))
    
    return sentences, processed_sentences

# Function to perform extractive summarization using TF-IDF
def extractive_summary(text, num_sentences=5):
    original_sentences, processed_sentences = preprocess_text(text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)

    sentence_scores = tfidf_matrix.sum(axis=1)
    ranked_sentences = [(score, sent) for score, sent in zip(sentence_scores, original_sentences)]
    ranked_sentences.sort(reverse=True, key=lambda x: x[0])

    summary = " ".join([sent for _, sent in ranked_sentences[:num_sentences]])
    return summary

# Main function to scrape, clean, save, and summarize data
def main():
    # URL to scrape
    url = "https://news.adidas.com/training/adidas-brings-universal-design-principles-to-kit-for-paris-2024--to-optimise-fit-and-performance-for/s/675a0e1a-ddd7-428f-b15d-1713d59bb352"

    # Step 2: Scrape the website content
    scraped_data = scrape_website(url)
    
    if scraped_data:
        # Step 3: Clean the scraped text
        cleaned_text = clean_text(scraped_data)
        
        # Step 4: Save the cleaned content to a new DOCX file with URL
        save_to_docx(cleaned_text, url, output_path='Scraped_Content.docx')
        
        # Step 5: Optionally, save the cleaned content to a PDF file with URL
        save_to_pdf(cleaned_text, url, output_path='Scraped_Content.pdf')

        # Step 6: Ask the user if they want to generate a summary
        summary_choice = input("Would you like to generate a summary? (y/n): ")
        if summary_choice.lower() == 'y':
            summary = extractive_summary(cleaned_text, num_sentences=5)
            print("\nSummary:\n", summary)
        else:
            print("Summary generation skipped.")
    else:
        print("Failed to scrape the website. Exiting the process.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaned content saved to Scraped_Content.docx


UnicodeEncodeError: 'latin-1' codec can't encode character '\u2019' in position 1171: ordinal not in range(256)

In [5]:
import requests
from bs4 import BeautifulSoup
from docx import Document
from fpdf import FPDF
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import unicodedata

# Ensure required NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')

# Function to scrape data from a webpage
def scrape_website(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text(separator='\n')
    else:
        print(f"Failed to scrape the website. Status code: {response.status_code}")
        return None

# Function to clean scraped text
def clean_text(text):
    # Remove unnecessary information using regular expressions
    cleaned_text = re.sub(r'(subscribe|find your story|download media|follow us|contact us|privacy notice|cookie policy|social media|legal|corporate news|cookies|site uses cookies|download all|media cart|press release|related links)', '', text, flags=re.IGNORECASE)
    
    # Remove extra whitespace and fix inconsistent capitalization
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    # Fix sentence capitalization
    cleaned_text = '. '.join(sentence.capitalize() for sentence in cleaned_text.split('. '))
    
    return cleaned_text

# Function to clean text for PDF by removing problematic Unicode characters
def clean_text_for_pdf(text):
    cleaned_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    return cleaned_text

# Function to save cleaned text to DOCX with URL
def save_to_docx(cleaned_text, url, output_path='Scraped_Content.docx'):
    doc = Document()
    doc.add_heading('Scraped Content', level=1)

    # Add the URL in italic style
    para = doc.add_paragraph()
    run = para.add_run(f"URL: {url}")
    run.italic = True

    paragraphs = cleaned_text.split("\n\n")
    for para in paragraphs:
        doc.add_paragraph(para)
    
    doc.save(output_path)
    print(f"Cleaned content saved to {output_path}")

# Function to save cleaned text to PDF with URL
def save_to_pdf(cleaned_text, url, output_path='Scraped_Content.pdf'):
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Add URL at the top of the PDF
    pdf.multi_cell(0, 10, f"URL: {url}\n")

    # Clean the text for PDF to handle encoding issues
    cleaned_data_for_pdf = clean_text_for_pdf(cleaned_text)

    # Add cleaned text
    pdf.multi_cell(0, 10, cleaned_data_for_pdf)
    
    pdf.output(output_path)
    print(f"Cleaned content saved to {output_path}")

# Function to preprocess text for summarization
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    processed_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words_filtered = [word for word in words if word.isalnum() and word not in stop_words]
        processed_sentences.append(" ".join(words_filtered))
    
    return sentences, processed_sentences

# Function to perform extractive summarization using TF-IDF
def extractive_summary(text, num_sentences=5):
    original_sentences, processed_sentences = preprocess_text(text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)

    sentence_scores = tfidf_matrix.sum(axis=1)
    ranked_sentences = [(score, sent) for score, sent in zip(sentence_scores, original_sentences)]
    ranked_sentences.sort(reverse=True, key=lambda x: x[0])

    summary = " ".join([sent for _, sent in ranked_sentences[:num_sentences]])
    return summary

# Main function to scrape, clean, save, and summarize data
def main():
    # URL to scrape
    url = "https://news.adidas.com/training/adidas-brings-universal-design-principles-to-kit-for-paris-2024--to-optimise-fit-and-performance-for/s/675a0e1a-ddd7-428f-b15d-1713d59bb352"

    # Step 2: Scrape the website content
    scraped_data = scrape_website(url)
    
    if scraped_data:
        # Step 3: Clean the scraped text
        cleaned_text = clean_text(scraped_data)
        
        # Step 4: Save the cleaned content to a new DOCX file with URL
        save_to_docx(cleaned_text, url, output_path='Scraped_Content.docx')
        
        # Step 5: Optionally, save the cleaned content to a PDF file with URL
        save_to_pdf(cleaned_text, url, output_path='Scraped_Content.pdf')

        # Step 6: Ask the user if they want to generate a summary
        summary_choice = input("Would you like to generate a summary? (y/n): ")
        if summary_choice.lower() == 'y':
            summary = extractive_summary(cleaned_text, num_sentences=5)
            print("\nSummary:\n", summary)
        else:
            print("Summary generation skipped.")
    else:
        print("Failed to scrape the website. Exiting the process.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaned content saved to Scraped_Content.docx
Cleaned content saved to Scraped_Content.pdf


Would you like to generate a summary? (y/n):  y



Summary:
 African republic chad chile china colombia comoros congo cook island costa rica croatia cuba cyprus czech republic denmark djibouti dominica dominican republic ecuador egypt el salvador equatorial guinea eritrea estonia ethiopia fiji finland france french polynesia gabon gambia georgia germany ghana gibraltar greece greenland grenada guadeloupe guam guatemala guinea guinea-bissau guyana haiti honduras hong kong hungary iceland india indonesia iran iraq ireland israel italy ivory coast jamaica japan jordan kazakhstan kenya korea, republic kosovo kuwait kyrgyzstan laos latvia lebanon lesotho liberia libya liechtenstein lithuania lithunia luxembourg macau macedonia madagascar madeira islands malawi malaysia maldives mali malta marshall islands martinique mauritania mauritius mexico micronesia moldova monaco mongolia montenegro montserrat morocco mozambique myanmar (burma) namibia nepal netherlands netherlands antilles new caledonia new zealand nicaragua niger nigeria north kore

In [11]:
import requests
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from fpdf import FPDF
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
import unicodedata

# Ensure required NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')

# Function to scrape data from a webpage
def scrape_website(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text(separator='\n')
    else:
        print(f"Failed to scrape the website. Status code: {response.status_code}")
        return None

# Function to clean scraped text without affecting punctuation and case
def clean_text(text):
    # Remove unnecessary information using regular expressions
    cleaned_text = re.sub(r'(subscribe|find your story|download media|follow us|contact us|privacy notice|cookie policy|social media|legal|corporate news|cookies|site uses cookies|download all|media cart|press release|related links)', '', text, flags=re.IGNORECASE)

    # Remove extra whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    # Preserve the original case and punctuation by avoiding any further modification
    return cleaned_text

# Function to save cleaned text to DOCX with URL and formatting
def save_to_docx(cleaned_text, url, output_path='Addidas.docx'):
    doc = Document()
    
    # Add a formatted title (Heading 1)
    doc.add_heading('Scraped Content from Adidas Website', level=1)

    # Add the URL in italic style
    para = doc.add_paragraph()
    run = para.add_run(f"URL: {url}")
    run.italic = True
    run.font.size = Pt(10)
    para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT

    # Add a subtitle
    doc.add_heading('Content Summary', level=2)

    # Add the cleaned text with custom formatting
    paragraphs = cleaned_text.split("\n\n")
    for para in paragraphs:
        p = doc.add_paragraph()
        run = p.add_run(para)
        run.font.size = Pt(11)
        run.font.name = 'Arial'
        p.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY

    # Set font for the entire document
    set_document_font(doc, font_name="Arial", font_size=11)

    doc.save(output_path)
    print(f"Formatted content saved to {output_path}")

# Helper function to set the font for the entire document
def set_document_font(document, font_name="Arial", font_size=11):
    """Applies font and size to all paragraphs in a document."""
    for paragraph in document.paragraphs:
        for run in paragraph.runs:
            run.font.name = font_name
            run.font.size = Pt(font_size)

# Function to clean text for PDF by removing problematic Unicode characters
def clean_text_for_pdf(text):
    cleaned_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    return cleaned_text

# Function to preprocess text for summarization
def preprocess_text(text):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    processed_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words_filtered = [word for word in words if word.isalnum() and word not in stop_words]
        processed_sentences.append(" ".join(words_filtered))
    
    return sentences, processed_sentences

# Function to perform extractive summarization using TF-IDF
def extractive_summary(text, num_sentences=5):
    original_sentences, processed_sentences = preprocess_text(text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_sentences)

    sentence_scores = tfidf_matrix.sum(axis=1)
    ranked_sentences = [(score, sent) for score, sent in zip(sentence_scores, original_sentences)]
    ranked_sentences.sort(reverse=True, key=lambda x: x[0])

    summary = " ".join([sent for _, sent in ranked_sentences[:num_sentences]])
    return summary

# Main function to scrape, clean, save, and summarize data
def main():
    # URL to scrape
    url = "https://news.adidas.com/training/adidas-brings-universal-design-principles-to-kit-for-paris-2024--to-optimise-fit-and-performance-for/s/675a0e1a-ddd7-428f-b15d-1713d59bb352"

    # Step 2: Scrape the website content
    scraped_data = scrape_website(url)
    
    if scraped_data:
        # Step 3: Clean the scraped text
        cleaned_text = clean_text(scraped_data)
        
        # Step 4: Save the cleaned content to a new formatted DOCX file with URL
        save_to_docx(cleaned_text, url, output_path='Formatted_Scraped_Content.docx')

        # Step 5: Ask the user if they want to generate a summary
        summary_choice = input("Would you like to generate a summary? (y/n): ")
        if summary_choice.lower() == 'y':
            summary = extractive_summary(cleaned_text, num_sentences=5)
            print("\nSummary:\n", summary)
        else:
            print("Summary generation skipped.")
    else:
        print("Failed to scrape the website. Exiting the process.")

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Formatted content saved to Formatted_Scraped_Content.docx


Would you like to generate a summary? (y/n):  y



Summary:
 African Republic Chad Chile China Colombia Comoros Congo Cook Island Costa Rica Croatia Cuba Cyprus Czech Republic Denmark Djibouti Dominica Dominican Republic Ecuador Egypt El Salvador Equatorial Guinea Eritrea Estonia Ethiopia Fiji Finland France French Polynesia Gabon Gambia Georgia Germany Ghana Gibraltar Greece Greenland Grenada Guadeloupe Guam Guatemala Guinea Guinea-Bissau Guyana Haiti Honduras Hong Kong Hungary Iceland India Indonesia Iran Iraq Ireland Israel Italy Ivory Coast Jamaica Japan Jordan Kazakhstan Kenya Korea, Republic Kosovo Kuwait Kyrgyzstan Laos Latvia Lebanon Lesotho Liberia Libya Liechtenstein Lithuania Lithunia Luxembourg Macau Macedonia Madagascar Madeira Islands Malawi Malaysia Maldives Mali Malta Marshall Islands Martinique Mauritania Mauritius Mexico Micronesia Moldova Monaco Mongolia Montenegro Montserrat Morocco Mozambique Myanmar (Burma) Namibia Nepal Netherlands Netherlands Antilles New Caledonia New Zealand Nicaragua Niger Nigeria North Kore