In [None]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

# Function to initialize the ChromeDriver and open the URL
def initialize_driver(url):
    chrome_options = Options()
    # Enable headless mode for faster scraping
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    # Path to ChromeDriver (adjust the path based on your setup)
    driver = webdriver.Chrome(options=chrome_options)

    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    except TimeoutException:
        print(f"Failed to load {url}")
        driver.quit()
        return None
    return driver

# Function to extract and summarize the website structure
def extract_website_structure(url):
    driver = initialize_driver(url)
    if not driver:
        return "Error: Could not initialize driver or load the page."

    # Extract page content using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Close the driver once we have the page source
    driver.quit()

    # Summarizing key elements
    summary = {
        'title': soup.title.string if soup.title else 'No title found',
        'headings': [],
        'links': [],
        'paragraphs': [],
        'divs': []
    }

    # Extract headings (h1, h2, h3, etc.)
    for heading_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
        headings = soup.find_all(heading_tag)
        for heading in headings:
            summary['headings'].append({heading_tag: heading.get_text(strip=True)})

    # Extract all anchor tags and their URLs
    for link in soup.find_all('a', href=True):
        summary['links'].append({'text': link.get_text(strip=True), 'url': link['href']})

    # Extract paragraphs without character limitations
    for paragraph in soup.find_all('p'):
        summary['paragraphs'].append(paragraph.get_text(strip=True))

    # Extract divs with class and ID to get more detailed structure information (without limits on text)
    for div in soup.find_all('div'):
        class_name = div.get('class', 'N/A')
        div_id = div.get('id', 'N/A')
        text = div.get_text(strip=True)  # Removed character limit for div text
        summary['divs'].append({'class': class_name, 'id': div_id, 'text': text})

    return summary

# Function to print the summary in grouped sections
def print_summary(summary):
    print("\nWebsite Structure Summary:\n")
    
    # Print Title
    print(f"Title: {summary['title']}\n")
    
    # Print Headings (grouped by heading tag, e.g., h1, h2, etc.)
    print("Headings:")
    for heading in summary['headings']:
        for tag, text in heading.items():
            print(f"  {tag.upper()}: {text}")

    # Print Links with their URLs
    print("\nLinks:")
    for link in summary['links']:
        print(f"  Text: {link['text']}, URL: {link['url']}")

    # Print all paragraphs without limiting them
    print("\nParagraphs:")
    for i, paragraph in enumerate(summary['paragraphs']):
        print(f"  Paragraph {i+1}: {paragraph}")

    # Print divs with class, ID, and their text
    print("\nDivs with classes and IDs:")
    for i, div in enumerate(summary['divs']):
        print(f"  Class: {div['class']}, ID: {div['id']}, Text: {div['text'][:100]}...")

# Example usage
if __name__ == "__main__":
    # Ask the user to input the URL
    url = input("Please enter the URL: ")  # Dynamic URL input
    
    summary = extract_website_structure(url)
    
    if summary:
        print_summary(summary)


        # Write the formatted text to a file
        with open("cleaned_text.txt", "w") as file:
            for key, values in summary.items():
                file.write(f"{key}:\n")
                for value in values:
                    file.write(f"  - {value}\n")
                file.write("-" * 40 + "\n")



In [None]:
import re
from bs4 import BeautifulSoup

# Step 1: Remove HTML tags using BeautifulSoup
def remove_html_tags(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup.get_text()

# Step 2: Remove paragraph markers like 'Paragraph X:'
def remove_redundant_phrases(text):
    return re.sub(r'Paragraph \d+:', '', text)

# Step 3: Remove duplicate lines or phrases (this removes repeated sentences)
def remove_duplicate_lines(text):
    lines = text.splitlines()
    unique_lines = list(dict.fromkeys(lines))  # Use dict.fromkeys() to maintain order and remove duplicates
    return " ".join(unique_lines)

# Step 4: Clean up extra spaces, newlines, and normalize text
def normalize_text(text):
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    text = text.lower()  # Optional: convert to lowercase for consistency
    return text

# Full pipeline to clean the scraped raw HTML content
def clean_scraped_data(raw_html_content):
    text = remove_html_tags(raw_html_content)
    text = remove_redundant_phrases(text)
    text = remove_duplicate_lines(text)
    cleaned_text = normalize_text(text)
    return cleaned_text

# Function to write cleaned data to a file
def write_cleaned_data_to_file(cleaned_data, filename="cleaned_text.txt"):
    with open(filename, "w") as file:
        file.write(cleaned_data)

# Function to open and read the cleaned data from the file
def read_cleaned_data_from_file(filename="cleaned_text.txt"):
    with open(filename, "r") as file:
        return file.read()

# Example usage: Write to and Read from 'cleaned_text.txt'
if __name__ == "__main__":
    # Assume that the raw HTML content was processed earlier and saved to 'cleaned_text.txt'
    # Now we want to read the cleaned text from the file

    # Step 1: Read the cleaned data from the file
    cleaned_data_from_file = read_cleaned_data_from_file()

    # Step 2: Show the cleaned data (for debugging, remove in production)
    print("Cleaned Data Read from File:")
    print(cleaned_data_from_file)

    # If further cleaning is required, you can perform additional steps here.
    # This assumes that `clean_scraped_data` has already been applied and written to 'cleaned_text.txt'


In [None]:
import re
from bs4 import BeautifulSoup

# Step 1: Remove HTML tags using BeautifulSoup
def remove_html_tags(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup.get_text()

# Step 2: Remove paragraph markers like 'Paragraph X:'
def remove_redundant_phrases(text):
    return re.sub(r'Paragraph \d+:', '', text)

# Step 3: Remove duplicate lines or phrases (this removes repeated sentences)
def remove_duplicate_lines(text):
    lines = text.splitlines()
    unique_lines = list(dict.fromkeys(lines))  # Use dict.fromkeys() to maintain order and remove duplicates
    return " ".join(unique_lines)

# Step 4: Clean up extra spaces, newlines, and normalize text
def normalize_text(text):
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and newlines
    text = text.lower()  # Optional: convert to lowercase for consistency
    return text

# Full pipeline to clean the scraped raw HTML content
def clean_scraped_data(raw_html_content):
    text = remove_html_tags(raw_html_content)
    text = remove_redundant_phrases(text)
    text = remove_duplicate_lines(text)
    cleaned_text = normalize_text(text)
    return cleaned_text

# Function to open and read the raw HTML content from the file
def read_raw_content_from_file(filename="cleaned_text.txt"):
    with open(filename, "r") as file:
        return file.read()

# Example usage within your framework
if __name__ == "__main__":
    # Step 1: Import the raw content from the .txt file generated in the previous step
    raw_html_content = read_raw_content_from_file("cleaned_text.txt")

    # Step 2: Clean the imported raw content
    cleaned_data = clean_scraped_data(raw_html_content)

    # Step 3: Show the cleaned data for verification (optional)
    print("Cleaned Data from File:")
    print(cleaned_data)


In [None]:
import spacy
import re
from collections import defaultdict

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to open and read the cleaned data from the file
def read_cleaned_data_from_file(filename="cleaned_text.txt"):
    with open(filename, "r") as file:
        return file.read()

# Step 1: Read the cleaned text from the file
cleaned_text = read_cleaned_data_from_file("cleaned_text.txt")

# Step 2: Process the text with spaCy
doc = nlp(cleaned_text)

# Regex patterns for phone numbers, emails, and links
phone_regex = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
email_regex = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
link_regex = r'https?://[^\s]+'

# Function to extract relevant labeled information
def extract_relevant_information(doc):
    extracted_info = defaultdict(list)
    
    # Extract entities using spaCy NER
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            extracted_info['Names'].append(ent.text)
        elif ent.label_ == "ORG":
            extracted_info['Organizations'].append(ent.text)
        elif ent.label_ == "GPE":
            extracted_info['Addresses'].append(ent.text)

    # Extract phone numbers using regex
    extracted_info['Phone Numbers'] = re.findall(phone_regex, doc.text)
    
    # Extract email addresses using regex
    extracted_info['Emails'] = re.findall(email_regex, doc.text)
    
    # Extract links using regex
    extracted_info['Links'] = re.findall(link_regex, doc.text)

        # Remove duplicates by converting lists to sets and back to lists
    for key in extracted_info:
        extracted_info[key] = list(set(extracted_info[key]))
    
    return extracted_info

# Extract the relevant information
extracted_info = extract_relevant_information(doc)

# Function to print the extracted information
def print_extracted_info(extracted_info):
    for key, values in extracted_info.items():
        print(f"{key}:")
        for value in values:
            print(f"  - {value}")
    print("-" * 40)

# Step 3: Print the extracted and labeled information
print_extracted_info(extracted_info)
