In [2]:
from bs4 import BeautifulSoup
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def extract_main_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove header, footer, nav, script, style, and noscript elements
    for element in soup(['header', 'footer', 'nav', 'script', 'style', 'noscript']):
        element.decompose()

    # Find the main content area
    main_content = ""
    article = soup.find('article')
    if article:
        main_content = article.get_text(separator='\n', strip=True)
    else:
        main = soup.find('main')
        if main:
            main_content = main.get_text(separator='\n', strip=True)
        else:
            large_divs = soup.find_all('div')
            for div in large_divs:
                if len(div.get_text(strip=True)) > 200:  # Threshold length to identify large content blocks
                    main_content += div.get_text(separator='\n', strip=True) + '\n'
            if not main_content:  # Fallback if no large divs found
                body = soup.find('body')
                if body:
                    main_content = body.get_text(separator='\n', strip=True)

    return main_content

def save_to_text_file(content, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

def process_html_files_in_directory(directory_path, output_file_path):
    combined_content = ""
    
    # Iterate through each file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.html'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()
            
            # Extract the main content
            main_content = extract_main_content(html_content)
            combined_content += main_content + "\n\n"

    # Save the combined content to a single text file
    save_to_text_file(combined_content, output_file_path)

    # Process the combined text file to remove stop words
    with open(output_file_path, 'r', encoding='utf-8') as file:
        combined_text = file.read()

    processed_text = process_text(combined_text)

    # Save the processed text back to the file
    save_to_text_file(processed_text, output_file_path)

    print(f'All content extracted, combined, and processed into {output_file_path}')

def process_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters using regex
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    words = nltk.word_tokenize(text)
    
    company_stopwords = [
        "download", "www", "html", "http", "login", "menu", "chat", "article", "disclaimer", "facebook", "cart", "loading",
        "click", "com", "htm", "https", "logout", "navbar", "message", "blog", "copyright", "twitter", "checkout",
        "processing", "submit", "org", "php", "ftp", "register", "footer", "reply", "post", "terms", "instagram", "order",
        "waiting", "login", "net", "asp", "mailto", "signup", "sidebar", "comment", "news", "privacy", "linkedin",
        "invoice", "error", "logout", "gov", "jsp", "tel", "signin", "header", "post", "story", "policy", "pinterest",
        "billing", "success", "register", "edu", "css", "news", "signout", "banner", "thread", "update", "conditions",
        "youtube", "shipping", "failure", "sign up", "co", "js", "irc", "user", "ad", "forum", "headline", "agreement",
        "vimeo", "payment", "retry", "sign in", "io", "json","file", "account", "advertisement", "discussion", "media",
        "license", "flickr", "subscribe", "refresh", "sign out", "uk", "xml", "profile", "promo", "like",
        "video", "cookie", "reddit", "membership", "reload", "contact", "de", "pdf", "href", "admin", "button",
        "share", "image", "settings", "tumblr", "account", "redirect", "about", "jp", "doc", "src", "dashboard",
        "click", "follow", "photo", "preferences", "snapchat", "profile", "navigate", "home", "fr", "docx", "ref",
        "settings", "read more", "subscribe", "gallery", "options", "tiktok", "wishlist", "submit", "menu", "au",
        "xls", "utm_source", "search", "more info", "unsubscribe", "slideshow", "tools", "whatsapp", "product", "validate",
        "search", "ca", "xlsx", "utm_medium", "results", "next", "notification", "podcast", "utilities", "telegram",
        "service", "authenticate", "next", "us", "ppt", "utm_campaign", "help", "previous", "alert", "episode",
        "resources", "messenger", "pricing", "authorize", "previous", "in", "pptx", "utm_term", "support", "back",
        "update", "stream", "links", "skype", "offer", "encrypt", "back", "cn", "txt", "utm_content", "about", "top",
        "upload", "broadcast", "map", "discord", "discount", "help", "br", "zip", "param", "home", "bottom", "attachment",
        "channel", "navigation", "signal", "coupon", "support", "es", "rar", "sid", "news", "skip", "link", "playlist",
        "sitemap", "medium", "gift", "terms", "tar", "id", "blog", "submit","url", "archive", "blogspot", "buy",
        "conditions", "gz", "key", "post", "reset", "address", "library", "wordpress", "sell", "privacy", "exe", "token",
        "article", "cancel", "contact", "resource", "github", "rent", "policy", "dmg", "hash", "category", "edit",
        "phone", "bitbucket", "lease", "disclaimer", "iso", "index", "tag", "delete", "email", "stackoverflow", "booking",
        "sitemap", "bin", "page", "archive", "save", "support", "quora", "reservation", "faq", "img", "sort", "year",
        "print", "help", "meetup", "feedback", "filter", "month", "close", "faq", "eventbrite", "news", "author",
        "collapse", "tutorial", "blog", "faq", "dropdown", "documentation", "post", "terms", "form", "manual",
        "article", "privacy", "field", "report", "read more", "policy", "checkbox", "feedback", "follow us",
        "conditions", "radio", "survey", "share", "disclaimer", "select", "like", "sitemap", "option",
        "comment", "feedback", "input", "subscribe", "gallery", "text area", "unsubscribe", "media", "captcha",
        "learn more", "video", "view details", "image", "all rights reserved", "photo", "© (copyright)", "download",
        "trademark", "upload", "update", "file", "settings", "attachment", "profile", "resource", "account", "link",
        "admin", "share", "dashboard", "my account", "your account", "preferences", "notifications", "messages", "inbox",
        "outbox", "send", "receive", "cart", "checkout", "order", "payment", "invoice", "billing", "shipping", "address",
        "terms of service", "conditions of use", "user agreement", "cookies", "advertisement", "sponsor", "partnership",
        "careers", "jobs", "vacancies", "apply now", "application", "newsletter", "updates", "events", "calendar",
        "press", "release", "media", "gallery", "videos", "photos", "terms & conditions", "privacy & policy", "contact us",
        "back to top", "accessibility", "languages", "select language", "international", "mobile", "desktop"
    ]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    # Update the stopwords list with company-specific stopwords
    stop_words.update(company_stopwords)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(lemmatized_words)

# Directory containing HTML files
directory_path = r"C:\Users\amitb\Desktop\Coding\Industry 4.0 project\Full data\Latest_tcs_scraped_data\htmls"

# Output file path
output_file_path = r'C:\\Users\\amitb\\Desktop\\Coding\\Industry 4.0 project\\v2_tcs_html_combined.txt'

# Process HTML files and combine content
process_html_files_in_directory(directory_path, output_file_path)

print(f'All content extracted, combined, and processed into {output_file_path}')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\amitb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amitb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amitb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


All content extracted, combined, and processed into C:\\Users\\amitb\\Desktop\\Coding\\Industry 4.0 project\\v2_tcs_html_combined.txt
All content extracted, combined, and processed into C:\\Users\\amitb\\Desktop\\Coding\\Industry 4.0 project\\v2_tcs_html_combined.txt
