In [None]:
!pip install nltk
!pip install requests
!pip install beautifulsoup4
!pip install urllib3
!pip install requests-toolbelt
!pip install pdfminer.six

Collecting requests-toolbelt
  Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl.metadata (14 kB)
Downloading requests_toolbelt-1.0.0-py2.py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: requests-toolbelt
Successfully installed requests-toolbelt-1.0.0
Collecting pdfminer.six
  Downloading pdfminer.six-20240706-py3-none-any.whl.metadata (4.1 kB)
Downloading pdfminer.six-20240706-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20240706


In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import hashlib
from urllib3.exceptions import InsecureRequestWarning
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pdfminer.high_level import extract_text

# Ensure necessary NLTK data packages are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:

###---------------------------------------------------------------scraper-------------------------------------------------

# Suppress SSL certificate warnings (use with caution)
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

def create_folder(folder_path):
    """Creates a folder if it doesn't exist."""
    os.makedirs(folder_path, exist_ok=True)

def get_unique_filename(url):
    """Generates a unique filename based on URL hash."""
    parsed_url = urlparse(url)
    file_name = hashlib.md5(url.encode()).hexdigest()
    return file_name + ".html"

def get_document_filename(url):
    """Extracts the filename from the URL for documents."""
    parsed_url = urlparse(url)
    return os.path.basename(parsed_url.path)

def save_content(content, folder_path, filename):
    """Saves content (text or binary) to a file."""
    with open(os.path.join(folder_path, filename), 'wb') as f:
        f.write(content)

def scrape_website(url, folder_path, visited=set(), depth=0, max_depth=3, common_tabs=None):
    """Recursively scrapes a website up to max_depth."""
    if depth > max_depth or url in visited:
        return

    visited.add(url)

    try:
        response = requests.get(url, verify=False)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        if common_tabs:
            for tab in common_tabs:
                for elem in soup.select(tab):
                    elem.decompose()

        # Save HTML content excluding header, footer, and common tabs
        html_folder = os.path.join(folder_path, 'htmls')
        create_folder(html_folder)
        filtered_content = soup.prettify('utf-8')
        save_content(filtered_content, html_folder, get_unique_filename(url))

        # Save documents
        doc_folder = os.path.join(folder_path, 'documents')
        create_folder(doc_folder)
        for link in soup.find_all('a', href=True):
            href = link.get('href')
            full_url = urljoin(url, href)
            if urlparse(full_url).netloc == urlparse(url).netloc:
                if full_url.endswith(('.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx')):
                    save_content(requests.get(full_url, verify=False).content, doc_folder, get_document_filename(full_url))
                else:
                    scrape_website(full_url, folder_path, visited, depth + 1, max_depth, common_tabs)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")

def safe_filename(text):
    # Replace characters not allowed in Windows filenames
    return ''.join(c for c in text if c.isalnum() or c in [' ', '_', '-', '.'])

def scrape_careers_section(company_url):
    # Parse the company URL
    parsed_url = urlparse(company_url)
    company_name = parsed_url.netloc.replace('www.', '').replace('.com', '').replace('.org', '')  # Extract company name

    # Create main directory for scraped data
    main_dir = f"{company_name}_scraped_data"
    if not os.path.exists(main_dir):
        os.makedirs(main_dir)

    careers_dir = os.path.join(main_dir, f"{company_name}_careers_scraped_data")
    if not os.path.exists(careers_dir):
        os.makedirs(careers_dir)

    # Retry mechanism setup
    session = requests.Session()
    retry_strategy = Retry(
        total=3,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    # Fetch HTML content
    try:
        response = session.get(company_url, timeout=30, allow_redirects=False)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find link to Careers section
            careers_link = None
            header_careers_link = soup.find('a', string='Careers')
            footer_careers_link = soup.find('a', string='Careers')  # Adjust if different in actual site structure

            if header_careers_link:
                careers_link = header_careers_link.get('href')
            elif footer_careers_link:
                careers_link = footer_careers_link.get('href')

            if careers_link:
                # Convert relative URL to absolute URL
                careers_url = urljoin(company_url, careers_link)
                careers_response = session.get(careers_url, timeout=30, allow_redirects=False)
                if careers_response.status_code == 200:
                    careers_soup = BeautifulSoup(careers_response.content, 'html.parser')

                    # Example: Find all links in the 'Careers' section
                    links = careers_soup.find_all('a', href=True)
                    for link in links:
                        link_url = urljoin(careers_url, link['href'])  # Convert relative link to absolute
                        # Check if link is HTTP/HTTPS
                        if link_url.startswith('http://') or link_url.startswith('https://'):
                            # Save each link as an HTML file
                            filename = safe_filename(link_url.split('/')[-1].replace('.html', '')) + '.html'
                            with open(os.path.join(careers_dir, filename), 'w', encoding='utf-8') as f:
                                f.write(session.get(link_url, timeout=30, allow_redirects=False).text)

                    # Example: Find all downloadable documents (PDFs)
                    documents = careers_soup.find_all('a', {'class': 'document'}, href=True)
                    for doc in documents:
                        doc_url = urljoin(careers_url, doc['href'])  # Convert relative link to absolute
                        # Check if link is HTTP/HTTPS
                        if doc_url.startswith('http://') or doc_url.startswith('https://'):
                            # Download and save documents
                            doc_filename = safe_filename(doc_url.split('/')[-1])
                            with open(os.path.join(careers_dir, doc_filename), 'wb') as f:
                                f.write(session.get(doc_url, timeout=30, allow_redirects=False).content)
                else:
                    print(f"Failed to retrieve Careers page: {careers_response.status_code}")

            else:
                print("Careers link not found.")

        else:
            print(f"Failed to retrieve page: {response.status_code}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching page: {e}")

#---------------------------------------------------------document_combiner-----------------------------------------------------------

def extract_text_from_pdfs(folder_path):
    combined_text = ""

    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_path, filename)
            try:
                text = extract_text(file_path)
                combined_text += text
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    return combined_text

def process_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters using regex
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    words = nltk.word_tokenize(text)

    company_stopwords = [
        "download", "www", "html", "http", "login", "menu", "chat", "article", "disclaimer", "facebook", "cart", "loading",
        "click", "com", "htm", "https", "logout", "navbar", "message", "blog", "copyright", "twitter", "checkout",
        "processing", "submit", "org", "php", "ftp", "register", "footer", "reply", "post", "terms", "instagram", "order",
        "waiting", "login", "net", "asp", "mailto", "signup", "sidebar", "comment", "news", "privacy", "linkedin",
        "invoice", "error", "logout", "gov", "jsp", "tel", "signin", "header", "post", "story", "policy", "pinterest",
        "billing", "success", "register", "edu", "css", "news", "signout", "banner", "thread", "update", "conditions",
        "youtube", "shipping", "failure", "sign up", "co", "js", "irc", "user", "ad", "forum", "headline", "agreement",
        "vimeo", "payment", "retry", "sign in", "io", "json","file", "account", "advertisement", "discussion", "media",
        "license", "flickr", "subscribe", "refresh", "sign out", "uk", "xml", "profile", "promo", "like",
        "video", "cookie", "reddit", "membership", "reload", "contact", "de", "pdf", "href", "admin", "button",
        "share", "image", "settings", "tumblr", "account", "redirect", "about", "jp", "doc", "src", "dashboard",
        "click", "follow", "photo", "preferences", "snapchat", "profile", "navigate", "home", "fr", "docx", "ref",
        "settings", "read more", "subscribe", "gallery", "options", "tiktok", "wishlist", "submit", "menu", "au",
        "xls", "utm_source", "search", "more info", "unsubscribe", "slideshow", "tools", "whatsapp", "product", "validate",
        "search", "ca", "xlsx", "utm_medium", "results", "next", "notification", "podcast", "utilities", "telegram",
        "service", "authenticate", "next", "us", "ppt", "utm_campaign", "help", "previous", "alert", "episode",
        "resources", "messenger", "pricing", "authorize", "previous", "in", "pptx", "utm_term", "support", "back",
        "update", "stream", "links", "skype", "offer", "encrypt", "back", "cn", "txt", "utm_content", "about", "top",
        "upload", "broadcast", "map", "discord", "discount", "help", "br", "zip", "param", "home", "bottom", "attachment",
        "channel", "navigation", "signal", "coupon", "support", "es", "rar", "sid", "news", "skip", "link", "playlist",
        "sitemap", "medium", "gift", "terms", "tar", "id", "blog", "submit","url", "archive", "blogspot", "buy",
        "conditions", "gz", "key", "post", "reset", "address", "library", "wordpress", "sell", "privacy", "exe", "token",
        "article", "cancel", "contact", "resource", "github", "rent", "policy", "dmg", "hash", "category", "edit",
        "phone", "bitbucket", "lease", "disclaimer", "iso", "index", "tag", "delete", "email", "stackoverflow", "booking",
        "sitemap", "bin", "page", "archive", "save", "support", "quora", "reservation", "faq", "img", "sort", "year",
        "print", "help", "meetup", "feedback", "filter", "month", "close", "faq", "eventbrite", "news", "author",
        "collapse", "tutorial", "blog", "faq", "dropdown", "documentation", "post", "terms", "form", "manual",
        "article", "privacy", "field", "report", "read more", "policy", "checkbox", "feedback", "follow us",
        "conditions", "radio", "survey", "share", "disclaimer", "select", "like", "sitemap", "option",
        "comment", "feedback", "input", "subscribe", "gallery", "text area", "unsubscribe", "media", "captcha",
        "learn more", "video", "view details", "image", "all rights reserved", "photo", "© (copyright)", "download",
        "trademark", "upload", "update", "file", "settings", "attachment", "profile", "resource", "account", "link",
        "admin", "share", "dashboard", "my account", "your account", "preferences", "notifications", "messages", "inbox",
        "outbox", "send", "receive", "cart", "checkout", "order", "payment", "invoice", "billing", "shipping", "address",
        "terms of service", "conditions of use", "user agreement", "cookies", "advertisement", "sponsor", "partnership",
        "careers", "jobs", "vacancies", "apply now", "application", "newsletter", "updates", "events", "calendar",
        "press", "release", "media", "gallery", "videos", "photos", "terms & conditions", "privacy & policy", "contact us",
        "back to top", "accessibility", "languages", "select language", "international", "mobile", "desktop"
    ]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    # Update the stopwords list with company-specific stopwords
    stop_words.update(company_stopwords)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(lemmatized_words)

def process_pdfs(documents_folder, output_file):
    # Extract text from PDFs
    combined_text = extract_text_from_pdfs(documents_folder)

    # Process the text
    processed_text = process_text(combined_text)

    # Write the processed text to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(processed_text)

    print(f"Processed text saved to {output_file}")

##---------------------------------------------------------------html_combiner----------------------------------------------------------

def extract_main_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove header, footer, nav, script, style, and noscript elements
    for element in soup(['header', 'footer', 'nav', 'script', 'style', 'noscript']):
        element.decompose()

    # Find the main content area
    main_content = ""
    article = soup.find('article')
    if article:
        main_content = article.get_text(separator='\n', strip=True)
    else:
        main = soup.find('main')
        if main:
            main_content = main.get_text(separator='\n', strip=True)
        else:
            large_divs = soup.find_all('div')
            for div in large_divs:
                if len(div.get_text(strip=True)) > 200:  # Threshold length to identify large content blocks
                    main_content += div.get_text(separator='\n', strip=True) + '\n'
            if not main_content:  # Fallback if no large divs found
                body = soup.find('body')
                if body:
                    main_content = body.get_text(separator='\n', strip=True)

    return main_content

def save_to_text_file(content, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

def process_html_files_in_directory(directory_path, output_file_path):
    combined_content = ""

    # Iterate through each file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.html'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()

            # Extract the main content
            main_content = extract_main_content(html_content)
            combined_content += main_content + "\n\n"

    # Save the combined content to a single text file
    save_to_text_file(combined_content, output_file_path)
    print(f'All content extracted and combined into {output_file_path}')

    # Process the combined text file to remove stop words
    with open(output_file_path, 'r', encoding='utf-8') as file:
        combined_text = file.read()

    processed_text = process_text(combined_text)

    # Save the processed text back to the file
    save_to_text_file(processed_text, output_file_path)

def process_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove special characters using regex
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize the text
    words = nltk.word_tokenize(text)

    company_stopwords = [
        "download", "www", "html", "http", "login", "menu", "chat", "article", "disclaimer", "facebook", "cart", "loading",
        "click", "com", "htm", "https", "logout", "navbar", "message", "blog", "copyright", "twitter", "checkout",
        "processing", "submit", "org", "php", "ftp", "register", "footer", "reply", "post", "terms", "instagram", "order",
        "waiting", "login", "net", "asp", "mailto", "signup", "sidebar", "comment", "news", "privacy", "linkedin",
        "invoice", "error", "logout", "gov", "jsp", "tel", "signin", "header", "post", "story", "policy", "pinterest",
        "billing", "success", "register", "edu", "css", "news", "signout", "banner", "thread", "update", "conditions",
        "youtube", "shipping", "failure", "sign up", "co", "js", "irc", "user", "ad", "forum", "headline", "agreement",
        "vimeo", "payment", "retry", "sign in", "io", "json","file", "account", "advertisement", "discussion", "media",
        "license", "flickr", "subscribe", "refresh", "sign out", "uk", "xml", "profile", "promo", "like",
        "video", "cookie", "reddit", "membership", "reload", "contact", "de", "pdf", "href", "admin", "button",
        "share", "image", "settings", "tumblr", "account", "redirect", "about", "jp", "doc", "src", "dashboard",
        "click", "follow", "photo", "preferences", "snapchat", "profile", "navigate", "home", "fr", "docx", "ref",
        "settings", "read more", "subscribe", "gallery", "options", "tiktok", "wishlist", "submit", "menu", "au",
        "xls", "utm_source", "search", "more info", "unsubscribe", "slideshow", "tools", "whatsapp", "product", "validate",
        "search", "ca", "xlsx", "utm_medium", "results", "next", "notification", "podcast", "utilities", "telegram",
        "service", "authenticate", "next", "us", "ppt", "utm_campaign", "help", "previous", "alert", "episode",
        "resources", "messenger", "pricing", "authorize", "previous", "in", "pptx", "utm_term", "support", "back",
        "update", "stream", "links", "skype", "offer", "encrypt", "back", "cn", "txt", "utm_content", "about", "top",
        "upload", "broadcast", "map", "discord", "discount", "help", "br", "zip", "param", "home", "bottom", "attachment",
        "channel", "navigation", "signal", "coupon", "support", "es", "rar", "sid", "news", "skip", "link", "playlist",
        "sitemap", "medium", "gift", "terms", "tar", "id", "blog", "submit","url", "archive", "blogspot", "buy",
        "conditions", "gz", "key", "post", "reset", "address", "library", "wordpress", "sell", "privacy", "exe", "token",
        "article", "cancel", "contact", "resource", "github", "rent", "policy", "dmg", "hash", "category", "edit",
        "phone", "bitbucket", "lease", "disclaimer", "iso", "index", "tag", "delete", "email", "stackoverflow", "booking",
        "sitemap", "bin", "page", "archive", "save", "support", "quora", "reservation", "faq", "img", "sort", "year",
        "print", "help", "meetup", "feedback", "filter", "month", "close", "faq", "eventbrite", "news", "author",
        "collapse", "tutorial", "blog", "faq", "dropdown", "documentation", "post", "terms", "form", "manual",
        "article", "privacy", "field", "report", "read more", "policy", "checkbox", "feedback", "follow us",
        "conditions", "radio", "survey", "share", "disclaimer", "select", "like", "sitemap", "option",
        "comment", "feedback", "input", "subscribe", "gallery", "text area", "unsubscribe", "media", "captcha",
        "learn more", "video", "view details", "image", "all rights reserved", "photo", "© (copyright)", "download",
        "trademark", "upload", "update", "file", "settings", "attachment", "profile", "resource", "account", "link",
        "admin", "share", "dashboard", "my account", "your account", "preferences", "notifications", "messages", "inbox",
        "outbox", "send", "receive", "cart", "checkout", "order", "payment", "invoice", "billing", "shipping", "address",
        "terms of service", "conditions of use", "user agreement", "cookies", "advertisement", "sponsor", "partnership",
        "careers", "jobs", "vacancies", "apply now", "application", "newsletter", "updates", "events", "calendar",
        "press", "release", "media", "gallery", "videos", "photos", "terms & conditions", "privacy & policy", "contact us",
        "back to top", "accessibility", "languages", "select language", "international", "mobile", "desktop"
    ]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    # Update the stopwords list with company-specific stopwords
    stop_words.update(company_stopwords)

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(lemmatized_words)


In [None]:
def main():
    company_url = input("Enter the company website URL: ").strip('\'"')
    parsed_url = urlparse(company_url)
    domain = parsed_url.netloc.split('.')[1] if 'www' in parsed_url.netloc else parsed_url.netloc.split('.')[0]
    output_folder = f'{domain}_scraped_data'
    common_tabs = ['#main-nav', '.footer', 'header', 'footer']
    Careers = f'{domain}_careers_scraped_data'

    # Step 1: Scrape the website and career sections
    create_folder(output_folder)
    scrape_website(company_url, output_folder, common_tabs=common_tabs)
    scrape_careers_section(company_url)

    # Step 2: Process PDFs from the documents folder
    documents_folder = os.path.join(output_folder, 'documents')
    pdf_output_file = os.path.join(output_folder, 'processed_text_from_pdfs.txt')
    process_pdfs(documents_folder, pdf_output_file)

    # Step 3: Combine HTML content into a single text file
    html_folder = os.path.join(output_folder, 'htmls')
    combined_html_file = os.path.join(output_folder, 'processed_html_content.txt')
    process_html_files_in_directory(html_folder, combined_html_file)

    #Step 4: Combine Careers content into a single text file
    Careers_folder = os.path.join(output_folder, Careers)
    combined_Careers_file = os.path.join(output_folder, 'processed_careers_content.txt')
    process_html_files_in_directory(Careers_folder, combined_Careers_file)


if __name__ == "__main__":
    main()


Enter the company website URL: https://www.tcs.com/
Error reading tsc-omnistore-ai-powered-unified-composable-commerce-platform.pdf: No /Root object! - Is this really a PDF?
Error reading report-tcs-ai-for-business-study.pdf: Invalid dictionary construct: [/'I', False, /'K', /b'fa', /b'lse', /'S', /'Transparency', /'Type', /'Group']
Processed text saved to tcs_scraped_data/processed_text_from_pdfs.txt
All content extracted and combined into tcs_scraped_data/processed_html_content.txt


  soup = BeautifulSoup(html_content, 'html.parser')


All content extracted and combined into tcs_scraped_data/processed_Careers_content.txt


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
