<a href="https://colab.research.google.com/github/cdukedev/AI-Skyy-Project/blob/main/WebCrawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# VERSION 1
# import requests
# from bs4 import BeautifulSoup
# from urllib.parse import urljoin
# import time

# # Set the base URL
# base_url = "https://www.mdc.edu/"

# # Set to store crawled URLs to avoid duplication
# crawled_urls = set()

# # Function to scrape a single page
# def scrape_page(url):
#     try:
#         # Send a GET request to the URL
#         response = requests.get(url, timeout=5)
#         response.raise_for_status()  # Check for HTTP errors
#     except requests.exceptions.RequestException as e:
#         print(f"Request failed: {e}")
#         return  # Skip this page and return from the function

#     # Try parsing with different parsers in case of errors
#     parsers = ['html.parser', 'lxml', 'html5lib']
#     for parser in parsers:
#         try:
#             soup = BeautifulSoup(response.text, parser)
#             break  # If parsing succeeds, exit the loop
#         except Exception as e:
#             print(f"Failed to parse with {parser} due to {e}")
#             if parser == parsers[-1]:  # If last parser fails
#                 return  # Unable to parse, so skip this page

#     # Find all anchor tags in the HTML
#     for link in soup.find_all('a'):
#         href = link.get('href')
#         if href:
#             # Create an absolute URL
#             full_url = urljoin(url, href)

#             # Ensure the URL is within the same base URL and not already crawled
#             if full_url.startswith(base_url) and full_url not in crawled_urls:
#                 print(full_url)
#                 crawled_urls.add(full_url)
#                 # Recursively scrape the linked page
#                 scrape_page(full_url)  # Recursive call to continue the crawl

# # Start the crawl from the base URL
# scrape_page(base_url)




# VERSION 2
# import requests
# from bs4 import BeautifulSoup
# from urllib.parse import urljoin
# import time
# import json  # Import JSON library to handle array storage

# # Set the base URL
# base_url = "https://www.mdc.edu/"

# # Set to store crawled URLs to avoid duplication
# crawled_urls = set()

# # Function to scrape a single page
# def scrape_page(url):
#     try:
#         # Send a GET request to the URL
#         response = requests.get(url, timeout=5)
#         response.raise_for_status()  # Check for HTTP errors
#     except requests.exceptions.RequestException as e:
#         print(f"Request failed: {e}")
#         return  # Skip this page and return from the function

#     # Try parsing with different parsers in case of errors
#     parsers = ['html.parser', 'lxml', 'html5lib']
#     for parser in parsers:
#         try:
#             soup = BeautifulSoup(response.text, parser)
#             break  # If parsing succeeds, exit the loop
#         except Exception as e:
#             print(f"Failed to parse with {parser} due to {e}")
#             if parser == parsers[-1]:  # If last parser fails
#                 return  # Unable to parse, so skip this page

#     # Find all anchor tags in the HTML
#     for link in soup.find_all('a'):
#         href = link.get('href')
#         if href:
#             # Create an absolute URL
#             full_url = urljoin(url, href)

#             # Ensure the URL is within the same base URL and not already crawled
#             if full_url.startswith(base_url) and full_url not in crawled_urls:
#                 crawled_urls.add(full_url)
#                 # Recursively scrape the linked page
#                 scrape_page(full_url)  # Recursive call to continue the crawl

# # Start the crawl from the base URL
# scrape_page(base_url)

# # After crawling is complete, write the crawled URLs to a file
# with open('crawled_urls.json', 'w') as file:
#     # Convert the set to a list for JSON serialization
#     json.dump(list(crawled_urls), file)


import requests
from google.colab import files
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import json

# Set the base URL
base_url = "https://www.mdc.edu/"

# Queue to store URLs to be crawled
url_queue = [base_url]  # Start with the base URL

# Set to store crawled URLs to avoid duplication
crawled_urls = set()

# Function to scrape a single page
def scrape_page(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url, timeout=5)
        response.raise_for_status()  # Check for HTTP errors
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return []  # Return an empty list for consistency

    # Try parsing with different parsers in case of errors
    parsers = ['html.parser', 'lxml', 'html5lib']
    new_urls = []  # List to store newly found URLs on this page
    for parser in parsers:
        try:
            soup = BeautifulSoup(response.text, parser)
            # Find all anchor tags in the HTML
            for link in soup.find_all('a'):
                href = link.get('href')
                if href:
                    # Create an absolute URL
                    full_url = urljoin(url, href)
                    # Ensure the URL is within the same base URL and not already crawled
                    if full_url.startswith(base_url) and full_url not in crawled_urls:
                        new_urls.append(full_url)
            break  # If parsing succeeds, exit the loop
        except Exception as e:
            print(f"Failed to parse with {parser} due to {e}")
            if parser == parsers[-1]:  # If last parser fails
                return []  # Unable to parse, so return an empty list

    return new_urls

# Start the crawl from the base URL
while url_queue:
    current_url = url_queue.pop(0)  # Get the first URL from the queue
    if current_url not in crawled_urls:
        print(f"Crawling: {current_url}")
        crawled_urls.add(current_url)
        new_urls = scrape_page(current_url)
        url_queue.extend(new_urls)  # Add new URLs to the queue

# After crawling is complete, write the crawled URLs to a file
with open('crawled_urls.json', 'w') as file:
    # Convert the set to a list for JSON serialization
    json.dump(list(crawled_urls), file)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseOne
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFive
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseThree
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFaqOne
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFaqTwo
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFaqThree
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFaqFour
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFaqFive
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFaqSix
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFaqSeven
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFaqEight
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFaqNine
Crawling: https://www.mdc.edu/adult-education/esol.aspx#collapseFaqTen
Crawling: