In [1]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd
import time
import re
import logging
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
import os
import concurrent.futures  # Added for parallel processing
import threading

In [2]:
# Set up logging
log_file_path = 'C:/Users/alex/OneDrive - University at Albany - SUNY/FALL 2024/graduate assistant/course_catalog_extraction_P_MAIN_TS.log'

logging.basicConfig(
    filename=log_file_path,
    filemode='w',
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s")

# Add initial log entry
logging.debug("Logging complete. Starting script execution.")

In [3]:
def find_course_catalog_url(base_url):
    logging.info(f"Attempting to find course catalog for {base_url}")
    # Parse the base URL to ensure correctness
    parsed_url = urlparse(base_url)
    scheme = parsed_url.scheme or 'https'
    netloc = parsed_url.netloc or parsed_url.path
    if not netloc:
        logging.error(f"Invalid base URL: {base_url}")
        return {'catalog_url': None, 'status': 'Invalid base URL'}
    
    # Remove 'www.' from netloc if present
    netloc = netloc.replace('www.', '')
    
    # List of possible prefixes
    prefixes = ['catalog', 'catalogs', 'course', 'courses']
    
    potential_urls = []
    
    # Generate potential URLs by adding prefixes as subdomains
    for prefix in prefixes:
        potential_url = f"{scheme}://{prefix}.{netloc}"
        potential_urls.append(potential_url)
    
    logging.info(f"Generated {len(potential_urls)} potential catalog URLs for {base_url}")
    
    # Verify which URL leads to the course catalog
    for url in potential_urls:
        try:
            logging.debug(f"Trying URL: {url}")
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                page_soup = BeautifulSoup(response.content, 'html.parser')
                title = page_soup.title.string.lower() if page_soup.title else ''
                if any(keyword in title for keyword in ['course', 'catalog', 'bulletin', 'curriculum']):
                    logging.info(f"Found course catalog URL: {url}")
                    return {'catalog_url': url, 'status': 'Catalog found via title'}
                # Additional check: look for specific keywords in the page content
                if page_soup.find(string=re.compile('course catalog', re.I)):
                    logging.info(f"Found course catalog URL by content match: {url}")
                    return {'catalog_url': url, 'status': 'Catalog found via content match'}
        except Exception as e:
            logging.debug(f"Error accessing {url}: {e}")
            continue
    
    logging.warning(f"No course catalog found for {base_url}")
    return {'catalog_url': None, 'status': 'No catalog found'}


In [4]:
def search_courses(args):
    # Unpack arguments
    catalog_url, institution_name, base_url = args

    import logging
    import time
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service as ChromeService
    from webdriver_manager.chrome import ChromeDriverManager
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from selenium.common.exceptions import (
        NoSuchElementException, TimeoutException,
        StaleElementReferenceException, ElementClickInterceptedException
    )

    # Configure logging for each thread
    logging.basicConfig(
        level=logging.INFO,
        format=f"%(asctime)s - %(levelname)s - [Thread {threading.current_thread().name}] - %(message)s"
    )

    logging.info(f"Starting course search on {catalog_url} for {institution_name}")
    options = webdriver.ChromeOptions()
    # Uncomment the following line to run Chrome in headless mode
    # options.add_argument('--headless')

    # Initialize the Chrome WebDriver
    try:
        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
    except Exception as e:
        logging.error(f"WebDriver initialization failed for {institution_name}: {e}")
        return {'courses': [], 'status': f'WebDriver init failed: {e}'}

    courses = []

    try:
        # Navigate to the Advanced Search Page
        advanced_search_url = catalog_url.rstrip('/') + '/search_advanced.php'
        driver.get(advanced_search_url)
        logging.info(f"Navigated to advanced search page: {advanced_search_url}")

        wait = WebDriverWait(driver, 15)  # Increased timeout to 15 seconds

        # Locate the 'database_search' form
        try:
            wait.until(EC.presence_of_element_located((By.NAME, 'database_search')))
            database_search_form = driver.find_element(By.NAME, 'database_search')
            logging.info("Located 'database_search' form")
        except TimeoutException:
            logging.error(f"'database_search' form not found on {advanced_search_url}")
            return {'courses': [], 'status': "'database_search' form not found"}

        # Locate the Keyword Search Input within the form
        try:
            search_box = database_search_form.find_element(By.NAME, 'filter[keyword]')
        except NoSuchElementException:
            logging.error(f"Keyword search box not found in 'database_search' form on {advanced_search_url}")
            return {'courses': [], 'status': "Keyword search box not found"}

        # Fill in the search term
        try:
            search_term = 'environmental health'
            search_box.clear()
            search_box.send_keys(f'"{search_term}"')  # Ensure exact phrase search
            logging.info(f"Entered search term '{search_term}' on {advanced_search_url}")
        except Exception as e:
            logging.error(f"Failed to enter search term on {advanced_search_url}: {e}")
            return {'courses': [], 'status': f"Failed to enter search term: {e}"}

        # Ensure 'Exact Match' Checkbox is Checked within the form
        try:
            exact_match_checkbox = database_search_form.find_element(By.NAME, 'filter[exact_match]')
            if not exact_match_checkbox.is_selected():
                exact_match_checkbox.click()
                logging.info("Checked 'Exact Match' checkbox")
        except NoSuchElementException:
            logging.warning(f"'Exact Match' checkbox not found on {advanced_search_url}")
            # Proceeding even if the checkbox is not found

        # Set Search Categories within the form
        # Check 'Courses' checkbox and uncheck others
        categories = [
            ('filter_course', True, 'Courses'),
            ('filter_program', False, 'Programs'),
            ('filter_entity', False, 'Schools and Colleges'),
            ('filter_other', False, 'Academic Rules')
        ]
        for category_id, should_check, category_name in categories:
            try:
                category_checkbox = database_search_form.find_element(By.ID, category_id)
                if category_checkbox.is_selected() != should_check:
                    category_checkbox.click()
                    action = "Checked" if should_check else "Unchecked"
                    logging.info(f"{action} '{category_name}' checkbox")
            except NoSuchElementException:
                logging.debug(f"'{category_name}' checkbox not found or could not be interacted with")
            except Exception as e:
                logging.error(f"Error interacting with '{category_name}' checkbox: {e}")
                return {'courses': [], 'status': f"Error with '{category_name}' checkbox: {e}"}

        # Submit the 'database_search' Form
        try:
            database_search_form.submit()
            logging.info("Submitted 'database_search' form")
        except Exception as e:
            logging.error(f"Failed to submit 'database_search' form: {e}")
            return {'courses': [], 'status': f"Failed to submit form: {e}"}

        # Wait for and Extract Search Results
        try:
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'table.table_default')))
            logging.info("Search results table found.")
        except TimeoutException:
            logging.warning(f"No search results found for {institution_name} on {advanced_search_url}")
            return {'courses': [], 'status': "No search results found"}

        # Parse Course Entries
        try:
            results_table = driver.find_element(By.CSS_SELECTOR, 'table.table_default')
            course_links = results_table.find_elements(By.XPATH, './/a[starts-with(@onclick, "showCatalogData") or starts-with(@onclick, "showCourse")]')
            logging.info(f"Found {len(course_links)} course links in the results table")
        except NoSuchElementException:
            logging.warning(f"No course links found in the results table for {institution_name}")
            return {'courses': [], 'status': "No course links found"}

        # Extract all course links' texts and their onclick attributes first to avoid StaleElementReferenceException
        course_data = []
        for link in course_links:
            try:
                course_name = link.text.strip()
                onclick_attr = link.get_attribute('onclick')
                course_data.append({'name': course_name, 'onclick': onclick_attr})
            except Exception as e:
                logging.error(f"Error extracting data from course link: {e}")
                continue

        logging.debug(f"Course data extracted: {course_data}")

        # Iterate through each course data
        for index, course in enumerate(course_data, start=1):
            course_name = course['name']
            onclick = course['onclick']
            logging.debug(f"Processing course {index}: {course_name} with onclick: {onclick}")

            # Use updated regex to extract parameters from onclick
            match = re.search(
                r"showCourse\(\s*'(\d+)'\s*,\s*'(\d+)'\s*,\s*this\s*,\s*'([^']*)'\s*\)",
                onclick,
                re.IGNORECASE
            )
            if not match:
                logging.error(f"Could not parse onclick attribute for course '{course_name}': {onclick}")
                continue

            catoid, coid, display_options = match.groups()

            # Construct the AJAX URL dynamically based on the catalog_url
            ajax_url = (
                f"{catalog_url.rstrip('/')}/ajax/preview_course.php"
                f"?catoid={catoid}&coid={coid}&display_options={display_options}&show"
            )
            logging.debug(f"Constructed AJAX URL for course '{course_name}': {ajax_url}")

            # Open the AJAX URL in a new tab
            try:
                driver.execute_script("window.open('');")  # Open a new tab
                driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
                driver.get(ajax_url)
                logging.debug(f"Navigated to AJAX URL: {ajax_url}")

                # Wait for the AJAX content to load
                wait.until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
                time.sleep(1)  # Additional wait to ensure content is fully loaded

                # Get the page source and parse with BeautifulSoup
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'html.parser')

                # Extract the course description based on the provided HTML structure
                # Locate the <h3> with the course name
                h3_heading = soup.find('h3', text=re.compile(re.escape(course_name), re.I))
                if h3_heading:
                    parent_div = h3_heading.find_parent('div')
                    if parent_div:
                        # Remove the <h3> tag
                        h3_heading.decompose()
                        # Replace <br> tags with newline characters for better formatting
                        for br in parent_div.find_all('br'):
                            br.replace_with('\n')
                        # Extract the text
                        course_description = parent_div.get_text(separator=' ', strip=True)
                    else:
                        logging.warning(f"No parent <div> found for <h3> in course '{course_name}'. Extracting all text.")
                        course_description = soup.get_text(separator=' ', strip=True)
                else:
                    logging.warning(f"<h3> heading with course name '{course_name}' not found. Extracting all text.")
                    course_description = soup.get_text(separator=' ', strip=True)

                logging.debug(f"Extracted description for course '{course_name}': {course_description[:100]}...")  # Log first 100 chars

                # Append the course to the list
                courses.append({
                    'Institution': institution_name,
                    'Website': base_url,
                    'Course Name': course_name,
                    'Description': course_description
                })

                # Close the tab and switch back to the main window
                driver.close()
                driver.switch_to.window(driver.window_handles[0])

                # Brief pause before processing the next course
                time.sleep(0.5)

            except Exception as e:
                logging.error(f"Error fetching description for course '{course_name}': {e}")
                # Ensure the new tab is closed if an error occurs
                if len(driver.window_handles) > 1:
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                continue  # Skip to the next course

    except Exception as e:
        logging.error(f"Error processing {catalog_url}: {e}")
        return {'courses': [], 'status': f"Error processing catalog: {e}"}
    finally:
        driver.quit()

    return {'courses': courses, 'status': 'Success'}


In [5]:
# main loop
def main():
    logging.info("Starting main execution")
    # Read the institutions from the CSV file
    csv_file_path = 'C:/Users/alex/OneDrive - University at Albany - SUNY/FALL 2024/graduate assistant/ny_institutions.csv'
    df_institutions = pd.read_csv(csv_file_path)

    all_courses = []
    failed_institutions = []

    tasks = []

    for idx, row in df_institutions.iterrows():
        institution_name = row['Name']
        base_url = row['Website']

        # Ensure the base URL is complete
        if not base_url.startswith('http'):
            base_url = 'https://' + base_url

        logging.info(f"Processing {institution_name} ({base_url})")

        catalog_info = find_course_catalog_url(base_url)
        catalog_url = catalog_info['catalog_url']
        catalog_status = catalog_info['status']

        if catalog_url:
            tasks.append((catalog_url, institution_name, base_url))
        else:
            # Log the failure
            failed_institutions.append({
                'Institution Name': institution_name,
                'Base URL': base_url,
                'Catalog URL Found': False,
                'Failure Reason': catalog_status
            })
            logging.warning(f"Course catalog not found for {institution_name}: {catalog_status}")

    # Use ThreadPoolExecutor to run tasks in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all tasks and unpack arguments
        future_to_task = {executor.submit(search_courses, task): task for task in tasks}

        for future in concurrent.futures.as_completed(future_to_task):
            task = future_to_task[future]
            institution_name = task[1]
            base_url = task[2]
            try:
                result = future.result()
                courses = result.get('courses', [])
                status = result.get('status', 'Unknown')

                if status == 'Success' and courses:
                    all_courses.extend(courses)
                    logging.info(f"Found {len(courses)} courses for {institution_name}")
                elif status == 'Success' and not courses:
                    logging.info(f"No courses found for {institution_name}")
                    failed_institutions.append({
                        'Institution Name': institution_name,
                        'Base URL': base_url,
                        'Catalog URL Found': True,
                        'Failure Reason': 'No courses found'
                    })
                else:
                    # Status indicates failure
                    failed_institutions.append({
                        'Institution Name': institution_name,
                        'Base URL': base_url,
                        'Catalog URL Found': True,
                        'Failure Reason': status
                    })
                    logging.error(f"Failed to scrape courses for {institution_name}: {status}")
            except Exception as e:
                logging.error(f"Error processing {institution_name}: {e}")
                failed_institutions.append({
                    'Institution Name': institution_name,
                    'Base URL': base_url,
                    'Catalog URL Found': True,
                    'Failure Reason': f"Unhandled exception: {e}"
                })

    # Save the data to CSV files
    try:
        # Save successful courses
        if all_courses:
            df_courses = pd.DataFrame(all_courses)
            output_file = 'C:/Users/alex/OneDrive - University at Albany - SUNY/FALL 2024/graduate assistant/scripts/environmental_health_courses_finalTS.csv'
            df_courses.to_csv(output_file, index=False)
            logging.info(f"All courses saved to {output_file}")
        else:
            logging.info("No courses found.")

        # Save failed institutions
        if failed_institutions:
            df_failed = pd.DataFrame(failed_institutions)
            failed_output_file = 'C:/Users/alex/OneDrive - University at Albany - SUNY/FALL 2024/graduate assistant/scripts/failed_institutions.csv'
            df_failed.to_csv(failed_output_file, index=False)
            logging.info(f"Failed institutions saved to {failed_output_file}")
        else:
            logging.info("No failures encountered.")
    except Exception as e:
        logging.error(f"Error saving data to CSV: {e}")
        raise

    logging.info("Script execution completed.")
    
if __name__ == "__main__":
    main()

  h3_heading = soup.find('h3', text=re.compile(re.escape(course_name), re.I))
