In [1]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import wikipedia
import pandas as pd
import logging
import time
import re

In [None]:
# Set up logging
log_file_path = '/mnt/c/Users/WSTATION/Desktop/NEW_ETL_TEST/course_scraping_EHS.log'

logging.basicConfig(
    filename=log_file_path,
    filemode='w', 
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s")

# add initial log entry
logging.debug("Logging complete. Starting script execution.")


In [None]:
# 1. Define the target URL
wiki_url = "https://en.wikipedia.org/wiki/List_of_colleges_and_universities_in_New_York_(state)"
logging.info(f"Target URL set: {wiki_url}")

In [4]:
# 2. Fetch and parse the Wikipedia page
try:
    headers = {'User-Agent': 'InstitutionScraper/1.0 (agodinez@albany.edu)'}
    response = requests.get(wiki_url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    logging.info("Wikipedia page fetched and parsed successfully.")
except Exception as e:
    logging.error(f"Error fetching or parsing the Wikipedia page: {e}")
    raise

In [5]:
# 3. Identify the desired sections
desired_sections = [
    'Public',
    'Private, not-for-profit, non-sectarian'
]
logging.info(f"Desired sections: {desired_sections}")


In [None]:
# 4. Extract institutions from desired sections
institutions = []

# Find all divs with class 'mw-heading mw-heading2' (section headings)
for div_h2 in soup.find_all('div', class_='mw-heading mw-heading2'):
    h2 = div_h2.find('h2')
    if h2:
        section_title = h2.get_text(strip=True).replace('[edit]', '').strip()
        logging.info(f"Found section: '{section_title}'")
        if section_title in desired_sections:
            logging.info(f"Processing section: {section_title}")

            # Initialize sub_section_title
            sub_section_title = ''

            # Collect all elements until the next section heading
            content = []
            sibling = div_h2.find_next_sibling()
            while sibling:
                # Break if we reach another section heading
                if sibling.name == 'div' and 'mw-heading2' in sibling.get('class', []):
                    break
                content.append(sibling)
                sibling = sibling.find_next_sibling()

            # Process the content
            for element in content:
                # Check for subsection headings
                if element.name == 'div' and 'mw-heading3' in element.get('class', []):
                    h3 = element.find('h3')
                    if h3:
                        sub_section_title = h3.get_text(strip=True).replace('[edit]', '').strip()
                        logging.info(f"  Sub-section: {sub_section_title}")
                else:
                    # Find all <li> elements within the content
                    for li in element.find_all('li'):
                        a_tag = li.find('a', href=True)
                        if a_tag:
                            institution_name = a_tag.get_text(strip=True)
                            institution_wiki_link = 'https://en.wikipedia.org' + a_tag['href']
                            # Avoid duplicates
                            if not any(inst['Name'] == institution_name for inst in institutions):
                                institutions.append({
                                    'Name': institution_name,
                                    'Wikipedia Link': institution_wiki_link,
                                    'Section': section_title,
                                    'Sub-section': sub_section_title
                                })
                                logging.debug(f"Added institution: {institution_name} from {sub_section_title}")
        else:
            logging.debug(f"Skipped section: {section_title}")



In [None]:
# 5. Fetch official website links from institution Wikipedia pages
for idx, institution in enumerate(institutions):
    logging.info(f"Processing institution {idx + 1}/{len(institutions)}: {institution['Name']}")
    try:
        # Use the requests library to get the institution's Wikipedia page
        time.sleep(1)  
        page_response = requests.get(institution['Wikipedia Link'], headers=headers)
        page_response.raise_for_status()
        page_soup = BeautifulSoup(page_response.content, 'html.parser')
        # Find the infobox
        infobox = page_soup.find('table', {'class': re.compile('infobox')})
        if infobox:
            # Look for the website link
            website = ''
            for row in infobox.find_all('tr'):
                if row.th and ('Website' in row.th.get_text() or 'website' in row.th.get_text()):
                    if row.td:
                        link = row.td.find('a', href=True)
                        if link and 'href' in link.attrs:
                            website = link['href']
                            logging.debug(f"Found website for {institution['Name']}: {website}")
                    break
            institution['Website'] = website
        else:
            logging.warning(f"Infobox not found for {institution['Name']}")
            institution['Website'] = ''
    except Exception as e:
        logging.error(f"Error fetching data for {institution['Name']}: {e}")
        institution['Website'] = ''

logging.info("Website extraction completed.")

In [None]:
# 6. Save the data to a CSV file
try:
    df = pd.DataFrame(institutions)
    output_path = '/mnt/c/Users/WSTATION/Desktop/NEW_ETL_TEST/ny_institutions.csv'
    df.to_csv(output_path, index=False)
    logging.info(f"Data saved to {output_path}")
except Exception as e:
    logging.error(f"Error saving data to CSV: {e}")
    raise
logging.info("Script execution completed.")


In [None]:
print(institutions)