# Transfer Courses Webscraping

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.DataFrame(columns=["Transfer College Name", "Transfer College Course", "MSOE Course Equivalent"])

In [3]:
def close_progress_bar(progress_bar):
    progress_bar.close()

_________________________________________

In [4]:
driver = webdriver.Chrome()
try:
    # URL of the website
    base_url = "https://tes.collegesource.com/"
    url = base_url + "publicview/TES_publicview01.aspx?rid=d1554a69-83c6-41cc-9dff-3fbffe353f06&aid=96909b9e-7de7-494b-9d7e-a7a9f73f9941"

    # Open the webpage
    driver.get(url)

    # Initialize an empty list to store course information
    data = []
    clicked_colleges = set()
    count = 0

    # Initialize tqdm progress bar outside the loop
    progress_bar = tqdm(desc="Progress", unit="college")

    while True:
        # Wait for college links to appear
        WebDriverWait(driver, 0.5).until(EC.presence_of_element_located((By.XPATH, "//table[@id='gdvInstWithEQ']//a[contains(@id, 'gdvInstWithEQ_btnCreditFromInstName')]")))

        # Find all college links on the current page
        college_links = driver.find_elements(By.XPATH, "//table[@id='gdvInstWithEQ']//a[contains(@id, 'gdvInstWithEQ_btnCreditFromInstName')]")
        total_colleges = len(college_links)
        locations = driver.find_elements(By.XPATH, "//table[@id='gdvInstWithEQ']//td[2]")
        
        if not college_links:
            break

        # Update total steps for tqdm progress bar
        progress_bar.total = total_colleges

        # Iterate over each college link
        for i in range(len(college_links)):
            try:
                college_link = college_links[i]
                college_name = college_link.text.strip()
                location = locations[i+1].text.strip()
                college_tuple = (college_name, location)
                if college_tuple not in clicked_colleges:
                    college_link.click()
                    clicked_colleges.add(college_tuple)
                    #print(f"Clicked on {college_name}, Location: {location}")

                    # Wait for course rows to appear
                    WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.XPATH, "//*[@id='gdvCourseEQ_btnViewCourseEQDetail_0']")))

                    # Find all course rows
                    course_rows = driver.find_elements(By.XPATH, "//table[@id='gdvCourseEQ']//tr")

                    # Extract course information
                    for row in course_rows[1:]:  # Exclude header row
                        cols = row.find_elements(By.TAG_NAME, "td")
                        if len(cols) >= 2:
                            course_name = cols[0].text.strip()
                            msoe_equivalent = cols[1].text.strip()
                            if any(code in msoe_equivalent for code in ["CSC 3511", "SWE 4511", "SWE 4540", "CYB 2001", "MTH 2680", "MTH 3340", "MTH 3410", "MTH 4130", "MTH 4150", "ACS 3410", "MTH 2610"]):   
                                data.append({"Transfer College Name": college_name,
                                            "Transfer College Course": course_name,
                                            "MSOE Course Equivalent": msoe_equivalent})
            except Exception as e:
                break

            # Update tqdm progress bar
            progress_bar.update(1)

        # Refresh the page to avoid stale elements
        driver.refresh()
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//table[@id='gdvInstWithEQ']//a[contains(@id, 'gdvInstWithEQ_btnCreditFromInstName')]")))
        count += 1
        if count == 50:
            break

finally:
    # Close the WebDriver
    driver.quit()


Progress:   2%|▏         | 1/50 [00:04<03:39,  4.48s/college]

In [None]:
df = pd.DataFrame(data)
df.head()

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
import pandas as pd

class WebsiteScraper:
    def __init__(self):
        self.driver = None

    def scrape_website(self):
        try:
            self.open_website()
            self.process_all_pages()
        finally:
            self.close_driver()

    def open_website(self):
        self.driver = webdriver.Chrome()
        base_url = "https://tes.collegesource.com/"
        url = base_url + "publicview/TES_publicview01.aspx?rid=d1554a69-83c6-41cc-9dff-3fbffe353f06&aid=96909b9e-7de7-494b-9d7e-a7a9f73f9941"
        self.driver.get(url)

    def process_all_pages(self):
        total_pages = 33  # Assuming there are 33 pages
        for page_num in range(total_pages):
            self.process_current_page()
            if page_num < total_pages - 1:
                self.go_to_next_page()

    def process_current_page(self):
        while True:
            WebDriverWait(self.driver, 0.5).until(EC.presence_of_element_located((By.XPATH, "//table[@id='gdvInstWithEQ']//a[contains(@id, 'gdvInstWithEQ_btnCreditFromInstName')]")))
            college_links = self.driver.find_elements(By.XPATH, "//table[@id='gdvInstWithEQ']//a[contains(@id, 'gdvInstWithEQ_btnCreditFromInstName')]")
            if not college_links:
                break
            self.process_colleges_on_current_page(college_links)

    def process_colleges_on_current_page(self, college_links):
        progress_bar = tqdm(desc="Progress", unit="college", total=len(college_links))
        for college_link in college_links:
            college_link.click()
            self.process_courses_on_college_page()
            progress_bar.update(1)
        progress_bar.close()

    def process_courses_on_college_page(self):
        WebDriverWait(self.driver, 0.5).until(EC.presence_of_element_located((By.XPATH, "//*[@id='gdvCourseEQ_btnViewCourseEQDetail_0']")))
        course_rows = self.driver.find_elements(By.XPATH, "//table[@id='gdvCourseEQ']//tr")[1:]
        for row in course_rows:
            cols = row.find_elements(By.TAG_NAME, "td")
            if len(cols) >= 2:
                course_name = cols[0].text.strip()
                msoe_equivalent = cols[1].text.strip()
                if any(code in msoe_equivalent for code in ["CSC 3511", "SWE 4511", "SWE 4540", "CYB 2001", "MTH 2680", "MTH 3340", "MTH 3410", "MTH 4130", "MTH 4150", "ACS 3410", "MTH 2610"]):   
                    self.save_course_data(course_name, msoe_equivalent)

    def save_course_data(self, course_name, msoe_equivalent):
        data.append({"Transfer College Name": self.college_name,
                     "Transfer College Course": course_name,
                     "MSOE Course Equivalent": msoe_equivalent})

    def go_to_next_page(self):
        next_page_link = self.driver.find_element(By.XPATH, "//a[contains(@href, 'Page$') and contains(text(), 'Next')]")
        if next_page_link:
            next_page_link.click()
            WebDriverWait(self.driver, 0.5).until(EC.staleness_of_element_located((By.XPATH, "//table[@id='gdvInstWithEQ']")))
        else:
            raise ValueError("Next page link not found")

    def close_driver(self):
        if self.driver:
            self.driver.quit()

if __name__ == "__main__":
    data = []
    scraper = WebsiteScraper()
    scraper.scrape_website()
    df = pd.DataFrame(data)
    df.head()


In [None]:
import requests
from bs4 import BeautifulSoup
import csv

# Function to scrape text from a webpage
def scrape_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        # Extract text from the webpage
        text = soup.get_text(separator='\n', strip=True)
        return text
    else:
        print(f"Failed to scrape {url}. Status code: {response.status_code}")
        return None

# Main function to scrape the main page
def main():
    main_url = "https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=d1554a69-83c6-41cc-9dff-3fbffe353f06&aid=96909b9e-7de7-494b-9d7e-a7a9f73f9941"

    with open("scraped_data.csv", "w", newline='', encoding="utf-8") as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(["URL", "Text"])  # Header row

        # Scrape main page
        print(f"Scraping: {main_url}")
        main_text = scrape_page(main_url)
        if main_text:
            csv_writer.writerow([main_url, main_text])

if __name__ == "__main__":
    main()


Scraping: https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=d1554a69-83c6-41cc-9dff-3fbffe353f06&aid=96909b9e-7de7-494b-9d7e-a7a9f73f9941
Failed to scrape https://tes.collegesource.com/publicview/TES_publicview01.aspx?rid=d1554a69-83c6-41cc-9dff-3fbffe353f06&aid=96909b9e-7de7-494b-9d7e-a7a9f73f9941. Status code: 403
