In [None]:
import requests
from bs4 import BeautifulSoup
import json

# URL of the page you want to scrape
url = 'https://www.theuniguide.co.uk/university-of-york-y50/'

# Send an HTTP request to the URL
response = requests.get(url)

# Parse the content of the page with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Find the <script> tag containing the application/json data
script_tag = soup.find('script', {'type': 'application/json', 'class': 'js-react-on-rails-component', 'data-component-name': 'InstitutionHeader'})

# Extract the JSON content from the script tag
if script_tag:
    json_data = script_tag.string  # The JSON data is inside the script tag as a string

    # Parse the JSON string into a Python dictionary
    parsed_data = json.loads(json_data)

    # Save the parsed data to a JSON file
    with open('institution_header.json', 'w') as json_file:
        json.dump(parsed_data, json_file, indent=4)

    print("JSON data saved to 'institution_header.json'")
else:
    print("Script tag not found.")


JSON data saved to 'institution_header.json'


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page you want to scrape
url = 'https://www.theuniguide.co.uk/university-of-york-y50/'

# Send a GET request to the website
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title of the institution
    institution_title = soup.find('h1').get_text(strip=True)

    # Extract other subtitle information (e.g., location, region)
    subtitles = [subtitle.get_text(strip=True) for subtitle in soup.find_all('h2')]

    # Extract image URLs (institution and logo images)
    image_urls = [img['src'] for img in soup.find_all('img')]

    # Extract additional data (you can expand based on structure)
    sections = soup.find_all('section')

    # Collect data in a dictionary for organization
    scraped_data = {
        'Institution Title': institution_title,
        'Subtitles': subtitles,
        'Image URLs': image_urls,
    }

    # Print the scraped data (you can also save it to a CSV or JSON file)
    print(scraped_data)

    # Example: If you want to save the data to a CSV file
    df = pd.DataFrame([scraped_data])
    df.to_csv('university_of_york_data.csv', index=False)

    print("Data saved to 'university_of_york_data.csv'")

else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


{'Institution Title': 'University of York', 'Subtitles': ['How students describe this university', 'Vital stats', 'What students say about this university', 'University images', 'Related advice articles', 'Explore more universities and colleges', 'Promoted universities', 'Browse expert advice'], 'Image URLs': ['https://cdn.theuniguide.co.uk/assets/logos/the-uni-guide-44a357fe789a61eea42d7e384bd0e37604988caaac627dc809a9bc45288794e3.svg', 'https://cdn.theuniguide.co.uk/assets/logos/the-uni-guide-mobile-034761f39ae32e883e0d790f2599ab6179adda42b695ee68658033765992d69e.svg', 'https://cdn.theuniguide.co.uk/assets/burger-menu-12ac258fbf8cec9d8083ac4b772649d3044674930f58f82c1347f0885dd965b7.svg', 'https://cdn.theuniguide.co.uk/uploads/image/file/11353/TSR_logo_-_The_Uni_Guide__400x200_.png', 'https://cdn.theuniguide.co.uk/uploads/image/file/11353/TSR_logo_-_The_Uni_Guide__400x200_.png', 'https://cdn.theuniguide.co.uk/assets/dialog-a14250dd301297fbbdd946f15682becec1b12a0315458caaf198a8c69c4ac0c

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# URL of the page to scrape
url = 'https://www.theuniguide.co.uk/university-of-york-y50/'

# Send a GET request to the website
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the page content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title of the page (Institution name)
    institution_title = soup.find('h1').get_text(strip=True)

    # Extract all subtitles (h2, h3, etc.)
    subtitles = [subtitle.get_text(strip=True) for subtitle in soup.find_all(['h2', 'h3', 'h4'])]

    # Extract all paragraph texts (descriptions)
    descriptions = [p.get_text(strip=True) for p in soup.find_all('p')]

    # Extract all the links
    links = [{'text': a.get_text(strip=True), 'url': a['href']} for a in soup.find_all('a', href=True)]

    # Extract tables, if any (using pandas)
    tables = []
    try:
        tables = pd.read_html(response.text)  # Attempt to extract tables
        print(f"Found {len(tables)} table(s).")
    except ValueError:
        print("No tables found on the page.")

    # Extract all image URLs
    image_urls = [img['src'] for img in soup.find_all('img')]

    # Combine all the data into a structured format (dictionary)
    scraped_data = {
        'Institution Title': institution_title,
        'Subtitles': subtitles,
        'Descriptions': descriptions,
        'Links': links,
        'Image URLs': image_urls,
        'Tables': [table.to_dict() for table in tables]  # Convert tables to dict format if any exist
    }

    # Print the scraped data for inspection (optional)
    print(scraped_data)

    # Save the data to a CSV or JSON file (optional)
    df = pd.DataFrame([scraped_data])  # Convert dictionary to pandas DataFrame

    # Save descriptions, subtitles, and links into separate files if needed
    pd.DataFrame(descriptions, columns=['Description']).to_csv('descriptions.csv', index=False)
    pd.DataFrame(subtitles, columns=['Subtitle']).to_csv('subtitles.csv', index=False)
    pd.DataFrame(links).to_csv('links.csv', index=False)

    # Save all data into a JSON file for detailed storage
    with open('university_of_york_data.json', 'w') as json_file:
        json.dump(scraped_data, json_file, indent=4)

    print("Data saved to 'university_of_york_data.json' and respective CSV files.")

else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


No tables found on the page.
{'Institution Title': 'University of York', 'Subtitles': ['How students describe this university', 'Varied union activities', '73%', 'Sporty', '73%', 'Political', '65%', 'Diverse local nightlife', '68%', 'Creative', '60%', 'Vital stats', 'Number of students', 'Undergraduate / Postgraduate', 'Full-time / Part-time', 'Male / Female', 'Young / Mature', 'UK / Non-UK', 'What students say about this university', 'University images', 'Related advice articles', 'Should you apply for more than one course at the same university?', 'First and insurance university choices explained', 'Calculating Ucas points based on predicted grades', 'Further education college versus university: how do degree studies differ?', 'Top questions to ask at a university open day', 'Six things you need to know before making your final A-level choices', 'How important are GCSE choices when it comes to university?', 'How to get the most out of university open days', 'Explore more universities

  tables = pd.read_html(response.text)  # Attempt to extract tables


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the courses page
url = 'https://www.theuniguide.co.uk/university-of-york-y50/courses'

# Send a GET request to fetch the page content
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all course elements using the provided HTML structure
    course_list = soup.find_all('dl', class_='result-card__heading--indented')

    courses = []

    # Iterate over each course and extract details
    for course in course_list:
        # Course Name
        course_name = course.find('h6', class_='course-name').get_text(strip=True) if course.find('h6', class_='course-name') else 'No Title'

        # Course Link
        course_link = course.find('a', href=True)['href'] if course.find('a', href=True) else 'No Link'
        course_link = 'https://www.theuniguide.co.uk' + course_link  # Add base URL

        # Additional course snippets (degree type, duration, etc.)
        course_snippet = course.find('span', class_='snippet').get_text(strip=True) if course.find('span', class_='snippet') else 'No Details'

        # Add the extracted data to the courses list
        courses.append({
            'Course Name': course_name,
            'Course Link': course_link,
            'Course Details': course_snippet
        })

    # Check if any courses were found
    if courses:
        # Convert the list of courses to a pandas DataFrame
        df = pd.DataFrame(courses)

        # Save the data to a CSV file
        df.to_csv('university_of_york_courses1.csv', index=False)
        print(f"Data saved to 'university_of_york_courses.csv'")

        # Optionally, display the scraped data
        print(df)
    else:
        print("No courses found. Please verify the HTML structure.")

else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Data saved to 'university_of_york_courses.csv'
                                          Course Name  \
0             German and Italian (with a year abroad)   
1   Chemistry, Green Principles and Sustainable Pr...   
2     Music Technology Systems with a Foundation Year   
3                                     Nursing (Child)   
4                                            Genetics   
5                  Physical Geography and Environment   
6                 Electronic and Computer Engineering   
7                            Social and Public Policy   
8                             Business and Management   
9                    Sociology with Social Psychology   
10                              Environmental Science   
11                           Archaeology and Heritage   
12                                Theoretical Physics   
13                                Robotic Engineering   
14          Actuarial Science with a year in industry   
15                                       

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL for the first page (without pagination)
base_url = 'https://www.theuniguide.co.uk/university-of-york-y50/courses'

# Initialize an empty list to store all courses
courses = []

# Set the number of pages to scrape (you can automate this further by detecting when there are no more pages)
total_pages = 30  # You can increase this depending on the number of pages

# Loop through each page
for page in range(1, total_pages + 1):
    # Construct the URL for each page
    if page == 1:
        url = base_url
    else:
        url = f'{base_url}?page={page}'

    # Send a GET request to the page
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all course elements
        course_list = soup.find_all('dl', class_='result-card__heading--indented')

        # Iterate over each course and extract details
        for course in course_list:
            course_name = course.find('h6', class_='course-name').get_text(strip=True) if course.find('h6', class_='course-name') else 'No Title'
            course_link = course.find('a', href=True)['href'] if course.find('a', href=True) else 'No Link'
            course_link = 'https://www.theuniguide.co.uk' + course_link  # Add base URL
            course_snippet = course.find('span', class_='snippet').get_text(strip=True) if course.find('span', class_='snippet') else 'No Details'

            # Add the extracted data to the courses list
            courses.append({
                'Course Name': course_name,
                'Course Link': course_link,
                'Course Details': course_snippet
            })

        print(f"Scraped page {page} successfully.")
    else:
        print(f"Failed to retrieve page {page}. Status code: {response.status_code}")
        break  # Stop if a page fails to load (you can handle this differently if needed)

# Convert the list of courses to a pandas DataFrame
df = pd.DataFrame(courses)

# Save the data to a CSV file
df.to_csv('university_of_york100.csv', index=False)

print(f"Data saved to 'university_of_york_courses_paginated.csv'")


Scraped page 1 successfully.
Scraped page 2 successfully.
Scraped page 3 successfully.
Scraped page 4 successfully.
Scraped page 5 successfully.
Scraped page 6 successfully.
Scraped page 7 successfully.
Scraped page 8 successfully.
Scraped page 9 successfully.
Scraped page 10 successfully.
Scraped page 11 successfully.
Scraped page 12 successfully.
Scraped page 13 successfully.
Scraped page 14 successfully.
Scraped page 15 successfully.
Scraped page 16 successfully.
Scraped page 17 successfully.
Scraped page 18 successfully.
Scraped page 19 successfully.
Scraped page 20 successfully.
Scraped page 21 successfully.
Scraped page 22 successfully.
Scraped page 23 successfully.
Scraped page 24 successfully.
Scraped page 25 successfully.
Scraped page 26 successfully.
Scraped page 27 successfully.
Scraped page 28 successfully.
Scraped page 29 successfully.
Scraped page 30 successfully.
Data saved to 'university_of_york_courses_paginated.csv'


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL for the first page (without pagination)
base_url = 'https://www.theuniguide.co.uk/university-of-york-y50/courses'

# Initialize an empty list to store all courses
courses = []

# Set the number of pages to scrape (you can automate this further by detecting when there are no more pages)
total_pages = 20  # Adjust this depending on how many pages there are

# Loop through each page
for page in range(1, total_pages + 1):
    # Construct the URL for each page
    if page == 1:
        url = base_url
    else:
        url = f'{base_url}?page={page}'

    # Send a GET request to the page
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all course elements using the provided HTML structure
        course_list = soup.find_all('dl', class_='result-card__heading--indented')

        # Iterate over each course and extract details
        for course in course_list:
            # Course Name
            course_name = course.find('h6', class_='course-name').get_text(strip=True) if course.find('h6', class_='course-name') else 'No Title'

            # Course Link
            course_link = course.find('a', href=True)['href'] if course.find('a', href=True) else 'No Link'
            course_link = 'https://www.theuniguide.co.uk' + course_link  # Add base URL

            # Additional course snippets (degree type, duration, etc.)
            course_snippet = course.find('span', class_='snippet').get_text(strip=True) if course.find('span', class_='snippet') else 'No Details'

            # Extract UCAS Points
            ucas_points = course.find('div', class_='entry_points').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='entry_points') else 'No UCAS Points'

            # Extract Student Satisfaction Score
            student_satisfaction = course.find('div', class_='cs_student_score').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='cs_student_score') else 'No Satisfaction Score'

            # Extract Average Graduate Salary
            graduate_salary = course.find('div', class_='cs_outcome_salary_6months').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='cs_outcome_salary_6months') else 'No Salary Info'

            # Add the extracted data to the courses list
            courses.append({
                'Course Name': course_name,
                'Course Link': course_link,
                'Course Details': course_snippet,
                'UCAS Points': ucas_points,
                'Student Satisfaction Score': student_satisfaction,
                'Average Graduate Salary': graduate_salary
            })

        print(f"Scraped page {page} successfully.")
    else:
        print(f"Failed to retrieve page {page}. Status code: {response.status_code}")
        break  # Stop if a page fails to load (you can handle this differently if needed)

# Convert the list of courses to a pandas DataFrame
df = pd.DataFrame(courses)

# Drop duplicates based on the 'Course Name' or 'Course Link' column
df.drop_duplicates(subset=['Course Name', 'Course Link'], keep='first', inplace=True)

# Save the data to a CSV file
df.to_csv('university_of_york_courses10.csv', index=False)

print(f"Data saved to 'university_of_york_courses_paginated_with_details_no_duplicates.csv'")


Scraped page 1 successfully.
Scraped page 2 successfully.
Scraped page 3 successfully.
Scraped page 4 successfully.
Scraped page 5 successfully.
Scraped page 6 successfully.
Scraped page 7 successfully.
Scraped page 8 successfully.
Scraped page 9 successfully.
Scraped page 10 successfully.
Scraped page 11 successfully.
Scraped page 12 successfully.
Scraped page 13 successfully.
Scraped page 14 successfully.
Scraped page 15 successfully.
Scraped page 16 successfully.
Scraped page 17 successfully.
Scraped page 18 successfully.
Scraped page 19 successfully.
Scraped page 20 successfully.
Data saved to 'university_of_york_courses_paginated_with_details_no_duplicates.csv'


In [None]:
pip install selenium


Collecting selenium
  Downloading selenium-4.25.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.25.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.27.0-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.7/481.7 kB[0m [31m27.

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up Selenium WebDriver (let Selenium find the chromedriver)
# Use the following line instead of specifying the path directly
# service = Service('/usr/lib/chromium-browser/chromedriver')  # Changed path  <- This line is removed
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode (no browser window)
options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=options)

# ... (rest of your code)

# Give the page some time to load
time.sleep(5)

# ... (rest of your code)

# Click the "Load More" button repeatedly until all content is loaded
while True:
    try:
        # Find and click the "Load More" button (adjust the selector if necessary)
        load_more_button = driver.find_element_by_class_name('js-search-load-more-control')
        driver.execute_script("arguments[0].click();", load_more_button)
        print("Clicked 'Load More' button")

        # Wait for new content to load
        time.sleep(3)
    except Exception as e:
        print("No more 'Load More' button found or an error occurred:", e)
        break  # Exit the loop if no more "Load More" button is found

# Once all content is loaded, pass the page source to BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Close the browser
driver.quit()

# Extract course data (same as before)
courses = []

# Find all course elements using the provided HTML structure
course_list = soup.find_all('dl', class_='result-card__heading--indented')

# Iterate over each course and extract details
for course in course_list:
    # Course Name
    course_name = course.find('h6', class_='course-name').get_text(strip=True) if course.find('h6', class_='course-name') else 'No Title'

    # Course Link
    course_link = course.find('a', href=True)['href'] if course.find('a', href=True) else 'No Link'
    course_link = 'https://www.theuniguide.co.uk' + course_link  # Add base URL

    # Additional course snippets (degree type, duration, etc.)
    course_snippet = course.find('span', class_='snippet').get_text(strip=True) if course.find('span', class_='snippet') else 'No Details'

    # Extract UCAS Points
    ucas_points = course.find('div', class_='entry_points').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='entry_points') else 'No UCAS Points'

    # Extract Student Satisfaction Score
    student_satisfaction = course.find('div', class_='cs_student_score').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='cs_student_score') else 'No Satisfaction Score'

    # Extract Average Graduate Salary
    graduate_salary = course.find('div', class_='cs_outcome_salary_6months').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='cs_outcome_salary_6months') else 'No Salary Info'

    # Add the extracted data to the courses list
    courses.append({
        'Course Name': course_name,
        'Course Link': course_link,
        'Course Details': course_snippet,
        'UCAS Points': ucas_points,
        'Student Satisfaction Score': student_satisfaction,
        'Average Graduate Salary': graduate_salary
    })

# Convert the list of courses to a pandas DataFrame
df = pd.DataFrame(courses)

# Drop duplicates based on the 'Course Name' or 'Course Link' column
df.drop_duplicates(subset=['Course Name', 'Course Link'], keep='first', inplace=True)

# Save the data to a CSV file
df.to_csv('university_of_york_courses_full_data_no_duplicates.csv', index=False)

print(f"Data saved to 'university_of_york_courses_full_data_no_duplicates.csv'")


No more 'Load More' button found or an error occurred: 'WebDriver' object has no attribute 'find_element_by_class_name'
Data saved to 'university_of_york_courses_full_data_no_duplicates.csv'


In [None]:
!apt-get update
!apt install chromium-chromedriver

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Connecting to archive.ubuntu.com] [1 InRelease 14.2 kB/129 kB 11%] [Connected to cloud.r-project                                                                                                    Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [1 InRelease 73.5 kB/129 kB 57%] [Waiting for h                                                                                                    Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Waiting for headers] [1 InRelease 73.5 kB/129 kB 57%] [Waiting for headers] [Connecting to ppa.l                                                                                                    Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [1 InRelease 118 kB/129 kB 92%] [Waiting f

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up Selenium WebDriver with Service (replace with the path to your ChromeDriver)
service = Service('/path/to/chromedriver')  # Update with your chromedriver path
driver = webdriver.Chrome(service=service)

# Open the webpage
url = 'https://www.theuniguide.co.uk/university-of-york-y50/courses'
driver.get(url)

# Give the page some time to load
time.sleep(5)

# Click the "Load More" button repeatedly until all content is loaded
while True:
    try:
        # Find and click the "Load More" button (adjust the selector if necessary)
        load_more_button = driver.find_element_by_class_name('js-search-load-more-control')
        driver.execute_script("arguments[0].click();", load_more_button)
        print("Clicked 'Load More' button")

        # Wait for new content to load
        time.sleep(3)
    except Exception as e:
        print("No more 'Load More' button found or an error occurred:", e)
        break  # Exit the loop if no more "Load More" button is found

# Once all content is loaded, pass the page source to BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Close the browser
driver.quit()

# Extract course data (same as before)
courses = []

# Find all course elements using the provided HTML structure
course_list = soup.find_all('dl', class_='result-card__heading--indented')

# Iterate over each course and extract details
for course in course_list:
    # Course Name
    course_name = course.find('h6', class_='course-name').get_text(strip=True) if course.find('h6', class_='course-name') else 'No Title'

    # Course Link
    course_link = course.find('a', href=True)['href'] if course.find('a', href=True) else 'No Link'
    course_link = 'https://www.theuniguide.co.uk' + course_link  # Add base URL

    # Additional course snippets (degree type, duration, etc.)
    course_snippet = course.find('span', class_='snippet').get_text(strip=True) if course.find('span', class_='snippet') else 'No Details'

    # Extract UCAS Points
    ucas_points = course.find('div', class_='entry_points').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='entry_points') else 'No UCAS Points'

    # Extract Student Satisfaction Score
    student_satisfaction = course.find('div', class_='cs_student_score').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='cs_student_score') else 'No Satisfaction Score'

    # Extract Average Graduate Salary
    graduate_salary = course.find('div', class_='cs_outcome_salary_6months').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='cs_outcome_salary_6months') else 'No Salary Info'

    # Add the extracted data to the courses list
    courses.append({
        'Course Name': course_name,
        'Course Link': course_link,
        'Course Details': course_snippet,
        'UCAS Points': ucas_points,
        'Student Satisfaction Score': student_satisfaction,
        'Average Graduate Salary': graduate_salary
    })

# Convert the list of courses to a pandas DataFrame
df = pd.DataFrame(courses)

# Drop duplicates based on the 'Course Name' or 'Course Link' column
df.drop_duplicates(subset=['Course Name', 'Course Link'], keep='first', inplace=True)

# Save the data to a CSV file
df.to_csv('university_of_york_courses_full_data_no_duplicates.csv', index=False)

print(f"Data saved to 'university_of_york_courses_full_data_no_duplicates.csv'")


NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


In [None]:
import os
os.environ['PATH'] += ":/usr/lib/chromium-browser/:/usr/bin/"


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up options for headless Chrome
chrome_options = Options()
chrome_options.add_argument('--headless')  # Run in headless mode
chrome_options.add_argument('--no-sandbox')  # Bypass OS security model
chrome_options.add_argument('--disable-dev-shm-usage')  # Overcome limited resource problems

# Initialize Chrome WebDriver with headless options (use default ChromeDriver location)
driver = webdriver.Chrome(options=chrome_options)

# Open the webpage
url = 'https://www.theuniguide.co.uk/university-of-york-y50/courses'
driver.get(url)

# Wait for the page to load
time.sleep(5)

# Click the "Load More" button repeatedly until all content is loaded
while True:
    try:
        # Find and click the "Load More" button (adjust the selector if necessary)
        load_more_button = driver.find_element_by_class_name('js-search-load-more-control')
        driver.execute_script("arguments[0].click();", load_more_button)
        print("Clicked 'Load More' button")

        # Wait for new content to load
        time.sleep(3)
    except Exception as e:
        print("No more 'Load More' button found or an error occurred:", e)
        break  # Exit the loop if no more "Load More" button is found

# Once all content is loaded, pass the page source to BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Close the browser
driver.quit()

# Extract course data (same as before)
courses = []

# Find all course elements using the provided HTML structure
course_list = soup.find_all('dl', class_='result-card__heading--indented')

# Iterate over each course and extract details
for course in course_list:
    # Course Name
    course_name = course.find('h6', class_='course-name').get_text(strip=True) if course.find('h6', class_='course-name') else 'No Title'

    # Course Link
    course_link = course.find('a', href=True)['href'] if course.find('a', href=True) else 'No Link'
    course_link = 'https://www.theuniguide.co.uk' + course_link  # Add base URL

    # Additional course snippets (degree type, duration, etc.)
    course_snippet = course.find('span', class_='snippet').get_text(strip=True) if course.find('span', class_='snippet') else 'No Details'

    # Extract UCAS Points
    ucas_points = course.find('div', class_='entry_points').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='entry_points') else 'No UCAS Points'

    # Extract Student Satisfaction Score
    student_satisfaction = course.find('div', class_='cs_student_score').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='cs_student_score') else 'No Satisfaction Score'

    # Extract Average Graduate Salary
    graduate_salary = course.find('div', class_='cs_outcome_salary_6months').find('div', class_='stat-number').get_text(strip=True) if course.find('div', class_='cs_outcome_salary_6months') else 'No Salary Info'

    # Add the extracted data to the courses list
    courses.append({
        'Course Name': course_name,
        'Course Link': course_link,
        'Course Details': course_snippet,
        'UCAS Points': ucas_points,
        'Student Satisfaction Score': student_satisfaction,
        'Average Graduate Salary': graduate_salary
    })

# Convert the list of courses to a pandas DataFrame
df = pd.DataFrame(courses)

# Drop duplicates based on the 'Course Name' or 'Course Link' column
df.drop_duplicates(subset=['Course Name', 'Course Link'], keep='first', inplace=True)

# Save the data to a CSV file
df.to_csv('university_of_york_courses_full_data_no_duplicates.csv', index=False)

print(f"Data saved to 'university_of_york_courses_full_data_no_duplicates.csv'")


No more 'Load More' button found or an error occurred: 'WebDriver' object has no attribute 'find_element_by_class_name'
Data saved to 'university_of_york_courses_full_data_no_duplicates.csv'
