In [1]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [3]:
def scrape_coursera():
    coursera_data = []

    for page in range(1, 85):  # Adjust the range based on your needs
        # Use Selenium to load dynamic content
        search_url = f'https://www.coursera.org/search?query=python&page={page}'
        driver = webdriver.Chrome()
        driver.get(search_url)

        # Wait for the page to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'cds-ProductCard-content')))

        # Get the updated HTML content after dynamic loading
        page_source = driver.page_source
        driver.quit()

        soup = BeautifulSoup(page_source, 'html.parser',from_encoding='utf-8')

        # Extract information from all course cards
        course_cards = soup.find_all('div', class_='cds-ProductCard-content')

        for course_card in course_cards:
            # Extracting course name
            course_name_tag = course_card.find('h3', class_='cds-CommonCard-title')
            course_name = course_name_tag.text.strip() if course_name_tag else 'N/A'

            # Extracting course URL
            course_url_tag = course_card.find('a', class_='cds-CommonCard-titleLink')
            course_url = 'https://www.coursera.org' + course_url_tag['href'] if course_url_tag and 'href' in course_url_tag.attrs else 'N/A'

            # Extracting course rating
            rating_container = course_card.find('div', class_='product-reviews')
            course_rating_tag = rating_container.find('p', class_='cds-119 css-11uuo4b cds-121') if rating_container else None
            course_rating = course_rating_tag.text.strip() if course_rating_tag else 'N/A'

            # Extracting course level and duration
            metadata_container = course_card.find('div', class_='cds-CommonCard-metadata')
            metadata_text = metadata_container.text.strip() if metadata_container else 'N/A'

             # Split metadata into level and duration
            metadata_parts = [info.strip() for info in metadata_text.split('·')]

            # Assign default values if needed
            level = metadata_parts[0] if len(metadata_parts) >= 1 else 'N/A'
            typeof = metadata_parts[1] if len(metadata_parts) >=2 else 'N/A'
            duration = metadata_parts[2] if len(metadata_parts) >= 3 else 'N/A'

            coursera_data.append({
                'Course Name': course_name,
                'Course URL': course_url,
                'Rating': course_rating,
                'Level': level,
                'Type': typeof,
                'Duration': duration
            })

    return coursera_data

In [4]:
coursera_data = scrape_coursera()
print(coursera_data)



[{'Course Name': 'Python for Data Science, AI & Development', 'Course URL': 'https://www.coursera.org/learn/python-for-applied-data-science-ai', 'Rating': '4.6', 'Level': 'Beginner', 'Type': 'Course', 'Duration': '1 - 3 Months'}, {'Course Name': 'Crash Course on Python', 'Course URL': 'https://www.coursera.org/learn/python-crash-course', 'Rating': '4.8', 'Level': 'Beginner', 'Type': 'Course', 'Duration': '1 - 3 Months'}, {'Course Name': 'Python for Everybody', 'Course URL': 'https://www.coursera.org/specializations/python', 'Rating': '4.8', 'Level': 'Beginner', 'Type': 'Specialization', 'Duration': '3 - 6 Months'}, {'Course Name': 'Python 3 Programming', 'Course URL': 'https://www.coursera.org/specializations/python-3-programming', 'Rating': '4.7', 'Level': 'Beginner', 'Type': 'Specialization', 'Duration': '3 - 6 Months'}, {'Course Name': 'Google IT Automation with Python', 'Course URL': 'https://www.coursera.org/professional-certificates/google-it-automation', 'Rating': '4.7', 'Level'

In [5]:
from random import choice

for course in coursera_data:
    active=reflective=sensing=intuitive=verbal=visual=sequential=globall=0
    if course['Level'] == 'Beginner':
        sequential+=1
        sensing+=1
    elif course['Level'] == 'Intermediate':
        reflective+=1
        intuitive+=1
    elif course['Level'] == 'Advanced':
        active+=1
        globall+=1
    if course['Type'] == 'Course':
        active+=1
        verbal+=1
    elif course['Type'] == 'Specialization':
        reflective+=1
        visual+=1
    elif course['Type'] == 'Professional Certificate':
            sensing+=1
    if 'weeks' in course['Duration'].lower() or 'hours' in course['Duration'].lower():
        active+=1
    elif 'months' in course['Duration'].lower():
        reflective+=1
     # Determine the top learning style(s)
    max_count = max(active, reflective, sensing, intuitive, verbal, visual, sequential, globall)
    top_learning_styles = [style for style, count in zip(
        ['Active', 'Reflective', 'Sensing', 'Intuitive', 'Verbal', 'Visual', 'Sequential', 'Global'],
        [active, reflective, sensing, intuitive, verbal, visual, sequential, globall]
    ) if count == max_count]
    
    # Handle ties (choose one randomly in this example)
    if len(top_learning_styles) == 1:
        top_learning_style = top_learning_styles[0]
    else:
        top_learning_style = choice(top_learning_styles)
    course['Learning Style'] = top_learning_style

In [6]:
import pandas as pd

def create_and_add_data(data, file_name, sheet_name='Sheet1'):
    # Convert data to DataFrame
    df = pd.DataFrame(data)

    # Create a new Excel file
    with pd.ExcelWriter(file_name, engine='xlsxwriter') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False)

In [7]:
create_and_add_data(coursera_data, 'new_coursera_courses.xlsx')

In [None]:
----edx------

In [174]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

In [175]:
def scrape_edx(pages_to_scrape=5):
    # Set up Chrome WebDriver
    driver = webdriver.Chrome()

    # Base URL for the search query
    base_url = 'https://www.edx.org/search?q=python+programming&tab=course&page={}'

    # Create a list to store the scraped data
    edx_data = []

    for page in range(1, pages_to_scrape + 1):
        # Construct the URL for the current page
        url = base_url.format(page)

        # Open the URL in the browser
        driver.get(url)

        # Wait for the page to load (you might need to adjust the sleep duration based on your internet speed)
        time.sleep(5)

        # Find all course containers
        course_containers = driver.find_elements(By.CLASS_NAME, 'base-card-wrapper')

        # Loop through each course container
        for container in course_containers:
            # Extract course title
            course_title = container.find_element(By.CLASS_NAME, 'pgn__card-header-title-md').text.strip()

            # Extract course URL
            course_url = container.find_element(By.CLASS_NAME, 'base-card-link').get_attribute('href')

            # Extract course subtitle
            course_subtitle = container.find_element(By.CLASS_NAME, 'pgn__card-header-subtitle-md').text.strip()

            # Extract course type
            course_type = container.find_element(By.CLASS_NAME, 'badge').text.strip()

            # Append data to the list
            edx_data.append({
                'Title': course_title,
                'URL': course_url,
                'Subtitle': course_subtitle,
                'Type': course_type,
            })

    # Close the browser
    driver.quit()

    return edx_data

In [176]:
edx_data = scrape_edx()
print(edx_data)

[{'Title': "CS50's\nIntroduction to\nProgramming wit…", 'URL': 'https://www.edx.org/learn/python/harvard-university-cs50-s-introduction-to-programming-with-python?index=product&queryID=18ebe4fb0c4e0db77b183f64b1bd2bd0&position=1&results_level=second-level-results&term=python+programming&objectID=course-2cc794d0-316d-42f7-bbfd-25c34e4cd5df&campaign=CS50%27s+Introduction+to+Programming+with+Python&source=edX&product_category=course&placement_url=https%3A%2F%2Fwww.edx.org%2Fsearch', 'Subtitle': 'Harvard University', 'Type': 'Course'}, {'Title': 'Python\nProgramming:\nBasic Skills', 'URL': 'https://www.edx.org/learn/python/codio-python-programming-basic-skills?index=product&queryID=18ebe4fb0c4e0db77b183f64b1bd2bd0&position=2&results_level=second-level-results&term=python+programming&objectID=course-dcea46cf-96bc-446d-8e23-cb36db4276b4&campaign=Python+Programming%3A+Basic+Skills&source=edX&product_category=course&placement_url=https%3A%2F%2Fwww.edx.org%2Fsearch', 'Subtitle': 'Codio', 'Type'

In [177]:
create_and_add_data(edx_data, 'new_edx_courses.xlsx')

In [None]:
----Udemy------

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

In [2]:
def scrape_udemy():
    udemy_url = 'https://www.udemy.com/courses/search/?q=python+programming&src=sac&kw=python'

    # Use Selenium to load dynamic content
    driver = webdriver.Chrome()
    driver.get(udemy_url)

    try:
        wait = WebDriverWait(driver, 20)  # Adjust the timeout as needed

        # Wait for the course cards to be present
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'course-card-title_title__tvSBS')))

        time.sleep(5)  # Additional delay

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        udemy_data = []

        course_cards = soup.find_all('div', class_='course-card-title_title__tvSBS')

        for card in course_cards:
            course_title_tag = card.find('h3', class_='ud-heading-md course-card-title_course-title___sH9w')
            print(course_title_tag)
            if course_title_tag:
                course_name = course_title_tag.find('a').text.strip()
                course_url = 'https://www.udemy.com' + course_title_tag.find('a')['href']

                # Extracting additional information
                rating_tag = card.find('span', {'data-testid': 'seo-rating'})
                rating = rating_tag.text.strip() if rating_tag else 'N/A'

                level_tag = card.find('span', {'data-testid': 'seo-instructional-level'})
                level = level_tag.text.strip() if level_tag else 'N/A'

                duration_tag = card.find('span', {'data-testid': 'seo-content-info'})
                duration = duration_tag.text.strip() if duration_tag else 'N/A'

                num_lectures_tag = card.find('span', {'data-testid': 'seo-num-lectures'})
                num_lectures = num_lectures_tag.text.strip() if num_lectures_tag else 'N/A'

                udemy_data.append({
                    'Course Name': course_name,
                    'Course URL': course_url,
                    'Rating': rating,
                    'Level': level,
                    'Duration': duration,
                    'Number of Lectures': num_lectures
                })

    finally:
        driver.quit()

    return udemy_data

udemy_data = scrape_udemy()
print(udemy_data)


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF62E4FAD32+56930]
	(No symbol) [0x00007FF62E46F632]
	(No symbol) [0x00007FF62E3242E5]
	(No symbol) [0x00007FF62E3698ED]
	(No symbol) [0x00007FF62E369A2C]
	(No symbol) [0x00007FF62E3AA967]
	(No symbol) [0x00007FF62E38BCDF]
	(No symbol) [0x00007FF62E3A81E2]
	(No symbol) [0x00007FF62E38BA43]
	(No symbol) [0x00007FF62E35D438]
	(No symbol) [0x00007FF62E35E4D1]
	GetHandleVerifier [0x00007FF62E876ABD+3709933]
	GetHandleVerifier [0x00007FF62E8CFFFD+4075821]
	GetHandleVerifier [0x00007FF62E8C818F+4043455]
	GetHandleVerifier [0x00007FF62E599766+706710]
	(No symbol) [0x00007FF62E47B90F]
	(No symbol) [0x00007FF62E476AF4]
	(No symbol) [0x00007FF62E476C4C]
	(No symbol) [0x00007FF62E466904]
	BaseThreadInitThunk [0x00007FF99CED257D+29]
	RtlUserThreadStart [0x00007FF99E5AAA58+40]


In [13]:
create_and_add_data(udemy_data, 'new_udemy_courses.xlsx')