# Scrape Course List and Detail in Udemy

In this project, I built open-source python packages to scrape courses from Udemy website with BeautifulSoup. Udemy is an online education plateform which provides courses from diverse industries. These packages will help people who are interested in finding the course list for a particular industry or course details for a particular course name.

* ScrapeMultiplePageinCategory(category, num_of_page): it will return all the course details in the top k pages in that category
* ScrapeDetailofClass(course_name): it will return the course detail for the particular course we are looking for

In [2]:
import requests
import pandas
from lxml import html
import time
from bs4 import BeautifulSoup
from selenium import webdriver

We want to know the parent and children relationship between the categories and subcategories by returning a list of the relationship. Firstly, we want to map the categories with ID and sub_category. One thing to note is that we won't find any ID information in the Udemy website but the ID is used to link the category, sub_category_level1 and sub_category_level2 in the HTML.

In [126]:
url = 'https://www.udemy.com/'
h = html.fromstring(requests.get(url).text)


def CategorytoSubcatgory1(url, h):
    # create a dictionary {data_id_level1, {category, sub_category_level1}}
    id_level1_ctgy = {}
    course_path = h.xpath('//a[contains(@class, "cat js-subc")]/@href')
    print(course_path[0])
    data_id = h.xpath('//a[contains(@class, "cat js-subc")]/@data-id')

    k = 0
    while k < len(data_id):
        # we find the category and sub_category_level1 by using the name in the website URL
        path = course_path[k][9:-1]
        data_id_level1 = data_id[k]

        for i, j in enumerate(path):
            if j == '/':
                category = path[:i]
                sub_category_level1 = path[i + 1:]
                if '-' in category:
                    category_update = category.replace('-', ' ')
                else:
                    category_update = category
                if '-' in sub_category_level1:
                    sub_category_level1_update = sub_category_level1.replace('-', ' ')
                else:
                    sub_category_level1_update = sub_category_level1
                id_level1_ctgy[data_id_level1] = {category_update: sub_category_level1_update}

        k += 1
    return id_level1_ctgy

After finding the relationship between category and subcategory_level1, we want to map the subcategory_level1 to subcategory_level2.

In [127]:
# Scrape all the subcategory_level_2 titles
def Subcategory2(url, h): 
    #                                 subCategory_map2      subCategory_map1
    # create a dictionary {category, {sub_category_level1, {sub_category_level2, data_id_level2}}}
    subCtgy2 = {}
    data_id_level2s = h.xpath('//a[contains(@href, "topic")]/@data-id')
    sub_ctgy_level2 = h.xpath('//a[contains(@href, "topic")]/@href')
    j = 0
    while j < len(data_id_level2s):
        temp = sub_ctgy_level2[j][7:-1]
#         if '-' in temp:
#             sub_ctgy_level2_update = temp.replace('-', ' ')
#         else:
#             sub_ctgy_level2_update = temp
        subCtgy2[data_id_level2s[j]] = temp
        j += 1
    return subCtgy2

In [128]:
# Map the relationship between subcategory_level1 and subcategory_level2
def Subcategory1toSubcategory2(url, h):
    subctgy1_subCtgy2 = {}
    divs = h.xpath('//div[contains(@class, "js-side-nav-popular-topics")]')
    data_connect = h.xpath('//div[contains(@class, "js-side-nav-popular-topics")]/@data-id')

    for i in range(len(divs)):
        ind_list = divs[i].xpath('a')
        id_subCtgy2 = {}
        id_list = divs[i].xpath('a/@data-id')
        for k in range(len(ind_list)):
            subctgy_list = ind_list[k].text
            id_subCtgy2[id_list[k]] = subctgy_list
        subctgy1_subCtgy2[data_connect[i]] = id_subCtgy2
    return subctgy1_subCtgy2

After finding the parent-children relationship of categories and subcategories, we want to find the class details in the category. The details of each class contains the information including name, url, instructor, rating and price in the top k pages. We can choose the number of pages we like and if the num_of_page is not entered, we will return the courses detail in the first five pages.

We can use selenium chrome driver to scrape the course details because the beautiful soup only scrapes the courses on the current screen. We need to control the mouse to scroll down, so we can get all the courses in the page.

In [122]:
SELENIUM_CHROME_DRIVER_PATH = '/Users/jiongjiangduan/Documents/chromedriver'
SELENIUM_CUSTOM_CHROME_PATH = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
SCROLL_PAUSE_TIME = 1.75
SCROLL_PAGE_SIZE = 600

def ScrapeMultiplePageinCategory(category, num_of_page):
    if num_of_page == 0:
        first_k_page = 5
    else:
        first_k_page = num_of_page
        
    category.lower()
    for i in category:
        if ' ' in category:
            temp = category.replace(' ', '-')
            
    k = 1
    for k <= first_k_page:
        url = "https://www.udemy.com/topic/" + temp + "?p=" + k
        ScrapePageinCategory(url)

def ScrapePageinCategory(url):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.binary_location = SELENIUM_CUSTOM_CHROME_PATH

    browser = webdriver.Chrome(SELENIUM_CHROME_DRIVER_PATH, options=options)
    browser.get(url)

    # scrape the course link in the Udemy website
    urls = list()
    scroll(browser)

    html = browser.page_source
    soup = BeautifulSoup(html, 'lxml')
    main_content = soup.find("div", "main-content")
    containers = main_content.find_all("div", {"class": "popover--popover--t3rNO popover--popover-hover--14ngr"})
    for div_tag in containers:
        a_tag = div_tag.find_all("a")
        for website_link in a_tag:
            if website_link['href'] is not None:
                urls.append("https://www.udemy.com"+website_link['href'])

    # scrape the course title in the Udemy website
    titles_content = main_content.find_all("div", {"class":"udlite-heading-sm udlite-focus-visible-target course-card--course-title--2f7tE"})
    for title in titles_content:
        if title is not None:
            print(title.text)

    # scrape the instructor in the Udemy website
    instructor_content = main_content.find_all("div", {"class":"udlite-text-xs course-card--instructor-list--lIA4f",
                                                       "data-purpose":"safely-set-inner-html:course-card:visible-instructors"})
    for instructor in instructor_content:
        if instructor is not None:
            print(instructor.text)

    # scrape the rating of each course
    rating_content = main_content.find_all("span", {"class":"udlite-heading-sm star-rating--rating-number--3lVe8"})
    for rating in rating_content:
        if rating is not None:
            print(rating.text)
    browser.close()

    print("Total Nums: " + str(len(urls)))
    print(*urls, sep="\n")


def scroll(browser):
    # Get scroll height
    last_height = browser.execute_script("return document.body.scrollHeight")
    print("Page Height: " + str(last_height))

    while True:
        # Scroll down to bottom
        next_height = last_height + SCROLL_PAGE_SIZE
        browser.execute_script("window.scrollTo(0, " + str(next_height) + ");")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = browser.execute_script("return document.body.scrollHeight")
        if new_height < last_height:
            break
        last_height = next_height
        print("Page Height: " + str(last_height))

Now we can have a look of a particular class. We can get the detail of a course if we know the class name instead of the class category.

In [7]:
def ScrapeDetailofClass(course_name):
    ans = []
    course_name.lower()
    for i in course_name:
        if ' ' in course_name:
            temp = course_name.replace(' ', '-')
        else:
            temp = course_name
    url = 'https://www.udemy.com/course/' + temp + '/'
    result = requests.get(url)
    src = result.content
    soup = BeautifulSoup(src, 'lxml')
    
    # url
    course_url = url
    ans.append(course_url)
    
    # course title
    title = soup.find('title')
    ans.append(title.text)
    
    # The instructor
    instructor_info = soup.find_all("a", {"class": "udlite-btn udlite-btn-large udlite-btn-link udlite-heading-md udlite-text-sm udlite-instructor-links", 
                                          "data-position":"1",
                                          "href":"#instructor-1"})
    for instructor_name in instructor_info:
        if instructor_name is not None:
            ans.append(instructor_name.text)
    
    # course price
    price_info = soup.find_all("div",{"class": "price-text--container--Ws-fP udlite-clp-price-text"})
    for price in price_info:
        if price is not None:
            ans.append(price.text)
    
    # The course rating
    rating_info = soup.find_all("div", {"class": "ud-component--course-landing-page-udlite--rating"})
    for score in rating_info:
        if score is not None:
            ans.append(score.text)
    
    # The number of student enrolled in the course
    num_of_enrollment = soup.find_all("div", {"data-purpose":"enrollment"})
    for enrollment in num_of_enrollment:
        if enrollment is not None:
            ans.append(enrollment.text[1:-2])
    
    if len(ans) == 1:
        raise Exception('Please Enter Another Course Name')
        
    return ans


ScrapeDetailofClass('python for data science and machine learning bootcamp')  

['https://www.udemy.com/course/python-for-data-science-and-machine-learning-bootcamp/',
 'Learn Python for Data Science, Structures, Algorithms, Interviews | Udemy',
 'Jose Portilla',
 'Current price$14.99Original Price$109.99Discount86% off',
 'Rating: 4.6 out of 54.6 (85,526 ratings)',
 '381,085 student']