# How to use this notebook
To search for a specific course title, just change the query term in the following code block.

Make sure that you are using **%20** for spaces in between words in the query.

In [2]:
#from selenium.webdriver import Edge
from msedge.selenium_tools import Edge, EdgeOptions
import time
import requests
from bs4 import BeautifulSoup

# initialize webscraper options
options = EdgeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
driver = Edge(executable_path='C:\ProgramData\Anaconda3\Scripts\msedgedriver.exe', options=options)

query = 'python' #QUERY
URL = 'https://www.coursera.org/search?query=' + query + '&'

def getPages(URL, num_pages=0): # by default gets only the first page
    driver.get(URL) # get the page
    time.sleep(2) # wait for the page to load
    
    if num_pages != 0:
        pages = []
        pages.append(driver.page_source) # getting the first page
        for i in range(num_pages): # getting subsequent pages
            if i <= 2:
                selector = '#main > div > div > div.rc-SearchTabs > div > div > div > div > div > div > div.rc-PaginationControls.horizontal-box.align-items-right.large-style.cds.css-k2t558 > div > button:nth-child(8)'
            else:
                selector = '#main > div > div > div.rc-SearchTabs > div > div > div > div > div > div > div.rc-PaginationControls.horizontal-box.align-items-right.large-style.cds.css-k2t558 > div > button:nth-child(9)'

            try:
                button = driver.find_element_by_css_selector(selector)
                #button = driver.find_element_by_class_name("label-text box arrow")
                #button = driver.find_element_by_css_selector('#main > div > div > div.rc-SearchTabs > div > div > div > div > div > div > div.rc-PaginationControls.horizontal-box.align-items-right.large-style.cds.css-k2t558 > div > button:nth-child(9)')
                button.click()
            except:
                print(f"Couldn't locate next page button on page {i+2}")
                break

            time.sleep(1)
            pages.append(driver.page_source)
    else:
        pages = driver.page_source
                
    driver.quit() # close browser
    return pages

results = getPages(URL, num_pages=9)

In [4]:
import pandas as pd

def getText(l):
    return [i.text.strip() for i in l]
def getResultURLs(soup): #page):
    #soup = BeautifulSoup(page, 'html.parser')
    hrefs = soup.find_all('a', {'class':'result-title-link'}, href=True)
    base = 'https://www.coursera.org'
    URLs = [(base + str(item['href'])) for item in hrefs]
    return URLs
def getRatings(soup):
    results = []
    parents = soup.find_all('div', {'class':'rating-enroll-wrapper'})
    for item in parents:
        try:
            temp = item.find('span', {'class':'ratings-text'}).text.strip()
        except:
            temp = 'Nan'
        results.append(temp)
    return results
def getCourseNames(soup):
    results = []
    parents = soup.find_all('div', {'class':'cds-69 card-info css-0 cds-71 cds-grid-item'})
    for item in parents:
        try:
            temp = item.find('h2', {'class':'cds-1 card-title css-iyr9nj cds-3'}).text.strip()
        except:
            temp = 'Nan'
        results.append(temp)
    return results
def getCourseDifficulties(soup):
    results = []
    parents = soup.find_all('div', {'class':'cds-69 card-info css-0 cds-71 cds-grid-item'})
    for item in parents:
        try:
            temp = item.find('span', {'class':'cds-1 difficulty css-lqm5si cds-3'}).text.strip()
        except:
            temp = 'Nan'
        results.append(temp)
    return results
def getCoursePartners(soup):
    results = []
    parents = soup.find_all('div', {'class':'cds-69 card-info css-0 cds-71 cds-grid-item'})
    for item in parents:
        try:
            temp = item.find('span', {'class':'cds-1 partner-name css-mx49ok cds-3'}).text.strip()
        except:
            temp = 'Nan'
        results.append(temp)
    return results
def getCourseTypes(soup):
    results = []
    parents = soup.find_all('div', {'class':'cds-69 card-info css-0 cds-71 cds-grid-item'})
    for item in parents:
        try:
            temp = item.find('span', {'class':'cds-1 withoutGradient pillContainer css-v4ktz5 cds-3'}).text.strip()
        except:
            temp = 'Nan'
        results.append(temp)
    return results
        

def getDeets(page): # get the details
    soup = BeautifulSoup(page, 'html.parser')
    
    names = getCourseNames(soup) # course names
    difficulty = getCourseDifficulties(soup) # course difficulties
    partner = getCoursePartners(soup) # partner offering course
    c_type = getCourseTypes(soup) # title of the course
    rating = getRatings(soup) # course rating
    urls = getResultURLs(soup) # get the 
    
    deets_ = {'name':names, 'difficulty':difficulty, 'partner':partner,
              'course_type':c_type, 'rating':rating, 'URL':urls}
    deets = pd.DataFrame(deets_)
    return deets

df = pd.DataFrame(columns=['name', 'difficulty', 'partner', 'course_type', 'rating', 'URL'])
for i in results:
    deets = getDeets(i)
    df = df.append(deets)

df.to_csv('PythonCourses.csv')
df

Unnamed: 0,name,difficulty,partner,course_type,rating,URL
0,Python for Everybody,Beginner,University of Michigan,Specialization,4.8,https://www.coursera.org/specializations/python
1,Google IT Automation with Python,Beginner,Google,Professional Certificate,4.7,https://www.coursera.org/professional-certific...
2,Python 3 Programming,Beginner,University of Michigan,Specialization,4.7,https://www.coursera.org/specializations/pytho...
3,Building a REST API with Python and Flask,Intermediate,Coursera Project Network,Guided Project,Nan,https://www.coursera.org/projects/building-a-r...
4,Crash Course on Python,Beginner,Google,Course,4.8,https://www.coursera.org/learn/python-crash-co...
...,...,...,...,...,...,...
5,COVID19 Data Visualization Using Python,Intermediate,Coursera Project Network,Guided Project,4.6,https://www.coursera.org/projects/covid19-data...
6,Create Interactive Dashboards with Streamlit a...,Intermediate,Coursera Project Network,Guided Project,4.7,https://www.coursera.org/projects/interactive-...
7,Data Processing Using Python,Beginner,Nanjing University,Course,4.1,https://www.coursera.org/learn/python-data-pro...
8,"Python Project: pillow, tesseract, and opencv",Intermediate,University of Michigan,Course,4,https://www.coursera.org/learn/python-project


In [5]:
len(results) # use to check number of search pages processed

11

In [51]:
# getting the course description and skill and outcome lists
from tqdm import tqdm


def getPageContent(URL): # by default gets the first 5 pages of a result (num_pages +1)
#     print(URL)
    # initialize webscraper options
    options = EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    driver = Edge(executable_path='C:\ProgramData\Anaconda3\Scripts\msedgedriver.exe', options=options)
    driver.get(URL) # get the page
    time.sleep(1) # wait for the page to load
    page = driver.page_source # getting the first page
    #driver.quit() # close browser
    return getSoup(page)

def getPageContent_list(url_list):
    options = EdgeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    driver = Edge(executable_path='C:\ProgramData\Anaconda3\Scripts\msedgedriver.exe', options=options)
    
    sources = []
    
    for url in tqdm(url_list):
        driver.get(url)
        time.sleep(1)
        sources.append(driver.page_source)
    driver.quit()
    
    soups = [BeautifulSoup(i, 'html.parser') for i in sources]
    return soups

def getSoup(page):
    return BeautifulSoup(page, 'html.parser')

def getCourseOutcomes(soup):
    outcomeContainer = soup.find('div', {'class':'CmlLearningObjectives border-a p-x-2 p-t-1'})
    if outcomeContainer == None:
        outcomes = []
    else:
        outcomes = outcomeContainer.find_all('p')
    return getText(outcomes)

def getCourseSkills(soup):
    skillsContainer = soup.find('ul', {'role':'list', 'class':'css-uqope5'})
    if skillsContainer == None:
        skills = []
    else:
        skills = skillsContainer.find_all('span', {'class':'_ontdeqt'})
    return getText(skills)

def getAllOutcomes(url):
    page = getPage(url)
    soup = getSoup(page)
    outcomes = getCourseOutcomes(soup)
    skills = getCourseSkills(soup)
    return outcomes, skills

In [56]:
from tqdm import tqdm

url_list = df['URL'].to_list()
soup_list = []

soups = getPageContent_list(url_list)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [03:32<00:00,  2.13s/it]


In [57]:
skills = [getCourseSkills(soup) for soup in soups]
l_outcomes = [getCourseOutcomes(soup) for soup in soups]

In [58]:
print(skills)

[['Json', 'Xml', 'Python Programming', 'Database (DBMS)', 'Python Syntax And Semantics', 'Basic Programming Language', 'Computer Programming', 'Data Structure', 'Tuple', 'Web Scraping', 'Sqlite', 'SQL'], ['Using Version Control', 'Troubleshooting & Debugging', 'Python Programming', 'Configuration Management', 'Automation', 'Basic Python Data Structures', 'Fundamental Programming Concepts', 'Basic Python Syntax', 'Object-Oriented Programming (OOP)', 'Setting up your Development Environment', 'Regular Expression (REGEX)', 'Testing in Python'], ['Json', 'Computer Programming', 'Python Programming', 'Sorting'], ['Web Development', 'Web Application', 'Python Programming', 'Flask', 'Software Engineering'], ['Basic Python Data Structures', 'Fundamental Programming Concepts', 'Basic Python Syntax', 'Python Programming', 'Object-Oriented Programming (OOP)'], ['Text Mining', 'Python Programming', 'Pandas', 'Matplotlib', 'Numpy', 'Data Cleansing', 'Data Virtualization', 'Data Visualization (DataV

In [55]:
print(l_outcomes)

[['Install Python and write your first program', 'Describe the basics of the Python programming language', 'Use variables to store, retrieve and calculate information', 'Utilize core programming tools such as functions and loops'], ['Automate tasks by writing Python scripts', 'Use Git and GitHub for version control', 'Manage IT resources at scale, both for physical machines and virtual machines in the cloud', 'Analyze real-world IT problems and implement the appropriate strategies to solve those problems'], ['How to inspect and understand APIs and third party libraries to be used with Python 3', 'How to apply the Python imaging library (pillow) to open, view, and manipulate images, including cropping, resizing, recoloring, and overlaying text', 'How to apply the python tesseract (py-tesseract) library with Python 3 in order to detect text in images through optical character recognition (OCR)', 'How to apply the open source computer vision library (opencv) to detect faces in images, & h

In [34]:


courseOutcomes = []
courseSkills = []

c0 = getPageContent(url_list[0])
skills = getCourseOutcomes(c0)


# for url in tqdm(df.URL):
#     a, b = getAllOutcomes(url)
#     courseOutcomes.append(a)
#     courseSkills.append(b)

https://www.coursera.org/specializations/python


In [36]:
outcomes = getCourseSkills(c0)
print(outcomes)

['Json', 'Xml', 'Python Programming', 'Database (DBMS)', 'Python Syntax And Semantics', 'Basic Programming Language', 'Computer Programming', 'Data Structure', 'Tuple', 'Web Scraping', 'Sqlite', 'SQL']


In [35]:
skills

['Install Python and write your first program',
 'Describe the basics of the Python programming language',
 'Use variables to store, retrieve and calculate information',
 'Utilize core programming tools such as functions and loops']