# Installing Packages

In [159]:
!pip install bs4

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,199 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,591 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 http://security.ubuntu.com/ubuntu jammy-sec

# Importing Libraries


In [160]:
from bs4 import BeautifulSoup
import requests
import os

import numpy as np
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# Performing web scraping for coursera's courses

## Testing the parsers

In [102]:
response = requests.get('https://www.coursera.org/courses')
response  # response 200 means valid

<Response [200]>

In [103]:
response.content



In [104]:
html_soup = BeautifulSoup(response.content, 'html.parser')
html_soup

<!DOCTYPE html>
<html dir="ltr" itemtype="http://schema.org" lang="en" xmlns:fb="http://ogp.me/ns/fb#"><head><link crossorigin="" href="https://d3njjcbhbojbot.cloudfront.net" rel="preconnect"/><meta content="IE=Edge,chrome=IE7" http-equiv="X-UA-Compatible"/><meta charset="utf-8"/><meta content="Coursera" property="og:site_name"/><meta content="727836538,4807654" property="fb:admins"/><meta content="823425307723964" property="fb:app_id"/><meta content="Coursera" name="twitter:site"/><meta content="Coursera" name="twitter:app:name:iphone"/><meta content="Coursera" name="twitter:app:name:ipad"/><meta content="Coursera" name="twitter:app:name:googleplay"/><meta content="id736535961" name="twitter:app:id:iphone"/><meta content="id736535961" name="twitter:app:id:ipad"/><meta content="org.coursera.android" name="twitter:app:id:googleplay"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link href="https://d3njjcbhbojbot.cloudfront.net/web/images/favicons/apple-touch-icon

In [105]:
url = f'https://www.coursera.org/courses?page={1}'
page_content = requests.get(url)
soup = BeautifulSoup(page_content.content, 'html.parser')
soup

<!DOCTYPE html>
<html dir="ltr" itemtype="http://schema.org" lang="en" xmlns:fb="http://ogp.me/ns/fb#"><head><link crossorigin="" href="https://d3njjcbhbojbot.cloudfront.net" rel="preconnect"/><meta content="IE=Edge,chrome=IE7" http-equiv="X-UA-Compatible"/><meta charset="utf-8"/><meta content="Coursera" property="og:site_name"/><meta content="727836538,4807654" property="fb:admins"/><meta content="823425307723964" property="fb:app_id"/><meta content="Coursera" name="twitter:site"/><meta content="Coursera" name="twitter:app:name:iphone"/><meta content="Coursera" name="twitter:app:name:ipad"/><meta content="Coursera" name="twitter:app:name:googleplay"/><meta content="id736535961" name="twitter:app:id:iphone"/><meta content="id736535961" name="twitter:app:id:ipad"/><meta content="org.coursera.android" name="twitter:app:id:googleplay"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link href="https://d3njjcbhbojbot.cloudfront.net/web/images/favicons/apple-touch-icon

In [106]:
soup.find_all("div", "css-18juqt8 cds-ProductCard-partners")[4].get_text()

'DSMultiple educators'

In [107]:
element = soup.find_all('div', class_='css-18juqt8 cds-ProductCard-partners')[4]
if element:
    title = element.get('title')
    print(title)

DeepLearning.AI, Stanford University


In [108]:
soup.find_all('a', "cds-119 cds-113 cds-115 cds-CommonCard-titleLink css-si869u cds-142")[4].get('href')

'/specializations/machine-learning-introduction'

In [109]:
full_link = 'https://www.coursera.org' + soup.find_all('a', "cds-119 cds-113 cds-115 cds-CommonCard-titleLink css-si869u cds-142")[4].get('href')
print(full_link)

https://www.coursera.org/specializations/machine-learning-introduction


In [110]:
soup.find_all('h3', 'cds-CommonCard-title css-6ecy9b')[3].get_text()

'Google Digital Marketing & E-commerce'

In [111]:
soup.find_all('div', 'cds-CommonCard-bodyContent')[4].get_text().split(': ')[1]

'Machine Learning, Machine Learning Algorithms, Applied Machine Learning, Algorithms, Deep Learning, Machine Learning Software, Artificial Neural Networks, Human Learning, Python Programming, Regression, Statistical Machine Learning, Tensorflow, Mathematics, Critical Thinking, Network Model, Training, Reinforcement Learning'

In [112]:
float(soup.find_all('div', 'cds-RatingStat-meter')[4].get('aria-valuenow'))

4.9

In [113]:
soup.find_all('div', 'css-vac8rf')[4].get_text().split()[0]

'30K'

In [114]:
soup.find_all('div', 'cds-CommonCard-metadata')[4].get_text().split()[0]

'Beginner'

In [115]:
soup.find_all('div', 'cds-CommonCard-metadata')[4].get_text().split('· ')[2]

'1 - 3 Months'

## Final Scraping Functions

In [206]:
def initialize_scraper(starting_page, ending_page):
  # Empty lists for storing course data
  course_organization = []
  course_link = []
  course_name = []
  skills = []
  course_rating = []
  course_num_reviews = []
  course_difficulty = []
  course_duration = []

  # Iterating through all the pages
  for i in range(starting_page, ending_page):
    url = "https://www.coursera.org/courses?page=" + str(i) + "&sortBy=BEST_MATCH"
    page_response = requests.get(url)
    page_soup = BeautifulSoup(page_response.content, 'html.parser')
    scrap_courses(course_organization, course_link, course_name, skills, course_rating, course_num_reviews, course_difficulty, course_duration, page_soup)

  return course_organization, course_link, course_name, skills, course_rating, course_num_reviews, course_difficulty, course_duration

In [207]:
def scrap_courses(course_organization, course_link, course_name, skills, course_rating, course_num_reviews, course_difficulty, course_duration, soup):
    course_cards = soup.find_all('li', "cds-9 css-0 cds-11 cds-grid-item cds-56 cds-64 cds-76 cds-90")
    print(f'No. of courses on current page: {len(course_cards)}')

    for i in range(0,len(course_cards)):
        card = course_cards[i]

        card_data = {
            'organization': 'unk',
            'link': 'unk',
            'name': 'unk',
            'skills': 'unk',
            'rating': 'unk',
            'num_reviews': 'unk',
            'difficulty': 'unk',
            'duration': 'unk'
        }

        # Get organization
        try:
            org = card.find('div', class_='css-18juqt8 cds-ProductCard-partners')
            if org:
                card_data['organization'] = org.get('title')
        except:
            pass

        # Get link
        try:
            link = card.find('a', "cds-119 cds-113 cds-115 cds-CommonCard-titleLink css-si869u cds-142")
            if link and link.get('href'):
                card_data['link'] = 'https://www.coursera.org' + link.get('href')
        except:
            pass

        # Get name
        try:
            name = card.find('h3', 'cds-CommonCard-title css-6ecy9b')
            if name:
                card_data['name'] = name.get_text()
        except:
            pass

        # Get skills
        try:
            skill = card.find('div', 'cds-CommonCard-bodyContent')
            if skill and ': ' in skill.get_text():
                card_data['skills'] = skill.get_text().split(': ')[1]
        except:
            pass

        # Get rating
        try:
            rating = card.find('div', 'cds-RatingStat-meter')
            if rating and rating.get('aria-valuenow'):
                card_data['rating'] = float(rating.get('aria-valuenow'))
        except:
            pass

        # Get number of reviews
        try:
            num_reviews = card.find('div', 'css-vac8rf')
            if num_reviews:
                card_data['num_reviews'] = num_reviews.get_text().split()[0]
        except:
            pass

        # Get difficulty and duration
        try:
            metadata = card.find('div', 'cds-CommonCard-metadata')
            if metadata:
                text = metadata.get_text()
                card_data['difficulty'] = text.split()[0]
                if '· ' in text:
                    card_data['duration'] = text.split('· ')[2]
        except:
            pass

        course_organization.append(card_data['organization'])
        course_link.append(card_data['link'])
        course_name.append(card_data['name'])
        skills.append(card_data['skills'])
        course_rating.append(card_data['rating'])
        course_num_reviews.append(card_data['num_reviews'])
        course_difficulty.append(card_data['difficulty'])
        course_duration.append(card_data['duration'])

## Trying out our scraper

In [208]:
course_organization, course_link, course_name, skills, course_rating, course_num_reviews, course_difficulty, course_duration = initialize_scraper(47, 48)

No. of courses on current page: 12


In [209]:
course_organization, course_link, course_name, skills, course_rating, course_num_reviews, course_difficulty, course_duration

(['Università Bocconi',
  'University of Colorado Boulder',
  'University of London',
  'Microsoft',
  'Universidad Nacional Autónoma de México',
  'Copenhagen Business School',
  'EDHEC Business School',
  'University of Michigan',
  'Imperial College London',
  'MedCerts',
  'SkillUp EdTech',
  'Berklee'],
 ['https://www.coursera.org/learn/food-beverage-management',
  'https://www.coursera.org/specializations/effective-business-communication',
  'https://www.coursera.org/learn/corporatestrategy',
  'https://www.coursera.org/professional-certificates/microsoft-genai-for-data-analysis',
  'https://www.coursera.org/learn/contabilidad',
  'https://www.coursera.org/learn/neuromarketing',
  'https://www.coursera.org/specializations/investment-management-python-machine-learning',
  'https://www.coursera.org/specializations/web-design',
  'https://www.coursera.org/learn/creative-thinking-techniques-and-tools-for-success',
  'https://www.coursera.org/learn/professionalism-allied-health',
  'h

# Generating csv data

In [210]:
%timeit
course_organization, course_link, course_name, skills, course_rating, course_num_reviews, course_difficulty, course_duration = initialize_scraper(1, 85)

No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on current page: 12
No. of courses on cu

In [211]:
coursera = pd.DataFrame({
    'course_org': course_organization,
    'url': course_link,
    'course_title': course_name,
    'subject': skills,
    'Rating': course_rating,
    'num_reviews': course_num_reviews,
    'level': course_difficulty,
    'course_duration': course_duration
})
coursera

Unnamed: 0,course_org,url,course_title,subject,Rating,num_reviews,level,course_duration
0,Google,https://www.coursera.org/professional-certific...,Google Cybersecurity,"Network Security, Python Programming, Linux, C...",4.8,44K,Beginner,3 - 6 Months
1,Google,https://www.coursera.org/professional-certific...,Google Data Analytics,"Data Analysis, R Programming, SQL, Spreadsheet...",4.8,159K,Beginner,3 - 6 Months
2,Google,https://www.coursera.org/professional-certific...,Google Project Management:,"Project Management, Strategy and Operations, L...",4.8,120K,Beginner,3 - 6 Months
3,Google,https://www.coursera.org/professional-certific...,Google Digital Marketing & E-commerce,"Digital Marketing, Marketing, Marketing Manage...",4.8,35K,Beginner,3 - 6 Months
4,"DeepLearning.AI, Stanford University",https://www.coursera.org/specializations/machi...,Machine Learning,"Machine Learning, Machine Learning Algorithms,...",4.9,30K,Beginner,1 - 3 Months
...,...,...,...,...,...,...,...,...
1003,Universiteit Leiden,https://www.coursera.org/learn/eu-making-europ...,EU policy and implementation: making Europe wo...,unk,4.4,185,Intermediate,1 - 3 Months
1004,University of Michigan,https://www.coursera.org/learn/python-data-ana...,Introduction to Data Science in Python,"Computer Programming, Data Analysis, Data Mana...",4.5,27K,Intermediate,1 - 4 Weeks
1005,Meta,https://www.coursera.org/learn/principles-of-u...,Principles of UX/UI Design,"User Experience, Front-End Web Development, Hu...",4.7,959,Beginner,1 - 3 Months
1006,University of Illinois Urbana-Champaign,https://www.coursera.org/specializations/innov...,Innovation: From Creativity to Entrepreneurship,"Entrepreneurship, Leadership and Management, I...",4.8,3.5K,Beginner,3 - 6 Months


In [212]:
coursera.sample(15)

Unnamed: 0,course_org,url,course_title,subject,Rating,num_reviews,level,course_duration
939,Università Bocconi,https://www.coursera.org/learn/corp-sustainabi...,Corporate Sustainability. Understanding and Se...,"Leadership and Management, Organizational Deve...",4.6,867,Mixed,1 - 3 Months
907,Rutgers the State University of New Jersey,https://www.coursera.org/specializations/healt...,Healthcare Organization Operations,"Leadership and Management, Organizational Deve...",4.6,632,Beginner,3 - 6 Months
521,University of Colorado Boulder,https://www.coursera.org/specializations/excel...,Excel/VBA for Creative Problem Solving,"Spreadsheet Software, Computer Programming, Pr...",4.8,4.7K,Beginner,3 - 6 Months
768,Microsoft,https://www.coursera.org/professional-certific...,Microsoft Data Visualization,"Business Intelligence, Data Visualization, Sta...",unk,unk,Beginner,3 - 6 Months
11,IBM,https://www.coursera.org/professional-certific...,IBM Generative AI Engineering,"Machine Learning, Algorithms, Artificial Neura...",4.6,77K,Beginner,3 - 6 Months
806,IESE Business School,https://www.coursera.org/learn/financial-accou...,Accounting: Principles of Financial Accounting,"Accounting, Financial Accounting",4.8,3.1K,Beginner,1 - 4 Weeks
110,University of Pennsylvania,https://www.coursera.org/specializations/busin...,Business Analytics,"Data Analysis, Business Analysis, Probability ...",4.6,18K,Beginner,3 - 6 Months
432,University of Michigan,https://www.coursera.org/learn/python-data,Python Data Structures,"Computational Logic, Computer Programming, Com...",4.9,96K,Beginner,1 - 3 Months
923,Microsoft,https://www.coursera.org/specializations/micro...,Microsoft Copilot: Your Everyday AI Companion,Critical Thinking,4.3,22,Beginner,1 - 3 Months
941,Google,https://www.coursera.org/learn/the-nuts-and-bo...,The Nuts and Bolts of Machine Learning,"Machine Learning, Python Programming",4.8,446,Advanced,1 - 3 Months


In [213]:
coursera.duplicated().sum()

0

In [214]:
coursera.to_csv('coursera.csv')

In [215]:
df = pd.read_csv('/content/coursera.csv')
df

Unnamed: 0.1,Unnamed: 0,course_org,url,course_title,subject,Rating,num_reviews,level,course_duration
0,0,Google,https://www.coursera.org/professional-certific...,Google Cybersecurity,"Network Security, Python Programming, Linux, C...",4.8,44K,Beginner,3 - 6 Months
1,1,Google,https://www.coursera.org/professional-certific...,Google Data Analytics,"Data Analysis, R Programming, SQL, Spreadsheet...",4.8,159K,Beginner,3 - 6 Months
2,2,Google,https://www.coursera.org/professional-certific...,Google Project Management:,"Project Management, Strategy and Operations, L...",4.8,120K,Beginner,3 - 6 Months
3,3,Google,https://www.coursera.org/professional-certific...,Google Digital Marketing & E-commerce,"Digital Marketing, Marketing, Marketing Manage...",4.8,35K,Beginner,3 - 6 Months
4,4,"DeepLearning.AI, Stanford University",https://www.coursera.org/specializations/machi...,Machine Learning,"Machine Learning, Machine Learning Algorithms,...",4.9,30K,Beginner,1 - 3 Months
...,...,...,...,...,...,...,...,...,...
1003,1003,Universiteit Leiden,https://www.coursera.org/learn/eu-making-europ...,EU policy and implementation: making Europe wo...,unk,4.4,185,Intermediate,1 - 3 Months
1004,1004,University of Michigan,https://www.coursera.org/learn/python-data-ana...,Introduction to Data Science in Python,"Computer Programming, Data Analysis, Data Mana...",4.5,27K,Intermediate,1 - 4 Weeks
1005,1005,Meta,https://www.coursera.org/learn/principles-of-u...,Principles of UX/UI Design,"User Experience, Front-End Web Development, Hu...",4.7,959,Beginner,1 - 3 Months
1006,1006,University of Illinois Urbana-Champaign,https://www.coursera.org/specializations/innov...,Innovation: From Creativity to Entrepreneurship,"Entrepreneurship, Leadership and Management, I...",4.8,3.5K,Beginner,3 - 6 Months
