In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://catalog.uta.edu/business/infosystems/#courseinventory'

### Download all HTML content from the webpage using requests

In [3]:
webpage = requests.get(url)
page_content = webpage.content
print('Sample of the scraped content from the webpage:')
page_content[:1000]

Sample of the scraped content from the webpage:


b'<!doctype html>\n<html class="no-js" xml:lang="en" lang="en" dir="ltr">\n\n<head>\n<meta http-equiv="X-UA-Compatible" content="IE=Edge" />\n<title>Information Systems and Operations Management &lt; University of Texas Arlington</title>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />\n<link rel="search" type="application/opensearchdescription+xml"\n\t\t\thref="/search/opensearch.xml" title="Catalog" />\n<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0" />\n<link href="/favicon.ico" rel="shortcut icon" />\n<link rel="stylesheet" type="text/css" href="/css/reset.css" />\n<link rel="stylesheet" type="text/css" href="/fonts/font-awesome/font-awesome.min.css" />\n<link rel="stylesheet" type="text/css" href="/css/courseleaf.css" />\n<link rel="stylesheet" type="text/css" href="/css/screen.css" media="screen" />\n<link rel="stylesheet" type="text/css" href="/css/print.css" media="print" />\n<script type="text/javascript" src="/js/

### Parse the HTML content and extract relevant text using BeautifulSoup

In [4]:
# Parse the above scraped HTML using BeautifulSoup
soup = BeautifulSoup(page_content, 'html.parser')

In [5]:
# Since the course title and description are inside a 'div' with class as 'courseblock', we can extract all text from these divs
div_courses = soup.find_all('div', attrs={'class': 'courseblock'})
courses_text = [row.text for row in div_courses]
print(f'Total number of courses: {len(courses_text)}')
print('\n\nSample Course Block:')
courses_text[5]

Total number of courses: 119


Sample Course Block:


'\nBSTAT\xa02305.  INTRODUCTORY STATISTICS FOR BUSINESS ANALYTICS.  3 Hours.\n\nGuiding business and economic decision-making with the use of descriptive and inferential statistical techniques. Topics include the collection, description and summarization of business and economic data; probability as a foundation of business intelligence; discrete and continuous random variables, their probability and sampling distributions, and their application in business analytics; estimation and confidence intervals for (and tests of hypotheses regarding) the population mean in business settings; and correlation and linear regression analysis as business decision-making tools. Software is used to conduct analyses throughout the course.\n\n'

In [6]:
# Since each course block follows the same format, we can extract the course name and its description using the pattern used and add it to a dictionary defined below
courses_dict = dict()

In [7]:
# From the extracted text, find the title and the description of each course to add them to the above dictionary
for course_block in courses_text:
    # First, remove all escape (\n) and other line feed characters (\xa) from the text
    course_block = course_block.replace('\n', '').replace('\xa0', ' ')
    splitted_text = course_block.split('.')
#     print(splitted_text)
    title = splitted_text[0] + '.' + splitted_text[1] + '.' + splitted_text[2]
    description = '.'.join(splitted_text[3:])
#     print(title)
#     print(description)
    courses_dict[title] = description

In [8]:
print('Number of courses found:')
print(len(courses_dict))

Number of courses found:
119


In [9]:
courses_dict

{'BANA 3308.  INTRODUCTION TO BUSINESS ANALYTICS.  3 Hours': 'This course introduces students to data mining and business analytics techniques that will enable them to draw actionable insights from data. In addition to tracing the evolution of ideas in Artificial Intelligence (AI), Machine Learning (ML), and Deep Learning (DL), the course provides hands-on exposure to state-of-the-art machine learning algorithms-such as linear, ensemble, and neural network models-that organizations rely on to derive business value. Prerequisite: BSTAT 3321 and INSY 3300.',
 'BANA 3309.  DATA VISUALIZATION AND BUSINESS INTELLIGENCE.  3 Hours': 'This course introduces students to cutting-edge techniques for visualizing data and creating dashboards to facilitate data-driven decision making.  Topics include fundamentals of SQL, preprocessing of data, examining principles and concepts underlying visual characteristics of data, exploring graphs and charts to draw insight from data, assessing the quality of d

# Find all courses that contain the keyword "machine learning" in their description

In [10]:
course_count = 0
for title, description in courses_dict.items():
    # Only print the ones that have "machine learning" in their description
    if 'machine learning' in description.lower():
        course_count += 1
        print(f'{course_count}\nCourse title: {title}.\nCourse description: {description}')

1
Course title: BANA 3308.  INTRODUCTION TO BUSINESS ANALYTICS.  3 Hours.
Course description: This course introduces students to data mining and business analytics techniques that will enable them to draw actionable insights from data. In addition to tracing the evolution of ideas in Artificial Intelligence (AI), Machine Learning (ML), and Deep Learning (DL), the course provides hands-on exposure to state-of-the-art machine learning algorithms-such as linear, ensemble, and neural network models-that organizations rely on to derive business value. Prerequisite: BSTAT 3321 and INSY 3300.
2
Course title: BANA 4308.  ADVANCED DATA SCIENCE.  3 Hours.
Course description: This course provides an in-depth understanding of machine learning concepts and algorithms using Python. Students will receive hands-on training on supervised learning algorithms such as KNN, Naïve Bayes, Linear and Logistic Regression, Support Vector Machines, Decision Trees and Ensembles, and Artificial Neural Networks (AN