In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
url = "https://catalog.utsa.edu/graduate/business/managementsciencestatistics/#courseinventory"
html = requests.get(url).content
soup = BeautifulSoup(html, 'html.parser')
nonBreakSpace = u'\xa0'

In [4]:
# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

In [5]:
# Create an empty list to store the course names and descriptions
course_list = []

# Find all course blocks on the page
for course in soup.find_all('div', {'class': 'courseblock'}):
    
     # Extract the course name and description and remove whitespace
    course_name = course.find('p', {'class': 'courseblocktitle'}).get_text().strip()
    course_desc = course.find('p', {'class': 'courseblockdesc'}).get_text().strip()
    
    # Process the course name and description with spaCy's NLP pipeline
    course_name_nlp = nlp(course_name)
    course_desc_nlp = nlp(course_desc)
    
    # Add the course name and description to the list as a tuple of spaCy Doc objects
    course_list.append((course_name_nlp, course_desc_nlp))
    
    # Select the first course in the course_list and print its text, part-of-speech tag, and dependency label
    # for token in course_list[0][1]:
        # print(token.text, token.pos_, token.dep_)

In [22]:
print(len(course_list), "courses in the Department of Management Science and Statistics Graduate Catalog." )

ms_count = 0
sta_count = 0

for name, desc in course_list:
    if name.text.startswith("MS"):
        ms_count += 1
    elif name.text.startswith("STA"):
        sta_count += 1

print(ms_count, "being MS courses")
print(sta_count, "being STA courses")

python_count = 0

for name, desc in course_list:
    if "Python" in [token.text for token in desc]:
        python_count += 1

print("Only", python_count, "courses mention Python in the description, with those courses being:")

python_count = 0

for name, desc in course_list:
    if "Python" in [token.text for token in desc]:
        print(name.text)
        python_count += 1

        # Stop looping after we've found three courses
        if python_count == 3:
            break

65 courses in the Department of Management Science and Statistics Graduate Catalog
20 being MS courses
45 being STA courses
Only 3 courses mention Python in the description, with those courses being:
STA 6233.  R Programming for Data Science.  (3-0) 3 Credit Hours.
STA 6923.  Introduction to Statistical Learning.  (3-0) 3 Credit Hours.
STA 6933.  Advanced Topics in Statistical Learning.  (3-0) 3 Credit Hours.


In [24]:
from collections import Counter

# Count phrases for courses starting with "MS"
ms_phrases = Counter()
for name, desc in course_list:
    if name.text.startswith("MS"):
        for chunk in desc.noun_chunks:
            ms_phrases[chunk.text] += 1

# Print the three most common phrases
print("Top three phrases for courses starting with MS:")
for phrase, count in ms_phrases.most_common(3):
    print(phrase, count)


Top three phrases for courses starting with MS:
Differential Tuition 20
Prerequisite 6
services 6


In [6]:
# Count courses that mention "data mining"
data_mining_count = 0
for name, desc in course_list:
    if "data mining" in desc.text.lower():
        data_mining_count += 1

# Print the count
print("Number of courses that mention data mining:", data_mining_count)

Number of courses that mention data mining: 2
