In [1]:
#!pip install bs4
#!pip install urllib
#!pip install plotly

# Imports

In [2]:
# libraries for webscraping, parsing and getting data
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import json

# for plotting and data manipulation
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.express as px

# for getting current date and time to print 'last updated' in webpage
from datetime import datetime

# Scrape the Course Reviews Data

Data scraped from OMS Central

Explanation of data scraping available here: 
https://medium.datadriveninvestor.com/sentiment-analysis-of-stocks-from-financial-news-using-python-82ebdcefb638

In [3]:
url = 'https://www.omscentral.com/'

req = Request(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'})

try:
    response = urlopen(req)   
except:
    time.sleep(10) # if there is an error and request is blocked, do it more slowly by waiting for 10 seconds before requesting again
    response = urlopen(req)  
        
# Read the contents of the file into 'html'
html = BeautifulSoup(response)

# Parse JSON Data

The course data is in JavaScript (between <script> tags at the bottom of the HTML code), in JSON format. Parse it and convert it to DataFrame.

In [4]:
# Find the data in between the final <script> tags in the website
raw = html.find_all("script")[-1]

json_text = raw.text

# Load the relevant part of the data
parsed = json.loads(json_text)['props']['pageProps']['courses']

parsed[:2]

[{'_createdAt': '2022-08-15T14:49:24Z',
  '_id': '26e53feb-fbd0-4233-8218-dfb7f56cc46d',
  '_rev': '959qsafI8uoLDPJeciuflB',
  '_type': 'course',
  '_updatedAt': '2022-08-25T21:43:35Z',
  'codes': ['CS-8803-O13'],
  'creditHours': 3,
  'description': 'The goal of this course is to provide students in CS and ECE with the fundamental background on quantum computing and equip them with the skills to write code and optimize quantum programs on real quantum computers.',
  'id': '26e53feb-fbd0-4233-8218-dfb7f56cc46d',
  'isDeprecated': False,
  'isFoundational': True,
  'name': 'Special Topics: Quantum Computing',
  'officialURL': 'https://omscs.gatech.edu/cs-8803-o13-quantum-computing',
  'programs': [{'_key': 'd21a69086f85',
    '_ref': 'b6f2bf84-c2ea-405e-8423-c348e1a94051',
    '_type': 'reference'}],
  'slug': 'special-topics-quantum-computing',
  'syllabus': {'file': {'_type': 'file',
    'asset': {'_ref': 'file-f921a6a8952de8270a8002e7cdd20e798252a800-pdf',
     '_type': 'reference'}}

In [5]:
raw_df = pd.DataFrame(parsed)
raw_df.head()

Unnamed: 0,_createdAt,_id,_rev,_type,_updatedAt,codes,creditHours,description,id,isDeprecated,...,programs,slug,syllabus,tags,textbooks,reviewCount,rating,difficulty,workload,notesURL
0,2022-08-15T14:49:24Z,26e53feb-fbd0-4233-8218-dfb7f56cc46d,959qsafI8uoLDPJeciuflB,course,2022-08-25T21:43:35Z,[CS-8803-O13],3,The goal of this course is to provide students...,26e53feb-fbd0-4233-8218-dfb7f56cc46d,False,...,"[{'_key': 'd21a69086f85', '_ref': 'b6f2bf84-c2...",special-topics-quantum-computing,"{'file': {'_type': 'file', 'asset': {'_ref': '...",[QC],"[{'_key': '2f5517a7bba6eb4a7b282b9e884f13e8', ...",2,3.0,3.0,15.0,
1,2022-08-19T21:31:17Z,270c4d16-25dd-4f69-9e41-b50e73ed74e2,vLev53Cwnp4JOI2CCHppWQ,course,2022-08-24T17:42:05Z,[CS-8001-OSO],1,,270c4d16-25dd-4f69-9e41-b50e73ed74e2,False,...,"[{'_key': '888bee183d55', '_ref': 'b6f2bf84-c2...",seminar-computational-sociology-seminar,,,,0,,,,
2,2022-08-19T21:55:21Z,28df9ae9-8ade-4b02-bd5a-5506544b33f0,959qsafI8uoLDPJecfUter,course,2022-08-24T17:43:44Z,[PUBP-8813],3,,28df9ae9-8ade-4b02-bd5a-5506544b33f0,False,...,"[{'_key': '15388d856ed7', '_ref': 'c647cd7c-c8...",special-topics-digital-public-policy,,,,0,,,,
3,2022-08-19T21:30:48Z,449cba43-6f4c-4fa3-8e5f-09b6ad89429d,vLev53Cwnp4JOI2CCHptei,course,2022-08-24T17:42:19Z,[CS-8001-OED],1,,449cba43-6f4c-4fa3-8e5f-09b6ad89429d,False,...,"[{'_key': '0bfdb161452e', '_ref': 'b6f2bf84-c2...",seminar-cs-educators-seminar,,,,0,,,,
4,2022-08-19T21:40:20Z,5cbd1444-ec69-4092-82b0-2d9ccf39ec6c,vLev53Cwnp4JOI2CCHkcVA,course,2022-08-24T17:23:30Z,[ECE-6374],3,This course provides an introduction to cyber-...,5cbd1444-ec69-4092-82b0-2d9ccf39ec6c,False,...,"[{'_key': 'e05818d20c8d', '_ref': 'c647cd7c-c8...",cyber-physical-security-in-electric-energy-sys...,,,,0,,,,


# Some Data Cleaning

Refer to comments within the code for the data cleaning steps.

In [None]:
# The 'codes' column of the courses consists of lists instead of strings, extract the element in the list into a list.
raw_df['code'] = raw_df['codes'].apply(lambda x: x[0])

# The first part of the course code (before the - sign) can be extracted as the department.
raw_df['dept'] = raw_df['code'].apply(lambda x: x.split('-')[0])

# Generate own tag using first letter of each capitalized word in the name as some courses are without tags
raw_df['tag'] = raw_df['name'].apply(lambda x: ''.join([word[0] for word in x.split() if word[0].isupper()]))

# Some courses already have tags stored in the form of a list, for simplicity, extract the first element of the list to as the tag
for i, row in raw_df.iterrows():
    if isinstance(row['tags'], list):
        if len(row['tags']) > 0:
            raw_df.at[i,'tag'] = row['tags'][0]  
raw_df.head()

In [None]:
raw_df.info()

We only want data with reviews. Also Distributed Computing has a workload of over 60 hours which is almost thrice that of the course with the second highest workload, this skews the scale when plotting the graph below hence I have adjusted it below.

In [None]:
df = raw_df[raw_df['reviewCount'] > 0]

# distributed computing has a ridiculously high amt of workload
#  so it is replaced with the same amount of workload as the second highest one
df.loc[df["tag"] == "DC", 'workload'] = 0 
df.loc[df["tag"] == "DC", 'workload'] = df["workload"].max()

df = df[['name', 'tag', 'dept', 'code', 'description', 'reviewCount', 'rating', 'difficulty', 'workload']]
df

Additionally you may choose to filter data with minimum review of 10 or change it.

In [None]:
min_review_count = 5

df_plot = df[df['reviewCount'] >= min_review_count]

# OMSCS Course Rating and Difficulty Plot (size = Review Count, color = Workload)

In [None]:
fig_scatter1 = px.scatter(df_plot, x="difficulty", y="rating", 
                 hover_data=['name', 'reviewCount'], text='tag', size='reviewCount', color='workload')
fig_scatter1.update_traces(textposition='top center')
fig_scatter1.add_vline(x=df_plot["difficulty"].mean(), line_width=0.5, annotation_text = 'Mean Difficulty')
fig_scatter1.add_hline(y=df_plot["rating"].mean(), line_width=0.5, annotation_text = 'Mean Rating')
fig_scatter1.update_layout(
    title="OMSCS Course Rating and Difficulty (size = Review Count, color = Workload)",
    xaxis_title="Difficulty",
    yaxis_title="Rating",
    height=800,
    font=dict(
        size=10
    )
)
fig_scatter1.show()

# OMSCS Course Workload and Difficulty Plot (size = Review Count, color = Workload)

In [None]:
fig_scatter2 = px.scatter(df_plot, x="difficulty", y="workload", 
                 hover_data=['name', 'reviewCount'], text='tag', size='reviewCount', color='rating')
fig_scatter2.update_traces(textposition='top center')
fig_scatter2.add_vline(x=df_plot["difficulty"].mean(), line_width=0.5, annotation_text = 'Mean Difficulty')
fig_scatter2.add_hline(y=df_plot["workload"].mean(), line_width=0.5, annotation_text = 'Mean Workload')
fig_scatter2.update_layout(
    title="OMSCS Course Workload and Difficulty (size = Review Count, color = Rating)",
    xaxis_title="Difficulty",
    yaxis_title="Workload",
    height=800,
    font=dict(
        size=10
    )
)
fig_scatter2.show()

# Treemap Plot of Course Rating

In [None]:
# group data into department at the highest level, breaks it down into courses
# the 'values' parameter uses the value of the column to determine the relative size of each box in the chart
# the color of the chart follows the course rating
df_plot['label'] = df_plot['tag'] + '<br><br>' + df_plot['rating'].apply(lambda x:str(round(x, 3)))
fig_treemap1 = px.treemap(df_plot, path=[px.Constant("OMSCS Course Rating"), 'dept', 'label'], values='reviewCount',
                  color='rating', hover_data=['name', 'difficulty'],
                  color_continuous_scale=['#FF0000', "#000000", '#00FF00'])

fig_treemap1.update_traces(textposition="middle center")
fig_treemap1.update_layout(margin = dict(t=30, l=10, r=10, b=10), font_size=20)

fig_treemap1.show()

# Treemap Plot of Course Difficulty

In [None]:
# group data into department at the highest level, breaks it down into courses
# the 'values' parameter uses the value of the column to determine the relative size of each box in the chart
# the color of the chart follows the course difficulty
df_plot['label'] = df_plot['tag'] + '<br><br>' + df_plot['difficulty'].apply(lambda x:str(round(x, 3)))
fig_treemap2 = px.treemap(df_plot, path=[px.Constant("OMSCS Course Difficulty"), 'dept', 'label'], values='reviewCount',
                  color='difficulty', hover_data=['name', 'rating'],
                  color_continuous_scale=['#FF0000', "#000000", '#00FF00'])

fig_treemap2.update_traces(textposition="middle center")
fig_treemap2.update_layout(margin = dict(t=30, l=10, r=10, b=10), font_size=20)

fig_treemap2.show()

# Histogram Plots to Show Distributions

In [None]:
fig_hist1 = px.histogram(df_plot, x='workload', nbins=30, title='Workload Distribution')
fig_hist1.update_layout(
    width=800
)
fig_hist1.show()

In [None]:
fig_hist2 = px.histogram(df_plot, x='rating', nbins=30, title='Rating Distribution')
fig_hist2.update_layout(
    width=800
)
fig_hist2.show()

In [None]:
fig_hist3 = px.histogram(df_plot, x='difficulty', nbins=30, title='Difficulty Distribution')
fig_hist3.update_layout(
    width=800
)
fig_hist3.show()

# Correlation Heatmap

In [None]:
fig_corr = px.imshow(df[['rating', 'difficulty', 'workload']].corr(), text_auto = True, title = 'Correlation')
fig_corr.show()

# Generated the HTML Page

The HTML page of the figures above is generated with the "time updated" info.

Not all the figures above are included below, feel free to change the code to include any figure that you may like.

In [None]:
# datetime object containing current date and time
now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
timezone_string = datetime.now().astimezone().tzname()
print(dt_string, timezone_string)

In [None]:
with open('omscs_courses_rating_difficulty.html', 'a') as f:
    f.truncate(0) # clear file if something is already written on it
    title = "<h1>Gerogia Tech OMSCS</h1><h2>Summary of Course Difficulty and Rating</h2>"
    updated = "<h3>Last updated: " + dt_string + " (Timezone: " + timezone_string + ")</h3>"
    description = "The data is pulled from <a href='https://www.omscentral.com/'>OMSCentral</a> daily via a GitHub Actions script to update the summary information in this page.<br><br>"
    credits = "Credits to <a href='https://www.omscentral.com/'>OMSCentral</a> for the information, review and rating of the courses. I do not own any of this data."
    subtitle = "<h3>Explanation and Source Code</h3>"
    code = """<a href="https://medium.com/datadriveninvestor/use-github-actions-to-create-a-live-stock-sentiment-dashboard-online-580a08457650">Explanatory Article</a> | <a href="https://github.com/damianboh/gatech_omscs_live_rating_reviews_plot">Source Code</a>"""
    author = """ | Created by Damian Boh, check out my <a href="https://damianboh.github.io/">GitHub Page</a>"""
   
    f.write(title + updated + description + credits + subtitle + code + author)
    f.write(fig_scatter1.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file
    f.write(fig_scatter2.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file
    f.write(fig_treemap1.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file
    f.write(fig_treemap2.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file
    f.write(fig_hist1.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file
    f.write(fig_hist2.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file
    f.write(fig_hist3.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file
    f.write(fig_corr.to_html(full_html=False, include_plotlyjs='cdn')) # write the fig created above into the html file