# Berkeley Course Crawler

This crawler helps you extract more detailed information from Berkeley Course webpage. Follow the steps below, and have your excel/google sheet/database prepared. You can organize your short list of courses.

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sat Jul  4 23:32:27 2020

@author: You, Bo-Xiang
"""


def course_extract(url):
    
    import requests
    from datetime import datetime, timedelta
    import json
    import pandas as pd
    from bs4 import BeautifulSoup
    
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    query = soup.find("div", class_="handlebarData theme_is_whitehot").attrs
    js_str = query["data-json"]
    js_dict = json.loads(js_str)
    course_num = js_dict["displayName"]
    serial_num = js_dict["id"]
    name = js_dict["course"]["title"]
    level = js_dict["course"]["academicCareer"]["description"]
    lecture = js_dict["component"]["description"]
    
    try:
        mode = js_dict["attributes"]["WEB"][0]["value"]["formalDescription"]
    except KeyError:
        mode = "Pending Reviews"
    
    instructor = js_dict["meetings"][0]["assignedInstructors"]
    instructors = ', '.join([str(i["instructor"]["names"][1]["formattedName"]) for i in instructor])
    
    try:
        units = js_dict["course"]["credit"]["value"]["fixed"]["units"]
    except KeyError:
        units = ' to '.join([str(js_dict["course"]["credit"]["value"]["range"]["minUnits"]), str(js_dict["course"]["credit"]["value"]["range"]["maxUnits"])])
    
    # date-time process
    fake_date = []
    if js_dict["meetings"][0]["meetsMonday"]:
        fake_date.append(datetime(2020,7,6).date())
    if js_dict["meetings"][0]["meetsTuesday"]:
        fake_date.append(datetime(2020,7,7).date())
    if js_dict["meetings"][0]["meetsWednesday"]:
        fake_date.append(datetime(2020,7,8).date())
    if js_dict["meetings"][0]["meetsThursday"]:
        fake_date.append(datetime(2020,7,9).date())
    if js_dict["meetings"][0]["meetsFriday"]:
        fake_date.append(datetime(2020,7,10).date())
    fake_starttime = datetime.strptime(js_dict["meetings"][0]["startTime"], '%H:%M:%S').time()
    fake_endtime = datetime.strptime(js_dict["meetings"][0]["endTime"], '%H:%M:%S').time()
    fake_start_dt = [datetime.combine(i, fake_starttime) for i in fake_date]
    fake_end_dt = [datetime.combine(i, fake_endtime) for i in fake_date]
    
    sf_start_dt = " / ".join([str(i.strftime("%a %H:%M")) for i in fake_start_dt])
    sf_end_dt = " / ".join([str(i.strftime("%a %H:%M")) for i in fake_end_dt])
    
    tw_start_dt = " / ".join([str((i + timedelta(hours=15)).strftime("%a %H:%M")) for i in fake_start_dt])
    tw_end_dt = " / ".join([str((i + timedelta(hours=15)).strftime("%a %H:%M")) for i in fake_end_dt])
    
    description = js_dict["course"]["description"]
    final = js_dict["course"]["finalExam"]["description"]
    
    information = pd.DataFrame({
            "Course_No." : course_num,
            "Serial_NO." : serial_num,
            "Course_Name" : name,
            "Level" : level,
            "Type" : lecture,
            "Mode" : mode,
            "Instructor(s)" : instructors,
            "Units" : units,
            "Start_Time(SF_time)" : sf_start_dt,
            "End_Time(SF_time)" : sf_end_dt,
            "Start_Time(TW_time)" : tw_start_dt,
            "End_Time(TW_time)" : tw_end_dt,
            "Final_Examination" : final,
            "Description" : description}, index=[0])
    
    return information

def multiple_extract(url_list):
    import pandas as pd
    information =  pd.DataFrame({
            "Course_No." : None,
            "Serial_NO." : None,
            "Course_Name" : None,
            "Level" : None,
            "Type" : None,
            "Mode" : None,
            "Instructor(s)" : None,
            "Units" : None,
            "Start_Time(SF_time)" : None,
            "End_Time(SF_time)" : None,
            "Start_Time(TW_time)" : None,
            "End_Time(TW_time)" : None,
            "Final_Examination" : None,
            "Description" : None}, index=[0])
    for i in url_list:
        info = course_extract(i)
        information = information.append(info, ignore_index=True)
    return information.iloc[1:]


In [2]:
%%HTML
<style type="text/css">
table.dataframe td, table.dataframe th {
    border: 1px  black solid !important;
  color: black !important;
}
</style>

## Demonstration

In [3]:
# Example for one Course: 
result = course_extract("https://classes.berkeley.edu/content/2020-fall-data-c102-001-lec-001") # put the link between ""
result

Unnamed: 0,Course_No.,Serial_NO.,Course_Name,Level,Type,Mode,Instructor(s),Units,Start_Time(SF_time),End_Time(SF_time),Start_Time(TW_time),End_Time(TW_time),Final_Examination,Description
0,2020 Fall DATA C102 001 LEC 001,33319,"Data, Inference, and Decisions",Undergraduate,Lecture,Pending Reviews,"Michael Jordan, Jacob Noah Steinhardt",4,Tue 14:00 / Thu 14:00,Tue 15:29 / Thu 15:29,Wed 05:00 / Fri 05:00,Wed 06:29 / Fri 06:29,Written final exam conducted during the schedu...,This course develops the probabilistic foundat...


In [4]:
# copy the result and paste it on your excel/Google Sheet/Database
result.to_clipboard(excel=True,sep='\t')

In [5]:
# Example for multiple Courses:
# 1. Put all your desired course link into the list, separate them by ","
url_list = ["https://classes.berkeley.edu/content/2020-fall-data-c102-001-lec-001", "https://classes.berkeley.edu/content/2020-fall-civeng-199-001-ind-001", "https://classes.berkeley.edu/content/2020-fall-indeng-290-004-lec-004"]
# 2. Extract them as table
multiple_results = multiple_extract(url_list)
multiple_results

Unnamed: 0,Course_No.,Serial_NO.,Course_Name,Level,Type,Mode,Instructor(s),Units,Start_Time(SF_time),End_Time(SF_time),Start_Time(TW_time),End_Time(TW_time),Final_Examination,Description
1,2020 Fall DATA C102 001 LEC 001,33319,"Data, Inference, and Decisions",Undergraduate,Lecture,Pending Reviews,"Michael Jordan, Jacob Noah Steinhardt",4,Tue 14:00 / Thu 14:00,Tue 15:29 / Thu 15:29,Wed 05:00 / Fri 05:00,Wed 06:29 / Fri 06:29,Written final exam conducted during the schedu...,This course develops the probabilistic foundat...
2,2020 Fall CIVENG 199 001 IND 001,16638,Supervised Independent Study,Undergraduate,Independent Study,Pending Reviews,Norman A Abrahamson,1 to 4,,,,,No final exam,Supervised independent study.
3,2020 Fall INDENG 290 004 LEC 004,32956,Special Topics in Industrial Engineering and O...,Graduate,Lecture,Asynchronous Instruction,Barna Saha,2 to 3,Tue 14:00 / Thu 14:00,Tue 15:29 / Thu 15:29,Wed 05:00 / Fri 05:00,Wed 06:29 / Fri 06:29,Written final exam conducted during the schedu...,Lectures and appropriate assignments on fundam...


In [6]:
# 3. use this action to copy the result and paste it on your excel/Google Sheet/Database
multiple_results.to_clipboard(excel=True,sep='\t')

## Search for the courses that interest you

In [10]:
# NOW, Try it on your own for one link!
result = course_extract("https://classes.berkeley.edu/content/2020-fall-cyplan-230-001-lec-001") # insert link
result.to_clipboard(excel=True,sep='\t') # copy the result, so you can paste it elsewhere
#check the result
result

Unnamed: 0,Course_No.,Serial_NO.,Course_Name,Level,Type,Mode,Instructor(s),Units,Start_Time(SF_time),End_Time(SF_time),Start_Time(TW_time),End_Time(TW_time),Final_Examination,Description
0,2020 Fall CYPLAN 230 001 LEC 001,20730,"U.S. Housing, Planning, and Policy",Graduate,Lecture,Pending Reviews,Benjamin Metcalf,3,Tue 09:30 / Thu 09:30,Tue 11:00 / Thu 11:00,Wed 00:30 / Fri 00:30,Wed 02:00 / Fri 02:00,No final exam,Theory of housing markets and empirical method...


In [8]:
# NOW, Try it on your own for multiple link!
url_list = ["https://classes.berkeley.edu/content/2020-fall-data-c104-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-isf-100a-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-comlit-170-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-sociol-c273n-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-info-290-004-lec-004",
            "https://classes.berkeley.edu/content/2020-fall-rhetor-153-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-indeng-290-004-lec-004",
            "https://classes.berkeley.edu/content/2020-fall-geog-170-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-sts-c200-001-sem-001",
            "https://classes.berkeley.edu/content/2020-fall-espm-c252-001-sem-001",
            "https://classes.berkeley.edu/content/2020-fall-rhetor-150-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-cyplan-255-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-cyplan-257-001-lab-001",
            "https://classes.berkeley.edu/content/2020-fall-cyplan-c261-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-africam-134-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-rhetor-115-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-datasci-w231-001-wbl-001",
            "https://classes.berkeley.edu/content/2020-fall-stat-154-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-data-c102-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-cyplan-230-001-lec-001",
            "https://classes.berkeley.edu/content/2020-fall-info-290-004-lec-004"] # insert links, you can add as more links as you want
multiple_results = multiple_extract(url_list)
multiple_results.to_clipboard(excel=True,sep='\t') # copy the result, so you can paste it elsewhere
#check the result
multiple_results

Unnamed: 0,Course_No.,Serial_NO.,Course_Name,Level,Type,Mode,Instructor(s),Units,Start_Time(SF_time),End_Time(SF_time),Start_Time(TW_time),End_Time(TW_time),Final_Examination,Description
1,2020 Fall DATA C104 001 LEC 001,33150,Human Contexts and Ethics of Data - History/STS,Undergraduate,Lecture,Asynchronous Instruction,"Margarita O Boenig-Liptsin, Ari S Edmundson",4,Mon 15:00 / Wed 15:00 / Fri 15:00,Mon 15:59 / Wed 15:59 / Fri 15:59,Tue 06:00 / Thu 06:00 / Sat 06:00,Tue 06:59 / Thu 06:59 / Sat 06:59,Written final exam conducted during the schedu...,This course teaches you to use the tools of ap...
2,2020 Fall ISF 100A 001 LEC 001,22003,Introduction to Social Theory and Cultural Ana...,Undergraduate,Lecture,Pending Reviews,Amm Quamruzzaman,4,Tue 12:30 / Thu 12:30,Tue 13:59 / Thu 13:59,Wed 03:30 / Fri 03:30,Wed 04:59 / Fri 04:59,Written final exam conducted during the schedu...,"This course, required of all ISF majors but op..."
3,2020 Fall COMLIT 170 001 LEC 001,32839,Special Topics in Comparative Literature,Undergraduate,Lecture,Pending Reviews,"Thomas Patrick McEnaney, David Alexander Bamman",1 to 4,Tue 14:00 / Thu 14:00,Tue 15:29 / Thu 15:29,Wed 05:00 / Fri 05:00,Wed 06:29 / Fri 06:29,No final exam,An independent studies course designed to fulf...
4,2020 Fall SOCIOL C273N 001 LEC 001,31030,Social Networks,Graduate,Lecture,Pending Reviews,Dennis M. Feehan,4,Tue 14:00,Tue 16:59,Wed 05:00,Wed 07:59,No final exam,This course provides a broad introduction to t...
5,2020 Fall INFO 290 004 LEC 004,17441,Special Topics in Information,Graduate,Lecture,Pending Reviews,"Anne Elizabeth Jonas, Jenna Burrell",1 to 4,Tue 12:30 / Thu 12:30,Tue 13:59 / Thu 13:59,Wed 03:30 / Fri 03:30,Wed 04:59 / Fri 04:59,No final exam,"Specific topics, hours, and credit may vary fr..."
6,2020 Fall RHETOR 153 001 LEC 001,33092,American Political Rhetoric,Undergraduate,Lecture,Pending Reviews,Nathan S Atkinson,4,Mon 17:00 / Wed 17:00,Mon 18:29 / Wed 18:29,Tue 08:00 / Thu 08:00,Tue 09:29 / Thu 09:29,Written final exam conducted during the schedu...,A survey of the ways in which Americans have d...
7,2020 Fall INDENG 290 004 LEC 004,32956,Special Topics in Industrial Engineering and O...,Graduate,Lecture,Asynchronous Instruction,Barna Saha,2 to 3,Tue 14:00 / Thu 14:00,Tue 15:29 / Thu 15:29,Wed 05:00 / Fri 05:00,Wed 06:29 / Fri 06:29,Written final exam conducted during the schedu...,Lectures and appropriate assignments on fundam...
8,2020 Fall GEOG 170 001 LEC 001,33480,Special Topics in Geography,Undergraduate,Lecture,Pending Reviews,Desiree Fields,3,Mon 11:00 / Wed 11:00,Mon 12:29 / Wed 12:29,Tue 02:00 / Thu 02:00,Tue 03:29 / Thu 03:29,Written final exam conducted during the schedu...,This course is designed to provide a vehicle f...
9,2020 Fall STS C200 001 SEM 001,27153,Topics in Science and Technology Studies,Graduate,Seminar,Pending Reviews,Massimo Mazzotti,3,Mon 16:00,Mon 17:59,Tue 07:00,Tue 08:59,No final exam,This course provides a strong foundation for g...
10,2020 Fall ESPM C252 001 SEM 001,26996,Topics in Science and Technology Studies,Graduate,Seminar,Pending Reviews,Massimo Mazzotti,3,Mon 16:00,Mon 17:59,Tue 07:00,Tue 08:59,No final exam,This course provides a strong foundation for g...
