In [152]:
import requests
import time
import pandas as pd
import re
from datetime import time
from datetime import datetime
import pytz
import xml.etree.ElementTree as ET
from xml.dom import minidom


In [153]:
# get all courses
def get_subjects(year, semester):
    url = f'http://courses.illinois.edu/cisapp/explorer/schedule/{year}/{semester}.xml'
    response = requests.get(url)
    if(response.status_code == 200):
        return response.text
    else:
        raise Exception(f"Error: Unable to fetch data (Status code: {response.status_code})")
    
def get_xml(url):
    response = requests.get(url)
    if(response.status_code == 200):
        return response.text
    else:
        raise Exception(f"Error: Unable to fetch data (Status code: {response.status_code})")
    
def parse_courses(xml_data):
    root = ET.fromstring(xml_data)
    courses = []
    for course in root.findall(".//course"):
        course_url = course.get("href")
        if(course_url.startswith("http://cis.local/cisapi")):
            course_url = course_url.replace("http://cis.local/cisapi", "https://courses.illinois.edu/cisapp/explorer", 1)
            course_url += ".xml"
        course_id = course.get("id")
        course_title = course.text.strip() if course.text is not None else 'Unknown'
        courses.append((course_url, course_id, course_title))
    return courses

def parse_sections(xml_data):
    root = ET.fromstring(xml_data)
    sections = []
    for section in root.findall(".//section"):
        section_url = section.get("href")
        if(section_url.startswith("http://cis.local/cisapi")):
            section_url = section_url.replace("http://cis.local/cisapi", "https://courses.illinois.edu/cisapp/explorer", 1)
            section_url += ".xml"
        section_id = section.get("id")
        section_code = section.text.strip() if section.text is not None else 'Unknown'
        sections.append((section_url, section_id, section_code))
    return sections

def parse_section(xml_data):
    root = ET.fromstring(xml_data)
    meetings = []
    for meeting in root.findall(".//meeting"):
        meeting_type = meeting.find("type").text.strip() if meeting.find("type") is not None else 'Unknown'
        meeting_start = meeting.find("start").text.strip() if meeting.find("start") is not None else 'Unknown'
        meeting_end = meeting.find("end").text.strip() if meeting.find("end") is not None else 'Unknown'
        meeting_days = meeting.find("daysOfTheWeek").text.strip() if meeting.find("daysOfTheWeek") is not None else 'Unknown'
        meeting_rm_num = meeting.find("roomNumber").text.strip() if meeting.find("roomNumber") is not None else 'Unknown'
        meeting_building = meeting.find("buildingName").text.strip() if meeting.find("buildingName") is not None else 'Unknown'
        meeting_instructors_arr = []
        instructors = meeting.findall(".//instructor")
        for instructor in instructors:
            meeting_instructors_arr.append(instructor.text.strip())
        if((meeting_rm_num != 'Unknown') & (meeting_building != 'Unknown')):
            meetings.append((meeting_type, meeting_start, meeting_end, meeting_days, meeting_rm_num,
                                meeting_building, meeting_instructors_arr))
        # print(meeting_type, " ", meeting_start, " ", meeting_end, " ", meeting_days, " ", meeting_rm_num, " ", meeting_building)
    return meetings

In [156]:
year = 2024
semester = "fall"

url = f'http://courses.illinois.edu/cisapp/explorer/schedule/{year}/{semester}.xml'

subjects_xml = get_xml(url)
# parses a XML string into an ElementTree
subjects_root = ET.fromstring(subjects_xml)

# store dataframe data
all_meetings = []

for subject in subjects_root.findall(".//subject"):
    subject_code = subject.get("id")
    subject_url = subject.get("href")
    print(subject_code)

    # get all courses for a subject
    courses_xml = get_xml(subject_url)
    courses = parse_courses(courses_xml)

    # get all sections for each course
    for course_url, course_id, course_title in courses:
        # print(subject_code, course_id, " ", course_title)
        sections_xml = get_xml(course_url)
        sections = parse_sections(sections_xml)

        # get info for each individual section
        for section_url, section_id, section_code in sections:
            section_xml = get_xml(section_url)
            meetings = parse_section(section_xml)
            for meeting_type, meeting_start, meeting_end, meeting_days, meeting_rm_num, meeting_building, meeting_instructors_arr in meetings:
                all_meetings.append({
                    "Subject": subject_code, 
                    "CourseID": course_id,
                    "Name": course_title,
                    "CRN": section_id,
                    "SectionCode" : section_code,
                    "Type": meeting_type,
                    "Start": meeting_start,
                    "End": meeting_end,
                    "Days": meeting_days,
                    "RoomNumber": meeting_rm_num,
                    "Building": meeting_building,
                    "Instructors": meeting_instructors_arr
                })

# Create a pandas DataFrame from the list of sections
df = pd.DataFrame(all_meetings)
df.to_csv('courses_2024_fall.csv', index=False)


AAS
ABE
ACCY
ACE
ACES
ADV
AE
AFAS
AFRO
AFST
AGCM
AGED
AHS
AIS
ALEC
ANSC
ANTH
ARAB
ARCH
ART
ARTD
ARTE
ARTF
ARTH
ARTJ
ARTS
ASRM
ASST
ASTR
ATMS
BADM
BASQ
BCOG
BCS
BDI
BIOC
BIOE
BIOP
BSE
BTW
BUS
CB
CDB
CEE
CHBE
CHEM
CHIN
CHP
CI
CIC
CLCV
CLE
CMN
CPSC
CS
CSE
CW
CWL
CZCH
DANC
DTX
EALC
ECE
ECON
EDPR
EDUC
EIL
ENG
ENGL
ENSU
ENVS
EPOL
EPSY
ERAM
ESE
ESL
ETMA
EURO
FAA
FIN
FLTE
FR
FSHN
GC
GEOL
GER
GGIS
GLBL
GMC
GRK
GRKM
GS
GSD
GWS
HDFS
HEBR
HIST
HK
HNDI
HORT
HT
HUM
IB
IE
INFO
IS
ITAL
JAPN
JOUR
JS
KOR
LA
LAS
LAST
LAT
LAW
LCTL
LEAD
LER
LING
LLS
MACS
MATH
MBA
MCB
MDIA
MDVL
ME
MICR
MILS
MIP
MSE
MUS
MUSC
MUSE
NE
NEUR
NPRE
NRES
NS
NUTR
PATH
PERS
PHIL
PHYS
PLPA
POL
PORT
PS
PSM
PSYC
QUEC
REES
REL
RHET
RMLG
RST
RUSS
SAME
SBC
SCAN
SE
SHS
SLAV
SLCL
SOC
SOCW
SPAN
SPED
STAT
SWAH
TAM
TE
THEA
TMGT
TRST
TURK
UKR
UP
VCM
VM
WLOF
WRIT
YDSH
ZULU


In [161]:
df_courses = pd.read_csv("courses_2024_fall.csv")
df_courses = df_courses[df_courses["Start"] != "ARRANGED"]

In [162]:
# change the convert a string into a datetime.time object
def getTime(time_str):
    hr = int(time_str[:2])
    min = int(time_str[3:5])
    mod = time_str[6:8]

    if((mod == "PM") & (hr != 12)):
        hr += 12
    if((mod == "AM") & (hr == 12)):
        hr = 0
    
    # get timezone for Urbana-Champagin
    tz = pytz.timezone("America/Chicago")
    time_obj = time(hr, min, 0, 0, tzinfo=tz)
    return time_obj

df_courses["StartTime"] = df_courses["Start"].apply(lambda x: pd.Series(getTime(x)))
df_courses["EndTime"] = df_courses["End"].apply(lambda x: pd.Series(getTime(x)))
df_courses.to_csv("courses_2024_fall_cleaned.csv", index=False)