In [1]:
from bs4 import BeautifulSoup

# Open and read the HTML file
with open("./data/tbody.html", "r") as f:
    content = f.read()

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')

# Print out a portion of the parsed HTML for verification
str(soup.prettify())[:500]


'<tbody>\n <tr>\n  <th class="ddlabel" scope="row">\n   <a href="/prod/bwckschd.p_disp_detail_sched?term_in=202203&amp;crn_in=30096">\n    Introduction to Computing - 30096 - CS 201 - 0\n   </a>\n   <a href="http://www.sabanciuniv.edu/syllabus/?crn=30096&amp;term=202203" target="blank">\n    <img alt="Course\xa0Web" border="0" src="https://suisimg.sabanciuniv.edu/images/webc.gif" title="Course\xa0Web"/>\n   </a>\n   <br/>\n   <br/>\n  </th>\n </tr>\n <tr>\n  <td class="dddefault">\n   <span class="fieldlabeltext">\n  '

In [2]:
import re
import json

# Function to extract meeting times
def extract_meeting_times(meeting_table):
    meeting_times = []
    rows = meeting_table.find_all('tr')[1:]  # Skip header row
    for row in rows:
        cells = row.find_all('td')
        meeting_times.append({
            "type": cells[0].get_text(strip=True),
            "time": cells[1].get_text(strip=True),
            "days": cells[2].get_text(strip=True),
            "location": cells[3].get_text(strip=True),
            "dateRange": cells[4].get_text(strip=True),
            "scheduleType": cells[5].get_text(strip=True),
            "instructors": cells[6].get_text(strip=True),
        })
    return meeting_times

# Initialize list to hold course data
courses = []
course = {}

# Loop over each row in the table
for row in soup.find_all('tr'):
    if row.th and 'ddlabel' in row.th.get('class', []):
        # Title row
        title = row.th.get_text(strip=True)
        match = re.search(r'(.+?) - (\d+) - (.*?) - (\w+)', title)
        name, crn_number, course_code, section = match.groups()
        course = {
            "name": name,
            "displayName": name.split(" - ")[0],
            "courseCode": course_code,
            "section": section,
            "crnNumber": crn_number,
        }
    else:
        # Data row
        cells = row.td
        if cells:
            text = cells.get_text(strip=True)
            match = re.search(r'Associated Term: (.*?) Levels: (.*?) Faculty: (.*?)(\d+\.\d+ Credits)?', text, re.S)
            if match:
                course.update({
                    "term": match.group(1).strip(),
                    "levels": [level.strip() for level in match.group(2).split(",")],
                    "faculty": match.group(3).strip(),
                    "credits": float(match.group(4).split()[0]) if match.group(4) else 0.0,
                })
            match = re.search(r'(\d+) of (\d+) Seats Available', text)
            if match:
                course["availableSeats"] = int(match.group(1))
            meeting_table = cells.find('table')
            if meeting_table:
                course["meetingTimes"] = extract_meeting_times(meeting_table)
        else:
            # End of course data, append course to list
            courses.append(course)
            course = {}

# Add the last course if it wasn't added
if course:
    courses.append(course)

# Write course data to JSON file
with open('./data/data.json', 'w') as f:
    json.dump(courses, f)

# Return number of courses processed for verification
len(courses)


3