In [3]:
import json

def check_attributes(obj):
    json_data = open("data/example_data.json", encoding="UTF8")
    json_obj = json.load(json_data)
    for key, value in json_obj.items():
        if key not in obj:
            if type(value) == type(""):
                obj[key] = ""
            elif value.isdigit():
                obj[key] = 0
            else:
                obj[key] = None

    return obj

# Example usage
obj = {
    "name": "Introduction to Computing",
    "displayName": "Introduction to Computing",
    "courseCode": "CS 201",
    "section": "0",
    "crnNumber": "30096",
    "meetingTimes": [
        {
            "type": "Class",
            "time": "11:40 am - 2:30 pm",
            "days": "R",
            "location": "Fac. of Engin. and Nat. Sci. L029",
            "dateRange": "Jul 10, 2023 - Aug 25, 2023",
            "scheduleType": "2nd del",
            "instructors": "At\u00c4\u00b1l Utku  Ay (P)"
        },
        {
            "type": "Class",
            "time": "8:40 am - 11:30 am",
            "days": "F",
            "location": "Fac. of Engin. and Nat. Sci. L029",
            "dateRange": "Jul 10, 2023 - Aug 25, 2023",
            "scheduleType": "1st del",
            "instructors": "At\u00c4\u00b1l Utku  Ay (P)"
        }
    ],
    "associatedTerm": "Summer 2022-2023",
    "registrationDates": "No dates available",
    "levels": "Doctorate, Undeclared, Scientific Preparatory, Undergraduate, Masters, Special Student, Exchange - Socrates Erasmus UG, Exchange - Socrates Erasmus MA, Exchange - Socrates Erasmus DR, Exchange - Erasmus Mundus MA, Exchange - Erasmus Mundus DR, Exchange - Erasmus Mundus UG",
    "faculty": "Course Offered by FENS",
    "attributes": "Lang. of Instruction: English, 6 ECTS, Course Offered by FENS"
}

updated_obj = check_attributes(obj)
print(updated_obj)


{'name': 'Introduction to Computing', 'displayName': 'Introduction to Computing', 'courseCode': 'CS 201', 'section': '0', 'crnNumber': '30096', 'meetingTimes': [{'type': 'Class', 'time': '11:40 am - 2:30 pm', 'days': 'R', 'location': 'Fac. of Engin. and Nat. Sci. L029', 'dateRange': 'Jul 10, 2023 - Aug 25, 2023', 'scheduleType': '2nd del', 'instructors': 'AtÄ±l Utku  Ay (P)'}, {'type': 'Class', 'time': '8:40 am - 11:30 am', 'days': 'F', 'location': 'Fac. of Engin. and Nat. Sci. L029', 'dateRange': 'Jul 10, 2023 - Aug 25, 2023', 'scheduleType': '1st del', 'instructors': 'AtÄ±l Utku  Ay (P)'}], 'associatedTerm': 'Summer 2022-2023', 'registrationDates': 'No dates available', 'levels': 'Doctorate, Undeclared, Scientific Preparatory, Undergraduate, Masters, Special Student, Exchange - Socrates Erasmus UG, Exchange - Socrates Erasmus MA, Exchange - Socrates Erasmus DR, Exchange - Erasmus Mundus MA, Exchange - Erasmus Mundus DR, Exchange - Erasmus Mundus UG', 'faculty': 'Course Offered by FEN

In [5]:
import json
import re
from bs4 import BeautifulSoup

# Open and read the HTML file
with open("./data/tbody.html", "r", encoding="UTF8") as f:
    content = f.read()

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')


# Function to reduce multiple spaces to single space
def reduce_spaces(string: str) -> str:
    result = re.sub(r'\s+', ' ', string)
    return result


def to_camel_case(phrase: str) -> str:
    """Converts a string to camel case.
    Args:
    phrase: The string to convert.
    Returns:
    The string in camel case.
    """
    # Check if phrase is str
    if not isinstance(phrase, str):
        raise TypeError("phrase must be str")

    # Check if empty
    if not phrase:
        return phrase

    # Split phrase into words
    words = phrase.split(" ")

    # Combine words, capitalizing the first letter of each word after checking length
    if len(words) > 1:
        return words[0].lower() + "".join(w.capitalize() for w in words[1:])
    else:
        return words[0].lower()


# Function to extract meeting times
def extract_meeting_times(meeting_table):
    meeting_times = []
    rows = meeting_table.find_all('tr')[1:]  # Skip header row
    for row in rows:
        cells = row.find_all('td')
        meeting_times.append({
            "type": reduce_spaces(cells[0].get_text(strip=True)),
            "time": reduce_spaces(cells[1].get_text(strip=True)),
            "days": reduce_spaces(cells[2].get_text(strip=True)),
            "location": reduce_spaces(cells[3].get_text(strip=True)),
            "dateRange": reduce_spaces(cells[4].get_text(strip=True)),
            "scheduleType": reduce_spaces(cells[5].get_text(strip=True)),
            "instructors": reduce_spaces(cells[6].get_text(strip=True)),
        })
    return meeting_times


# Initialize list to hold course data
courses = []
course = {}

# Loop over each row in the table
for row in soup.find_all('tr'):
    if row.th and 'ddlabel' in row.th.get('class', []):
        # Title row
        title = row.th.get_text(strip=True)
        match = re.search(r'(.+?) - (\d+) - (.*?) - (\w+)', title)
        name, crn_number, course_code, section = match.groups()
        course = {
            "name": name,
            "displayName": name.split(" - ")[0],
            "courseCode": course_code,
            "section": section,
            "crnNumber": crn_number,
        }
    else:
        # Data row
        cells = row.td
        if cells:
            cells: BeautifulSoup
            meeting_table = cells.find('table')
            if meeting_table:
                course["meetingTimes"] = extract_meeting_times(meeting_table)
                # delete meeting tables from cells
                meeting_table.decompose()
            delimiter = '###'
            for line_break in cells.findAll('br'):
                line_break.replaceWith(delimiter)
            lines = cells.get_text().split(delimiter)
            for line in lines:
                #  slice line as key:value
                splitted = line.split(':', 1)
                key = splitted[0]
                value = splitted[1] if len(splitted) > 1 else None
                # make the key camelCase
                # e.g. from "Associated Term" to "associatedTerm"
                key = to_camel_case(key)
                # if there is key and value, add them to course
                if key and value:
                    key = reduce_spaces(key)
                    value = reduce_spaces(value)
                    course[key.strip()] = value.strip()

        else:
            # End of course data, append course to list
            if course:
                course = check_attributes(course)
                courses.append(course)
                course = {}

# Add the last course if it wasn't added
if course:
    courses.append(course)

# Write course data to JSON file
with open('./data/data.json', 'w') as f:
    json.dump(courses, f)

# Return number of courses processed for verification
print(len(courses))


214


In [8]:
test = """    <td class="dddefault">
<span class="fieldlabeltext">Associated Term: </span>Summer 2022-2023<br /><span class="fieldlabeltext">Registration Dates: </span> No dates
available
<br />
<span class="fieldlabeltext">Levels: </span>Doctorate, Undeclared,
Scientific Preparatory, Undergraduate, Masters, Special Student, Exchange
- Socrates Erasmus UG, Exchange - Socrates Erasmus MA, Exchange - Socrates
Erasmus DR, Exchange - Erasmus Mundus MA, Exchange - Erasmus Mundus DR,
Exchange - Erasmus Mundus UG
<br />
<span class="fieldlabeltext">Faculty: </span>
Course Offered by FENS
<br />
<span class="fieldlabeltext">Attributes: </span>Lang. of Instruction:
English, 6 ECTS, Course Offered by FENS
<br />
<br />
Sabancı University Campus Campus
<br />
Lecture Schedule Type
<br />
3.000 Credits
<br />
<a
  href="/prod/bwckctlg.p_display_courses?term_in=202203&amp;one_subj=CS&amp;sel_crse_strt=201&amp;sel_crse_end=201&amp;sel_subj=&amp;sel_levl=&amp;sel_schd=&amp;sel_coll=&amp;sel_divs=&amp;sel_dept=&amp;sel_attr="
  >View Catalog Entry</a
>
<br />
<br />
      <table
        class="datadisplaytable"
        summary="This table lists the scheduled meeting times and assigned instructors for this class.."
      >
        <caption class="captiontext">
          Scheduled Meeting Times
        </caption>
        <tbody>
          <tr>
            <th class="ddheader" scope="col">Type</th>
            <th class="ddheader" scope="col">Time</th>
            <th class="ddheader" scope="col">Days</th>
            <th class="ddheader" scope="col">Where</th>
            <th class="ddheader" scope="col">Date Range</th>
            <th class="ddheader" scope="col">Schedule Type</th>
            <th class="ddheader" scope="col">Instructors</th>
          </tr>
          <tr>
            <td class="dddefault">Class</td>
            <td class="dddefault">8:40 am - 10:30 am</td>
            <td class="dddefault">M</td>
            <td class="dddefault">Fac. of Engin. and Nat. Sci. L065</td>
            <td class="dddefault">Jul 10, 2023 - Aug 25, 2023</td>
            <td class="dddefault">2nd del</td>
            <td class="dddefault">
              Atıl Utku Ay (<abbr title="Primary">P</abbr>)
            </td>
          </tr>
          <tr>
            <td class="dddefault">Class</td>
            <td class="dddefault">8:40 am - 10:30 am</td>
            <td class="dddefault">W</td>
            <td class="dddefault">Fac. of Engin. and Nat. Sci. L067</td>
            <td class="dddefault">Jul 10, 2023 - Aug 25, 2023</td>
            <td class="dddefault">1st del</td>
            <td class="dddefault">
              Atıl Utku Ay (<abbr title="Primary">P</abbr>)
            </td>
          </tr>
        </tbody>
      </table>
<br />
<br />
    </td>"""
test=test.replace("\n", "")

s = BeautifulSoup(test)

s.find("table").decompose()

delimiter = '###'                       
for line_break in s.findAll('br'):      
    line_break.replaceWith(delimiter)   
strings = s.get_text().split(delimiter) 


print(s.get_text().split(delimiter))

['Associated Term: Summer 2022-2023', 'Registration Dates:  No datesavailable', 'Levels: Doctorate, Undeclared,Scientific Preparatory, Undergraduate, Masters, Special Student, Exchange- Socrates Erasmus UG, Exchange - Socrates Erasmus MA, Exchange - SocratesErasmus DR, Exchange - Erasmus Mundus MA, Exchange - Erasmus Mundus DR,Exchange - Erasmus Mundus UG', 'Faculty: Course Offered by FENS', 'Attributes: Lang. of Instruction:English, 6 ECTS, Course Offered by FENS', '', 'Sabancı University Campus Campus', 'Lecture Schedule Type', '3.000 Credits', 'View Catalog Entry', '', '      ', '', '    ']


In [12]:
temp = {}
if temp: 
    print("yes")
else:
    print("no")

no
