In [206]:
import re

def extract_schedule_info(schedule_string):
    schedule_string = schedule_string.strip()
    # Define the regular expression patterns
    units_pattern = re.compile(r'(?P<Units>(?:VAR|\d+(?:-\d+)?(?:,\d+)*(?:,\d+)?|\d+\.\d+))\s+')
    schedule_pattern = re.compile(r'(?P<LecSec>\S+)\s+(?P<Days>\S+)\s+(?P<Begin>\d+:\d+[APMapm]+)\s+(?P<End>\d+:\d+[APMapm]+)\s+(?P<BldgRoom>[A-Za-z]+\s*\d{0,3}[A-Za-z]*\d{0,4})\s+(?P<Location>.*?)(?=\s{2,})\s+(?P<Instructors>.+)$')

    tba_dnm_dnm_pattern = re.compile(r'TBA\s+' + r'\s{20,}' + r'DNM\s+DNM')
    tba_tba_pattern = re.compile(r'TBA\s+' + r'\s{20,}' + r'TBA\s+')
    tba_dnm_dnm_match = tba_dnm_dnm_pattern.search(schedule_string)
    tba_tba_match = tba_tba_pattern.search(schedule_string)
    if tba_dnm_dnm_match or tba_tba_match:
        return None

    # Extract Units information
    units_match = units_pattern.match(schedule_string)
    units = units_match.group('Units') if units_match else None

    # Extract the remaining schedule information
    remaining_schedule_string = re.sub(units_pattern, '', schedule_string, count=1)
    schedule_match = schedule_pattern.match(remaining_schedule_string)

    if units and schedule_match:
        # Combine information from both matches
        schedule_info = {'Units': units, **schedule_match.groupdict()}
        return schedule_info
    else:
        return None

In [207]:
def get_schedule_dictionary(lines):
    i = 0
    s = {}

    while i < len(lines):
        line = lines[i].strip()

        course_num_pattern = r'^\d{5}'
        if re.match(course_num_pattern, line):
            course_num = line[:5]
            course_title = line[6:].strip()
            s[course_num] = {"name": course_title}
            i += 1
            # While the next string is not a course number, add the schedule info to the course
            while i < len(lines) and not re.match(course_num_pattern, lines[i].strip()):
                schedule_info = extract_schedule_info(lines[i])
                if schedule_info:
                    # Add the schedule info to the course
                    s[course_num].update(schedule_info)
                i += 1
        i += 1
    return s
        
        

In [208]:
def dict_to_nl(s, semester):
    nl = []
    for course in s:
        number = course
        name = s[course]['name']
        if (len(s[course].keys()) == 1):
            continue
        units = s[course]['Units']
        section = s[course]['LecSec']
        if section == "Lec":
            section = "A"
        days = s[course]['Days']
        days = days.replace('M', 'Monday, ').replace('T', 'Tuesday, ') \
                    .replace('W', 'Wednesday, ').replace('R', 'Thursday') \
                    .replace('F', 'Friday').replace('U', 'Sunday, ')
        begin = s[course]['Begin']
        end = s[course]['End']
        bldg_room = s[course]['BldgRoom']
        location = s[course]['Location']
        instructors = s[course]['Instructors']
        
        sentence = f"In semester{semester}, the course {number} is titled {name}, is taught by {instructors}, and has {units} units. "
        sentence += f"Section {section} for the course {number} {name} meets on {days} from {begin} to {end} at {bldg_room} in {location}.\n"
        nl.append(sentence)
    return nl

In [209]:
import re, json

paths = ['../data/cleaned_other/sched_fall23.txt',
         '../data/cleaned_other/sched_spring24.txt',
         '../data/cleaned_other/sched_summer24_1.txt',
         '../data/cleaned_other/sched_summer24_2.txt']

for path in paths:
    print(f"Processing {path}")
    with open(path, 'r') as f:
        data = f.read()
        lines = data.split('\n')
        semester = str(lines[2].strip()[9:])
        s = get_schedule_dictionary(lines)
        nl = dict_to_nl(s, semester)
        header = [f"Schedule for{semester} at Carnegie Mellon University\n"]
        text = ''.join(header + nl)
        # Save the text to a file
        sem = path[28:-4]
        with open(f'../data/cleaned_other/sched_{sem}_nl.txt', 'w') as f:
            f.write(text)


Processing ../data/cleaned_other/sched_fall23.txt
Processing ../data/cleaned_other/sched_spring24.txt
Processing ../data/cleaned_other/sched_summer24_1.txt
Processing ../data/cleaned_other/sched_summer24_2.txt


In [210]:
import re

def extract_schedule_info(schedule_string):
    schedule_string = schedule_string.strip()
    # Define the regular expression patterns
    units_pattern = re.compile(r'(?P<Units>(?:VAR|\d+(?:-\d+)?(?:,\d+)*(?:,\d+)?|\d+\.\d+))\s+')

    tba_dnm_dnm_pattern = re.compile(r'TBA\s+' + r'\s{20,}' + r'DNM\s+DNM')
    tba_tba_pattern = re.compile(r'TBA\s+' + r'\s{20,}' + r'TBA\s+')
    tba_dnm_dnm_match = tba_dnm_dnm_pattern.search(schedule_string)
    tba_tba_match = tba_tba_pattern.search(schedule_string)
    if tba_dnm_dnm_match or tba_tba_match:
        return None
    
    schedule_pattern = re.compile(r'(?P<LecSec>\S+)\s+(?P<Days>\S+)\s+(?P<Begin>\d+:\d+[APMapm]+)\s+(?P<End>\d+:\d+[APMapm]+)\s+(?P<BldgRoom>[A-Za-z]+\s*\d{0,3}[A-Za-z]*\d{0,4})\s+(?P<Location>.*?)(?=\s{2,})\s+(?P<Instructors>.+)$')

    # Extract Units information
    units_match = units_pattern.match(schedule_string)
    units = units_match.group('Units') if units_match else None

    # Extract the remaining schedule information
    remaining_schedule_string = re.sub(units_pattern, '', schedule_string, count=1)
    schedule_match = schedule_pattern.match(remaining_schedule_string)

    if units and schedule_match:
        # Combine information from both matches
        schedule_info = {'Units': units, **schedule_match.groupdict()}
        return schedule_info
    else:
        return None

# Test string with comma-separated values in the Units field
schedule_string_comma = "5,36,48 A       TR      03:00PM 03:50PM CFA A9  Pittsburgh, Pennsylvania        Holmes"
print(schedule_string_comma)

# Extract schedule information
result_comma = extract_schedule_info(schedule_string_comma)

if result_comma:
    print("Extracted Schedule Information (Comma-separated Units):")
    for key, value in result_comma.items():
        print(f"{key}: {value}")
else:
    print("No match found (Comma-separated Units).")


# Test string with missing units
schedule_string_missing_units = "                        0,36    A       TBA                     TBA     Pittsburgh, Pennsylvania        Instructor TBA"
# String is "                        10.0    A       TR      01:00PM 02:50PM CFA 211 Pittsburgh, Pennsylvania        Barbuto"
print(schedule_string_missing_units)

# Extract schedule information
result_missing_units = extract_schedule_info(schedule_string_missing_units)

if result_missing_units:
    print("Extracted Schedule Information:")
    for key, value in result_missing_units.items():
        print(f"{key}: {value}")
else:
    if result_missing_units is None:
        print("result_missing_units is none")
    print("Skipped due to mismatch.")


5,36,48 A       TR      03:00PM 03:50PM CFA A9  Pittsburgh, Pennsylvania        Holmes
Extracted Schedule Information (Comma-separated Units):
Units: 5,36,48
LecSec: A
Days: TR
Begin: 03:00PM
End: 03:50PM
BldgRoom: CFA A9
Location: Pittsburgh, Pennsylvania
Instructors: Holmes
                        0,36    A       TBA                     TBA     Pittsburgh, Pennsylvania        Instructor TBA
result_missing_units is none
Skipped due to mismatch.
