In [113]:
import pandas as pd

In [114]:
import re

def extract_schedule_info(schedule_string):
    schedule_string = schedule_string.strip()
    # Define the regular expression patterns
    units_pattern = re.compile(r'(?P<Units>(?:VAR|\d+(?:-\d+)?(?:,\d+)*(?:,\d+)?|\d+\.\d+))\s+')
    # schedule_pattern = re.compile(r'(?P<LecSec>\S+)\s+(?P<Days>\S+)\s+(?P<Begin>\d+:\d+[APMapm]+)\s+(?P<End>\d+:\d+[APMapm]+)\s+(?P<BldgRoom>[A-Za-z]+\s*\d{0,3}[A-Za-z]*\d{0,4})\s+(?P<Location>.*?)(?=\s{2,})\s+(?P<Instructors>.+)$')

    schedule_pattern = re.compile(
        r'(?P<LecSec>[A-Za-z0-9]{1,3})\s+'
        r'(?P<Days>\S+)\s+'
        r'(?P<Begin>\d+:\d+[APMapm]+)\s+'
        r'(?P<End>\d+:\d+[APMapm]+)\s+'
        r'(?P<BldgRoom>[A-Za-z]+\s*\d{0,3}[A-Za-z]*\d{0,4})?\s+'
        r'(?P<Location>.*?)(?=\s{2,})\s+'
        r'(?P<Instructors>.+)$'
    )

    tba_dnm_dnm_pattern = re.compile(r'TBA\s+' + r'\s{20,}' + r'DNM\s+DNM')
    tba_tba_pattern = re.compile(r'TBA\s+' + r'\s{20,}' + r'TBA\s+')
    tba_dnm_dnm_match = tba_dnm_dnm_pattern.search(schedule_string)
    tba_tba_match = tba_tba_pattern.search(schedule_string)
    if tba_dnm_dnm_match or tba_tba_match:
        return None

    # Extract Units information
    units_match = units_pattern.match(schedule_string)
    units = units_match.group('Units') if units_match else None

    # Extract the remaining schedule information
    remaining_schedule_string = re.sub(units_pattern, '', schedule_string, count=1)
    schedule_match = schedule_pattern.match(remaining_schedule_string)

    if units and schedule_match:
        # Combine information from both matches
        schedule_info = {'Units': units, **schedule_match.groupdict()}
        return schedule_info
    else:
        return None

In [115]:
def dict_to_nl(s, semester):
    columns = ['Question', 'Answer']
    nl = pd.DataFrame(columns=columns)
    i = 0
    for course in s:
        number = course
        name = s[course]['name']
        if (len(s[course].keys()) == 1):
            continue
        units = s[course]['Units']
        section = s[course]['LecSec']
        if section == "Lec":
            section = "A"
        days = s[course]['Days']
        days = days.replace('M', 'Monday, ').replace('T', 'Tuesday, ') \
                    .replace('W', 'Wednesday, ').replace('R', 'Thursday') \
                    .replace('F', 'Friday').replace('U', 'Sunday, ')
        begin = s[course]['Begin']
        end = s[course]['End']
        bldg_room = s[course]['BldgRoom']
        location = s[course]['Location']
        instructors = s[course]['Instructors']
        
        q = f'What is the course number of {name}?'
        a = number
        nl.loc[i] = [q, a]
        i += 1

        q = f'What is the name of course number {number}?'
        a = name
        nl.loc[i] = [q, a]
        i += 1

        q = f'What time does the course {number} {name} begin?'
        a = begin
        nl.loc[i] = [q, a]
        i += 1

        q = f'What time does the course {number} {name} end?'
        a = end
        nl.loc[i] = [q, a]
        i += 1

        q = f'What days does the course {number} {name} meet?'
        a = days
        nl.loc[i] = [q, a]
        i += 1

        q = f'What is the location of the course {number} {name}?'
        a = location
        nl.loc[i] = [q, a]
        i += 1

        q = f'What is the building and room number of the course {number} {name}?'
        a = bldg_room
        nl.loc[i] = [q, a]
        i += 1

        q = f'How many units is the course {number} {name}?'
        a = units
        nl.loc[i] = [q, a]
        i += 1

        q = f'Who are the instructors of the course {number} {name}?'
        a = instructors
        nl.loc[i] = [q, a]
        i += 1

        q = f'Is the course {number} {name} offered in {semester}?'
        a = "Yes"
        nl.loc[i] = [q, a]
        i += 1

        # sentence = f"In semester{semester}, the course {number} is titled {name}, is taught by {instructors}, and has {units} units. "
        # sentence += f"Section {section} for the course {number} {name} meets on {days} from {begin} to {end} at {bldg_room} in {location}.\n"
        # nl.append(sentence)
    return nl

In [116]:
def get_schedule_dictionary(lines):
    i = 0
    s = {}

    while i < len(lines):
        line = lines[i].strip()

        course_num_pattern = re.compile(r'\b\d{5}\b')
        if re.match(course_num_pattern, line):
            course_num = line.strip()[:5]
            course_title = line[6:].strip()
            s[course_num] = {"name": course_title}
            i += 1
            # While the next string is not a course number, add the schedule info to the course
            while i < len(lines) and not re.match(course_num_pattern, lines[i].strip()):
                schedule_info = extract_schedule_info(lines[i])
                if schedule_info:
                    # Add the schedule info to the course
                    s[course_num].update(schedule_info)
                i += 1
        # else:
        #     print("No matched course number in line: ", line)
        i += 1
    return s
        

In [117]:
import re, json

paths = ['../data/cleaned_other/sched_spring24.txt',
         '../data/cleaned_other/sched_fall23.txt',
         '../data/cleaned_other/sched_summer24_1.txt',
         '../data/cleaned_other/sched_summer24_2.txt']

for path in paths:
    print(f"Processing {path}")
    with open(path, 'r') as f:
        data = f.read()
        lines = data.split('\n')
        semester = str(lines[2].strip()[9:])
        s = get_schedule_dictionary(lines)
        nl = dict_to_nl(s, semester)
        # display(nl)
        # Save the text to a file
        sem = path[28:-4]
        path = f'../data/schedule_csv/sched_{sem}_nl.csv'
        nl.to_csv(path, index=False)

Processing ../data/cleaned_other/sched_spring24.txt
Processing ../data/cleaned_other/sched_fall23.txt
Processing ../data/cleaned_other/sched_summer24_1.txt
Processing ../data/cleaned_other/sched_summer24_2.txt


In [118]:
import re

def extract_schedule_info(schedule_string):
    schedule_string = schedule_string.strip()
    # Define the regular expression patterns
    units_pattern = re.compile(r'(?P<Units>(?:VAR|\d+(?:-\d+)?(?:,\d+)*(?:,\d+)?|\d+\.\d+))\s+')
    # schedule_pattern = re.compile(r'(?P<LecSec>\S+)\s+(?P<Days>\S+)\s+(?P<Begin>\d+:\d+[APMapm]+)\s+(?P<End>\d+:\d+[APMapm]+)\s+(?P<BldgRoom>[A-Za-z]+\s*\d{0,3}[A-Za-z]*\d{0,4})\s+(?P<Location>.*?)(?=\s{2,})\s+(?P<Instructors>.+)$')

    schedule_pattern = re.compile(
        r'(?P<LecSec>[A-Za-z0-9]{1,3})\s+'
        r'(?P<Days>\S+)\s+'
        r'(?P<Begin>\d+:\d+[APMapm]+)\s+'
        r'(?P<End>\d+:\d+[APMapm]+)\s+'
        r'(?P<BldgRoom>[A-Za-z]+\s*\d{0,3}[A-Za-z]*\d{0,4})?\s+'
        r'(?P<Location>.*?)(?=\s{2,})\s+'
        r'(?P<Instructors>.+)$'
    )

    tba_dnm_dnm_pattern = re.compile(r'TBA\s+' + r'\s{20,}' + r'DNM\s+DNM')
    tba_tba_pattern = re.compile(r'TBA\s+' + r'\s{20,}' + r'TBA\s+')
    tba_dnm_dnm_match = tba_dnm_dnm_pattern.search(schedule_string)
    tba_tba_match = tba_tba_pattern.search(schedule_string)
    if tba_dnm_dnm_match or tba_tba_match:
        return None

    # Extract Units information
    units_match = units_pattern.match(schedule_string)
    units = units_match.group('Units') if units_match else None

    # Extract the remaining schedule information
    remaining_schedule_string = re.sub(units_pattern, '', schedule_string, count=1)
    schedule_match = schedule_pattern.match(remaining_schedule_string)

    if units and schedule_match:
        # Combine information from both matches
        schedule_info = {'Units': units, **schedule_match.groupdict()}
        return schedule_info
    else:
        return None

# Test string with comma-separated values in the Units field
schedule_string_comma = "                        12.0    A       TR      09:30AM 10:50AM   Pittsburgh, Pennsylvania        Bisk"
print(schedule_string_comma)

# Extract schedule information
result_comma = extract_schedule_info(schedule_string_comma)

if result_comma:
    print("Extracted Schedule Information (Comma-separated Units):")
    for key, value in result_comma.items():
        print(f"{key}: {value}")
else:
    print("No match found.")


# Test string with missing units
# schedule_string_missing_units = "                        2.0     A3      T       02:00PM 03:20PM DNM DNM Pittsburgh, Pennsylvania        Sontag"
schedule_string_missing_units = lines[17]
print(schedule_string_missing_units)

# Extract schedule information
result_missing_units = extract_schedule_info(schedule_string_missing_units)

if result_missing_units:
    print("Extracted Schedule Information:")
    for key, value in result_missing_units.items():
        print(f"{key}: {value}")
else:
    if result_missing_units is None:
        print("result_missing_units is none")
    print("Skipped due to mismatch.")


                        12.0    A       TR      09:30AM 10:50AM   Pittsburgh, Pennsylvania        Bisk
Extracted Schedule Information (Comma-separated Units):
Units: 12.0
LecSec: A
Days: TR
Begin: 09:30AM
End: 10:50AM
BldgRoom: None
Location: Pittsburgh, Pennsylvania
Instructors: Bisk
        03124   Modern Biology Laboratory
result_missing_units is none
Skipped due to mismatch.


In [119]:
example = """
        Carnegie Mellon University - Full Schedule Of Classes
        Run Date: 22-feb-2024
        Semester: Spring 2024
        Course  Title   Units   Lec/Sec Days    Begin   End     Bldg/Room       Location        Instructor(s)
        Architecture                                                            
        48051   Study Abroad
                        0.0     A       TBA                     TBA     Pittsburgh, Pennsylvania        Instructor TBA
        48105   Architecture Design Studio: Poiesis Studio 2
                        15.0    Lec     MWF     02:00PM 04:50PM MM A14  Pittsburgh, Pennsylvania        Yang
                                A       MWF     02:00PM 04:50PM CFA 200 Pittsburgh, Pennsylvania        Abraham
                                B       MWF     02:00PM 04:50PM CFA 200 Pittsburgh, Pennsylvania        McFarland
                                C       MWF     02:00PM 04:50PM CFA 200 Pittsburgh, Pennsylvania        Saleh
                                D       MWF     02:00PM 04:50PM CFA 200 Pittsburgh, Pennsylvania        Jno Baptiste
        48111   Exploring Pittsburgh
                        3.0     A       F       09:00AM 11:50AM CFA 214 Pittsburgh, Pennsylvania        Torello
                                B       F       09:00AM 11:50AM CFA 214 Pittsburgh, Pennsylvania        Torello
        48112   Digital Fabrication Skills
                        2.0     A3      T       02:00PM 03:20PM DNM DNM Pittsburgh, Pennsylvania        Sontag
                                        R       02:00PM 03:20PM GHC 4102        Pittsburgh, Pennsylvania 
                                A4      T       02:00PM 03:20PM DNM DNM Pittsburgh, Pennsylvania        Sontag
                                        R       02:00PM 03:20PM WEH 5415        Pittsburgh, Pennsylvania 
"""

lines = example.split('\n')

In [120]:
get_schedule_dictionary(lines[10:])

{'48111': {'name': 'Exploring Pittsburgh',
  'Units': '3.0',
  'LecSec': 'A',
  'Days': 'F',
  'Begin': '09:00AM',
  'End': '11:50AM',
  'BldgRoom': 'CFA 214',
  'Location': 'Pittsburgh, Pennsylvania',
  'Instructors': 'Torello'}}

In [121]:
string = '''
        48112   Digital Fabrication Skills
                        2.0     A3      T       02:00PM 03:20PM DNM DNM Pittsburgh, Pennsylvania        Sontag
                                        R       02:00PM 03:20PM GHC 4102        Pittsburgh, Pennsylvania 
                                A4      T       02:00PM 03:20PM DNM DNM Pittsburgh, Pennsylvania        Sontag
                                        R       02:00PM 03:20PM WEH 5415        Pittsburgh, Pennsylvania 
'''

lines = string.split('\n')

print(get_schedule_dictionary(lines))
s = get_schedule_dictionary(lines)

{'48112': {'name': 'Digital Fabrication Skills', 'Units': '2.0', 'LecSec': 'A3', 'Days': 'T', 'Begin': '02:00PM', 'End': '03:20PM', 'BldgRoom': 'DNM DNM', 'Location': 'Pittsburgh, Pennsylvania', 'Instructors': 'Sontag'}}


In [122]:
display(dict_to_nl(s, 'fall23'))

Unnamed: 0,Question,Answer
0,What is the course number of Digital Fabricati...,48112
1,What is the name of course number 48112?,Digital Fabrication Skills
2,What time does the course 48112 Digital Fabric...,02:00PM
3,What time does the course 48112 Digital Fabric...,03:20PM
4,What days does the course 48112 Digital Fabric...,"Tuesday,"
5,What is the location of the course 48112 Digit...,"Pittsburgh, Pennsylvania"
6,What is the building and room number of the co...,DNM DNM
7,How many units is the course 48112 Digital Fab...,2.0
8,Who are the instructors of the course 48112 Di...,Sontag
9,Is the course 48112 Digital Fabrication Skills...,Yes


In [123]:
string = '        48112   Digital Fabrication Skills'
line = string.strip()
print(line)

course_num_pattern = re.compile(r'\b\d{5}\b')
if re.match(course_num_pattern, line):
    course_num = line.strip()[:5]
    course_title = line[6:].strip()
    print(course_num)
    print(course_title)

48112   Digital Fabrication Skills
48112
Digital Fabrication Skills
