# Preprocess the Professor names

In [101]:
from bs4 import BeautifulSoup
import json

In [102]:
def read_json(semester):
    with open(semester+'.json', "r") as json_file:
        return json.load(json_file)

In [103]:
spring_courses = read_json('spring')
fall_courses = read_json('fall')

with open('all_courses.json', "r") as json_file:
    all_courses = json.load(json_file)

##### Scrapping the prof list HTML

In [104]:
def get_profs(file):
    with open(file, 'r') as file:
        html = file.read()

    soup = BeautifulSoup(html, 'html.parser')
    profs = [a.find('span', itemprop='name').text for a in soup.find_all('a', itemprop='url')]
    return profs

all_profs = get_profs('prof_list.html')
with open('addition.txt', 'r') as f:
    names = f.read().splitlines() 
all_profs += names
all_profs = [{'full': prof.strip(), 'last':prof.strip().split(' ')[-1]} for prof in all_profs]

##### Adding the full names of professors to the data

In [105]:
def preprocess_profs(courses):
    for course in courses:
        for m in course["meetings"]:
            instructors = m["instructor"].split(';')
            instructors = [ins.strip() for ins in instructors]
            full_name_profs = [get_full_name(ins) if ins else '' for ins in instructors]
            
            if '' in full_name_profs:
                if len(full_name_profs) > 1:
                    full_name_profs.remove('')
                else:
                    full_name_profs = list(map(lambda x: x.replace('', 'N/A'), full_name_profs))
            full_name_profs = "; ".join(full_name_profs)
            m["instructor_full_name"] = full_name_profs
    return courses  

def get_full_name(ins):
    ins = ins.split(' ')
    last = ins[0]
    matched = [p for p in all_profs if p["last"].lower()==last.lower()]

    if len(matched) > 1:
        matched = [mat for mat in matched if mat["full"][0].lower()==ins[-1].lower()]
    elif len(matched) < 1:
        for prof in all_profs:
            if prof["last"] == ins[1] or prof["last"] == ins[-2]:
                matched.append(prof)

    assert(len(matched) == 1)
    
    return matched[0]["full"]

In [106]:
spring_courses = preprocess_profs(spring_courses)
fall_courses = preprocess_profs(fall_courses)
all_courses = preprocess_profs(all_courses)

##### Writing the updated info

In [107]:
with open('all_courses_latest.json', "w") as json_file:
    json.dump(all_courses, json_file)

with open('spring.json', "w") as json_file:
    json.dump(spring_courses, json_file)

with open('fall.json', "w") as json_file:
    json.dump(fall_courses, json_file)

In [108]:
spring_courses[0]

{'dept': 'ILS-Z',
 'url': 'https://luddy.indiana.edu/academics/courses/class/iub-spring-2023-ils-z640',
 'course_id': '640',
 'course_title': 'SEMINAR IN INTELLECTUAL FREEDOM',
 'filename': 'iub-spring-2023-ils-z640.html',
 'meetings': [{'component': 'LEC',
   'credits': '3',
   'status': 'Open',
   'time': '12:40 p.m.–3:10 p.m.',
   'day': 'R',
   'facility': 'I 232',
   'instructor': 'Rosenbaum H',
   'course_type': 'Regular Academic Session',
   'mode': 'In Person',
   'total_seats': '15',
   'available_seats': '7',
   'waitlisted_seats': '0',
   'notes': ['Above class meets with ILS-Z 764'],
   'instructor_full_name': 'Howard Rosenbaum'}],
 'description': '',
 'textfile': 'ILS-Z640.txt',
 'semester': 'spring',
 'pk': 'ILS-Z640spring',
 'full_id': 'ILS-Z640'}