# This File creates text data for each course

In [79]:
import json
import os

In [87]:
def read_json(semester):
    with open(semester+'.json', "r") as json_file:
        return json.load(json_file)

def save_json(courses, semester):
    with open(semester+'.json', "w") as json_file:
        json.dump(courses, json_file)

def updated_print_courses(course_data):
    for course in course_data:
        print(f"\033[1mDept ID\033[0m: {course['dept']}")
        print(f"\033[1mCourse ID\033[0m:: {course['course_id']}")
        print(f"\033[1mCourse Title\033[0m:: {course['course_title']}")
        print(f"\033[1mCourse Description\033[0m:: {course['description']}")
        print(f"\033[1mUrl\033[0m:: {course['url']}")
        print(f"\033[1mFilename\033[0m:: {course['filename']}")
        print(f"\033[1mTextfile\033[0m:: {course['textfile']}")
        print(f"\033[1mSemester\033[0m:: {course['semester']}")
        for meeting in course['meetings']:
            print(f"\tcomponent: {meeting['component']}")
            print(f"\tcredits: {meeting['credits']}")
            print(f"\tstatus: {meeting['status']}")
            print(f"\tcourse type: {meeting['course_type']}")
            print(f"\tmode: {meeting['mode']}")
            print(f"\tday: {meeting['day']}")
            print(f"\ttime: {meeting['time']}")
            print("\t------------------------")
        print()

In [93]:
course_info = {}
course_texts = []

##### This function adds text data for each course and adds it to course_texts and also resolves shorthand day issue

In [94]:
def write_course_text(courses, semester):
    for course in courses:
        course_id = course["course_id"]
        course_title = course["course_title"]
        dept = course["dept"]
        meetings = course["meetings"]
        description = course["description"]
        course_type = meetings[0].get("course_type", "").lower() if meetings else ""
        credits = meetings[0].get("credits", "") if meetings else ""
        semester = course["semester"]
        profs_l = list(set([met["instructor"] for met in course["meetings"] if met["instructor"]]))
        profs = ", ".join(profs_l)


        paragraph = f"The course with the course ID {dept}{course_id}, titled \"{course_title}\", is offered by the Department of Computer Science and Informatics at Indiana University. The course is part of the {course_type} with {credits} course credits. "
        paragraph += f"This course is taught by the professors {profs}.\n" if profs else ""
        paragraph += f"The course description is as follows: \n{description}\n\n" if description else ""
        paragraph += f"The course has the following classes offered in {semester} semester:\n"

        days_dict = {
            "M": "Monday",
            "T": "Tuesday",
            "W": "Wednesday",
            "R": "Thursday",
            "F": "Friday"
        }

        for index, meeting in enumerate(meetings):
            component = meeting["component"]
            time = meeting["time"]
            day = meeting["day"]
            mode = meeting["mode"].lower()
            status = meeting["status"].lower()
            facility = meeting["facility"]
            instructor_short_hand = meeting["instructor"]
            instructor = meeting["instructor_full_name"]
            total_seats = meeting["total_seats"]
            available_seats = meeting["available_seats"]
            waitlisted_seats = meeting["waitlisted_seats"]
            notes = ", ".join(meeting["notes"])

            paragraph += f"{index+1}. The {component} class is a {mode} type class. "
            if day != "ARR":
                full_days = ""
                for d in day:
                    fd = days_dict[d]
                    full_days += fd + ", "
                full_days = full_days[:-2]
                paragraph += f"This class meets on {full_days} from {time} at {facility}. "
            paragraph += f"The class is taught by the professors {instructor} (shorthand name: {instructor_short_hand})and is currently {status}. "
            paragraph += f"The total number of seats available for this component is {total_seats}, with {available_seats} seats currently available and {waitlisted_seats} seats on the waitlist.\n"
            paragraph += f"Notes: {notes}.\n\n"
        filename = dept + course_id

        course['textfile'] = f"{filename}.txt"
        course['semester'] = semester
        pk = dept + course_id + semester
        course['pk'] = pk
        course["full_id"] = dept + course_id

        course_info[pk] = {"text": paragraph}
        course_texts.append({"page_content": f"""{paragraph}""", "metadata": course})

##### Reading JSON data

In [95]:
spring_courses = read_json('spring')
fall_courses = read_json('fall')

with open('all_courses_latest.json', "r") as json_file:
    all_courses = json.load(json_file)

In [96]:
write_course_text(spring_courses, 'spring')
write_course_text(fall_courses, 'fall')

##### Assert the lengths

In [105]:
assert(len(spring_courses) + len(fall_courses) == len(course_info.keys()))
assert(len(spring_courses) + len(fall_courses) == len(course_texts))

##### Save the data to JSON

In [99]:
save_json(spring_courses, 'spring')
save_json(fall_courses, 'fall')
all_courses = spring_courses + fall_courses
with open('all_courses_latest'+'.json', "w") as json_file:
    json.dump(all_courses, json_file)

In [100]:
with open('course_info'+'.json', "w") as json_file:
    json.dump(course_info, json_file)

In [104]:
with open('course_text'+'.json', "w") as json_file:
    json.dump(course_texts, json_file)

##### Print the JSON

In [57]:
updated_print_courses(spring_courses)

[1mDept ID[0m: ILS-Z
[1mCourse ID[0m:: 640
[1mCourse Title[0m:: SEMINAR IN INTELLECTUAL FREEDOM
[1mCourse Description[0m:: 
[1mUrl[0m:: https://luddy.indiana.edu/academics/courses/class/iub-spring-2023-ils-z640
[1mFilename[0m:: iub-spring-2023-ils-z640.html
[1mTextfile[0m:: ILS-Z640.txt
[1mSemester[0m:: spring
	component: LEC
	credits: 3
	status: Open
	course type: Regular Academic Session
	mode: In Person
	day: R
	time: 12:40 p.m.–3:10 p.m.
	------------------------

[1mDept ID[0m: INFO-I
[1mCourse ID[0m:: 301
[1mCourse Title[0m:: PRESENTATIONS FOR IT PROFESSIONALS
[1mCourse Description[0m:: Students present several different types of presentations and engage in developing these "21st Century skills" for their future.  The course utilizes an open studio format that allows students to explore public speaking to better prepare for future educational and professional presentations.
[1mUrl[0m:: https://luddy.indiana.edu/academics/courses/class/iub-spring-2023-inf

In [58]:
updated_print_courses(fall_courses)

[1mDept ID[0m: INFO-I
[1mCourse ID[0m:: 540
[1mCourse Title[0m:: HUMAN ROBOT INTERACTION
[1mCourse Description[0m:: This course surveys the field of human-robot interaction (HRI), which involves understanding how people perceive and respond to robots and creating robots that interact naturally with people. We will discuss the design, evaluation and societal significance of interactive robots from a human-centered perspective through readings, discussion and developing HRI prototypes.
[1mUrl[0m:: https://luddy.indiana.edu/academics/courses/class/iub-fall-2023-info-i540
[1mFilename[0m:: iub-fall-2023-info-i540.html
[1mTextfile[0m:: INFO-I540.txt
[1mSemester[0m:: fall
	component: LEC
	credits: 3
	status: Closed
	course type: Regular Academic Session
	mode: In Person
	day: TR
	time: 3:00 p.m.–4:15 p.m.
	------------------------

[1mDept ID[0m: STAT-S
[1mCourse ID[0m:: 626
[1mCourse Title[0m:: BAYESIAN THEORY AND DATA ANALYSIS
[1mCourse Description[0m:: Introduction 

In [59]:
updated_print_courses(all_courses)

[1mDept ID[0m: ILS-Z
[1mCourse ID[0m:: 640
[1mCourse Title[0m:: SEMINAR IN INTELLECTUAL FREEDOM
[1mCourse Description[0m:: 
[1mUrl[0m:: https://luddy.indiana.edu/academics/courses/class/iub-spring-2023-ils-z640
[1mFilename[0m:: iub-spring-2023-ils-z640.html
[1mTextfile[0m:: ILS-Z640.txt
[1mSemester[0m:: spring
	component: LEC
	credits: 3
	status: Open
	course type: Regular Academic Session
	mode: In Person
	day: R
	time: 12:40 p.m.–3:10 p.m.
	------------------------

[1mDept ID[0m: INFO-I
[1mCourse ID[0m:: 301
[1mCourse Title[0m:: PRESENTATIONS FOR IT PROFESSIONALS
[1mCourse Description[0m:: Students present several different types of presentations and engage in developing these "21st Century skills" for their future.  The course utilizes an open studio format that allows students to explore public speaking to better prepare for future educational and professional presentations.
[1mUrl[0m:: https://luddy.indiana.edu/academics/courses/class/iub-spring-2023-inf

### This data has the professors names in shorthand format which creates lots of issues while embedding. This issue will be addressed later

In [60]:
depts = list(set([c["dept"] for c in all_courses]))
components = set()
days = set()
course_types = set()
modes = set()
status = set()
pk = set()
profs_s = set()
credits = set()
time = set()
for c in all_courses:
    pk.add(c["pk"])
    for m in c["meetings"]:
        components.add(m["component"])
        days.add(m["day"])
        course_types.add(m["course_type"])
        modes.add(m["mode"])
        status.add(m["status"])
        profs_l = [met["instructor"] for met in c["meetings"] if met["instructor"]]
        profs = ", ".join(profs_l)
        print("profs: ", profs)
        if m["instructor"]:
            profs_s.add(m["instructor"])
        credits.add(m["credits"])
        time.add(m["time"])
components = list(components)
days = list(days)
course_types = list(course_types)
modes = list(modes)
status = list(status)
pk = list(pk)
profs_s = list(profs_s)
credits = list(credits)
time = list(time)

profs:  Rosenbaum H
profs:  Thacker U
profs:  German D, German D
profs:  German D, German D
profs:  Rosenbaum H
profs:  Bahramian H, Bahramian H, Bahramian H
profs:  Bahramian H, Bahramian H, Bahramian H
profs:  Bahramian H, Bahramian H, Bahramian H
profs:  Morrison A
profs:  Silver J
profs:  Crandall D, Tang W
profs:  Crandall D, Tang W
profs:  Himebaugh B
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Tao C, Tao C, Tao C, Tao C, Tao C, Tao C
profs:  Mejia A
profs:  Press M
profs:  Malbasa V, M

In [61]:
print(components)
print(days)
print(course_types)
print(modes)
print(status)
print(len(pk))
print(time)
print(credits)

['LEC', 'PCT', 'DIS', 'IND', 'LAB', 'PRA', 'ACT', 'RES', 'ITN', 'SEM', 'RDS']
['M', 'T', 'TR', 'MWF', 'MTW', 'ARR', 'MW', 'MTWR', 'W', 'FTR', 'R', 'F']
['Advance College Project', 'Regular Academic Session', 'Eight Week - Second', 'Eight Week - First', 'Three Week', 'Thirteen Week']
['Independent/Directed Study', 'Hybrid-Distance Video & Online', 'Distance Synchronous Video', 'Hybrid-On Campus & Online', '100% Online All', 'Internship/Practica', 'In Person']
['Open', 'Closed']
560
['4:45 p.m.–7:45 p.m.', '4:10 p.m.–6:45 p.m.', '10:00 a.m.–12:30 p.m.', '11:30 a.m.–1:30 p.m.', '9:10 a.m.–12:00 p.m.', '12:30 p.m.–2:30 p.m.', '1:50 p.m.–2:40 p.m.', '9:45 a.m.–11:20 a.m.', '1:15 p.m.–2:30 p.m.', '11:15 a.m.–1:10 p.m.', '5:45 p.m.–8:15 p.m.', '6:30 p.m.–7:50 p.m.', '9:45 a.m.–12:15 p.m.', '3:15 p.m.–4:45 p.m.', '3:50 p.m.–5:05 p.m.', '10:00 a.m.–12:15 p.m.', '1:50 p.m.–4:20 p.m.', '4:05 p.m.–6:00 p.m.', '8:45 a.m.–10:40 a.m.', '4:45 p.m.–6:05 p.m.', '12:30 p.m.–1:30 p.m.', '3:00 p.m.–5:30 p.

In [62]:
print(len(profs_s))
print(profs_s)

310
['Ensmenger N', 'Horton A', 'Manrique-Vallier D', 'Roberts M', 'Housworth E; Kill B', 'Guan Z', 'Leffel T; Koraganji V', 'Borner K', 'Rawlins G', 'Flammini A; Kemp J', 'Jadhao V', 'Koo J; Housworth E', 'Ali S', 'Emigh W', 'Bahramian H', 'An H', 'Wang X', 'Kemp J; Wang X', 'Peirce Caudell A; Lin L', 'Duncan J', 'Cavar D', 'McCoy C', 'Choi H', 'Rosenbaum H; Kemp J', 'Heidari Khoozani M', 'Marru S', 'Kouper I', 'Nippert-Eng C', 'Radicchi F; Kemp J', 'Leivant D', 'Kapadia A; Kemp J', 'Tiganj Z; Kemp J', 'Khardon R', 'Hahn M', 'Hannon C', 'Bondesson Bolin M', 'Wang X; Kemp J', 'Zhou D', 'Shih P; Kemp J', 'Siek K', 'Shan C', 'Brendel V; Kemp J', 'Ferradal S', 'Kemp J; Wang Y', 'Kavousian S', 'Kemp J; Khardon R', 'Lewis G', 'Fortunato S', 'Nti Asare I; Wild D', 'Francisco M', 'Martell A', 'Valdivia A', 'Steup R', 'Granger R; Housworth E', 'Kaziunas E', 'Kemp J; Tang H', 'Van Gucht D', 'Kemp J; Leake D', 'Tang H; Kemp J', 'Bueckle A', 'Roach A', 'Pierce M', 'Zhang X', 'Day R', 'Raphael C; 