# Comparison of Universities Curriculum

In [80]:
import pandas as pd
from typing import List, Dict
from collections import defaultdict


def load_curriculums(curriculum_path: str, universities: List[str]) -> Dict[str, pd.DataFrame]:
    return {
        university: pd.read_excel(curriculum_path, sheet_name=university, dtype='object')
        for university in universities
    }


def compare_curriculums(
        curriculums: Dict[str, pd.DataFrame],
        curriculum_columns: List[str],
        course_priority: pd.DataFrame,

):
    # preprocess
    course_priority = course_priority[['kod', 'priorita']].drop_duplicates()

    columns = ['priorita'] + curriculum_columns
    for university in curriculums:
        # add course priority
        curriculums[university] = pd.merge(curriculums[university], course_priority[['kod', 'priorita']], on='kod',
                                           how='left')
        curriculums[university].drop(columns=['kod'], inplace=True)

        # deduplicate records
        curriculums[university].drop_duplicates(subset=columns, inplace=True)

        # sort
        curriculums[university] = curriculums[university].sort_values(by=columns)

    # get maximum courses per course code across universities
    def get_max_courses(priority, curriculums):
        max_courses = max([
            len(curriculums[university].loc[curriculums[university]['priorita'] == priority])
            for university in curriculums
        ])
        return max_courses

    course_priority['max_courses'] = course_priority['priorita'] \
        .apply(get_max_courses, curriculums=curriculums)

    # merged curriculums
    codes = course_priority['priorita'].unique()
    data = defaultdict(list)
    for code in codes:
        max_courses = course_priority.loc[course_priority['priorita'] == code, 'max_courses'].values[0]
        data['priorita'].extend([code] * max_courses)
        for university, curriculum in curriculums.items():
            for column in curriculum_columns:
                values = curriculum.loc[curriculum['priorita'] == code, column].astype(str).tolist()
                empty_values = [None] * (max_courses - len(values))
                values += empty_values
                data[f'{university}_{column}'].extend(values)

    return pd.DataFrame(data)


curriculum_path = '../data/curriculums.xlsx'
curriculum_columns = ['volba', 'predmet',]    # 'rocnik', 'semester']
universities = ['stu_fei', 'vut_fit', 'ukf_fai', 'utb_fai', 'vut_fsi', 'vut_fekt']
course_priority = pd.read_excel(curriculum_path, sheet_name='kod_dim')

curriculums = load_curriculums(
    universities=universities,
    curriculum_path=curriculum_path
)

df = compare_curriculums(
    curriculums=curriculums,
    curriculum_columns=curriculum_columns,
    course_priority=course_priority,
)

with pd.ExcelWriter('../data/curriculums.xlsx', mode='a') as writer:
    df.to_excel(writer, sheet_name='merged_by_priorita', index=False)