In [None]:
%pip install xmltodict

# Get course data from FS

Documentation of API:
  * <https://www.fellesstudentsystem.no/brukersider/teknisk/fsws-dok/soap/studinfo2/studinfo2.pdf>
  * <https://www.fellesstudentsystem.no/brukersider/teknisk/fsws-dok/rest/studinfo.html>
  
URL to use: `https://fsws.usit.no/fsrest/rest/studinfo/<tjeneste>/<query-parametre>`

In [None]:
import requests
from requests.auth import HTTPBasicAuth
import json
import os

from dotenv import load_dotenv, find_dotenv
import urllib.parse

import pandas as pd
import numpy as np

# For parsing xml and docbook
import xmltodict
import xml.etree.ElementTree as ET

# Find and load the .env file
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

## Libraries

In [None]:
import fnmatch
import collections
import re

def find_pattern(dictionary, pattern, path=''):
    """Find keys or values in nested dictionary that match a given pattern."""
    matches = []
    for key, value in dictionary.items():
        new_path = f"{path}.{key}" if path else key
        if fnmatch.fnmatch(key, pattern) or (isinstance(value, str) and fnmatch.fnmatch(value, pattern)):
            matches.append(new_path)
        if isinstance(value, dict):
            matches.extend(find_pattern(value, pattern, new_path))
        elif isinstance(value, list):
            for i, item in enumerate(value):
                if isinstance(item, dict):
                    matches.extend(find_pattern(item, pattern, f"{new_path}[{i}]"))
    return matches

def get_structure(dictionary, max_depth):
    """Get the structure of a nested dictionary up to a certain depth."""
    if max_depth <= 0:
        return type(dictionary).__name__
    elif isinstance(dictionary, dict):
        return {key: get_structure(value, max_depth - 1) for key, value in dictionary.items()}
    elif isinstance(dictionary, list):
        return [get_structure(item, max_depth - 1) for item in dictionary]
    else:
        return type(dictionary).__name__



def convert_docbook_to_markdown(text):
    # Convert <p> tags
    text = re.sub(r'<p\b[^>]*>(.*?)</p>', r'\n\1\n', text)

    # Convert <list> tags
    text = re.sub(r'<list\b[^>]*>(.*?)</list>', r'\n\1\n', text)

    # Convert <listItem> tags
    text = re.sub(r'<listItem\b[^>]*>(.*?)</listItem>', r'* \1\n', text)

    text = re.sub(r'<[^>]+>', '', text)

    return text




## Config

Configuration of organizations with username and org nr

In [None]:
orgs = [
    {"name": "siktai_aho", "instnr": 189},
    {"name": "siktai_ath", "instnr": 255},
    {"name": "siktai_bdm", "instnr": 1526},
    {"name": "siktai_dmmh", "instnr": 253},
    {"name": "siktai_fih", "instnr": 258},
    {"name": "siktai_him", "instnr": 211},
    {"name": "siktai_hinn", "instnr": 209},
    {"name": "siktai_hiø", "instnr": 224},
    {"name": "siktai_hvl", "instnr": 203},
    {"name": "siktai_hvo", "instnr": 223},
    {"name": "siktai_khio", "instnr": 260},
    {"name": "siktai_krus", "instnr": 1661},
    {"name": "siktai_ldh", "instnr": 230},
    {"name": "siktai_mf", "instnr": 190},
    {"name": "siktai_mil", "instnr": 1627},
    {"name": "siktai_nhh", "instnr": 191},
    {"name": "siktai_nih", "instnr": 150},
    {"name": "siktai_nla", "instnr": 254},
    {"name": "siktai_nmbu", "instnr": 192},
    {"name": "siktai_nmh", "instnr": 178},
    {"name": "siktai_nord", "instnr": 204},
    {"name": "siktai_ntnu", "instnr": 194},
    {"name": "siktai_nuc", "instnr": 259},
    {"name": "siktai_oslomet", "instnr": 215},
    {"name": "siktai_phs", "instnr": 233},
    {"name": "siktai_sash", "instnr": 231},
    {"name": "siktai_uia", "instnr": 201},
    {"name": "siktai_uib", "instnr": 184},
    {"name": "siktai_uio", "instnr": 185},
    {"name": "siktai_uis", "instnr": 217},
    {"name": "siktai_uit", "instnr": 186},
    {"name": "siktai_unis", "instnr": 195},
    {"name": "siktai_usn", "instnr": 222},
    {"name": "siktai_vid", "instnr": 251},
]

In [None]:
len(orgs)

In [None]:
def parseEmner(emner, orgname, aar):
    res = []
    columns_to_include = ['infotyper']
    for emne in emner:
        if isinstance(emne, str):
            print("Unwexpected value for emne " + emne)
            continue
        if 'infotyper' not in emne:
            continue
        if not isinstance(emne.get('emneid', {}), dict):
            print("Unwexpected value for emneid " + emne.get('emneid', {}))
            continue
        new_item = {
            'inst': emne.get('emneid', {}).get('Institusjonsnr', np.nan),
            'iname': orgname,
            'aar': aar,
            'emneid': emne.get('emneid', {}).get('Emnekode', np.nan),
            'studiepoeng': emne.get('studiepoeng', np.nan),
            'studienivakode': emne.get('studienivakode', np.nan),
            'nuskode': emne.get('nuskode', np.nan),
            'sprak': emne.get('@sprak', np.nan)
        }
        for ii in emne['infotyper']:
            if ii in ['emneansvar', 'sensorordning', 'undform', 'hjelpemidler', 'vurderingsuttrykk', 'overlapp', 'opptak', 'fagplan', 'arbeidskrav', 'eksamen']:
                continue
            if 'infotekst' not in emne['infotyper'][ii]:
                continue
            if not isinstance(emne['infotyper'][ii]['infotekst'], str):
                continue
            new_item['desc-' + ii] = convert_docbook_to_markdown(emne['infotyper'][ii]['infotekst'])
            #new_item['desc-' + ii] = emne['infotyper'][ii]['infotekst']
        res.append(new_item)

    return pd.DataFrame(res)


In [None]:
def getData(username, instnr, aar):
    password = os.environ.get("FS_PASS")
    auth_details = (username, password)
    baseURL = "https://fsws.usit.no/fsrest/rest/studinfo/emne/"

    # FSWS Query
    query = {
        'institusjonsnr': instnr,
        'faknr': '-1',
        'instituttnr': '-1',
        'gruppenr': '-1',
        'arstall': aar,
        'terminkode': 'STÅR',
        'sprak': 'B', 
        # can be B, N or E
    }
    response = requests.get(baseURL + "?" + urllib.parse.urlencode(query), auth=auth_details)
    if response.status_code == requests.codes.ok:
        data_dict = xmltodict.parse(response.text)
        if 'fs-studieinfo' in data_dict:
            if 'emne' in data_dict['fs-studieinfo']:
                return parseEmner(data_dict['fs-studieinfo']['emne'], username, aar)
    else:
        print("HTTP Error:", response.status_code)
        print("Error Body:", response.text)
    return None

In [None]:
i = 0
for aar in range(2023, 1980, -1):
    for org in orgs:
        i += 1
        if i >= 50:
            break
        print(f"Processing {org['name']} {org['instnr']} for {aar}")
        emner = getData(org['name'], org['instnr'], aar)
        display(emner)

In [None]:
emnr = parseEmner(data_dict['fs-studieinfo']['emne'])
emnr
#emnr[emnr['desc-innhold'].notna()]
#len(emnr)

In [None]:
from IPython.display import display, Markdown



# Assuming 'emnr' is the name of your dataframe
for index, row in emnr.head(10).iterrows():
    #print('-----: ' + row['emneid'])
    display(Markdown('-----'))
    print(row['desc-utbytte'])
    #display(Markdown(convert_docbook_to_markdown(row['desc-utbytte'])))

In [None]:
na_counts = emnr.isna().sum()

print(na_counts)

In [None]:
filtered_df = emnr[emnr['infotyper'].notna()]

In [None]:
# Assuming 'emnr' is your DataFrame
#for index, row in emnr.iterrows():
#    infotyper_value = row['infotyper']

print(emnr['infotyper'])
get_structure(emnr, 10)
x = pd.DataFrame(emnr['infotyper'][0])
x

In [None]:
emnr

In [None]:
x = get_structure(data_dict, 4)
x