In [1]:
%pip install xmltodict

Note: you may need to restart the kernel to use updated packages.


# Get course data from FS

Documentation of API:
  * <https://www.fellesstudentsystem.no/brukersider/teknisk/fsws-dok/soap/studinfo2/studinfo2.pdf>
  * <https://www.fellesstudentsystem.no/brukersider/teknisk/fsws-dok/rest/studinfo.html>
  
URL to use: `https://fsws.usit.no/fsrest/rest/studinfo/<tjeneste>/<query-parametre>`

In [54]:
import requests
from requests.auth import HTTPBasicAuth
import json
import os
import time


from dotenv import load_dotenv, find_dotenv
import urllib.parse

import pandas as pd
import numpy as np

# For parsing xml and docbook
import xmltodict
import xml.etree.ElementTree as ET

# Find and load the .env file
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

## Libraries

In [55]:
import fnmatch
import collections
import re

def find_pattern(dictionary, pattern, path=''):
    """Find keys or values in nested dictionary that match a given pattern."""
    matches = []
    for key, value in dictionary.items():
        new_path = f"{path}.{key}" if path else key
        if fnmatch.fnmatch(key, pattern) or (isinstance(value, str) and fnmatch.fnmatch(value, pattern)):
            matches.append(new_path)
        if isinstance(value, dict):
            matches.extend(find_pattern(value, pattern, new_path))
        elif isinstance(value, list):
            for i, item in enumerate(value):
                if isinstance(item, dict):
                    matches.extend(find_pattern(item, pattern, f"{new_path}[{i}]"))
    return matches

def get_structure(dictionary, max_depth):
    """Get the structure of a nested dictionary up to a certain depth."""
    if max_depth <= 0:
        return type(dictionary).__name__
    elif isinstance(dictionary, dict):
        return {key: get_structure(value, max_depth - 1) for key, value in dictionary.items()}
    elif isinstance(dictionary, list):
        return [get_structure(item, max_depth - 1) for item in dictionary]
    else:
        return type(dictionary).__name__



def convert_docbook_to_markdown(text):
    # Convert <p> tags
    text = re.sub(r'<p\b[^>]*>(.*?)</p>', r'\n\1\n', text)

    # Convert <list> tags
    text = re.sub(r'<list\b[^>]*>(.*?)</list>', r'\n\1\n', text)

    # Convert <listItem> tags
    text = re.sub(r'<listItem\b[^>]*>(.*?)</listItem>', r'* \1\n', text)

    text = re.sub(r'<[^>]+>', '', text)

    return text




## Config

Configuration of organizations with username and org nr

In [56]:
orgs = [
#    {"name": "siktai_hiø", "instnr": 224, "key": "hioe"}   - usikker på denne med ø i brukernavn.
    {"name": "siktai_ldh", "instnr": 230, "key": "ldh"},
    {"name": "siktai_mf", "instnr": 190, "key": "mf"},
    {"name": "siktai_mil", "instnr": 1627, "key": "mil"},
    {"name": "siktai_nih", "instnr": 150, "key": "nih"},
    {"name": "siktai_nla", "instnr": 254, "key": "nla"},
    {"name": "siktai_nmbu", "instnr": 192, "key": "nmbu"},
    {"name": "siktai_nmh", "instnr": 178, "key": "nmh"},
    {"name": "siktai_nuc", "instnr": 259, "key": "nuc"},
    {"name": "siktai_oslomet", "instnr": 215, "key": "oslomet"},
    {"name": "siktai_phs", "instnr": 233, "key": "phs"},
    {"name": "siktai_sash", "instnr": 231, "key": "sash"},
    {"name": "siktai_unis", "instnr": 195, "key": "unis"},
    {"name": "siktai_usn", "instnr": 222, "key": "usn"},
    {"name": "siktai_vid", "instnr": 251, "key": "vid"},
    {"name": "siktai_krus", "instnr": 1661, "key": "krus"},# No data
    {"name": "siktai_hvo", "instnr": 223, "key": "hvo"},# No data
    {"name": "siktai_hinn", "instnr": 209, "key": "hinn"},# No data
    {"name": "siktai_fih", "instnr": 258, "key": "fih"},# No data
    {"name": "siktai_bdm", "instnr": 1526, "key": "bdm"},# No data
    {"name": "siktai_ath", "instnr": 255, "key": "ath"},# No data
    {"name": "siktai_aho", "instnr": 189, "key": "aho"},# No data
    {"name": "siktai_khio", "instnr": 260, "key": "khio"}, # No data
    {"name": "siktai_ntnu", "instnr": 194, "key": "ntnu"},# Having troubles
    {"name": "siktai_nord", "instnr": 204, "key": "nord"},# Having troubles
    {"name": "siktai_uib", "instnr": 184, "key": "uib"}, # Having troubles
    {"name": "siktai_uit", "instnr": 186, "key": "uit"}, # Having troubles
    {"name": "siktai_dmmh", "instnr": 253, "key": "dmmh"},# Done
    {"name": "siktai_him", "instnr": 211, "key": "him"}, # Done
    {"name": "siktai_hvl", "instnr": 203, "key": "hvl"}, # Done
    {"name": "siktai_uio", "instnr": 185, "key": "uio"}, # Done
    {"name": "siktai_uis", "instnr": 217, "key": "uis"}, # Done
    {"name": "siktai_nhh", "instnr": 191, "key": "nhh"}, # Done
    {"name": "siktai_uia", "instnr": 201, "key": "uia"}, # Done
]


In [57]:
len(orgs)

33

In [58]:
def parseEmner(emner, orgname, aar):
    res = []
    columns_to_include = ['infotyper']
    for emne in emner:
        if isinstance(emne, str):
            print("Unwexpected value for emne " + emne)
            continue
        if 'infotyper' not in emne:
            continue
        if not isinstance(emne.get('emneid', {}), dict):
            print("Unwexpected value for emneid " + emne.get('emneid', {}))
            continue
        new_item = {
            'inst': emne.get('emneid', {}).get('Institusjonsnr', np.nan),
            'iname': orgname,
            'aar': aar,
            'emneid': emne.get('emneid', {}).get('Emnekode', np.nan),
            'studiepoeng': emne.get('studiepoeng', np.nan),
            'studienivakode': emne.get('studienivakode', np.nan),
            'nuskode': emne.get('nuskode', np.nan),
            'sprak': emne.get('@sprak', np.nan)
        }
        for ii in emne['infotyper']:
            if ii in ['emneansvar', 'sensorordning', 'undform', 'hjelpemidler', 'vurderingsuttrykk', 'overlapp', 'opptak', 'fagplan', 'arbeidskrav', 'eksamen']:
                continue
            if 'infotekst' not in emne['infotyper'][ii]:
                continue
            if not isinstance(emne['infotyper'][ii]['infotekst'], str):
                continue
            new_item['desc-' + ii] = convert_docbook_to_markdown(emne['infotyper'][ii]['infotekst'])
            #new_item['desc-' + ii] = emne['infotyper'][ii]['infotekst']
        res.append(new_item)

    return pd.DataFrame(res)


In [59]:
def getData(username, instnr, aar, spraak):
    password = os.environ.get("FS_PASS")
    auth_details = (username, password)
    baseURL = "https://fsws.usit.no/fsrest/rest/studinfo/emne/"

    # FSWS Query
    query = {
        'institusjonsnr': instnr,
        'faknr': '-1',
        'instituttnr': '-1',
        'gruppenr': '-1',
        'arstall': aar,
        'terminkode': 'STÅR',
        'sprak': spraak, 
        # can be B, N or E
    }
    url = baseURL + "?" + urllib.parse.urlencode(query)
    print(f"Getting data from {url}")
          
    response = requests.get(baseURL + "?" + urllib.parse.urlencode(query), auth=auth_details)
    if response.status_code == requests.codes.ok:
        data_dict = xmltodict.parse(response.text)
        if 'fs-studieinfo' in data_dict:
            if 'emne' in data_dict['fs-studieinfo']:
                return parseEmner(data_dict['fs-studieinfo']['emne'], username, aar)
    else:
        print("HTTP Error:", response.status_code)
        print("Error Body:", response.text)
    return None

In [None]:
i = 0
for org in orgs:
    emner = pd.DataFrame()
    try:
        for aar in range(2022, 2002, -1):
            for spraak in ["B", "N", "E"]:
                i += 1
                if i >= 120:
                    break
                start_time = time.time()
                print(f"Processing {org['name']} {org['instnr']} for {aar} and lang {spraak}")
                try:
                    nye_emner = getData(org['name'], org['instnr'], aar, spraak)
                    emner = pd.concat([emner, nye_emner], ignore_index=True)
                    runtime = time.time() - start_time
                    print(f" Runtime: {runtime:.1f} seconds. Rows received {len(nye_emner)}")
                except Exception as e:
                    print(f"An exception occurred when processing {org['name']} {org['instnr']} for {aar} and lang {spraak}: {str(e)}")
                    continue
                # display(nye_emner)
    except Exception as e:
        print(f"An exception occurred for organization {org['name']} {org['instnr']}: {str(e)}")
        continue
    emner.to_parquet("data2/" + org['key'] + ".parquet")
display(emner)


Processing siktai_krus 1661 for 2022 and lang B
Getting data from https://fsws.usit.no/fsrest/rest/studinfo/emne/?institusjonsnr=1661&faknr=-1&instituttnr=-1&gruppenr=-1&arstall=2022&terminkode=ST%C3%85R&sprak=B
An exception occurred when processing siktai_krus 1661 for 2022 and lang B: object of type 'NoneType' has no len()
Processing siktai_krus 1661 for 2022 and lang N
Getting data from https://fsws.usit.no/fsrest/rest/studinfo/emne/?institusjonsnr=1661&faknr=-1&instituttnr=-1&gruppenr=-1&arstall=2022&terminkode=ST%C3%85R&sprak=N
An exception occurred when processing siktai_krus 1661 for 2022 and lang N: object of type 'NoneType' has no len()
Processing siktai_krus 1661 for 2022 and lang E
Getting data from https://fsws.usit.no/fsrest/rest/studinfo/emne/?institusjonsnr=1661&faknr=-1&instituttnr=-1&gruppenr=-1&arstall=2022&terminkode=ST%C3%85R&sprak=E
An exception occurred when processing siktai_krus 1661 for 2022 and lang E: object of type 'NoneType' has no len()
Processing siktai_k

In [None]:
emner

In [None]:
emnr = parseEmner(data_dict['fs-studieinfo']['emne'])
emnr
#emnr[emnr['desc-innhold'].notna()]
#len(emnr)

In [None]:
from IPython.display import display, Markdown



# Assuming 'emnr' is the name of your dataframe
for index, row in emnr.head(10).iterrows():
    #print('-----: ' + row['emneid'])
    display(Markdown('-----'))
    print(row['desc-utbytte'])
    #display(Markdown(convert_docbook_to_markdown(row['desc-utbytte'])))

In [None]:
na_counts = emnr.isna().sum()

print(na_counts)

In [None]:
filtered_df = emnr[emnr['infotyper'].notna()]

In [None]:
# Assuming 'emnr' is your DataFrame
#for index, row in emnr.iterrows():
#    infotyper_value = row['infotyper']

print(emnr['infotyper'])
get_structure(emnr, 10)
x = pd.DataFrame(emnr['infotyper'][0])
x

In [None]:
emnr

In [None]:
x = get_structure(data_dict, 4)
x