In [15]:
import os
import pandas as pd

In [6]:
r,f,d = next(os.walk("./ALTO"))

In [11]:
idmap = {}
for name in f:
    _,sf,_ = next(os.walk(os.path.join(r,name)))
    idmap[name] = sf[0]

In [20]:
import requests

In [34]:
def get_urn(sesamid):
    string = f"https://api.nb.no/catalog/v1/metadata/{sesamid}/struct"
    #print(string)
    res = requests.get(string)
    if res.status_code == 200:
        result = res.text
    else:
        print(res.status_code)
        result = None
    return result

In [35]:
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd

def extract_resource_info(xml_content: str) -> dict:
    """Extract URN and scan resolution from structMap XML"""
    try:
        # Parse the XML with namespace awareness
        root = ET.fromstring(xml_content)
        
        # Get the first div/resource element
        resource = root.find('.//{*}div/{*}resource')
        
        if resource is not None:
            # Extract URN base (remove page number at end)
            urn = resource.get('{http://www.w3.org/1999/xlink}href', '')
            urn_base = urn.rsplit('_', 1)[0] if urn else None
            
            info = {
                'urn_base': urn_base,
                'scan_resolution': resource.get('SCANRESOLUTION'),
                'height': resource.get('HEIGHT'),
                'width': resource.get('WIDTH')
            }
            return info
    except Exception as e:
        print(f"Error processing XML: {str(e)}")
    
    return {
        'urn_base': None,
        'scan_resolution': None,
        'height': None,
        'width': None
    }

In [39]:
# Example usage
def urn_base(sesamid):
    xml_content = get_urn(sesamid)
    info = extract_resource_info(xml_content)
    urn = info['urn_base']
    return urn

In [41]:
df_ids = pd.DataFrame.from_dict(idmap, orient='index', columns=['sesamid']).reset_index()

In [43]:
df_ids['urn'] = df_ids.sesamid.apply(urn_base)

In [45]:
df_ids['url'] = df_ids.urn.apply(lambda x: f"https://nb.no/items/{x}")

In [47]:
df_ids.style

Unnamed: 0,index,sesamid,urn,url
0,1581443,c2910cf5e44d09b996c62a1f4417fb57,URN:NBN:no-nb_digimanus_279683,https://nb.no/items/URN:NBN:no-nb_digimanus_279683
1,1594278,6b9108fc26177727be1ecf382c988b16,URN:NBN:no-nb_digimanus_279883,https://nb.no/items/URN:NBN:no-nb_digimanus_279883
2,1579385,e8ceee63c94b380060d42d52cefb26ee,URN:NBN:no-nb_digimanus_279697,https://nb.no/items/URN:NBN:no-nb_digimanus_279697
3,1563964,5f27d5b9ffdef6adf0e3a80d439f2d6c,URN:NBN:no-nb_digimanus_280121,https://nb.no/items/URN:NBN:no-nb_digimanus_280121
4,1558161,92285008828e0e5cb8eb002b7f84c421,URN:NBN:no-nb_digimanus_279817,https://nb.no/items/URN:NBN:no-nb_digimanus_279817
5,3971423,9faba46158e1c7f87ef07d235a12b0df,URN:NBN:no-nb_digimanus_279923,https://nb.no/items/URN:NBN:no-nb_digimanus_279923
6,1587674,42480c1a3c07b4adbff5438a13afa8a1,URN:NBN:no-nb_digimanus_280336,https://nb.no/items/URN:NBN:no-nb_digimanus_280336
7,1594283,136c4f5e3f4e3062f23ef71a48df293a,URN:NBN:no-nb_digimanus_279749,https://nb.no/items/URN:NBN:no-nb_digimanus_279749
8,1594255,1a64cb0fe040c09a01cf7b3c6f384705,URN:NBN:no-nb_digimanus_279698,https://nb.no/items/URN:NBN:no-nb_digimanus_279698
9,3971323,d7588fb66077bb4d852cf97d67795d64,URN:NBN:no-nb_digimanus_280402,https://nb.no/items/URN:NBN:no-nb_digimanus_280402


In [49]:
metadata = pd.read_excel("metadata.xlsx", index_col=0)

In [53]:
metadata['doc_id'] = metadata['filename'].apply(lambda x: x.split('_')[1])

In [55]:
df_metadata = metadata.merge(df_ids, left_on='doc_id', right_on='index')

In [58]:
df_metadata['year'] = pd.to_numeric(df_metadata['year'], errors='coerce').astype('Int64')

In [61]:
df_metadata["title author year acts genre doc_id sesamid urn url".split()].to_excel("metadata_links.xlsx", index=False)

In [65]:
characters = pd.read_excel("Karakteranalyse.xlsx", index_col = 0)

In [67]:
characters['doc_id']  = characters.title.apply(lambda x: x.split('_')[1])

In [69]:
df_characters = characters.merge(df_ids, left_on="doc_id", right_on='index')

In [73]:
df_characters.columns

Index(['title', 'name', 'gender', 'status', 'description', 'doc_id', 'index',
       'sesamid', 'urn', 'url'],
      dtype='object')

In [75]:
import urllib.parse

In [77]:
print("Data type of 'name' column:", df_characters['name'].dtype)


Data type of 'name' column: object


In [89]:
df_characters['search_url'] = df_characters.fillna('').apply(lambda row: f"{row['url']}?searchText=\"{urllib.parse.quote(row['name'])}\"", axis=1)

In [93]:
df_characters[["name","gender", "status", "doc_id", "search_url"]]

Unnamed: 0,name,gender,status,doc_id,search_url
0,Sigrid,F,,3971483,https://nb.no/items/URN:NBN:no-nb_digimanus_28...
1,Ragnhild,F,,3971483,https://nb.no/items/URN:NBN:no-nb_digimanus_28...
2,Asmund,M,,3971483,https://nb.no/items/URN:NBN:no-nb_digimanus_28...
3,Nordal,M,,3971483,https://nb.no/items/URN:NBN:no-nb_digimanus_28...
4,Steenby,M,Studenter,3971483,https://nb.no/items/URN:NBN:no-nb_digimanus_28...
...,...,...,...,...,...
569,Johan,M,hans Bontignedring,1563988,https://nb.no/items/URN:NBN:no-nb_digimanus_27...
570,Joseph,M,Falkenskjolds Tjener,1563988,https://nb.no/items/URN:NBN:no-nb_digimanus_27...
571,Berg,M,ham en Hofmand,1563988,https://nb.no/items/URN:NBN:no-nb_digimanus_27...
572,Præsidenten,M,i høieste Ret,1563988,https://nb.no/items/URN:NBN:no-nb_digimanus_27...


In [92]:
df_characters[["name","gender", "status", "doc_id", "search_url"]].to_excel("characters_link.xlsx", index=False)