In [13]:
import requests 
from bs4 import BeautifulSoup
import sys
import re

base_URL = "https://authority.bibsys.no/authority/rest"


def get_authority(identifier):
    res = requests.get( f"{base_URL}/authorities/v2/{identifier}")
    result = ""
    if res.status_code == 200:
        result = res.json()
    gender = ''
    life = ''
    name = ''
    for x in result['marcdata']:
        if '375' == x['tag']:
            if 'subfields' in x:
                sub = x['subfields']
                for s in sub:
                    if 'subcode' in s and 'value' in s:
                        if s['subcode'] == 'a':
                            gender = s['value']
        if '100' == x['tag']:
            if 'subfields' in x:
                sub = x['subfields']
                (b,d) = ('','')
                for s in sub:
                    if 'subcode' in s and 'value' in s:
                        if s['subcode'] == 'a':
                            name = s['value']
                        if s['subcode'] == 'd':
                            life = s['value']
                            x = life.split('-')
                            if len(x) == 2:
                                (b,d) = x    
    returnvalue = (name, gender, b,d, identifier)
    return returnvalue

def get_author(name, category = "person"):
    res = requests.get(f"https://api.nb.no/catalog/v1/authority/authorityid?size=20&authorityName={name}")
    if res.status_code != 200:
        return {}
    result = res.json()
    returnvalue = []
    for entry in result['content']:
        is_cat = entry['category'] == category
        has_id = entry['id'].startswith('bibsys')
        #is_quality = entry['qualityLevel'] == 'kat3' or entry['qualityLevel'] == 'kat2'
        if is_cat and has_id :
            entrycode = {
                "authorityid": entry['id'].split(':')[-1],
                "name": entry["primaryName"]
            }
            returnvalue.append(entrycode)
    if returnvalue != []:
        returnvalue = returnvalue[0]
    else:
        returnvalue = {}
    return returnvalue
    
def author_info(person):
    x = get_author(person)
    result = ()
    if 'authorityid' in x:
        result = get_authority(x['authorityid'])
    return result

def author_gender(person):
    """ 
        Find five of the records matching input parameter person.
        Return value is a 3-tuple with name in standard format, gender and life-span.
        Only returns a value if the characters in person is a subset of the characters in a name found in bibsys.
        assumes
            import requests 
            from bs4 import BeautifulSoup
            import sys
            import re
        input
            person a string
        output
            (name string, gender char, birth-death as string)
        
    """
    # remove punctuation characters - appears to be a problem for bibsys or ...
    person = re.sub('[^(\w|\s)]','', person)

    #print(person)
    
    # set up request parameter string

    req = "https://authority.bibsys.no/authority/rest/sru?operation=searchRetrieve&query={author}&startRecord=1&maximumRecords=50&recordPacking=xml&recordSchema=marcxchange" 
    
    # res is the return value, will contain a list of tuples if anything.
    res = []

    # issue request
    r = requests.get(req.format(author = person))

    
    # if everything is ok, go ahead and pick out the information
    if r.status_code == 200:
        # extract character symbols from person, used below to compare
        person_char = set(person.lower())
        #print(person_char)
        
        soup = BeautifulSoup(r.text, 'lxml')

        # name recides in tag 100, gender in tag 375
        # loop through all query matches
        for n in soup.find_all("srw:recorddata"):
            print(n, n.text)
            # initialise the components of the return tuple
            name = ""
            year = ""
            gender = ""
            
            # then try to extract som useful information
            namedata = n.find("marc:datafield",{'tag':100})
            if not namedata is None: 
                name_n = namedata.find("marc:subfield", {'code':'a'})
                if not name_n is None:
                    name = name_n.text
                year_n = namedata.find("marc:subfield", {'code':'d'})
                if not year_n is None:
                    year = year_n.text

            genderdata = n.find("marc:datafield", {'tag':375})
            if not genderdata is None:
                gender_n = genderdata.find("marc:subfield", {'code':'a'})
                if not gender_n is None:
                    gender = gender_n.text

            # Pseudonyms and other names for person related to query can get returned - retain only those 
            # that match the query here implemented as a subset relation. 
            # Bibsys will not match ó to o for example. If it does, this has to be replace by
            # some form of jaccard formula.

            #if person_char.issubset(set(name.lower())):
            res.append((name, gender, year))
            
    return res


#if __name__ == "__main__":
#    author_gender(' '.join(x for x in sys.argv))

In [2]:
import sqlite3
import pandas as pd

In [3]:
def query(sql, params = ()):
    with sqlite3.connect("/mnt/disk1/metadata.db") as con:
        cur = con.cursor()
        cur.execute(sql, params)
    return cur

In [4]:
def dbquery(db, sql, params = ()):
    with sqlite3.connect(db) as con:
        cur = con.cursor()
        cur.execute(sql, params)
    return cur.fetchall()

In [178]:
cur = query("select authors from metadata where doctype = 'digibok'")

In [154]:
names = set()
for x in cur:
    for n in [y.strip() for y in x[0].split('/')]:
        names |= {n}

In [155]:
len(names)

192190

In [158]:
with open('namex.txt', "w") as f:
    for x in names:
        f.write(f"{x}\n")

In [5]:
with open('namex.txt') as f:
    names = f.read().split("\n")

In [15]:
author_info("anne ragde")

('Ragde, Anne B.', 'f', '1957', '', '90097114')

In [16]:
import time

In [62]:
x = time.time()

In [63]:
round((time.time() - x)/60,2)

0.01

In [71]:
dbquery("names_life_gender.db", "create table names (name, gender, born, dead, identifier)")

[]

In [72]:
start_time = time.time()
with sqlite3.connect("names_life_gender.db") as con:
    cur = con.cursor()
    for i,x in enumerate(names):
        try:
            nom = author_info(x)
            #print(nom)
            cur.execute("insert into names values (?,?,?,?,?)", nom)
        except KeyboardInterrupt:
            break
        except:
            print(f" {i} feil ved {round((time.time() - start_time)/60,2)} minutter for {x}")
        if i % 10000 == 0:
            print(f"{i} logg ved {round((time.time() - start_time)/60,2)} minutter for {nom}")

 0 feil ved 0.0 minutter for 
0 logg ved 0.0 minutter for ()
 488 feil ved 1.09 minutter for Жадан , Сергiй Вiкторович
 767 feil ved 1.72 minutter for Ræder , A[nton]
 1689 feil ved 4.8 minutter for Norges tekniske høgskole
 1791 feil ved 5.04 minutter for Løvberg , [Alf]
 2175 feil ved 5.91 minutter for Dahl , Kjell-P[etter]
 3946 feil ved 13.95 minutter for Ebbel]l] , Clara Thue
 4000 feil ved 14.08 minutter for Tidemand-Johannessen , F[rithjof]
 4829 feil ved 15.94 minutter for Fomitsjenko
 5924 feil ved 18.44 minutter for Rødsli , Ingegrigt
 7628 feil ved 23.3 minutter for Seeberg , A[xel] S[egelcke]
 8329 feil ved 25.88 minutter for Hvidsteen , Chr[istian]
 8475 feil ved 26.21 minutter for Øyer
 8562 feil ved 26.41 minutter for Wilhelmsen , Leif J[armann]
 9040 feil ved 28.5 minutter for B[ing] , M[imi] G[rieg]
 9075 feil ved 28.57 minutter for Høstmark , [Sverre] Halfdan
10000 logg ved 31.67 minutter for ('Berg, Vally', '', '', '', '7001537')
 10172 feil ved 32.05 minutter for St

In [177]:
len(names_curated)

192190