# Script to automate the export and manipulation of the VICAV-library

## Import Package eTree to parse XML Files

In [1]:
import requests
import json
import logging
import os
import xml.etree.ElementTree as ET
import asyncio
import aiohttp
# this module is needed to make asyncio.run work inside the notebook as well as in the generated python script
import nest_asyncio
nest_asyncio.apply()
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
#logging.basicConfig(level=logging.DEBUG)

## Define name-space for xml-parsing

In [2]:
xmlns = {"tei": "http://www.tei-c.org/ns/1.0", "xml":"http://www.w3.org/XML/1998/namespace" }

## Access to the VICAV Zotero library

* Use API_TOKEN from environment to access Zotero
* Set the Zotero group id for VICAV here

In [3]:
request_headers = {'Authorization': 'Bearer ' + os.environ['API_TOKEN']}
group_id = "2165756"
limit_downloads_to = None
conn_limit=16
total_timeout=600 #s

## Read all items in the library

Load items from Zotero group library

    Args: 
        group_id (str): ID of a Zotero group
        limit (int): number of items to retrieve from library, maximum is 100.
        start (int): item number to start with

In [4]:
async def get_items(session, group_id:str,limit:int,start:int,itemType = None):
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/" + "?limit=" + str(limit) + "&start=" + str(start) + ("&itemType="+itemType if itemType is not None else "")
    async with session.get(url=request_url, headers=request_headers) as response:
        if response.status == 200:
            parsed = json.loads(await response.text())
            response_headers = response.headers
        
    return parsed, response_headers

Get total number of items in group library

    Args:  
        group_id (str): ID of a Zotero group
    
    Returns:
        int: number of items in the library

In [5]:
def total_number_items(group_id) -> int:
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/"
    response = requests.get(request_url, headers=request_headers)
    
    return int(response.headers["Total-Results"])

Get headers of Zotero-Api-Calls

    Args:  
        group_id (str): ID of a Zotero group

In [6]:
def get_headers(group_id):
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/"
    response = requests.get(request_url, headers=request_headers)
    
    return response.headers

Get links from headers

    Args:
        headers: http-headers of a response

    Returns:
        dict

In [7]:
def get_links_from_headers(headers) -> dict:
    link_list = headers["Link"].split(",")
    links = {}
    for link_item in link_list:
        #print(link_item)
        link_type = link_item.split('; rel="')[1].replace('"','').strip()
        link_value = link_item.split('; rel="')[0].replace("<","").replace(">","").strip()
        links[link_type] = link_value
    
    return links

Get all items of a collection

In [8]:
async def get_all_items(session, group_id, itemType = None):
    logging.info("Getting all items" + ((" of type " + itemType) if itemType is not None else "") + " now.")
    # empty list that will hold all items of the library
    allitems=[]
    
    # settings to be used in the function to get the items (limit is max 100)
    limit=100
    start=0
    
    # get the first 200 items to start with
    first_round=await get_items(session, group_id,limit,start,itemType)
    allitems=allitems+first_round[0]
    
    # get the next link from the headers
    next_url = get_links_from_headers(first_round[1])["next"]
    last_url = get_links_from_headers(first_round[1])["last"]
    # get items until next url is last url, then all items are fetched
    while next_url != last_url:
        logging.info("Getting items from " + next_url)
        async with session.get(url=next_url, headers=request_headers) as response:
            if response.status == 200:
                parsed = json.loads(await response.text())
                response_headers = response.headers

                allitems=allitems + parsed
                urls = get_links_from_headers(response_headers)
                if "next" in urls:
                    next_url = urls["next"]
                else:
                    break
            else:
                break

    # get the last items of the group
    response = requests.get(last_url)
    if response.status_code == 200:
        parsed = json.loads(response.text)
        allitems=allitems + parsed
    
    return allitems

Store all items of a group library in a json file

    Args:
        group_id (str): ID of a Zotero group
        filename (str): name of the export file including file-extension

    Returns:
        bool: True if successful

In [9]:
def export_all_items_to_file(group_id,filename) ->bool: 
    allitems = get_all_items(group_id)
    with open(filename,"w") as f:
        json.dump(allitems, f)
    return True

Store export in a file and get all item ids

The export contains also the note items. These are child items of some other item in this export. They have a parent reference.

In [27]:
json_file = "export_grouplib.json"
item_ids = []
note_ids = []
async def get_generic_items(session):
    if os.path.isfile(json_file):
        logging.info("Grouplib export json already exists. Delete to fetch again (time consuming).")
        with open(json_file, 'r') as f:
            all_items = json.load(f)    
    else:
        all_items = await get_all_items(session, group_id)
    return all_items

async def get_export_json():
    conn = aiohttp.TCPConnector(limit=conn_limit)
    timeout = aiohttp.ClientTimeout(total=total_timeout)
    async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:
        all_items = await get_generic_items(session)
    # all_items = test5
    with open(json_file,"w") as f:
        json.dump(all_items, f)
        logging.info("Exported json.")
    
    for item in all_items:
        item_id = item["key"]
        item_type = item["data"]["itemType"]
        if item_type == 'note':
            note_ids.append(item_id)
        else:
            item_ids.append(item_id)
    return all_items
all_items = asyncio.run(get_export_json())
all_items_map = {data["key"]:data for data in all_items}
all_notes_map = {data["data"]["parentItem"]:data for data in [all_items_map[id] for id in note_ids]}

2023-02-22 20:30:40,286 - Grouplib export json already exists. Delete to fetch again (time consuming).
2023-02-22 20:30:49,319 - Exported json.


## Get all TEIs from Zotero

man nimmt die Liste mit den IDs der entries, baut für jeden entry die URL nach dem Muster  
https://api.zotero.org/groups/2165756/items/944KQVKQ?format=tei  
man lädt das mit GET requesst  
dann aus dem response den body und parsed das mit ET from string, nimmt daraus das  
`<biblStruct>` Element;  
baut eine gemeinsame `<listBibl>` und fügt das geparste Element ein,  
dann dumpt man den ganzen Element-Tree

### Retrieves TEI of an item generated by Zotero

In [61]:
ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
async def get_item_tei(group_id,item_id,session):
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/" + item_id + "?format=tei"
    bibl = None
    try:
        async with session.get(url=request_url, headers=request_headers) as response:
            list_bibl = ET.fromstring(await response.text())
        bibl = list_bibl.find("tei:biblStruct",xmlns)
        if item_id in all_notes_map:
            note_el = ET.SubElement(bibl, "note")
            note_el.append(ET.fromstring(all_notes_map[item_id]["data"]["note"]))
        tags_el = ET.SubElement(bibl, "note", type="tags")
        for o in all_items_map[item_id]["data"]["tags"]:
            ET.SubElement(tags_el, "note", type="tag").text = o["tag"]
    except asyncio.TimeoutError:
        logging.info("Timeout fetching " + item_id)
    if bibl is None:
        logging.debug("No biblStruct in item " + item_id)
    logging.info("Fetched TEI for " + item_id)
    return bibl

In [62]:
async def get_item_tei_test():
    conn = aiohttp.TCPConnector(limit=conn_limit)
    timeout = aiohttp.ClientTimeout(total=total_timeout)
    async with aiohttp.ClientSession(connector=conn) as session:
        test = await get_item_tei(group_id,"944KQVKQ",session)
    ET.indent(test)
    ET.dump(test)
asyncio.run(get_item_tei_test())

2023-02-22 21:00:13,580 - Fetched TEI for 944KQVKQ


<biblStruct xmlns="http://www.tei-c.org/ns/1.0" type="journalArticle" xml:id="Harahsheh2020" corresp="http://zotero.org/groups/2165756/items/944KQVKQ">
  <analytic>
    <title level="a">Animal names used to address people in Jordanian spoken Arabic</title>
    <author>
      <forename>Ahmad Mohammad Ahmad al-</forename>
      <surname>Harahsheh</surname>
    </author>
    <author>
      <forename>Rafat M. al</forename>
      <surname>Rousan</surname>
    </author>
  </analytic>
  <monogr>
    <title level="j">Dirasat: Human and Social Sciences</title>
    <idno type="ISSN">10263721</idno>
    <imprint>
      <biblScope unit="volume">47 i</biblScope>
      <biblScope unit="page">328-336</biblScope>
      <date>2020</date>
    </imprint>
  </monogr>
  <note>
    <p>Accession Number: ICHA1094316. Harahsheh, Ahmad Mohammad Ahmad al-; Rousan, Rafat M. al. Issue Info: 47 i. Publication Date: 20200101. Number of Pages: 9. Document Type: Article. Language: English.</p>
  </note>
  <note type="

In [64]:
print("Notes:\n", all_notes_map["944KQVKQ"]["data"]["note"], "\nTags:\n", all_items_map["944KQVKQ"]["data"]["tags"])

Notes:
 <p>Accession Number: ICHA1094316. Harahsheh, Ahmad Mohammad Ahmad al-; Rousan, Rafat M. al. Issue Info: 47 i. Publication Date: 20200101. Number of Pages: 9. Document Type: Article. Language: English.</p> 
Tags:
 [{'tag': 'Animals (in Islamic arts, literatures, folklore, traditions, cultures, law)', 'type': 1}, {'tag': 'Anthropology & ethnography', 'type': 1}, {'tag': 'Arabic language: colloquial', 'type': 1}, {'tag': 'Colloquial Arabic dialects Shami', 'type': 1}, {'tag': 'Jordan', 'type': 1}, {'tag': 'Jordan Social studies', 'type': 1}]


### Load template containing a listBibl-element that will be filled with the retrieved biblStruct elements

In [65]:
template = ET.parse("listbibl_template.xml")
list_bibl = template.find("tei:text/tei:body/tei:listBibl",xmlns)

# Get the TEI

* For each item-id get the TEI and append it to list-bibl
* Save the resulting XML
* Save errors for further inspection

We need to consider https://www.zotero.org/support/dev/web_api/v3/basics#rate_limiting

In [66]:
errors = []
max_fetch = limit_downloads_to if limit_downloads_to is not None else len(item_ids)
logging.info("Fetching at most " + str(max_fetch) + " TEI/XML.")
async def get_item_tei_from_group(item_id, session):
    bibl_struct = await get_item_tei(group_id, item_id, session)
    if bibl_struct:
        list_bibl.append(bibl_struct)
    else:
        logging.debug("Can not append " + item_id)
        errors.append(item_id)

async def async_download():
    conn = aiohttp.TCPConnector(limit=conn_limit)
    timeout = aiohttp.ClientTimeout(total=total_timeout)
    async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:
        await asyncio.gather(*[get_item_tei_from_group(item_id, session) for item_id in item_ids[:max_fetch]])

asyncio.run(async_download())

with open('TEI_export.xml', 'wb') as f:
    ET.indent(template)
    template.write(f, encoding='utf-8')
    logging.info("TEI export done.")

# Export IDs of items with errors
with open("errors.json","w") as f:
    json.dump(errors, f)
    logging.info("Exported errors.json.")

2023-02-22 21:02:36,903 - Fetching at most 10 TEI/XML.
2023-02-22 21:02:37,497 - Fetched TEI for E9GJGZZ7
2023-02-22 21:02:37,505 - Fetched TEI for NPJ679RJ
2023-02-22 21:02:37,569 - Fetched TEI for RCI9I9AM
2023-02-22 21:02:37,599 - Fetched TEI for QUCSQL79
2023-02-22 21:02:37,664 - Fetched TEI for N2NCTC8C
2023-02-22 21:02:37,670 - Fetched TEI for EYRYK2V5
2023-02-22 21:02:37,703 - Fetched TEI for 6KWTRZHW
2023-02-22 21:02:37,706 - Fetched TEI for B6L9AIMZ
2023-02-22 21:02:37,739 - Fetched TEI for 6EPNLRXU
2023-02-22 21:02:37,754 - Fetched TEI for JYWMA77R
2023-02-22 21:02:37,774 - TEI export done.
2023-02-22 21:02:37,797 - Exported errors.json.
