# Script to automate the export and manipulation of the VICAV-library

## Import Package eTree to parse XML Files

In [1]:
import requests
import json
import logging
import os
import xml.etree.ElementTree as ET
import asyncio
import aiohttp
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
#logging.basicConfig(level=logging.DEBUG)

## Define name-space for xml-parsing

In [2]:
xmlns = {"tei": "http://www.tei-c.org/ns/1.0", "xml":"http://www.w3.org/XML/1998/namespace" }

## Access to the VICAV Zotero library

* Use API_TOKEN from environment to access Zotero
* Set the Zotero group id for VICAV here

In [3]:
request_headers = {'Authorization': 'Bearer ' + os.environ['API_TOKEN']}
group_id = "2165756"

## Read all items in the library

Load items from Zotero group library

    Args: 
        group_id (str): ID of a Zotero group
        limit (int): number of items to retrieve from library, maximum is 100.
        start (int): item number to start with

In [4]:
def get_items(group_id:str,limit:int,start:int):
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/" + "?limit=" + str(limit) + "&start=" + str(start)
    response = requests.get(request_url, headers=request_headers)
    if response.status_code == 200:
        parsed = json.loads(response.text)
        response_headers = response.headers
        
    return parsed, response_headers

Get total number of items in group library

    Args:  
        group_id (str): ID of a Zotero group
    
    Returns:
        int: number of items in the library

In [5]:
def total_number_items(group_id) -> int:
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/"
    response = requests.get(request_url, headers=request_headers)
    
    return int(response.headers["Total-Results"])

Get headers of Zotero-Api-Calls

    Args:  
        group_id (str): ID of a Zotero group

In [6]:
def get_headers(group_id):
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/"
    response = requests.get(request_url, headers=request_headers)
    
    return response.headers

Get links from headers

    Args:
        headers: http-headers of a response

    Returns:
        dict

In [7]:
def get_links_from_headers(headers) -> dict:
    link_list = headers["Link"].split(",")
    links = {}
    for link_item in link_list:
        #print(link_item)
        link_type = link_item.split('; rel="')[1].replace('"','').strip()
        link_value = link_item.split('; rel="')[0].replace("<","").replace(">","").strip()
        links[link_type] = link_value
    
    return links

Get all items of a collection

In [8]:
def get_all_items(group_id):
    logging.info("Getting all items now.")
    # empty list that will hold all items of the library
    allitems=[]
    
    # settings to be used in the function to get the items (limit is max 100)
    limit=100
    start=0
    
    # get the first 200 items to start with
    first_round=get_items(group_id,limit,start)
    allitems=allitems+first_round[0]
    
    # get the next link from the headers
    next_url = get_links_from_headers(first_round[1])["next"]
    last_url = get_links_from_headers(first_round[1])["last"]
    # get items until next url is last url, then all items are fetched
    while next_url != last_url:
        logging.info("Getting items from " + next_url)
        response = requests.get(next_url)
        if response.status_code == 200:
            parsed = json.loads(response.text)
            response_headers = response.headers
        
            allitems=allitems + parsed
            urls = get_links_from_headers(response_headers)
            if "next" in urls:
                next_url = urls["next"]
            else:
                break
        else:
            break

    # get the last items of the group
    response = requests.get(last_url)
    if response.status_code == 200:
        parsed = json.loads(response.text)
        allitems=allitems + parsed
    
    return allitems

Store all items of a group library in a json file

    Args:
        group_id (str): ID of a Zotero group
        filename (str): name of the export file including file-extension

    Returns:
        bool: True if successful

In [9]:
def export_all_items_to_file(group_id,filename) ->bool: 
    allitems = get_all_items(group_id)
    with open(filename,"w") as f:
        json.dump(allitems, f)
    return True

Store export in a file and get all item ids

In [10]:
json_file = "export_grouplib.json"
#all_items = get_all_items(group_id)
with open(json_file, 'r') as f:
    all_items = json.load(f)
# all_items = test5
with open(json_file,"w") as f:
    json.dump(all_items, f)
    logging.info("Exported json.")

item_ids = []
for item in all_items:
    item_id = item["data"]["key"]
    item_ids.append(item_id)

2023-02-21 16:53:45,215 - Exported json.


## Get all TEIs from Zotero

man nimmt die Liste mit den IDs der entries, baut für jeden entry die URL nach dem Muster  
https://api.zotero.org/groups/2165756/items/M7UJPP23?format=tei  
man lädt das mit GET requesst  
dann aus dem response den body und parsed das mit ET from string, nimmt daraus das  
`<biblStruct>` Element;  
baut eine gemeinsame `<listBibl>` und fügt das geparste Element ein,  
dann dumpt man den ganzen Element-Tree

### Retrieves TEI of an item generated by Zotero

In [11]:
ET.register_namespace("tei", "http://www.tei-c.org/ns/1.0")
async def get_item_tei(group_id,item_id,session):
    request_url = "https://api.zotero.org/groups/" + group_id + "/items/" + item_id + "?format=tei"
    bibl = None
    try:
        async with session.get(url=request_url, headers=request_headers) as response:
            list_bibl = ET.fromstring(await response.text())
        bibl = list_bibl.find("tei:biblStruct",xmlns)
    except asyncio.TimeoutError:
        logging.info("Timeout fetching " + item_id)
    if bibl is None:
        logging.debug("No biblStruct in item " + item_id)
    logging.info("Fetched TEI for " + item_id)
    return bibl

In [12]:
conn = aiohttp.TCPConnector(limit=4)
async with aiohttp.ClientSession(connector=conn) as session:
    test = await get_item_tei(group_id,"M7UJPP23",session)
ET.dump(test)

2023-02-21 16:53:46,198 - Fetched TEI for M7UJPP23


<tei:biblStruct xmlns:tei="http://www.tei-c.org/ns/1.0" type="book" xml:id="Durand1994" corresp="http://zotero.org/groups/2165756/items/M7UJPP23"><tei:monogr><tei:title level="m">Profilo di arabo marocchino (Varietà urbane centro-meriodionali</tei:title><tei:author><tei:forename>Olivier</tei:forename><tei:surname>Durand</tei:surname></tei:author><tei:imprint><tei:pubPlace>Roma</tei:pubPlace><tei:publisher>Università degli studi "La Sapienza"</tei:publisher><tei:date>1994</tei:date></tei:imprint></tei:monogr></tei:biblStruct>


### Load template containing a listBibl-element that will be filled with the retrieved biblStruct elements

In [13]:
template = ET.parse("listbibl_template.xml")
list_bibl = template.find("tei:text/tei:body/tei:listBibl",xmlns)

# Get the TEI

* For each item-id get the TEI and append it to list-bibl
* Save the resulting XML
* Save errors for further inspection

We need to consider https://www.zotero.org/support/dev/web_api/v3/basics#rate_limiting

In [18]:
errors = []
async def get_item_tei_from_group(item_id, session):
    bibl_struct = await get_item_tei(group_id, item_id, session)
    if bibl_struct:
        list_bibl.append(bibl_struct)
    else:
        logging.debug("Can not append " + item_id)
        errors.append(item_id)

conn = aiohttp.TCPConnector(limit=16)
timeout = aiohttp.ClientTimeout(total=600) # 10 min
async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:
    await asyncio.gather(*[get_item_tei_from_group(item_id, session) for item_id in item_ids])

with open('TEI_export.xml', 'wb') as f:
    ET.indent(template)
    template.write(f, encoding='utf-8')
    logging.info("TEI export done.")

# Export IDs of items with errors
with open("errors.json","w") as f:
    json.dump(errors, f)
    logging.info("Exported errors.json.")

2023-02-21 17:05:55,957 - Fetched TEI for QUCSQL79
2023-02-21 17:05:55,962 - Fetched TEI for EYRYK2V5
2023-02-21 17:05:55,979 - Fetched TEI for B6L9AIMZ
2023-02-21 17:05:56,181 - Fetched TEI for N2NCTC8C
2023-02-21 17:05:56,193 - Fetched TEI for RCI9I9AM
2023-02-21 17:05:56,209 - Fetched TEI for E9GJGZZ7
2023-02-21 17:05:56,212 - Fetched TEI for 6EPNLRXU
2023-02-21 17:05:56,221 - Fetched TEI for JYWMA77R
2023-02-21 17:05:56,242 - Fetched TEI for 6KWTRZHW
2023-02-21 17:05:56,273 - Fetched TEI for NPJ679RJ
2023-02-21 17:05:56,294 - TEI export done.
2023-02-21 17:05:56,298 - Exported errors.json.
