# ELAN to TEI conversion 

**Author:** Daniel Schopper    
**Description:** This notebook automates the ELAN to TEI conversion in the SHAWI Project. When finished, it should be ported to a non-interactive script which is triggered by a github action.    
**Last Change:** 2022-03-23     
**History:**    
* 2022-03-10: Initital set up
* 2022-03-11: added XSLT transformation via saxonpy
* 2022-03-12: added ELAN to TEI conversion
* 2022-03-23: integrated merge metadata XSL

## TODOS and Open Questions

* merge documents, making sure to not overwrite manual changes
* replace TEI documents with x-includes
* tokenize
* validate

In [1]:
import io
import sharepy
import os
import requests
import pathlib
#import filetype – not used
from pathlib import Path
from saxonpy  import PySaxonProcessor, PyXdmValue
from zipfile import ZipFile
import subprocess
from datetime import datetime

## Configuration

In [2]:
# the URL of the Sharepoint installation 
sp_baseURL = "oeawacat.sharepoint.com"

# the sharepoint username + password are taken from the environment
sp_username = os.environ['SP_USERNAME']
sp_pwd = pwd = os.environ['SP_PWD']

# the name of the Sharepoint Site
sp_siteName = "ACDH-CH_p_ShawiTypeArabicDialects_Shawi"

# the path to the Excel file
sp_pathToRecordingsXLSX = "Shared Documents/General/Shawi_Recordings.xlsx"


# the name of the local directory where downloaded data will be stored
dataDir = "data"

# the name of the local directory where downloaded libraries and other auxiliary code will be stored
libDir = "lib"

# the root of the git repository
shawidataHomeDir = "../.."

# path to project-specific stylesheets
pathToShawiStylesheetsDir = shawidataHomeDir+"/082_scripts_xsl"

# the path to the ELAN transcription files
pathToELANDir = shawidataHomeDir+"/122_elan"

# the path to the TEI transcription files
pathToTEIDir = shawidataHomeDir+"/102_derived_tei"

# the path to the tei Corpus document produced by this script
pathToTeiCorpus = pathToTEIDir+"/shawiCorpus.xml"


# the path to the audio files
pathToRecordingsDir = "THIS_IS_NOT_USED"#"/mnt/univie_orientalistik/SHAWI/Recordings"

#set up directories
print("** setting up directories **")
for i in [dataDir,libDir]: 
    if os.path.exists(i):
        print("skipped existing directory '"+i+"'")
    else:
        os.mkdir(i)
        print("created directory '"+i+"'")
        
        
# define which steps should be skipped. 

SKIP_PROCESSING = []#["runTEICorpo"]

** setting up directories **
skipped existing directory 'data'
created directory 'lib'


## Setup

### Step 1: get the latest release of the TEI Stylesheets 

In [3]:
# Setup

# fetch the TEI Stylesheets    
def installFromGithub(libraryName):
    headers = {"Accept" : "application/vnd.github.v3+json"}
    repo = libraryName
    print("** Fetching library "+repo+" **")
    libBasePath = libDir+"/"+repo
    
    # First we check which tag name the latest release has
    r = requests.get("https://api.github.com/repos/"+repo+"/releases/latest", headers=headers)
    if r.status_code != 200:
        print("An error occured fetching the latest release. Maybe there isn't any release? ")
        print(r.content)
        return 1
    release = r.json()
    tag = release["tag_name"]
    
    # we check whether we have the latest version already \
    # by checking if the respective path is already installed
    libReleasePath = libBasePath+"/"+tag
    haveLatestVersion = os.path.exists(libReleasePath)
    if haveLatestVersion:
        print("We have already the latest version ("+tag+"). Exiting")
        print("")
        return libReleasePath
    else:
        url = release["assets"][0]["browser_download_url"]
        payload = requests.get(url).content
        zipfilename = os.path.basename(url)
        os.makedirs(libReleasePath, exist_ok=True)
        zipfilePath = libReleasePath +"/"+zipfilename
        open(zipfilePath, 'wb').write(payload)
        ZipFile(zipfilePath).extractall(path=libReleasePath)
        print("Downloaded latest version ("+tag+") to "+libReleasePath)
        print("")
        return libReleasePath


pathToTEIGuidelines=installFromGithub("TEIC/TEI")
pathToTEIStylesheets=installFromGithub("TEIC/Stylesheets")


** Fetching library TEIC/TEI **


NameError: name 'u' is not defined

### Step 2: Download the latest version of the Excel Sheet

In [None]:
# TODO will need to add credentials if this is run in non-interactive mode
def downloadFromSP(sp_filepath, force=False):
    url = "https://"+sp_baseURL+"/sites/"+sp_siteName+"/"+sp_filepath
    print("attempting to download file from "+url)
    filename = os.path.basename(sp_filepath)
    downloadPath = dataDir+"/"+filename
    if os.path.exists(downloadPath) and not force:
        print("skipping existing file "+downloadPath)
        return downloadPath
    else:
        s = sharepy.connect(sp_baseURL, username=sp_username, password=sp_pwd)
        s.getfile(url, filename=downloadPath)
        return downloadPath


pathToExcelSheet = downloadFromSP(sp_pathToRecordingsXLSX, force="downloadExcelSheet" not in SKIP_PROCESSING)
print(pathToExcelSheet)

## Step 2: transform xlsx to TEI table

In [None]:
def transform(input, stylesheet, output, parameters=[]):   
    proc = PySaxonProcessor(license=False)
    saxon = proc.new_xslt_processor()
    saxon.set_source(file_name=os.path.abspath(input))
    for i in parameters:
        saxon.set_parameter(name=i, value=PySaxonProcessor(license=False).make_string_value(parameters[i]))
    saxon.compile_stylesheet(stylesheet_file=os.path.abspath(stylesheet))
    saxon.set_output_file(os.path.abspath(output))
    saxon.transform_to_file()
    return output

In [None]:
def xlsx2teitable(xlsx, output):

    # first, extract contents of XLSX document to a temp directory
    unzipPath=xlsx.replace(".xlsx","")
    os.makedirs(unzipPath, exist_ok=True)
    ZipFile(xlsx).extractall(path=unzipPath)
    
    # then transform the .rels file using the TEIC Stylesheets 
    pathToXlsxtoteiXSL=pathToTEIStylesheets+"/xml/tei/stylesheet/xlsx/xlsxtotei.xsl"

    params = {
        "inputDir" : pathlib.Path(os.path.abspath(unzipPath)).as_uri(),
        "workDir" : pathlib.Path(os.path.abspath(unzipPath)).as_uri()
        
    }

    transform(
        input = unzipPath+"/_rels/.rels", 
        stylesheet = pathToXlsxtoteiXSL, 
        output = output, 
        parameters=params
    )
    return

In [None]:
pathToTEItable=pathToExcelSheet.replace(".xlsx",".xml")

if not "xlsx2teitable" in SKIP_PROCESSING:    
    xlsx2teitable(xlsx=pathToExcelSheet, output=pathToTEItable)
    debugstring="""<!-- 
   THIS FILE IS INCLUDED IN THE GIT REPOSITORY ONLY FOR DEBUGGING PURPOSES. 
   
   The source of this file is constantly being edited at 
   https://oeawacat.sharepoint.com/sites/ACDH-CH_p_ShawiTypeArabicDialects_Shawi/_layouts/15/Doc.aspx?sourcedoc={F01FF43B-2409-4E31-A5BF-653E0559B160}&file=SHAWI%20Recordings.xlsx&action=default&mobileredirect=true&cid=f7311564-c2b6-4b08-9a52-468547688408
   So this copy is most probably already outdated.
   
  To update it, you can either run https://gitlab.com/acdh-oeaw/shawibarab/shawi-data/-/blob/main/080_scripts_generic/080_01_ELAN2TEI/ELAN2TEI.ipyn
   *OR*  
   1) download the Excel file manually from Sharepoint
   2) and tranform it to TEI using oxgarage.tei-c.org/ 
   
-->
    """
    f = open(pathToTEItable,mode="r",encoding="UTF8")
    src = f.read()
    new = src.replace('<?xml version="1.0" encoding="UTF-8"?>','<?xml version="1.0" encoding="UTF-8"?>\n'+debugstring)
    f.close()
    f = open(pathToTEItable, mode="wt",encoding="UTF8")
    f.write(new)
    f.close()
        
    print(pathToTEItable)

## transform TEI table to corpus header

In [None]:
pathToTeitableToCorpusXSL=pathToShawiStylesheetsDir+"/table2corpus.xsl"
params = {
    "pathToRecordings" : pathlib.Path(os.path.abspath(pathToRecordingsDir)).as_uri()
}
try:
    transform(pathToTEItable, pathToTeitableToCorpusXSL, pathToTeiCorpus, params)
except:
    print("an error occured")
print(pathToTeiCorpus)

## Run TEICorpo

In [None]:
def installFromUrl(url, force=False):
    r = requests.get(url)
    filename = os.path.basename(url)
    downloadpath = libDir+"/"+filename
    if os.path.exists(downloadpath) and not force:
        print("skipping download")
    else:
        open(downloadpath, 'wb').write(r.content)
        print("file already downloaded")
    return downloadpath

# TODO check for filetype and automatically extract zip file 
# so this can be re-used for the insta
 
pathToTeiCorpo=installFromUrl("http://ct3.ortolang.fr/tei-corpo/teicorpo.jar")
print(pathToTeiCorpo)

Collect all ELAN documents from pathToELANDir

In [None]:
ELANDocs = []

for i in os.scandir(pathToELANDir):
    ELANDocs.append({
        "filepath" : os.path.abspath(i),
        "filename" : os.path.basename(i),
        "basename" : Path(i).stem
    })
print(ELANDocs)

In [None]:
def runTEICorpo(docs = dict):
    for i in docs:
        pathToInput = i["filepath"]
        filenameELAN = i["filename"]
        runtime = datetime.now().strftime("%Y-%m-%d_%H-%M")
        tmpDir = pathToTEIDir+"/"+runtime
        filenameTEI = i["basename"]+".xml"
        pathToOutput = tmpDir+"/"+"ELAN_"+filenameTEI
        os.makedirs(tmpDir, exist_ok=True)
        i["filepath_tmp_TEI"] = os.path.abspath(pathToOutput)
        subprocess.run(["java", "-jar", pathToTeiCorpo, "-from","elan", "-to","tei", "-o",pathToOutput, pathToInput], capture_output=True)
        print(pathToOutput)

run TEI Corpo on all ELANDocs, writing the path to the TEI output back to the variable

In [None]:
if not "runTEICorpo" in SKIP_PROCESSING:
    runTEICorpo(docs=ELANDocs)

## Merge metadata an TEICorpo output

In [None]:
xsl = pathToShawiStylesheetsDir+"/mergeHeaderAndTranscription.xsl"
print(os.path.abspath(pathToTeiCorpus))
params = {
    "pathToCorpusDoc" : pathlib.Path(os.path.abspath(pathToTeiCorpus)).as_uri()
}

for i in ELANDocs:
    pathToTmpTEI = i["filepath_tmp_TEI"]
    output = os.path.abspath(pathToTEIDir + "/" + i["basename"] + ".xml")
    i["TEI"] = os.path.abspath(output)
    transform(pathToTmpTEI, xsl, output, params)
    print(output)

## Replace TEI elements with x-includes in corpus document

## Tokenize