# Creating one Standardized Dataframe from Perseus, Gorman, Pedalion, and Proiel Trees

In this notebook, I will demonstrate the code used to standardize Perseus, Gorman, Pedalion, and Proiel Treebanks encoded in xml. 

These treebanks total to approximately one million words of Ancient Greek, but in their current formats, it is impossible to look at these treebanked words as a whole. All of the treebanks I examine use different encoding patterns, meaning that accessing the morphological and structural data in treebanks is only possible by examining the differences in encoding and tailoring queries to each separate encoded xml file. 
My aim is to collect the work that has been put into Ancient Greek Treebanks, so that the one million treebanked words are in one standardized format.

(Note that this is here for replication purposes.)

In [23]:
import requests
import pandas as pd
import numpy as np
import plotly.express as px

from bs4 import BeautifulSoup, NavigableString
from collections import defaultdict

import re
import os

In [24]:
# mappings of POStags to the meanings they stand for
# taken directly from Dr. Crane's 'TreebankCount.ipynb'
tenses  = {'p':'present','f':'future', 'i':'imperfect','a':'aorist','r':'perfect','l':'pluperfect',
           't':'future perfect','s':'resultative','x':'uncertain'}
voices  = {'a':'active','m':'middle','p':'passive', 'i':'imperfect','e':'middle-passive'}
moods   = {'i':'indicative','s':'subjunctive', 'o':'optative','m':'imperative','p':'participle',
           'n':'infinitive', 'g':'verbal_adjective','x':'uncertain'}
degrees = {'c':'comparative', 's': 'superlative','p':'positive'}
numbers = {'s':'singular','p':'plural','d':'dual','x':'uncertain'}
persons = {'1':'1st','2':'2nd','3':'3rd'}
genders = {'c':'common','f':'feminine', 'n':'neuter','m':'masculine','x':'uncertain',
           'p':'masculine or feminine', 'r':'feminine or neuter', 'q': 'masculine, feminine or neuter', 
           'o':'masculine or neuter'}
cases   = {'n':'nominative', 'l':'locative','v':'vocative','g':'genitive','d':'dative',
           'a':'accusative','x':'uncertain'}
POSes   = {'n': 'noun','v': 'verb','t': 'participle','a': 'adjective','d': 'adverb','l': 'article','g': 'particle',
           'c': 'conjunction','r': 'preposition','p': 'pronoun','m': 'numeral','i': 'interjection','e': 'exclamation',
           'u': 'punctuation','x': 'irregular',
           #proiel POStags
           'A-': 'adjective','Df': 'adverb','S-': 'article','Ma': 'cardinal numeral','Nb': 'common noun',
           'C-': 'conjunction','Pd': 'demonstrative pronoun','F-': 'foreign word', 'Px': 'indefinite pronoun',
           'N-': 'infinitive marker','I-': 'interjection','Du': 'interrogative adverb','Pi': 'interrogative pronoun',
           'Mo': 'ordinal numeral','Pp': 'personal pronoun','Pk': 'personal reflexive pronoun',
           'Ps': 'possessive pronoun','Pt': 'possessive reflexive pronoun','R-': 'preposition','Ne': 'proper noun',
           'Py': 'quantifier','Pc': 'reciprocal pronoun','Dq': 'relative adverb','Pr': 'relative pronoun',
           'G-': 'subjunction','V-': 'verb','X-': 'unassigned'}

strengths   = {'w': 'weak','s': 'strong','t': 'weak or strong'}
inflections = {'n': 'non-inflecting','i': 'inflecting'}

# mappings of strings in the Proiel Greek NT to TLG works
ntworks = {"MATT": "tlg001","MARK": "tlg002","LUKE": "tlg003","JOHN": "tlg004","ACTS": "tlg005","ROM": "tlg006",
           "1COR": "tlg007","2COR": "tlg008","GAL": "tlg009","EPH": "tlg010","PHIL": "tlg011","COL": "tlg012",
           "1THESS": "tlg013","2THESS": "tlg014","1TIM": "tlg015","2TIM": "tlg016","TIT": "tlg017","PHILEM": "tlg018",
           "HEB": "tlg019","JAS": "tlg020","1PET": "tlg021","3JOHN": "tlg025","JUDE": "tlg026","REV": "tlg027"}

authinfo = {}
authinfo['tlg0011'] = 'Sophocles,-5,-497,-406,poetry,drama'
authinfo['tlg1220'] = 'Batrachomyomachia,-1,-100,-1,poetry,hexameter'
authinfo['tlg0013'] = 'Homeric Hymns,-6,-650,-450,poetry,hexameter'
authinfo['tlg0020'] = 'Hesiod,-8,-750,-650,poetry,hexameter'
authinfo['tlg0026'] = 'Aeschines,-4,-389,-314,prose,orator'
authinfo['tlg0058'] = 'Aeneas Tacticus,4,301,400,prose,misc'
authinfo['tlg0096'] = 'Aesop,-6,-620,-564,prose,misc'
authinfo['tlg0085'] = 'Aeschylus,-5,-525c,-455c,poetry,drama'
authinfo['tlg0028'] = 'Antiphon,-5,-490,-411,prose,orator'
authinfo['tlg0551'] = 'Appian,2,95,165,prose,history'
authinfo['tlg0086'] = 'Aristotle,-4,-384,-322,prose,philosophy'
authinfo['tlg0019'] = 'Aristophanes,-5,-446,-386,poetry,drama'
authinfo['tlg0008'] = 'Athenaeus,3,170,223,prose,misc'
authinfo['tlg0554'] = 'Chariton,2,101,200,prose,misc'
authinfo['tlg0041'] = 'Chionis Epistulae,1,1,200,prose,misc'
authinfo['tlg0627'] = 'Corpus Hippocraticum,-5,-450,-350,prose,misc'
authinfo['tlg0014'] = 'Demosthenes,-4,-384,-322,prose,orator'
authinfo['tlg0060'] = 'Diodorus Siculus,-1,-90c,-30c,prose,history'
authinfo['tlg0081'] = 'Dionysius of Halicarnassus,-1,-60c,-7c,prose,history'
authinfo['tlg0557'] = 'Epictetus,2,50,135,prose,philosopy'
authinfo['tlg0537'] = 'Epicurus,-3,-341,-270,prose,philosophy'
authinfo['tlg0343'] = 'Ezechiel the Tragic Poet,-2,-200,-101,poetry,drama'
authinfo['tlg0006'] = 'Euripides,-5,-480c,-406c,poetry,drama'
authinfo['tlg0016'] = 'Herodotus,-5,-484c,-425c,prose,history'
authinfo['tlg0559'] = 'Heron of Alexandria,1,10,70,prose,misc'
authinfo['tlg0010'] = 'Isocrates,-4,-436,-338,prose,orator'
authinfo['tlg0526'] = 'Josephus,1,37,100c,prose,history'
authinfo['tlg2003'] = 'Julian the Apostate,4,331,363,prose,misc'
authinfo['tlg0561'] = 'Longus,2,101,200,prose,misc'
authinfo['tlg0061'] = 'Pseudo-Lucian,3,201,400,prose,misc'
authinfo['tlg0062'] = 'Lucian,2,125,180,prose,misc'
authinfo['tlg0540'] = 'Lysias,-4,-445c,-380c,prose,orator'
authinfo['tlg0541'] = 'Menander,-3,-342,-291,poetry,drama'
authinfo['tlg0255'] = 'Mimnermus,-7,-650,-600,poetry,lyreleg'
authinfo['tlgX208'] = 'Paeanius,4,301,400,prose,misc'
authinfo['tlg0585'] = 'Phlegon,2,151,200,prose,misc'
authinfo['tlg0059'] = 'Plato,-4,-428c,-347c,prose,philosophy'
authinfo['tlg0007'] = 'Plutarch,2,46,119,prose,history'
authinfo['tlg0543'] = 'Polybius,-2,-200,-118,prose,history'
authinfo['tlg4029'] = 'Procopius,6,500,565,prose,history'
authinfo['tlg0009'] = 'Sappho,-7,-630,-570,poetry,lyreleg'
authinfo['tlg0260'] = 'Semonides,-7,-700,-601,poetry,lyreleg'
authinfo['tlg0527'] = 'Septuagint,-2,-250,-100,prose,bible'
authinfo['tlg0544'] = 'Sextus Empiricus,3,150,250,prose,philosophy'
authinfo['tlg0032'] = 'Xenophon,-4,-430c,-354,prose,history'
authinfo['tlg0005'] = 'Theocritus,-3,-300,-255,poetry,hexameter'
authinfo['tlg0003'] = 'Thucydides,-5,-460c,-400c,prose,history'
authinfo['tlg0093'] = 'Theophrastus,-4,-371,-287,prose,philosophy'
authinfo['tlg0012'] = 'Homer,-8,-775c,-700c,poetry,hexameter'
authinfo['tlg0031'] = 'New Testament,1,80,100,prose,bible'
authinfo['tlg3143'] = 'Georgius Sphrantzes,16,1480,1550,prose,history'
authinfo['papyri']  = 'papyri,?,?,?,prose,misc'
authinfo['Chilia']  = ''
authinfo['Pedalion'] = ''

In [25]:
# change ROOT to local root
# wget repository
# hack to see whose computer we are
curwd = os.getcwd()
if(re.search('gcrane',curwd)):
    ROOT = '/Users/gcrane/'
    LOCALPATH = os.path.join(ROOT, 'github')
else:
    ROOT = "/Users/bellahwang/Documents"
    LOCALPATH = os.path.join(ROOT, 'GitHub')
    
AGDPATH  = os.path.join(LOCALPATH, 'gAGDT', 'data', 'xml')
GORPATH  = os.path.join(LOCALPATH, 'gorman-trees', 'public', 'xml')
PEDPATH  = os.path.join(LOCALPATH, 'pedalion-trees', 'public', 'xml')
PROPATH  = os.path.join(LOCALPATH, 'proiel-treebank')
FILEPATH = os.path.join(LOCALPATH, 'treebankstats')

In [26]:
# taken from Dr. Crane's 'TreebankCount.ipynb' + some modifications
def addfiles(dirname,flist):
    for foo in sorted(os.listdir(dirname)):
        if(re.search('proiel', dirname)):
            if(not re.search('(chron|greek-nt|hdt)\.xml', foo)):
                continue
        elif(not re.search('\.xml$', foo)):
            continue
        newpath = os.path.join(dirname, foo)
        flist.append(newpath)
    return(flist)

searchfiles = []
searchfiles = addfiles(AGDPATH, searchfiles)
searchfiles = addfiles(PEDPATH, searchfiles)
searchfiles = addfiles(GORPATH, searchfiles)
searchfiles = addfiles(PROPATH, searchfiles)

In [27]:
authList = defaultdict(list)
PostagExcept = ['""', '_', '-', '', '--------']
PostagMistake = ['m-------', 'c-------', 'b-------', 'd-------', 'c']

def findAll(FILENAME):
    authTLG = '-'
    workTLG = '-'
    returnList = []
    with open(FILENAME, 'r', encoding="utf-8") as f:
        soup = BeautifulSoup(f, "xml")
        
        # for proiel trees
        if (re.search("proiel", FILENAME)):
            if (re.search("greek-nt\.xml", FILENAME)):
                authTLG = 'tlg0031'
            elif (re.search("chron\.xml", FILENAME)):
                authTLG = 'tlg3143'
                workTLG = 'tlg001'
            elif (re.search("hdt\.xml", FILENAME)):
                authTLG = 'tlg0016'
                workTLG = 'tlg001'
                
            for sentence in soup('sentence'):
                sentID = sentence['id']
                docID  = '-'
                author = '-'
                for token in sentence('token'):    
                    wordID = token['id']
                    if token.has_attr('head-id'):
                        head = token['head-id']
                    else:
                        head = '-'
                    if token.has_attr('form'):
                        form = token['form']
                    else:
                        form = '-'
                    if token.has_attr('lemma'):
                        lemma = token['lemma']
                    else:
                        lemma = '-'
                    
                    # ignores book 1 of herodotus
                    if token.has_attr('citation-part'):
                        subdoc = token['citation-part']
                        if(len(subdoc.split())>1):
                            if(authTLG == 'tlg0031'):                                
                                args = subdoc.split()
                                workname = args[0]
                                if(workname in ntworks):
                                    workTLG = ntworks[workname]
                                else:
                                    print('bad NT ref:',subdoc)
                            else:
                                print('doublearg',args)
                        book    = subdoc.split('.')[0]
                        if book == '1' and authTLG == 'tlg0016':
                            continue
                    else:
                        subdoc  = '-'
                    if token.has_attr('relation'):
                        relation = token['relation']
                    else:
                        relation = '-'
                    ref = '-'
                    if token.has_attr('presentation-after'):
                        presentation_after = token['presentation-after']
                    else:
                        presentation_after = '-'
                    if token.has_attr('empty-token-sort'):
                        emptyTokenSort = token['empty-token-sort']
                    else:
                        emptyTokenSort = '-'
                        
                    insertionID = '-'
                    artificial  = '-'
                    
                    if token.has_attr('gloss'):
                        gloss = token['gloss']
                    else:
                        gloss = '-'
                        
                    sem = '-'
                    
                    slashList = []
                    for slash in token('slash'):
                        indivList = []
                        
                        target_id = slash['target-id']
                        relation = slash['relation']

                        indivList.append(target_id)
                        indivList.append(relation)
                        slashList.append(indivList)
                        
                    if not slashList:
                        slashList = '-'
                    
                    if token.has_attr('part-of-speech'):
                        pos = token['part-of-speech']
                    else:
                        pos = '-'
                    if token.has_attr('morphology'):
                        postag     = token['morphology']
                        person     = postag[0]
                        number     = postag[1]
                        tense      = postag[2]
                        mood       = postag[3]
                        voice      = postag[4]
                        gender     = postag[5]
                        case       = postag[6]
                        degree     = postag[7]
                        strength   = postag[8]
                        inflection = postag[9]
                    else:
                        postag     = '-'
                        pos        = '-'
                        person     = '-'
                        number     = '-'
                        tense      = '-'
                        mood       = '-'
                        voice      = '-'
                        gender     = '-'
                        case       = '-'
                        degree     = '-'
                        strength   = '-'
                        inflection = '-'
        
                    wordList = []
                    wordList.append(authinfo[authTLG].split(',')[0])
                    wordList.append(authTLG)
                    wordList.append(workTLG)
                    if not authinfo[authTLG] == '':
                        wordList.append(authinfo[authTLG].split(',')[1])
                        wordList.append(authinfo[authTLG].split(',')[2])
                        wordList.append(authinfo[authTLG].split(',')[3])
                        wordList.append(authinfo[authTLG].split(',')[4])
                        wordList.append(authinfo[authTLG].split(',')[5])
                    else:
                        wordList.append('-')
                        wordList.append('-')
                        wordList.append('-')
                        wordList.append('-')
                        wordList.append('misc') #genre
                    wordList.append(sentID)
                    wordList.append(docID)
                    wordList.append(subdoc)
                    wordList.append(author)
                    wordList.append(wordID)
                    wordList.append(head)
                    wordList.append(form)
                    wordList.append(lemma)
                    wordList.append(relation)
                    wordList.append(ref)
                    wordList.append(presentation_after)
                    wordList.append(insertionID)
                    wordList.append(artificial)
                    wordList.append(gloss)
                    wordList.append(sem)
                    wordList.append(slashList)
                    wordList.append(postag)
                    wordList.append(pos)
                    wordList.append(person)
                    wordList.append(number)
                    wordList.append(tense)
                    wordList.append(mood)
                    wordList.append(voice)
                    wordList.append(gender)
                    wordList.append(case)
                    wordList.append(degree)
                    wordList.append(strength)
                    wordList.append(inflection)
                    returnList.append(wordList)
        
        # for gorman, pedalion, and gAGDT trees
            
        else:
            if(re.search('example-sentences', FILENAME)):
                return
            else:
                if(re.search("papyri", FILENAME)):
                    authTLG = 'papyri'
                    
                for sentence in soup('sentence'):
                    sentID = sentence['id']
                    docID  = sentence['document_id']
                    if sentence.has_attr('subdoc'):
                        subdoc  = sentence['subdoc']
                    else:
                        subdoc  = '-'
                    if sentence.has_attr('Author'):
                        author = sentence['Author']
                    else:
                        author = '-'
                    # find author ID
                    if ("urn:cts:greekLit:" in docID):
                        docIDList = docID.split(':')
                        if ("tlg" in docIDList[3]):
                            authTLG = docIDList[3].split('.')[0]
                            workTLG = docIDList[3].split('.')[1]
                        else:
                            authTLG = docIDList[-1].split('.')[0]
                            workTLG = docIDList[-1].split('.')[1]
                    elif ("Perseus:text:" in docID):
                        authTLG = 'tlg0008'
                        workTLG = 'tlg001'
                    elif (re.search(r'....-...', docID)):
                        authTLG = 'tlg' + docID.split('-')[0]
                        workTLG = 'tlg' + docID.split('-')[1]
                    elif ("NT" in docID):
                        authTLG = 'tlg0031'
                    elif ("Ps-Luc" in docID):
                        authTLG = 'tlg0061'
                    elif ("Paean" in docID):
                        authTLG = 'tlgX208'
                    elif ("Genesis" in docID):
                        authTLG = 'tlg0527'
                    elif ("Chilia" in docID):
                        authTLG = 'Chilia'
                    elif ("Pedalion" in docID):
                        authTLG = 'Pedalion'
                    elif ("Mimn" in docID):
                        authTLG = 'tlg0255'
                    elif ("0260" in docID):
                        authTLG = 'tlg0260'
                    elif ("0005" in docID):
                        authTLG = 'tlg0005'

                    for word in sentence('word'):    
                        wordID = word['id']
                        head = word['head']
                        if word.has_attr('form'):
                            form = word['form']
                        else:
                            form = '-'
                        if word.has_attr('lemma'):
                            lemma = word['lemma']
                        else:
                            lemma = '-'
                        if word.has_attr('relation'):
                            relation = word['relation']
                        else:
                            relation = '-'
                        if word.has_attr('ref'):
                            ref = word['ref']
                        else:
                            ref = '-'
                        presentation_after = '-'
                        if word.has_attr('insertion_id'):
                            insertionID = word['insertion_id']
                        else:
                            insertionID = '-'
                        if word.has_attr('artificial'):
                            artificial = word['artificial']
                        else:
                            artificial = '-'
                        if word.has_attr('gloss'):
                            gloss = word['gloss']
                        else:
                            gloss = '-' 
                        if word.has_attr('sem'):
                            sem = word['sem']
                        else:
                            sem = '-'

                        slashList = '-'

                        if word.has_attr('postag'):
                            postag = word['postag']
                            if postag in PostagExcept:
                                postag     = '-'
                                pos        = '-'
                                person     = '-'
                                number     = '-'
                                tense      = '-'
                                mood       = '-'
                                voice      = '-'
                                gender     = '-'
                                case       = '-'
                                degree     = '-'
                                strength   = '-'
                                inflection = '-'
                            elif postag in PostagMistake:
                                postag     = postag[0] + '--------'
                                pos        = postag[0]
                                person     = '-'
                                number     = '-'
                                tense      = '-'
                                mood       = '-'
                                voice      = '-'
                                gender     = '-'
                                case       = '-'
                                degree     = '-'
                                strength   = '-'
                                inflection = '-'
                            elif postag == 'm-p---na':
                                postag     = 'm-p---na-'
                                pos        = postag[0]
                                person     = postag[1]
                                number     = postag[2]
                                tense      = postag[3]
                                mood       = postag[4]
                                voice      = postag[5]
                                gender     = postag[6]
                                case       = postag[7]
                                degree     = postag[8]
                                strength   = '-'
                                inflection = '-'
                            elif postag == 'v2pasm':
                                postag     = 'v2pasm---'
                                pos        = postag[0]
                                person     = postag[1]
                                number     = postag[2]
                                tense      = postag[3]
                                mood       = postag[4]
                                voice      = postag[5]
                                gender     = postag[6]
                                case       = postag[7]
                                degree     = postag[8]
                                strength   = '-'
                                inflection = '-'
                            else:
                                #print(postag)
                                pos        = postag[0]
                                person     = postag[1]
                                number     = postag[2]
                                tense      = postag[3]
                                mood       = postag[4]
                                voice      = postag[5]
                                gender     = postag[6]
                                #if(not postag[7] in cases):
                                #    print('postag',postag)
                                case       = postag[7]
                                degree     = postag[8]
                                strength   = '-'
                                inflection = '-'
                        else:
                            postag     = '-'
                            pos        = '-'
                            person     = '-'
                            number     = '-'
                            tense      = '-'
                            mood       = '-'
                            voice      = '-'
                            gender     = '-'
                            case       = '-'
                            degree     = '-'
                            strength   = '-'
                            inflection = '-'

                        wordList = []
                        wordList.append(authinfo[authTLG].split(',')[0])
                        wordList.append(authTLG)
                        wordList.append(workTLG)
                        if not authinfo[authTLG] == '':
                            wordList.append(authinfo[authTLG].split(',')[1])
                            wordList.append(authinfo[authTLG].split(',')[2])
                            wordList.append(authinfo[authTLG].split(',')[3])
                            wordList.append(authinfo[authTLG].split(',')[4])
                            wordList.append(authinfo[authTLG].split(',')[5])
                        else:
                            wordList.append('-')
                            wordList.append('-')
                            wordList.append('-')
                            wordList.append('-')
                            wordList.append('-')
                        wordList.append(sentID)
                        wordList.append(docID)
                        wordList.append(subdoc)
                        wordList.append(author)
                        wordList.append(wordID)
                        wordList.append(head)
                        wordList.append(form)
                        wordList.append(lemma)
                        wordList.append(relation)
                        wordList.append(ref)
                        wordList.append(presentation_after)
                        wordList.append(insertionID)
                        wordList.append(artificial)
                        wordList.append(gloss)
                        wordList.append(sem)
                        wordList.append(slashList)
                        wordList.append(postag)
                        wordList.append(pos)
                        wordList.append(person)
                        wordList.append(number)
                        wordList.append(tense)
                        wordList.append(mood)
                        wordList.append(voice)
                        wordList.append(gender)
                        wordList.append(case)
                        wordList.append(degree)
                        wordList.append(strength)
                        wordList.append(inflection)
                        returnList.append(wordList)

    return returnList

In [28]:
allDFList = []
for i in searchfiles:
    print(i)
    data = findAll(i)
    df = pd.DataFrame(data, columns = ['Author', 'AuthorTLG', 'WorkTLG', 'Timeline', 'StartDate', 'EndDate', \
        'Poetry/Prose', 'Genre', 'sentID', 'docID', 'subdoc', 'AuthorName', 'wordID', \
        'head', 'form', 'lemma', 'relation', 'ref', 'presentation_after', 'insertionID', 'artificial', \
        'gloss', 'sem', 'slash', 'postag', 'pos', 'person', 'number', 'tense', 'mood', 'voice', 'gender', 'case', \
        'degree', 'strength', 'inflection'])
    allDFList.append(df)

/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0003.tlg001.perseus-grc1.1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg001.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg002.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg003.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg004.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg005.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0012.tlg001.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0012.tlg002.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0013.tlg002.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg001.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg002.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg00

/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-80-89-2019.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-80-89-jan-15.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-90-95-2019.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/athen13-90-95-jan-15.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/dem-59-neaira-2019.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-1-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-1-50-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-101-150-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-151-200-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-201-275-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/demosthenes-18-276-324-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml

/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-1-81-100-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-3-1-20-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/thuc-3-21-40-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-1-1-2-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-1-3-4-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-1-5-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-1-6-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-7-1-3-tree.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-7-4-5-tree.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-8-1-8-4-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-8-5-7-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/xen-cyr-8-8-bu1.xml
/Users/bellahwang/Documents/GitHu

In [29]:
master_df = pd.concat(allDFList, ignore_index=True)

# gets rid of ... in dataframe and displays all of it
pd.set_option('display.max_columns', None)

master_df

Unnamed: 0,Author,AuthorTLG,WorkTLG,Timeline,StartDate,EndDate,Poetry/Prose,Genre,sentID,docID,subdoc,AuthorName,wordID,head,form,lemma,relation,ref,presentation_after,insertionID,artificial,gloss,sem,slash,postag,pos,person,number,tense,mood,voice,gender,case,degree,strength,inflection
0,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,1,3,Θουκυδίδης,Θουκυδίδης,SBJ,-,-,-,-,-,-,-,n-s---mn-,n,-,s,-,-,-,m,n,-,-,-
1,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,2,1,Ἀθηναῖος,Ἀθηναῖος,ATR,-,-,-,-,-,-,-,n-s---mn-,n,-,s,-,-,-,m,n,-,-,-
2,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,3,0,ξυνέγραψε,συγγράφω,PRED,-,-,-,-,-,-,-,v3saia---,v,3,s,a,i,a,-,-,-,-,-
3,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,4,5,τὸν,ὁ,ATR,-,-,-,-,-,-,-,l-s---ma-,l,-,s,-,-,-,m,a,-,-,-
4,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,5,10,πόλεμον,πόλεμος,OBJ_AP,-,-,-,-,-,-,-,n-s---ma-,n,-,s,-,-,-,m,a,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495803,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022394,1022397,ἐν,ἐν,adv,-,,-,-,-,-,-,---------n,R-,-,-,-,-,-,-,-,-,-,n
1495804,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022395,1022396,τῇσι,ὁ,aux,-,,-,-,-,-,-,-p---fd--i,S-,-,p,-,-,-,f,d,-,-,i
1495805,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022396,1022394,Ἀφέτῃσι,Ἀφέται,obl,-,,-,-,-,-,-,-p---fd--i,Ne,-,p,-,-,-,f,d,-,-,i
1495806,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022397,-,ἐποιεῦντο,ποιέω,pred,-,,-,-,-,-,-,3piim----i,V-,3,p,i,i,m,-,-,-,-,i


In [30]:
# replaces inconsistent null values with NaN
# ignores Chilia, Pedalion, and papyri in terms of authors
master_df['Author'].replace('', np.nan, inplace=True)
master_df.replace('-', np.nan, inplace=True)
master_df.replace('_', np.nan, inplace=True)
master_df.replace('?', np.nan, inplace=True)
master_df.dropna(subset=['Author'], inplace=True)

# added this to prevent visualizations from complaining about NaN values
master_df.fillna('-', inplace=True)

master_df

Unnamed: 0,Author,AuthorTLG,WorkTLG,Timeline,StartDate,EndDate,Poetry/Prose,Genre,sentID,docID,subdoc,AuthorName,wordID,head,form,lemma,relation,ref,presentation_after,insertionID,artificial,gloss,sem,slash,postag,pos,person,number,tense,mood,voice,gender,case,degree,strength,inflection
0,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,1,3,Θουκυδίδης,Θουκυδίδης,SBJ,-,-,-,-,-,-,-,n-s---mn-,n,-,s,-,-,-,m,n,-,-,-
1,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,2,1,Ἀθηναῖος,Ἀθηναῖος,ATR,-,-,-,-,-,-,-,n-s---mn-,n,-,s,-,-,-,m,n,-,-,-
2,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,3,0,ξυνέγραψε,συγγράφω,PRED,-,-,-,-,-,-,-,v3saia---,v,3,s,a,i,a,-,-,-,-,-
3,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,4,5,τὸν,ὁ,ATR,-,-,-,-,-,-,-,l-s---ma-,l,-,s,-,-,-,m,a,-,-,-
4,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,5,10,πόλεμον,πόλεμος,OBJ_AP,-,-,-,-,-,-,-,n-s---ma-,n,-,s,-,-,-,m,a,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495803,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022394,1022397,ἐν,ἐν,adv,-,,-,-,-,-,-,---------n,R-,-,-,-,-,-,-,-,-,-,n
1495804,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022395,1022396,τῇσι,ὁ,aux,-,,-,-,-,-,-,-p---fd--i,S-,-,p,-,-,-,f,d,-,-,i
1495805,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022396,1022394,Ἀφέτῃσι,Ἀφέται,obl,-,,-,-,-,-,-,-p---fd--i,Ne,-,p,-,-,-,f,d,-,-,i
1495806,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022397,-,ἐποιεῦντο,ποιέω,pred,-,,-,-,-,-,-,3piim----i,V-,3,p,i,i,m,-,-,-,-,i


In [31]:
# replaces single letter identifiers with readable labels
master_df['pos'].replace(POSes, inplace=True)
master_df['person'].replace(persons, inplace=True)
master_df['number'].replace(numbers, inplace=True)
master_df['tense'].replace(tenses, inplace=True)
master_df['mood'].replace(moods, inplace=True)
master_df['voice'].replace(voices, inplace=True)
master_df['gender'].replace(genders, inplace=True)
master_df['case'].replace(cases, inplace=True)
master_df['degree'].replace(degrees, inplace=True)
master_df['strength'].replace(strengths, inplace=True)
master_df['inflection'].replace(inflections, inplace=True)

# includes total word counts across authors
master_df['authWordCount'] = master_df.groupby('Author')['Author'].transform('count')

master_df

Unnamed: 0,Author,AuthorTLG,WorkTLG,Timeline,StartDate,EndDate,Poetry/Prose,Genre,sentID,docID,subdoc,AuthorName,wordID,head,form,lemma,relation,ref,presentation_after,insertionID,artificial,gloss,sem,slash,postag,pos,person,number,tense,mood,voice,gender,case,degree,strength,inflection,authWordCount
0,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,1,3,Θουκυδίδης,Θουκυδίδης,SBJ,-,-,-,-,-,-,-,n-s---mn-,noun,-,singular,-,-,-,masculine,nominative,-,-,-,57795
1,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,2,1,Ἀθηναῖος,Ἀθηναῖος,ATR,-,-,-,-,-,-,-,n-s---mn-,noun,-,singular,-,-,-,masculine,nominative,-,-,-,57795
2,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,3,0,ξυνέγραψε,συγγράφω,PRED,-,-,-,-,-,-,-,v3saia---,verb,3rd,singular,aorist,indicative,active,-,-,-,-,-,57795
3,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,4,5,τὸν,ὁ,ATR,-,-,-,-,-,-,-,l-s---ma-,article,-,singular,-,-,-,masculine,accusative,-,-,-,57795
4,Thucydides,tlg0003,tlg001,-5,-460c,-400c,prose,history,1,urn:cts:greekLit:tlg0003.tlg001.perseus-grc1,1.1.1,-,5,10,πόλεμον,πόλεμος,OBJ_AP,-,-,-,-,-,-,-,n-s---ma-,noun,-,singular,-,-,-,masculine,accusative,-,-,-,57795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495803,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022394,1022397,ἐν,ἐν,adv,-,,-,-,-,-,-,---------n,preposition,-,-,-,-,-,-,-,-,-,non-inflecting,91864
1495804,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022395,1022396,τῇσι,ὁ,aux,-,,-,-,-,-,-,-p---fd--i,article,-,plural,-,-,-,feminine,dative,-,-,inflecting,91864
1495805,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022396,1022394,Ἀφέτῃσι,Ἀφέται,obl,-,,-,-,-,-,-,-p---fd--i,proper noun,-,plural,-,-,-,feminine,dative,-,-,inflecting,91864
1495806,Herodotus,tlg0016,tlg001,-5,-484c,-425c,prose,history,225174,-,8.7.2,-,1022397,-,ἐποιεῦντο,ποιέω,pred,-,,-,-,-,-,-,3piim----i,verb,3rd,plural,imperfect,indicative,middle,-,-,-,-,inflecting,91864


In [32]:
FILENAME = os.path.join(FILEPATH, 'allauthors.csv')
master_df.to_csv(FILENAME, index=False)