In [68]:
import requests
import pandas as pd
import numpy as np
import plotly.express as px

from bs4 import BeautifulSoup, NavigableString
from collections import defaultdict

import re
import os

In [190]:
# taken directly from Dr. Crane's 'TreebankCount.ipynb'
tenses = {'p':'present','f':'future', 'i':'imperfect','a':'aorist','r':'perfect','l':'pluperfect','t':'future perfect'}
voices = {'a':'active','m':'middle','p':'passive', 'i':'imperfect','e':'middle-passive'}
moods = {'i':'indicative','s':'subjunctive', 'o':'optative','m':'imperative','p':'participle','n':'infinitive'}

authinfo = defaultdict()
authinfo['tlg0011'] = 'Sophocles,-497,-406,poetry,drama'
authinfo['tlg1220'] = 'Batrachomyomachia,-100,-1,poetry,hexameter'
authinfo['tlg0013'] = 'Homeric Hymns,-650,-450,poetry,hexameter'
authinfo['tlg0020'] = 'Hesiod,-750,-650,poetry,hexameter'
authinfo['tlg0026'] = 'Aeschines,-389,-314,prose,orator'
authinfo['tlg0058'] = 'Aeneas Tacticus,301,400,prose,misc'
authinfo['tlg0096'] = 'Aesop,-620,-564,prose,misc'
authinfo['tlg0085'] = 'Aeschylus,-525c,-455c,poetry,drama'
authinfo['tlg0028'] = 'Antiphon,-490,-411,prose,orator'
authinfo['tlg0551'] = 'Appian,-490,-411,prose,history'
authinfo['tlg0086'] = 'Aristotle,-384,-322,prose,philosophy'
authinfo['tlg0019'] = 'Aristophanes,-446,-386,poetry,drama'
authinfo['tlg0008'] = 'Athenaeus,170,223,prose,misc'
authinfo['tlg0554'] = 'Chariton,101,200,prose,misc'
authinfo['tlg0041'] = 'Chionis Epistulae,1,200,prose,misc'
authinfo['tlg0627'] = 'Corpus Hippocraticum,-450,-350,prose,misc'
authinfo['tlg0014'] = 'Demosthenes,-384,-322,prose,orator'
authinfo['tlg0060'] = 'Diodorus Siculus,-90c,-30c,prose,history'
authinfo['tlg0081'] = 'Dionysius of Halicarnassus,-60c,-7c,prose,history'
authinfo['tlg0557'] = 'Epictetus,50,135,prose,philosopy'
authinfo['tlg0537'] = 'Epicurus,-341,-270,prose,philosophy'
authinfo['tlg0343'] = 'Ezechiel the Tragic Poet,-200,-101,poetry,drama'
authinfo['tlg0006'] = 'Euripides,-480c,-406c,poetry,drama'
authinfo['tlg0016'] = 'Herodotus,-484c,-425c,prose,history'
authinfo['tlg0559'] = 'Heron of Alexandria,10,70,prose,misc'
authinfo['tlg0010'] = 'Isocrates,-436,-338,prose,orator'
authinfo['tlg0526'] = 'Josephus,37,100c,prose,history'
authinfo['tlg2003'] = 'Julian the Apostate,331,363,prose,misc'
authinfo['tlg0561'] = 'Longus,101,200,prose,misc'
authinfo['tlg0061'] = 'Pseudo-Lucian,201,400,prose,misc'
authinfo['tlg0062'] = 'Lucian,125,180,prose,misc'
authinfo['tlg0540'] = 'Lysias,-445c,-380c,prose,orator'
authinfo['tlg0541'] = 'Menander,-342,-291,poetry,drama'
authinfo['tlg0255'] = 'Mimnermus,-650,-600,poetry,lyreleg'
authinfo['tlgX208'] = 'Paeanius,301,400,prose,misc'
authinfo['tlg0585'] = 'Phlegon,151,200,prose,misc'
authinfo['tlg0059'] = 'Plato,-428c,-347c,prose,philosophy'
authinfo['tlg0007'] = 'Plutarch,46,119,prose,history'
authinfo['tlg0543'] = 'Polybius,-200,-118,prose,history'
authinfo['tlg4029'] = 'Procopius,500,565,prose,history'
authinfo['tlg0009'] = 'Sappho,-630,-570,poetry,lyreleg'
authinfo['tlg0260'] = 'Semonides,-700,-601,poetry,lyreleg'
authinfo['tlg0527'] = 'Septuagint,-250,-100,prose,bible'
authinfo['tlg0544'] = 'Sextus Empiricus,150,250,prose,philosophy'
authinfo['tlg0032'] = 'Xenophon,-430c,-354,prose,history'
authinfo['tlg0005'] = 'Theocritus,-300,-255,poetry,hexameter'
authinfo['tlg0003'] = 'Thucydides,-460c,-400c,prose,history'
authinfo['tlg0093'] = 'Theophrastus,-371,-287,prose,philosophy'
authinfo['tlg0012'] = 'Homer,-775c,-700c,poetry,hexameter'
authinfo['tlg0031'] = 'New Testament,80,100,prose,bible'
authinfo['tlg3143'] = 'Georgius Sphrantzes,1480,1050,prose,history'
authinfo['papyri']  = 'papyri,x,x,x,x'
authinfo['Chilia']  = ''
authinfo['Pedalion'] = ''

In [167]:
# change ROOT to local root
#ROOT = "c:\\Users\\bella\\OneDrive\\Documents"
ROOT = "/Users/bellahwang/Documents"

LOCALPATH = os.path.join(ROOT, 'GitHub')
GORPATH = os.path.join(LOCALPATH, 'gorman-trees', 'public', 'xml')
PEDPATH = os.path.join(LOCALPATH, 'pedalion-trees', 'public', 'xml')
MAMPATH = os.path.join(LOCALPATH, 'gAGDT', 'data', 'xml')
PROPATH = os.path.join(LOCALPATH, 'proiel-treebank')
BELPATH = os.path.join(LOCALPATH, 'treebankstats')

In [75]:
# taken directly from Dr. Crane's 'TreebankCount.ipynb'
def addfiles(dirname,flist):
    for foo in sorted(os.listdir(dirname)):
        if(re.search('proiel', dirname)):
            if(not re.search('(chron|greek-nt|hdt)\.xml', foo)):
                continue
        elif(not re.search('\.xml$', foo)):
            continue
        newpath = os.path.join(dirname, foo)
        flist.append(newpath)
    return(flist)

def cleanFiles(FILENAME):
    with open(FILENAME, 'r', encoding="utf-8") as f:
        f = re.sub('λέγω3','λέγω', str(f))
        f = re.sub('part-of-speech="(.)." morphology="(........).."','postag="\g<1>\g<2>"', str(f))
        f = re.sub("(word|id|form|lemma|postag|relation|head)='([^']+)'",'\g<1>="\g<2>"', str(f))
        f = re.sub('postag="--------', 'postag="---------', f)

In [76]:
searchfiles = []
searchfiles = addfiles(GORPATH, searchfiles)
searchfiles = addfiles(PEDPATH, searchfiles)
searchfiles = addfiles(MAMPATH, searchfiles)
searchfiles = addfiles(PROPATH, searchfiles)

for i in searchfiles:
    cleanFiles(i)

In [174]:
authList = defaultdict(list)
PostagExcept = ['""', '_', '-', 'c']

def findVerbs(FILENAME):
    authTLG = 'dummy'
    workTLG = 'N/A'
    returnList = []
    with open(FILENAME, 'r', encoding="utf-8") as f:
        soup = BeautifulSoup(f, "xml")
        
        # for proiel trees
        if (re.search("proiel", FILENAME)):
            if (re.search("greek-nt\.xml", FILENAME)):
                authTLG = 'tlg0031'
                for token in soup('token'):
                    if token.has_attr('part-of-speech'):
                        pos   = token['part-of-speech']
                        morph = token['morphology']
                        tense = morph[2]
                        mood  = morph[3]
                        voice = morph[4]

                        if (pos == 'V-'):
                            wordList = []
                            wordList.append(authinfo[authTLG].split(',')[0])
                            wordList.append(authTLG)
                            wordList.append(workTLG)
                            wordList.append(token['form'])
                            wordList.append(token['lemma'])
                            wordList.append(tense)
                            wordList.append(mood)
                            wordList.append(voice)
                            wordList.append(tense + mood + voice)
                            wordList.append(morph)
                            returnList.append(wordList)
            elif (re.search("chron\.xml", FILENAME)):
                authTLG = 'tlg3143'
                for token in soup('token'):
                    if token.has_attr('part-of-speech'):
                        pos   = token['part-of-speech']
                        morph = token['morphology']
                        tense = morph[2]
                        mood  = morph[3]
                        voice = morph[4]

                        if (pos == 'V-'):
                            wordList = []
                            wordList.append(authinfo[authTLG].split(',')[0])
                            wordList.append(authTLG)
                            wordList.append(workTLG)
                            wordList.append(token['form'])
                            wordList.append(token['lemma'])
                            wordList.append(tense)
                            wordList.append(mood)
                            wordList.append(voice)
                            wordList.append(tense + mood + voice)
                            wordList.append(morph)
                            returnList.append(wordList)
            elif (re.search("hdt\.xml", FILENAME)):
                authTLG = 'tlg0016'
                for token in soup('token'):
                    if token.has_attr('citation-part'):
                        cite  = token['citation-part']
                        book  = cite.split('.')[0]
                        if book == '1':
                            continue
                        
                    if token.has_attr('part-of-speech'):
                        pos   = token['part-of-speech']
                        morph = token['morphology']
                        tense = morph[2]
                        mood  = morph[3]
                        voice = morph[4]
                    
                    if (pos == 'V-' and token.has_attr('form')):
                        wordList = []
                        wordList.append(authinfo[authTLG].split(',')[0])
                        wordList.append(authTLG)
                        wordList.append(workTLG)
                        wordList.append(token['form'])
                        wordList.append(token['lemma'])
                        wordList.append(tense)
                        wordList.append(mood)
                        wordList.append(voice)
                        wordList.append(tense + mood + voice)
                        wordList.append(morph)
                        returnList.append(wordList)
                
        # for gorman, pedalion, and gAGDT trees
        else:
            if(re.search("papyri", FILENAME)):
                authTLG = 'papyri'
                for word in soup('word'):    
                    # find form, lemma, postag
                    if word.has_attr('postag'):
                        postag = word['postag']
                        if not postag in PostagExcept and postag:
                            #print(postag)
                            pos    = postag[0]
                            #person = postag[1]
                            #number = postag[2]
                            tense  = postag[3]
                            mood   = postag[4]
                            voice  = postag[5]
                            #gender = postag[6]
                            #case   = postag[7]
                            if (pos == 'v'):
                                wordList = []
                                wordList.append(authinfo[authTLG].split(',')[0])
                                wordList.append(authTLG)
                                wordList.append(workTLG)
                                wordList.append(word['form'])
                                wordList.append(word['lemma'])
                                wordList.append(tense)
                                wordList.append(mood)
                                wordList.append(voice)
                                wordList.append(tense + mood + voice)
                                wordList.append(postag)
                                returnList.append(wordList)
            elif(re.search('example-sentences', FILENAME)):
                return
            else:
                for sentence in soup('sentence'):
                    docID = sentence['document_id']
                    # find author ID
                    if ("urn:cts:greekLit:" in docID):
                        docIDList = docID.split(':')
                        if ("tlg" in docIDList[3]):
                            authTLG = docIDList[3].split('.')[0]
                            workTLG = docIDList[3].split('.')[1]
                        else:
                            authTLG = docIDList[-1].split('.')[0]
                            workTLG = docIDList[-1].split('.')[1]
                    elif ("Perseus:text:" in docID):
                        authTLG = 'tlg0008'
                    elif (re.search(r'....-...', docID)):
                        authTLG = 'tlg' + docID.split('-')[0]
                        workTLG = 'tlg' + docID.split('-')[1]
                    elif ("NT" in docID):
                        authTLG = 'tlg0031'
                    elif ("Ps-Luc" in docID):
                        authTLG = 'tlg0061'
                    elif ("Paean" in docID):
                        authTLG = 'tlgX208'
                    elif ("Genesis" in docID):
                        authTLG = 'tlg0527'
                    elif ("Chilia" in docID):
                        authTLG = 'Chilia'
                    elif ("Pedalion" in docID):
                        authTLG = 'Pedalion'
                    elif ("Mimn" in docID):
                        authTLG = 'tlg0255'
                    elif ("0260" in docID):
                        authTLG = 'tlg0260'
                    elif ("0005" in docID):
                        authTLG = 'tlg0005'
                    else:
                        print(docID)
                        
                    for word in sentence('word'):    
                        # find form, lemma, postag
                        if word.has_attr('postag'):
                            postag = word['postag']
                            if not postag in PostagExcept and postag:
                                #print(postag)
                                pos    = postag[0]
                                #person = postag[1]
                                #number = postag[2]
                                tense  = postag[3]
                                mood   = postag[4]
                                voice  = postag[5]
                                #gender = postag[6]
                                #case   = postag[7]
                                if (pos == 'v'):
                                    wordList = []
                                    wordList.append(authinfo[authTLG].split(',')[0])
                                    wordList.append(authTLG)
                                    wordList.append(workTLG)
                                    wordList.append(word['form'])
                                    wordList.append(word['lemma'])
                                    wordList.append(tense)
                                    wordList.append(mood)
                                    wordList.append(voice)
                                    wordList.append(tense + mood + voice)
                                    wordList.append(postag)
                                    returnList.append(wordList)
                                    #print(wordList)
                                    #authList[authTLG].append(wordList)
    return returnList

In [175]:
searchfiles = []
searchfiles = addfiles(GORPATH, searchfiles)
searchfiles = addfiles(PEDPATH, searchfiles)
searchfiles = addfiles(MAMPATH, searchfiles)
searchfiles = addfiles(PROPATH, searchfiles)
#FILENAME = os.path.join(GORPATH, 'dem-59-neaira-2019.xml')


DFList = []
for i in searchfiles:
    print(i)
    data = findVerbs(i)
    df = pd.DataFrame(data, columns = ['Author', 'AuthorTLG', 'WorkTLG', 'Form', 'Lemma', 'Tense', 'Mood', 'Voice', 'Tense+Mood+Voice','POStag'])
    DFList.append(df)

/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-1-50-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-101-150-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-151-196-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-51-100-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-1-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-2-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-5-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-6-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-0-1-4-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-11-14-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-5-7-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-8-10-bu1.xml
/Users/

/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plut-fortuna-romanorum-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plutarch-alex-fort-aut-virt-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plutarch-lycurgus-1-15-bu4.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plutarch-lycurgus-16-31-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-1-10-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-11-20-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-21-35-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-36-49-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-1-10-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-11-20-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-21-30-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-tr

/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg005.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0012.tlg001.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0012.tlg002.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0013.tlg002.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg001.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg002.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg003.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg001.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg002.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg003.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg004.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg005.

In [176]:
master_df = pd.concat(DFList, ignore_index=True)
print(master_df)

           Author AuthorTLG WorkTLG          Form         Lemma Tense Mood  \
0       Aeschines   tlg0026  tlg001    γραψάμενος         γράφω     a    p   
1       Aeschines   tlg0026  tlg001       λυπήσας         λυπέω     a    p   
2       Aeschines   tlg0026  tlg001        νομίζω        νομίζω     p    i   
3       Aeschines   tlg0026  tlg001    παρεσχηκώς        παρέχω     r    p   
4       Aeschines   tlg0026  tlg001          ὁρῶν          ὁράω     p    p   
...           ...       ...     ...           ...           ...   ...  ...   
268045  Herodotus   tlg0016     N/A     ἐκφυγόντα       ἐκφεύγω     a    p   
268046  Herodotus   tlg0016     N/A  περιγενέσθαι  περιγίγνομαι     a    n   
268047  Herodotus   tlg0016     N/A    ἐμηχανῶντο    μηχανάομαι     i    i   
268048  Herodotus   tlg0016     N/A   περιέπεμπον     περιπέμπω     i    i   
268049  Herodotus   tlg0016     N/A     ἐποιεῦντο         ποιέω     i    i   

       Voice Tense+Mood+Voice      POStag  
0          m       

In [21]:
master_df['Author'].replace('', np.nan, inplace=True)
master_df.dropna(subset=['Author'], inplace=True)

In [22]:
#general verb counts
master_df['Tense+Mood+Voice'].value_counts(normalize = True)

pia    0.114140
aia    0.106509
ppa    0.103722
iia    0.084729
pna    0.065274
         ...   
-pa    0.000004
pnd    0.000004
-d-    0.000004
ti-    0.000004
-ie    0.000004
Name: Tense+Mood+Voice, Length: 138, dtype: float64

In [23]:
# verb counts
master_df.groupby("Author", sort=False)["Tense+Mood+Voice"].value_counts(normalize = True)

Author               Tense+Mood+Voice
Aeschines            pia                 0.137268
                     pna                 0.096088
                     ppa                 0.095402
                     aia                 0.057310
                     pie                 0.048387
                                           ...   
Georgius Sphrantzes  fpp                 0.000286
                     ppe                 0.000286
                     psp                 0.000286
                     rna                 0.000286
                     rpe                 0.000286
Name: Tense+Mood+Voice, Length: 2740, dtype: float64

In [35]:
# verb counts based on tense+mood+voice
verbFormCt = master_df.groupby(["Author", "Tense+Mood+Voice", "Tense", "Mood", "Voice"], sort=False)["Tense+Mood+Voice"].size().reset_index(name='count')

verbFormCt['norm_ct'] = verbFormCt['count'] / verbFormCt.groupby("Author")['count'].transform('sum')
verbFormCt

Unnamed: 0,Author,Tense+Mood+Voice,Tense,Mood,Voice,count,norm_ct
0,Aeschines,apm,a,p,m,39,0.013384
1,Aeschines,apa,a,p,a,132,0.045299
2,Aeschines,pia,p,i,a,400,0.137268
3,Aeschines,rpa,r,p,a,90,0.030885
4,Aeschines,ppa,p,p,a,278,0.095402
...,...,...,...,...,...,...,...
2735,New Testament,aop,a,o,p,4,0.000147
2736,New Testament,fpp,f,p,p,1,0.000037
2737,New Testament,rpe,r,p,e,1,0.000037
2738,New Testament,ppe,p,p,e,3,0.000111


In [32]:
fig1 = px.treemap(verbFormCt, path=[px.Constant('Verb Tenses'), 'Author', 'Tense+Mood+Voice'], values='norm_ct')
fig1.show()

In [33]:
fig1 = px.treemap(verbFormCt, path=[px.Constant('Verb Tenses'), 'Author', 'Tense+Mood+Voice'], values='count')
fig1.show()

In [134]:
fig2 = px.icicle(verbFormCt, path=[px.Constant('Verb Tenses'), 'Author', 'Tense+Mood+Voice'], values='count')
fig2.show()

In [97]:
fig3 = px.sunburst(verbFormCt, path=[px.Constant('Verb Tenses'), 'Author', 'Tense+Mood+Voice'], values='count')
fig3.show()

In [191]:
authList = defaultdict(list)
PostagExcept = ['""', '_', '-', '', '--------']
PostagMistake = ['m-------', 'c-------', 'b-------', 'd-------', 'c']

def findAll(FILENAME):
    authTLG = 'dummy'
    workTLG = 'NaN'
    returnList = []
    with open(FILENAME, 'r', encoding="utf-8") as f:
        soup = BeautifulSoup(f, "xml")
        
        # for proiel trees
        if (re.search("proiel", FILENAME)):
            if (re.search("greek-nt\.xml", FILENAME)):
                authTLG = 'tlg0031'
            elif (re.search("chron\.xml", FILENAME)):
                authTLG = 'tlg3143'
            elif (re.search("hdt\.xml", FILENAME)):
                authTLG = 'tlg0016'
                
            for sentence in soup('sentence'):
                sentID = sentence['id']
                docID  = 'NaN'
                author = 'NaN'
                for token in sentence('token'):    
                    wordID = token['id']
                    if token.has_attr('head-id'):
                        head = token['head-id']
                    else:
                        head = 'NaN'
                    if token.has_attr('form'):
                        form = token['form']
                    else:
                        form = 'NaN'
                    if token.has_attr('lemma'):
                        lemma = token['lemma']
                    else:
                        lemma = 'NaN'
                    if token.has_attr('citation-part'):
                        subdoc = token['citation-part']
                        book   = subdoc.split('.')[0]
                        if book == '1' and authTLG == 'tlg0016':
                            continue
                    else:
                        lemma = 'NaN'
                    if token.has_attr('relation'):
                        relation = token['relation']
                    else:
                        relation = 'NaN'
                    ref = 'NaN'
                    if token.has_attr('presentation-after'):
                        presentation_after = token['presentation-after']
                    else:
                        presentation_after = 'NaN'
                    if token.has_attr('empty-token-sort'):
                        emptyTokenSort = token['empty-token-sort']
                    else:
                        emptyTokenSort = 'NaN'
                    insertionID = 'NaN'
                    artificial = 'NaN'
                    if token.has_attr('part-of-speech'):
                        pos = token['part-of-speech']
                    else:
                        pos = 'NaN'
                    if token.has_attr('morphology'):
                        postag     = token['morphology']
                        person     = postag[0]
                        number     = postag[1]
                        tense      = postag[2]
                        mood       = postag[3]
                        voice      = postag[4]
                        gender     = postag[5]
                        case       = postag[6]
                        degree     = postag[7]
                        strength   = postag[8]
                        inflection = postag[9]
                    else:
                        postag     = 'NaN'
                        pos        = 'NaN'
                        person     = 'NaN'
                        number     = 'NaN'
                        tense      = 'NaN'
                        mood       = 'NaN'
                        voice      = 'NaN'
                        gender     = 'NaN'
                        case       = 'NaN'
                        degree     = 'NaN'
                        strength   = 'NaN'
                        inflection = 'NaN'
        
                    wordList = []
                    wordList.append(authinfo[authTLG].split(',')[0])
                    wordList.append(authTLG)
                    wordList.append(workTLG)
                    if not authinfo[authTLG] == '':
                        wordList.append(authinfo[authTLG].split(',')[1])
                        wordList.append(authinfo[authTLG].split(',')[2])
                        wordList.append(authinfo[authTLG].split(',')[3])
                        wordList.append(authinfo[authTLG].split(',')[4])
                    else:
                        wordList.append('NaN')
                        wordList.append('NaN')
                        wordList.append('NaN')
                        wordList.append('NaN')
                    wordList.append(sentID)
                    wordList.append(docID)
                    wordList.append(subdoc)
                    wordList.append(author)
                    wordList.append(wordID)
                    wordList.append(head)
                    wordList.append(form)
                    wordList.append(lemma)
                    wordList.append(relation)
                    wordList.append(ref)
                    wordList.append(presentation_after)
                    wordList.append(insertionID)
                    wordList.append(artificial)
                    wordList.append(postag)
                    wordList.append(pos)
                    wordList.append(person)
                    wordList.append(number)
                    wordList.append(tense)
                    wordList.append(mood)
                    wordList.append(voice)
                    wordList.append(gender)
                    wordList.append(case)
                    wordList.append(degree)
                    wordList.append(strength)
                    wordList.append(inflection)
                    #print(wordList)
                    returnList.append(wordList)
        # for gorman, pedalion, and gAGDT trees
            
        else:
            if(re.search('example-sentences', FILENAME)):
                return
            else:
                if(re.search("papyri", FILENAME)):
                    authTLG = 'papyri'
                    
                for sentence in soup('sentence'):
                    sentID = sentence['id']
                    docID  = sentence['document_id']
                    if sentence.has_attr('subdoc'):
                        subdoc = sentence['subdoc']
                    else:
                        subdoc = 'NaN'
                    if sentence.has_attr('Author'):
                        author = sentence['Author']
                    else:
                        author = 'NaN'
                    # find author ID
                    if ("urn:cts:greekLit:" in docID):
                        docIDList = docID.split(':')
                        if ("tlg" in docIDList[3]):
                            authTLG = docIDList[3].split('.')[0]
                            workTLG = docIDList[3].split('.')[1]
                        else:
                            authTLG = docIDList[-1].split('.')[0]
                            workTLG = docIDList[-1].split('.')[1]
                    elif ("Perseus:text:" in docID):
                        authTLG = 'tlg0008'
                        workTLG = 'tlg001'
                    elif (re.search(r'....-...', docID)):
                        authTLG = 'tlg' + docID.split('-')[0]
                        workTLG = 'tlg' + docID.split('-')[1]
                    elif ("NT" in docID):
                        authTLG = 'tlg0031'
                    elif ("Ps-Luc" in docID):
                        authTLG = 'tlg0061'
                    elif ("Paean" in docID):
                        authTLG = 'tlgX208'
                    elif ("Genesis" in docID):
                        authTLG = 'tlg0527'
                    elif ("Chilia" in docID):
                        authTLG = 'Chilia'
                    elif ("Pedalion" in docID):
                        authTLG = 'Pedalion'
                    elif ("Mimn" in docID):
                        authTLG = 'tlg0255'
                    elif ("0260" in docID):
                        authTLG = 'tlg0260'
                    elif ("0005" in docID):
                        authTLG = 'tlg0005'

                    for word in sentence('word'):    
                        # find form, lemma, postag
                        wordID = word['id']
                        head = word['head']
                        if word.has_attr('form'):
                            form = word['form']
                        else:
                            form = 'NaN'
                        if word.has_attr('lemma'):
                            lemma = word['lemma']
                        else:
                            lemma = 'NaN'
                        if word.has_attr('relation'):
                            relation = word['relation']
                        else:
                            relation = 'NaN'
                        if word.has_attr('ref'):
                            ref = word['ref']
                        else:
                            ref = 'NaN'
                        presentation_after = 'NaN'
                        if word.has_attr('insertion_id'):
                            insertionID = word['insertion_id']
                        else:
                            insertionID = 'NaN'
                        if word.has_attr('artificial'):
                            artificial = word['artificial']
                        else:
                            artificial = 'NaN'
                        if word.has_attr('postag'):
                            postag = word['postag']
                            if postag in PostagExcept:
                                postag     = 'NaN'
                                pos        = 'NaN'
                                person     = 'NaN'
                                number     = 'NaN'
                                tense      = 'NaN'
                                mood       = 'NaN'
                                voice      = 'NaN'
                                gender     = 'NaN'
                                case       = 'NaN'
                                degree     = 'NaN'
                                strength   = 'NaN'
                                inflection = 'NaN'
                            elif postag in PostagMistake:
                                postag     = postag[0] + '--------'
                                pos        = postag[0]
                                person     = 'NaN'
                                number     = 'NaN'
                                tense      = 'NaN'
                                mood       = 'NaN'
                                voice      = 'NaN'
                                gender     = 'NaN'
                                case       = 'NaN'
                                degree     = 'NaN'
                                strength   = 'NaN'
                                inflection = 'NaN'
                            elif postag == 'm-p---na':
                                postag     = 'm-p---na-'
                                pos        = postag[0]
                                person     = postag[1]
                                number     = postag[2]
                                tense      = postag[3]
                                mood       = postag[4]
                                voice      = postag[5]
                                gender     = postag[6]
                                case       = postag[7]
                                degree     = postag[8]
                                strength   = 'NaN'
                                inflection = 'NaN'
                            elif postag == 'v2pasm':
                                postag     = 'v2pasm---'
                                pos        = postag[0]
                                person     = postag[1]
                                number     = postag[2]
                                tense      = postag[3]
                                mood       = postag[4]
                                voice      = postag[5]
                                gender     = postag[6]
                                case       = postag[7]
                                degree     = postag[8]
                                strength   = 'NaN'
                                inflection = 'NaN'
                            else:
                                #print(postag)
                                pos        = postag[0]
                                person     = postag[1]
                                number     = postag[2]
                                tense      = postag[3]
                                mood       = postag[4]
                                voice      = postag[5]
                                gender     = postag[6]
                                case       = postag[7]
                                degree     = postag[8]
                                strength   = 'NaN'
                                inflection = 'NaN'
                        else:
                            postag     = 'NaN'
                            pos        = 'NaN'
                            person     = 'NaN'
                            number     = 'NaN'
                            tense      = 'NaN'
                            mood       = 'NaN'
                            voice      = 'NaN'
                            gender     = 'NaN'
                            case       = 'NaN'
                            degree     = 'NaN'
                            strength   = 'NaN'
                            inflection = 'NaN'

                        wordList = []
                        wordList.append(authinfo[authTLG].split(',')[0])
                        wordList.append(authTLG)
                        wordList.append(workTLG)
                        if not authinfo[authTLG] == '':
                            wordList.append(authinfo[authTLG].split(',')[1])
                            wordList.append(authinfo[authTLG].split(',')[2])
                            wordList.append(authinfo[authTLG].split(',')[3])
                            wordList.append(authinfo[authTLG].split(',')[4])
                        else:
                            wordList.append('NaN')
                            wordList.append('NaN')
                            wordList.append('NaN')
                            wordList.append('NaN')
                        wordList.append(sentID)
                        wordList.append(docID)
                        wordList.append(subdoc)
                        wordList.append(author)
                        wordList.append(wordID)
                        wordList.append(head)
                        wordList.append(form)
                        wordList.append(lemma)
                        wordList.append(relation)
                        wordList.append(ref)
                        wordList.append(presentation_after)
                        wordList.append(insertionID)
                        wordList.append(artificial)
                        wordList.append(postag)
                        wordList.append(pos)
                        wordList.append(person)
                        wordList.append(number)
                        wordList.append(tense)
                        wordList.append(mood)
                        wordList.append(voice)
                        wordList.append(gender)
                        wordList.append(case)
                        wordList.append(degree)
                        wordList.append(strength)
                        wordList.append(inflection)
                        #print(wordList)
                        returnList.append(wordList)

    return returnList

In [192]:
searchfiles = []
searchfiles = addfiles(GORPATH, searchfiles)
searchfiles = addfiles(PEDPATH, searchfiles)
searchfiles = addfiles(MAMPATH, searchfiles)
searchfiles = addfiles(PROPATH, searchfiles)
FILENAME = os.path.join(PEDPATH, 'euripides_medea.xml')


# data = findAll(FILENAME)
# print(data)

allDFList = []
for i in searchfiles:
    print(i)
    data = findAll(i)
    df = pd.DataFrame(data, columns = ['Author', 'AuthorTLG', 'WorkTLG', 'StartDate', 'EndDate', 'Poetry/Prose', 'Genre', 'sentID', 'docID', 'subdoc', \
        'AuthorName', 'wordID', 'head', 'form', 'lemma', 'relation', 'ref', 'presentation_after', 'insertionID', \
        'artificial', 'postag', 'pos', 'person', 'number', 'tense', 'mood', 'voice', 'gender', 'case', \
        'degree', 'strength', 'inflection'])
    allDFList.append(df)

/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-1-50-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-101-150-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-151-196-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/aeschines-1-51-100-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-1-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-2-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-5-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/antiphon-6-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-0-1-4-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-11-14-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-5-7-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/appian-bc-1-8-10-bu1.xml
/Users/

/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plut-alcib-18-39-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plut-fortuna-romanorum-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plutarch-alex-fort-aut-virt-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plutarch-lycurgus-1-15-bu4.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/plutarch-lycurgus-16-31-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-1-10-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-11-20-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-21-35-bu2.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-10-36-49-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-1-10-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-trees/public/xml/polybius-2-11-20-bu1.xml
/Users/bellahwang/Documents/GitHub/gorman-tr

/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg004.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0011.tlg005.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0012.tlg001.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0012.tlg002.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0013.tlg002.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg001.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg002.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0020.tlg003.perseus-grc1.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg001.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg002.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg003.perseus-grc2.tb.xml
/Users/bellahwang/Documents/GitHub/gAGDT/data/xml/tlg0085.tlg004.

In [193]:
master_alldf = pd.concat(allDFList, ignore_index=True)
print(master_alldf)

            Author AuthorTLG WorkTLG StartDate EndDate Poetry/Prose    Genre  \
0        Aeschines   tlg0026  tlg001      -389    -314        prose   orator   
1        Aeschines   tlg0026  tlg001      -389    -314        prose   orator   
2        Aeschines   tlg0026  tlg001      -389    -314        prose   orator   
3        Aeschines   tlg0026  tlg001      -389    -314        prose   orator   
4        Aeschines   tlg0026  tlg001      -389    -314        prose   orator   
...            ...       ...     ...       ...     ...          ...      ...   
1495803  Herodotus   tlg0016     NaN     -484c   -425c        prose  history   
1495804  Herodotus   tlg0016     NaN     -484c   -425c        prose  history   
1495805  Herodotus   tlg0016     NaN     -484c   -425c        prose  history   
1495806  Herodotus   tlg0016     NaN     -484c   -425c        prose  history   
1495807  Herodotus   tlg0016     NaN     -484c   -425c        prose  history   

         sentID                        

In [194]:
master_alldf['Author'].replace('', np.nan, inplace=True)
master_alldf.replace('-', np.nan, inplace=True)
master_alldf.dropna(subset=['Author'], inplace=True)

In [195]:
master_alldf

Unnamed: 0,Author,AuthorTLG,WorkTLG,StartDate,EndDate,Poetry/Prose,Genre,sentID,docID,subdoc,...,person,number,tense,mood,voice,gender,case,degree,strength,inflection
0,Aeschines,tlg0026,tlg001,-389,-314,prose,orator,1,urn:cts:greekLit:tlg0026.tlg001.perseus-grc1,1,...,,s,,,,m,a,,,
1,Aeschines,tlg0026,tlg001,-389,-314,prose,orator,1,urn:cts:greekLit:tlg0026.tlg001.perseus-grc1,1,...,,,,,,,,,,
2,Aeschines,tlg0026,tlg001,-389,-314,prose,orator,1,urn:cts:greekLit:tlg0026.tlg001.perseus-grc1,1,...,,p,,,,m,g,,,
3,Aeschines,tlg0026,tlg001,-389,-314,prose,orator,1,urn:cts:greekLit:tlg0026.tlg001.perseus-grc1,1,...,,p,,,,m,g,,,
4,Aeschines,tlg0026,tlg001,-389,-314,prose,orator,1,urn:cts:greekLit:tlg0026.tlg001.perseus-grc1,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495803,Herodotus,tlg0016,,-484c,-425c,prose,history,225174,,8.7.2,...,,,,,,,,,,n
1495804,Herodotus,tlg0016,,-484c,-425c,prose,history,225174,,8.7.2,...,,p,,,,f,d,,,i
1495805,Herodotus,tlg0016,,-484c,-425c,prose,history,225174,,8.7.2,...,,p,,,,f,d,,,i
1495806,Herodotus,tlg0016,,-484c,-425c,prose,history,225174,,8.7.2,...,3,p,i,i,m,,,,,i


In [196]:
FILENAME = os.path.join(BELPATH, 'allauthors.csv')
master_alldf.to_csv(FILENAME)

In [184]:
#verbFormCt = master_alldf.groupby(["Author", "Tense", "Mood", "Voice"], sort=False)["Tense", "Mood", "Voice"].size().reset_index(name='count')

verbFormCt = master_alldf.groupby(["Author", "tense", "mood", "voice"], sort=False).size().reset_index(name='count')
verbFormCt['norm_count'] = verbFormCt['count'] / verbFormCt.groupby("Author")['count'].transform('sum')
verbFormCt

Unnamed: 0,Author,tense,mood,voice,count,norm_count
0,Aeschines,a,p,m,39,0.012965
1,Aeschines,a,p,a,132,0.043883
2,Aeschines,p,i,a,400,0.132979
3,Aeschines,r,p,a,90,0.029920
4,Aeschines,p,p,a,278,0.092420
...,...,...,...,...,...,...
2721,Herodotus,f,i,p,3,0.000160
2722,Herodotus,i,o,m,1,0.000053
2723,Herodotus,p,m,p,1,0.000053
2724,Herodotus,r,s,m,1,0.000053


In [189]:
fig = px.sunburst(verbFormCt, path=[px.Constant('Verb Tenses'), 'Author', 'tense'], values='norm_count')
fig.show()