In [79]:
import requests
from bs4 import BeautifulSoup, NavigableString
import pandas as pd
from collections import defaultdict

import re
import os

In [80]:
# taken directly from Dr. Crane's 'TreebankCount.ipynb'
tenses = {'p':'present','f':'future', 'i':'imperfect','a':'aorist','r':'perfect','l':'pluperfect','t':'future perfect'}
voices = {'a':'active','m':'middle','p':'passive', 'i':'imperfect','e':'middle-passive'}
moods = {'i':'indicative','s':'subjunctive', 'o':'optative','m':'imperative','p':'participle','n':'infinitive'}

authinfo = {}
authinfo['tlg0011'] = 'Sophocles,-497,-406,poetry,drama'
authinfo['tlg1220'] = 'Batrachomyomachia,-100,-1,poetry,hexameter'
authinfo['tlg0013'] = 'Homeric Hymns,-650,-450,poetry,hexameter'
authinfo['tlg0020'] = 'Hesiod,-750,-650,poetry,hexameter'
authinfo['tlg0026'] = 'Aeschines,-389,-314,prose,orator'
authinfo['tlg0058'] = 'Aeneas Tacticus,301,400,prose,misc'
authinfo['tlg0096'] = 'Aesop,-620,-564,prose'
authinfo['tlg0085'] = 'Aeschylus,-525c,-455c,poetry,drama'
authinfo['tlg0028'] = 'Antiphon,-490,-411,prose,orator'
authinfo['tlg0551'] = 'Appian,-490,-411,prose,history'
authinfo['tlg0086'] = 'Aristotle,-384,-322,prose,philosophy'
authinfo['tlg0019'] = 'Aristophanes,-446,-386,poetry,drama'
authinfo['tlg0008'] = 'Athenaeus,170,223,prose,misc'
authinfo['tlg0554'] = 'Chariton,101,200,prose,misc'
authinfo['tlg0041'] = 'Chionis Epistulae,1,200,prose,misc'
authinfo['tlg0627'] = 'Corpus Hippocraticum,-450,-350,prose,misc'
authinfo['tlg0014'] = 'Demosthenes,-384,-322,prose,orator'
authinfo['tlg0060'] = 'Diodorus Siculus,-90c,-30c,prose,history'
authinfo['tlg0081'] = 'Dionysius of Halicarnassus,-60c,-7c,prose,history'
authinfo['tlg0557'] = 'Epictetus,50,135,prose,philosopy'
authinfo['tlg0537'] = 'Epicurus,-341,-270,prose,philosophy'
authinfo['tlg0343'] = 'Ezechiel the Tragic Poet,-200,-101,poetry'
authinfo['tlg0006'] = 'Euripides,-480c,-406c,poetry,drama'
authinfo['tlg0016'] = 'Herodotus,-484c,-425c,prose,history'
authinfo['tlg0559'] = 'Heron of Alexandria,10,70,prose,misc'
authinfo['tlg0010'] = 'Isocrates,-436,-338,prose,orator'
authinfo['tlg0526'] = 'Josephus,37,100c,prose,history'
authinfo['tlg2003'] = 'Julian the Apostate,331,363,prose,misc'
authinfo['tlg0561'] = 'Longus,101,200,prose,misc'
authinfo['tlg0061'] = 'Pseudo-Lucian,201,400,prose,misc'
authinfo['tlg0062'] = 'Lucian,125,180,prose,misc'
authinfo['tlg0540'] = 'Lysias,-445c,-380c,prose,orator'
authinfo['tlg0541'] = 'Menander,-342,-291,poetry,drama'
authinfo['tlg0255'] = 'Mimnermus,-650,-600,poetry,lyreleg'
authinfo['tlgX208'] = 'Paeanius,301,400,prose,misc'
authinfo['tlg0585'] = 'Phlegon,151,200,prose,misc'
authinfo['tlg0059'] = 'Plato,-428c,-347c,prose,philosophy'
authinfo['tlg0007'] = 'Plutarch,46,119,prose,history'
authinfo['tlg0543'] = 'Polybius,-200,-118,prose,history'
authinfo['tlg4029'] = 'Procopius,500,565,prose,history'
authinfo['tlg0009'] = 'Sappho,-630,-570,poetry,lyreleg'
authinfo['tlg0260'] = 'Semonides,-700,-601,poetry,lyreleg'
authinfo['tlg0527'] = 'Septuagint,-250,-100,prose,bible'
authinfo['tlg0544'] = 'Sextus Empiricus,150,250,prose,philosophy'
authinfo['tlg0032'] = 'Xenophon,-430c,-354,prose,history'
authinfo['tlg0005'] = 'Theocritus,-300,-255,poetry,hexameter'
authinfo['tlg0003'] = 'Thucydides,-460c,-400c,prose,history'
authinfo['tlg0093'] = 'Theophrastus,-371,-287,prose,philosophy'
authinfo['tlg0012'] = 'Homer,-775c,-700c,poetry,hexameter'

In [81]:
# change ROOT to local root
ROOT = "c:\\Users\\bella\\OneDrive\\Documents"

LOCALPATH = os.path.join(ROOT, 'GitHub')
GORPATH = os.path.join(LOCALPATH, 'gorman-trees', 'public', 'xml')
PEDPATH = os.path.join(LOCALPATH, 'pedalion-trees', 'public', 'xml')
MAMPATH = os.path.join(LOCALPATH, 'gAGDT', 'data', 'xml')

In [90]:
FILENAME = os.path.join(PEDPATH, 'chion.xml')

def VerbTenses(FILENAME):
    verbForms = defaultdict(int)
    
    with open(FILENAME, 'r', encoding="utf-8") as f:
        soup = BeautifulSoup(f, "xml")
        if soup.treebank.sentence['document_id']:
            docID = soup.treebank.sentence['document_id']

            TLG = docID.split(':')[-1]
            author = TLG.split('.')[0]
            work = TLG.split('.')[1]
            print(TLG)
        '''
        elif soup.treebank.sentence.has_attr('Author'):
            Author = soup.treebank.sentence['Author']
            print(Author)
        '''
        for sentence in soup('sentence'):
            for word in sentence('word'):
                if word.has_attr('postag'):
                    postag = word['postag']
                    #print(postag)
                    pos    = postag[0]
                    person = postag[1]
                    number = postag[2]
                    tense  = postag[3]
                    mood   = postag[4]
                    voice  = postag[5]
                    gender = postag[6]
                    case   = postag[7]
                    '''
                    lemma = word['lemma']
                    strLemma = re.sub(r'\d+', '', lemma)
                    if (strLemma.endswith('ω')):
                        verbEnd = 'o'
                    elif (strLemma.endswith(''))
                    '''
                    if (pos == 'v'):
                        category = tense + mood + voice
                        verbForms[category] += 1
        print(verbForms)
        
#VerbTenses(FILENAME)

IndexError: list index out of range

In [15]:
def addfiles(dirname,flist):
    for foo in sorted(os.listdir(dirname)):
        if(not re.search('\.xml$',foo)):
            continue
        flist.append(dirname+foo)
    return(flist)

None
