# Metrical Analysis of Sanskrit Ninth Class Verb Forms

## Getting Verbal Roots 

In [None]:
!mkdir -p downloads
!mkdir -p data

In [None]:
!wget -O downloads/whitney_roots.pdf http://gretil.sub.uni-goettingen.de/gretil_elib/Whi885__Whitney_Roots-ACCENTED.pdf

In [4]:
# install pdftk if not already there. eg: for ubuntu: sudo apt install pdftk
!pdftk downloads/whitney_roots.pdf cat 229 output data/whitney_roots_ninth_class.pdf

In [5]:
# produces data/whitney_roots_ninth_class.txt
!pdftotext data/whitney_roots_ninth_class.pdf

Cleanup the text version manually, fixing formatting and diacritics.

Final results are in [data/whitney_roots_ninth_class_cleaned.txt](data/whitney_roots_ninth_class_cleaned.txt)

In [225]:
# TODO try to get the 9th class forms/roots directly from Lubotksy's concordance?

## Parsing Verbal Roots Info

In [486]:
CLASS_HEADER = "6. nā-class"
EARLIER_LANGUAGE_HEADER = "A. Earlier Language"
EARLIER_AND_LATER_LANGUAGE_HEADER = "B. Earlier and Later Language"
LATER_LANGUAGE_HEADER = "C. Later Language"

NINTH_CLASS_STRONG_MARKER = "ā"
NINTH_CLASS_WEAK_MARKER = "ī"

whitney_roots = []

language_period = None

with open("data/whitney_roots_ninth_class_cleaned.txt", 'r') as whitney_file:
    while line := whitney_file.readline():
        variant_no = None
        attestation_texts = None
        weak_only = False
        
        line = line.rstrip()
        if not line or CLASS_HEADER in line:
            continue    
        elif EARLIER_LANGUAGE_HEADER in line:
            language_period = "Earlier"
            continue
        elif EARLIER_AND_LATER_LANGUAGE_HEADER in line:
            language_period = "Earlier & Later"
            continue
        elif LATER_LANGUAGE_HEADER in line:
            language_period = "Later"
            continue
                
        line_parts = line.split()
        if line_parts[0].isdigit():
            variant_no = line_parts.pop(0)
        stem = line_parts.pop(0)
        if line_parts:
            attestation_texts = " ".join(line_parts)
        
        if stem.endswith(NINTH_CLASS_WEAK_MARKER):
            weak_only = True
            weak_stem = stem
            strong_stem = stem[:-1] + NINTH_CLASS_STRONG_MARKER
        else:
            weak_stem = stem[:-1] + NINTH_CLASS_WEAK_MARKER
            strong_stem = stem
        
        # removes the last two chars
        root = stem[:-2]
        # TODO implement overrides for others
        if root == "pu":
            root = "pū"
        elif root == "ju":
            root = "jū"
        elif root == "ji":
            root = "jī"
        
        whitney_roots.append({
            "root": root, 
            "variant_no": variant_no,
            "strong_stem": strong_stem,
            "weak_stem": weak_stem,
            "weak_only": weak_only,
            "attestation_texts": attestation_texts,
            "language_period": language_period,
        })

In [17]:
import pandas

In [487]:
df_whitney_roots = pandas.DataFrame.from_dict(whitney_roots)
df_whitney_roots.to_csv("data/whitney_roots_ninth_class.csv", index=None)
df_whitney_roots.head()

Unnamed: 0,root,variant_no,strong_stem,weak_stem,weak_only,attestation_texts,language_period
0,i,,inā,inī,True,V.,Earlier
1,iṣ,,iṣṇā,iṣṇī,False,,Earlier
2,ubh,,ubhnā,ubhnī,False,V.,Earlier
3,uṣ,,uṣṇā,uṣṇī,False,V.,Earlier
4,kṣi,,kṣiṇā,kṣiṇī,False,V.B.,Earlier


## Annotating Verbal Roots with Rig Veda Attestations (Manual)

Using Lubotsky's concordance, attestation info is manually added to [data/whitney_roots_ninth_class.csv](data/whitney_roots_ninth_class.csv).

Final results are in [data/roots_ninth_class_manual.csv](data/roots_ninth_class_manual.csv).

In [439]:
#df_roots_manual = pandas.read_csv("data/roots_ninth_class_manual.csv")
df_roots_manual = pandas.read_csv("data/roots_ninth_class_manual.csv", keep_default_na=False)

In [441]:
# TODO remove test df once we have all the annotations
#df_roots_test_manual = df_roots_manual[~df_roots["notes"].isna()]
df_roots_test_manual = df_roots_manual[df_roots_manual["notes"].str.len() > 0]
df_roots_test_manual.head()

Unnamed: 0,root,variant_no,stem,weak_only,attestation_texts,language_period,rig_veda_weak_attestations,rig_veda_strong_attestations,lubotsky_page_no,notes
1,iṣ,,iṣṇā,False,,Earlier,,1.63.2d,1:,iSnAsi
18,vr̥,1.0,vr̥ṇī,True,V.,Earlier,1.180.4b 1.67.1b 4.25.3a,,2:1338-1339,avRNItam vRnIte vRnIte(accented - last syll)
37,pu,,punā,False,,Earlier & Later,9.16.3c 9.67.27d,1.133.1a 10.13.3d,1:900-,punIhi puNAmi


## Annotating Verbal Roots with Rig Veda Attestations (Own Search)

### Getting Rig Veda padapatha text (Eichler)

In [None]:
# http://www.detlef108.de/Rigveda.htm 
# http://www.detlef108.de/Notes-to-the-Rigveda-Page.htm 
!wget -O downloads/rv_padapatha_eichler.html http://www.detlef108.de/RV-Padapatha-TA3-paada-NA-UTF8.html 

In [224]:
# sudo apt install html2text
#!html2text -utf8 -width 3000 -o rv_padapatha.txt rv_padaptaha.html

from bs4 import BeautifulSoup

with open("downloads/rv_padapatha_eichler.html", "r") as input_file:
    soup = BeautifulSoup(input_file)
    
    hymns = []
    
    for para in soup.find_all("p"):
        # ignore the ending notes
        if para.contents[0].name == "span":
            continue
        
        #hymns.append(para.text.rstrip()) # no extra lines between hymns
        hymns.append(para.text)
    
    with open("data/rv_padapatha_eichler.txt", 'w') as f:
        f.write("".join(hymns))

In [222]:
# TODO break the padapatha verse into sub-lines

### Getting Rig Veda padapatha / metrically restored texts

In [None]:
# https://github.com/cceh/c-salt_vedaweb_sources/tree/master/rigveda/versions
# description of the sources here:
# https://github.com/cceh/c-salt_vedaweb_tei/blob/master/vedaweb_corpus.tei
# https://vedaweb.uni-koeln.de/rigveda/help

!wget -O downloads/rv_padapatha_lubotsky.json https://raw.githubusercontent.com/cceh/c-salt_vedaweb_sources/master/rigveda/versions/lubotsky.json

!wget -O downloads/rv_samhitapatha_vnh.json https://raw.githubusercontent.com/cceh/c-salt_vedaweb_sources/master/rigveda/versions/vnh.json

In [217]:
# TODO make text version from the jsons, with line numbers at the beginning

### Searching text for ninth-class verbal forms

In [359]:
# TODO search text for ninth-class verbal forms , replicating vedaweb search below?
# use vidyut to identify only finite verbal forms?

## Annotating Verbal Roots with Rig Veda Attestations

In [513]:
# TODO
#!curl -H "Content-Type: application/json" -XPOST https://vedaweb.uni-koeln.de/rigveda/api/search/grammar -d '{}'

import requests
import time

from bs4 import BeautifulSoup

VEDAWEB_API_URL = "https://vedaweb.uni-koeln.de/rigveda/api"

def parse_vedaweb_search_highlight_text(text):
    word_instances = []
    
    for instance_text in BeautifulSoup(text, "lxml").text.split('/'):
        word_gloss = {}
        for prop in instance_text.split(';'):
            prop_parts = prop.split(':')
            prop_name = prop_parts[0].strip()
            prop_value = prop_parts[1].strip()
            
            if prop_name in ["lemma", "lemma type"]:
                continue
                    
            word_gloss[prop_name] = prop_value
            
        word_instances.append(word_gloss)

    return word_instances
        
def search_verb_form_attestations_vedaweb(root, stem=None, results_no=10, results_from=0):
    search_block = {
        "lemma type": "root",
        "lemma": root,
        # make sure we get verbal forms only
        # (i.e. ignore nominal forms like participles which are not marked for person)
        "person": "*", # is present
        "required": True,
        "distance": 0
        
    }
    
    if stem:
        search_block["term"] = '*' + stem + '*'
    
    response = requests.post(
        VEDAWEB_API_URL + "/search/grammar",
        headers = {"Content-Type": "application/json"},
        json = {
            "mode": "grammar",
            "accents": False,
            "blocks": [search_block],
            "scopes": [],
            "meta": {
                #"hymnAddressee": [],
                #"hymnGroup": [],
                #"strata": [],
                #"stanzaType": [],
                #"lateAdditions": []
            },
            "size": results_no,
            "from": results_from
        }
    )

    # raises an exception on non-200 responses, since we want to know and act on it
    response.raise_for_status()
    
    #pprint(response.request.body)
    #pprint(response.json()["hits"][0])
    
    results = {}
    for hit in response.json()["hits"]:
        stanza_no = hit["docId"]
        
        words = []
        for word, highlight_text in hit["highlight"].items():
            word_instances = parse_vedaweb_search_highlight_text(highlight_text) 
            for word_gloss in word_instances:
                words.append({word: word_gloss})
        
        if stanza_no in results:
            # shouldn't happen in our case at all, but just in case
            raise Exception(f"Unexpected, duplicate stanza number found: {stanza_no}")
        else:
            results[stanza_no] = words
    
    return results
    

# TODO need to add variant number for roots like is- (whitney 2, vedaweb 1, lubotsky 1 'to send')
# TODO also fix root vowel length for roots like pU

roots = []

for root in whitney_roots:   
    #root_base = "iṣ"
    #variant_no_vedaweb = "1" # TODO try up to 5 and see if we get hits still (if we don't get on base)
    #stem = "iṣṇā"
    #root_variant = root_base + ' ' + variant_no_vedaweb
    #results = search_verb_form_attestations_vedaweb(root_variant, stem, 10)
    
    # test cases
    #results = search_verb_form_attestations_vedaweb("iṣ 1", "iṣṇā", 10)
    #results = search_verb_form_attestations_vedaweb("pū", "pun", 10)

    if root["root"] not in ["iṣ", "pū", "vr̥"]:
    #if root["root"] not in ["iṣ"]:        
        root["rig_veda_strong_attestations"] = ''
        root["rig_veda_weak_attestations"] = ''
        
        root["rig_veda_strong_attestations_data"] = {}
        root["rig_veda_weak_attestations_data"] = {}
    
        roots.append(root)
        
        continue
    
    print(f"Getting data for stem: {root['root']} {root['variant_no']}")
    
    # TODO automate this logic when no hits are found for the root
    root_variant = root["root"]
    if root_variant == "iṣ":
        root_variant += " 1"
    # variant_no 1 'cover' is attested only in AV. 2 is 'choose'
    elif root_variant == "vr̥" and root["variant_no"] == '2':
        root_variant = "vr̥ vr̥̄"
    #pprint(root_variant)
    
    results_strong = search_verb_form_attestations_vedaweb(
        root_variant, root["strong_stem"], 10
    )
    results_weak = search_verb_form_attestations_vedaweb(
        root_variant, root["weak_stem"], 10
    )

    #pprint(results_strong["09.067.27"])
    #pprint(results_strong)
    #pprint(results_weak)
    
    root["rig_veda_strong_attestations"] = " ".join(list(results_strong.keys()))
    root["rig_veda_weak_attestations"] = " ".join(list(results_weak.keys()))
    
    root["rig_veda_strong_attestations_data"] = results_strong
    root["rig_veda_weak_attestations_data"] = results_weak
    
    roots.append(root)
    
    # so that we don't hammer the api
    time.sleep(0.5)

#pprint(roots)

Getting data for stem: iṣ None
Getting data for stem: vr̥ 1
Getting data for stem: pū None
Getting data for stem: vr̥ 2


In [514]:
#with open(f"data/{stanza_no}.json", 'w') as f:
#    f.write(response.text)

df_roots = pandas.DataFrame.from_dict(roots)
df_roots.to_csv("data/roots_ninth_class.csv", index=None)
df_roots.head()

Unnamed: 0,root,variant_no,strong_stem,weak_stem,weak_only,attestation_texts,language_period,rig_veda_strong_attestations_data,rig_veda_weak_attestations_data,rig_veda_strong_attestations,rig_veda_weak_attestations
0,i,,inā,inī,True,V.,Earlier,{},{},,
1,iṣ,,iṣṇā,iṣṇī,False,,Earlier,"{'01.063.02': [{'iṣṇā́si': {'voice': 'ACT', 'n...",{},01.063.02,
2,ubh,,ubhnā,ubhnī,False,V.,Earlier,{},{},,
3,uṣ,,uṣṇā,uṣṇī,False,V.,Earlier,{},{},,
4,kṣi,,kṣiṇā,kṣiṇī,False,V.B.,Earlier,{},{},,


## Organizing Data by Verse Lines (pādas)

In [527]:
#df_roots = pandas.read_csv("data/roots_ninth_class.csv")
df_roots = pandas.read_csv("data/roots_ninth_class.csv", keep_default_na=False)
#df_roots.head()

In [526]:
# TODO remove test df once we have all the annotations
#df_roots_test = df_roots[~df_roots["rig_veda_strong_attestations"].isna()]
#df_roots_test = df_roots[df_roots["rig_veda_strong_attestations"].str.len() > 0]
df_roots_test = df_roots.query('rig_veda_strong_attestations != "" or rig_veda_weak_attestations != ""')
df_roots_test.head()

Unnamed: 0,root,variant_no,strong_stem,weak_stem,weak_only,attestation_texts,language_period,rig_veda_strong_attestations_data,rig_veda_weak_attestations_data,rig_veda_strong_attestations,rig_veda_weak_attestations
1,iṣ,,iṣṇā,iṣṇī,False,,Earlier,"{'01.063.02': [{'iṣṇā́si': {'voice': 'ACT', 'n...",{},01.063.02,
37,pū,,punā,punī,False,,Earlier & Later,"{'01.133.01': [{'punāmi': {'voice': 'ACT', 'nu...","{'09.067.24': [{'punīhi': {'voice': 'ACT', 'nu...",01.133.01 01.160.03 09.001.06 09.067.22 09.104...,09.067.24 09.067.27 01.015.02 07.085.01 08.012...
44,vr̥,2.0,vr̥ṇā,vr̥ṇī,False,,Earlier & Later,{},"{'01.139.01': [{'vr̥ṇīmahe': {'voice': 'MED', ...",,01.139.01 09.066.18 10.013.04 10.036.11 01.012...


In [135]:
#from pprint import pprint

In [307]:
rv_lines = []

roots = df_roots_test_manual.to_dict("records")

for root in roots:
    rv_weak_line_nos = root.pop("rig_veda_weak_attestations").split()
    rv_strong_line_nos = root.pop("rig_veda_strong_attestations").split()
    
    for line_no in rv_weak_line_nos:
        line = {"line_no": line_no, "stem_type": "weak"} | root
        # update stem vowel (since stem won't have weak marker in whitney when strong 
        # stem is attested)
        line["stem"] = line["stem"][:-1] + NINTH_CLASS_WEAK_MARKER
        rv_lines.append(line)
        
    for line_no in rv_strong_line_nos:
        rv_lines.append({"line_no": line_no, "stem_type": "strong"} | root)

pprint(rv_lines[0])

{'attestation_texts': '',
 'language_period': 'Earlier',
 'line_no': '1.63.2d',
 'lubotsky_page_no': '1:',
 'notes': 'iSnAsi',
 'root': 'iṣ',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'variant_no': '',
 'weak_only': False}


### Parsing line numbers

In [308]:
# "1.1.1b" > "01" "001" "02" "b"
def parse_rv_line_no(string):
    line_no_parts = string.split(".")
    
    book = line_no_parts[0].zfill(2)
    hymn = line_no_parts[1].zfill(3)
    
    last_char = line_no_parts[2][-1]
    if last_char.isalpha():
        stanza = line_no_parts[2][:-1] # drop the last char
        pada = last_char
    else:
        stanza = line_no_parts[2]
        pada = ""

    stanza = stanza.zfill(2)

    return {
        "book"  : book,
        "hymn"  : f"{book}.{hymn}",
        "stanza": f"{book}.{hymn}.{stanza}",
        "pada"  : f"{book}.{hymn}.{stanza}.{pada}"
    }    

rv_lines = [line | (parse_rv_line_no(line["line_no"])) for line in rv_lines]

pprint(rv_lines[0])

{'attestation_texts': '',
 'book': '01',
 'hymn': '01.063',
 'language_period': 'Earlier',
 'line_no': '1.63.2d',
 'lubotsky_page_no': '1:',
 'notes': 'iSnAsi',
 'pada': '01.063.02.d',
 'root': 'iṣ',
 'stanza': '01.063.02',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'variant_no': '',
 'weak_only': False}


## Annotating Verse Lines

### Downloading annotation data

In [282]:
!mkdir -p downloads/vedaweb

In [291]:
import requests
import time

VEDAWEB_API_URL = "https://vedaweb.uni-koeln.de/rigveda/api"

rv_stanza_nos = sorted(list(set([line["stanza"] for line in rv_lines])))

for stanza_no in rv_stanza_nos:
    print(f"Getting data for stanza: {stanza_no}")
    
    # eg: https://vedaweb.uni-koeln.de/rigveda/api/document/id/0100102
    vedaweb_doc_id = stanza_no.replace('.', '')
    vedaweb_doc_url = f"{VEDAWEB_API_URL}/document/id/{vedaweb_doc_id}"
    
    response = requests.get(vedaweb_doc_url)
    # raises an exception on non-200 responses, since we want to know and act on it
    response.raise_for_status()
    
    with open(f"downloads/vedaweb/{stanza_no}.json", 'w') as f:
        f.write(response.text)
    
    # so that we don't hammer the api
    time.sleep(0.5)

Getting data for stanza: 01.063.02
Getting data for stanza: 01.067.01
Getting data for stanza: 01.133.01
Getting data for stanza: 01.180.04
Getting data for stanza: 04.025.03
Getting data for stanza: 09.016.03
Getting data for stanza: 09.067.27
Getting data for stanza: 10.013.03


### Enriching the lines with text and metrical info

In [324]:
import json

def pada_char_to_no(char):
    match char:
        case 'a':
            return 0
        case 'b':
            return 1
        case 'c':
            return 2
        case 'd':
            return 3
        case _:
            raise Exception(f"Invalid pada char: {char}")

def annotate_line(line):
    pada_no = pada_char_to_no(line["pada"][-1])
    
    with open(f"downloads/vedaweb/{line['stanza']}.json") as f:
        stanza = json.load(f)
        
        stanza_versions = stanza["versions"]

        for version in stanza_versions:
            if version["id"] == "version_lubotsky":
                line["text_padapatha"] = version["form"][pada_no]
                break
        
        for version in stanza_versions:
            if version["id"] == "version_vannootenholland":
                # TODO deal with * at the begining of the text here?
                line["text_samhitapatha"] = version["form"][pada_no]
                line["meter_scansion"] = version["metricalData"][pada_no]
                break
    
        line["meter_name"] = stanza["stanzaType"] or ''
        
        # TODO add these too:
        # get these info from hellewig too?
        # strata, late_additon
        #
        # pada_labels
        # 
        # hymn_absolute_no, hymn_addressee, hymn_group
        #
        # present in search results too: 'hymnAddressee', 'hymnGroup', 'stanzaStrata'
        
        return line

rv_lines_annotated = [annotate_line(line) for line in rv_lines]
pprint(rv_lines_annotated[0])

{'attestation_texts': '',
 'book': '01',
 'hymn': '01.063',
 'language_period': 'Earlier',
 'line_no': '1.63.2d',
 'lubotsky_page_no': '1:',
 'meter_name': 'Triṣṭubh',
 'meter_scansion': 'SS LLS SSLS LL',
 'notes': 'iSnAsi',
 'pada': '01.063.02.d',
 'root': 'iṣ',
 'stanza': '01.063.02',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'text_padapatha': 'púraḥ iṣṇā́si= puruhūta pūrvī́ḥ',
 'text_samhitapatha': 'púra iṣṇā́si puruhūta pūrvī́ḥ',
 'variant_no': '',
 'weak_only': False}


## Saving the Final Line Results

In [325]:
df_rv_lines = pandas.DataFrame.from_dict(rv_lines_annotated)
df_rv_lines.to_csv("data/rv_lines_ninth_class.csv", index=None)
df_rv_lines.head(100)

Unnamed: 0,line_no,stem_type,root,variant_no,stem,weak_only,attestation_texts,language_period,lubotsky_page_no,notes,book,hymn,stanza,pada,text_padapatha,text_samhitapatha,meter_scansion,meter_name
0,1.63.2d,strong,iṣ,,iṣṇā,False,,Earlier,1:,iSnAsi,1,1.063,01.063.02,01.063.02.d,púraḥ iṣṇā́si= puruhūta pūrvī́ḥ,púra iṣṇā́si puruhūta pūrvī́ḥ,SS LLS SSLS LL,Triṣṭubh
1,1.180.4b,weak,vr̥,1.0,vr̥ṇī,True,V.,Earlier,2:1338-1339,avRNItam vRnIte vRnIte(accented - last syll),1,1.18,01.180.04,01.180.04.b,apáḥ ná+_ kṣódaḥ= avr̥ṇītam eṣé?_,apó ná kṣódo avr̥ṇītam eṣé,SL L LL SSLS LL,
2,1.67.1b,weak,vr̥,1.0,vr̥ṇī,True,V.,Earlier,2:1338-1339,avRNItam vRnIte vRnIte(accented - last syll),1,1.067,01.067.01,01.067.01.b,vr̥ṇīté?_ śruṣṭím rā́jā iva ajuryám,vr̥ṇīté śruṣṭíṁ rā́jevājuryám,SLL LL LLLLL,
3,4.25.3a,weak,vr̥,1.0,vr̥ṇī,True,V.,Earlier,2:1338-1339,avRNItam vRnIte vRnIte(accented - last syll),4,4.025,04.025.03,04.025.03.a,káḥ devā́nām= ávaḥ adyā́+} vr̥ṇīte?_,kó devā́nām ávo adyā́ vr̥ṇīte,L LLL SL LL SLL,Triṣṭubh
4,9.16.3c,weak,pu,,punī,False,,Earlier & Later,1:900-,punIhi puNAmi,9,9.016,09.016.03,09.016.03.c,punīhí índrāya pā́tave-_,punīhī́ndrāya pā́tave,SLLLS LSL,Gāyatrī
5,9.67.27d,weak,pu,,punī,False,,Earlier & Later,1:900-,punIhi puNAmi,9,9.067,09.067.27,09.067.27.d,jā́tavedaḥ punīhí mā,*jā́tavedaḥ punīhí mā,LSLL SLS L,Aṇuṣṭubh
6,1.133.1a,strong,pu,,punā,False,,Earlier & Later,1:900-,punIhi puNAmi,1,1.133,01.133.01,01.133.01.a,ubhé+_ punāmi= ródasī+_} r̥téna,ubhé punāmi ródasī r̥téna,SL SLS LSL SLS,Triṣṭubh
7,10.13.3d,strong,pu,,punā,False,,Earlier & Later,1:900-,punIhi puNAmi,10,10.013,10.013.03,10.013.03.d,r̥tásya nā́bhau= ádhi sám} punāmi,r̥tásya nā́bhāv ádhi sám punāmi,SLS LL SS L SLS,Triṣṭubh
