# Metrical Analysis of Sanskrit Ninth Class Verb Forms

## Getting Verbal Roots 

In [None]:
!mkdir -p downloads
!mkdir -p data

In [None]:
!wget -O downloads/whitney_roots.pdf http://gretil.sub.uni-goettingen.de/gretil_elib/Whi885__Whitney_Roots-ACCENTED.pdf

In [4]:
# install pdftk if not already there. eg: for ubuntu: sudo apt install pdftk
!pdftk downloads/whitney_roots.pdf cat 229 output data/whitney_roots_ninth_class.pdf

In [5]:
# produces data/whitney_roots_ninth_class.txt
!pdftotext data/whitney_roots_ninth_class.pdf

Cleanup the text version manually, fixing formatting and diacritics.

Final results are in [data/whitney_roots_ninth_class_cleaned.txt](data/whitney_roots_ninth_class_cleaned.txt)

In [225]:
# TODO try to get the 9th class forms/roots directly from Lubotksy's concordance?

## Parsing Verbal Roots Info

In [2]:
CLASS_HEADER = "6. nā-class"
EARLIER_LANGUAGE_HEADER = "A. Earlier Language"
EARLIER_AND_LATER_LANGUAGE_HEADER = "B. Earlier and Later Language"
LATER_LANGUAGE_HEADER = "C. Later Language"

NINTH_CLASS_STRONG_MARKER = "ā"
NINTH_CLASS_WEAK_MARKER = "ī"

whitney_roots = []

language_period = None

with open("data/whitney_roots_ninth_class_cleaned.txt", 'r') as whitney_file:
    while line := whitney_file.readline():
        variant_no = None
        attestation_texts = None
        weak_only = False
        
        line = line.rstrip()
        if not line or CLASS_HEADER in line:
            continue    
        elif EARLIER_LANGUAGE_HEADER in line:
            language_period = "Earlier"
            continue
        elif EARLIER_AND_LATER_LANGUAGE_HEADER in line:
            language_period = "Earlier & Later"
            continue
        elif LATER_LANGUAGE_HEADER in line:
            language_period = "Later"
            continue
                
        line_parts = line.split()
        if line_parts[0].isdigit():
            variant_no = line_parts.pop(0)
        stem = line_parts.pop(0)
        if line_parts:
            attestation_texts = " ".join(line_parts)
        
        if stem.endswith(NINTH_CLASS_WEAK_MARKER):
            weak_only = True
            weak_stem = stem
            strong_stem = stem[:-1] + NINTH_CLASS_STRONG_MARKER
        else:
            weak_stem = stem[:-1] + NINTH_CLASS_WEAK_MARKER
            strong_stem = stem
        
        # removes the last two chars
        root = stem[:-2]
        # FIXME implement overrides for others
        if root == "pu":
            root = "pū"
        elif root == "ju":
            root = "jū"
        elif root == "ji":
            root = "jī"
        elif root == "iṣ":
            # following vedaweb/lubotsky no 'send'(whitney has this as no 2)
            # TODO automate this logic when no hits are found for the root
            root += " 1"
        elif root == "vr̥" and variant_no == '2':
            # variant_no 1 'cover' is attested only in AV. 2 is 'choose'
            root = "vr̥ ~ vr̥̄"
        elif variant_no:
            # FIXME assign vedaweb equivalent variant numbers here
            root += ' ' + variant_no
        
        whitney_roots.append({
            "root": root, 
            "variant_no": variant_no,
            "strong_stem": strong_stem,
            "weak_stem": weak_stem,
            "weak_only": weak_only,
            "attestation_texts": attestation_texts,
            "language_period": language_period,
        })

In [3]:
import pandas

In [4]:
df_whitney_roots = pandas.DataFrame.from_dict(whitney_roots)
df_whitney_roots.to_csv("data/whitney_roots_ninth_class.csv", index=None)
df_whitney_roots.head()

Unnamed: 0,root,variant_no,strong_stem,weak_stem,weak_only,attestation_texts,language_period
0,i,,inā,inī,True,V.,Earlier
1,iṣ 1,,iṣṇā,iṣṇī,False,,Earlier
2,ubh,,ubhnā,ubhnī,False,V.,Earlier
3,uṣ,,uṣṇā,uṣṇī,False,V.,Earlier
4,kṣi,,kṣiṇā,kṣiṇī,False,V.B.,Earlier


## Annotating Verbal Roots with Rig Veda Attestations (Manual)

Using Lubotsky's concordance, attestation info is manually added to [data/whitney_roots_ninth_class.csv](data/whitney_roots_ninth_class.csv).

Final results are in [data/roots_ninth_class_manual.csv](data/roots_ninth_class_manual.csv).

In [528]:
#df_roots_manual = pandas.read_csv("data/roots_ninth_class_manual.csv")
df_roots_manual = pandas.read_csv("data/roots_ninth_class_manual.csv", keep_default_na=False)

In [529]:
# TODO remove test df once we have all the annotations
#df_roots_test_manual = df_roots_manual[~df_roots["notes"].isna()]
df_roots_test_manual = df_roots_manual[df_roots_manual["notes"].str.len() > 0]
df_roots_test_manual.head()

Unnamed: 0,root,variant_no,stem,weak_only,attestation_texts,language_period,rig_veda_weak_attestations,rig_veda_strong_attestations,lubotsky_page_no,notes
1,iṣ,,iṣṇā,False,,Earlier,,1.63.2d,1:,iSnAsi
18,vr̥,1.0,vr̥ṇī,True,V.,Earlier,1.180.4b 1.67.1b 4.25.3a,,2:1338-1339,avRNItam vRnIte vRnIte(accented - last syll)
37,pu,,punā,False,,Earlier & Later,9.16.3c 9.67.27d,1.133.1a 10.13.3d,1:900-,punIhi puNAmi


## Annotating Verbal Roots with Rig Veda Attestations (Own Search)

### Getting Rig Veda padapatha text (Eichler)

In [None]:
# http://www.detlef108.de/Rigveda.htm 
# http://www.detlef108.de/Notes-to-the-Rigveda-Page.htm 
!wget -O downloads/rv_padapatha_eichler.html http://www.detlef108.de/RV-Padapatha-TA3-paada-NA-UTF8.html 

In [224]:
# sudo apt install html2text
#!html2text -utf8 -width 3000 -o rv_padapatha.txt rv_padaptaha.html

from bs4 import BeautifulSoup

with open("downloads/rv_padapatha_eichler.html", "r") as input_file:
    soup = BeautifulSoup(input_file)
    
    hymns = []
    
    for para in soup.find_all("p"):
        # ignore the ending notes
        if para.contents[0].name == "span":
            continue
        
        #hymns.append(para.text.rstrip()) # no extra lines between hymns
        hymns.append(para.text)
    
    with open("data/rv_padapatha_eichler.txt", 'w') as f:
        f.write("".join(hymns))

In [222]:
# TODO break the padapatha verse into sub-lines

### Getting Rig Veda padapatha / metrically restored texts

In [None]:
# https://github.com/cceh/c-salt_vedaweb_sources/tree/master/rigveda/versions
# description of the sources here:
# https://github.com/cceh/c-salt_vedaweb_tei/blob/master/vedaweb_corpus.tei
# https://vedaweb.uni-koeln.de/rigveda/help

!wget -O downloads/rv_padapatha_lubotsky.json https://raw.githubusercontent.com/cceh/c-salt_vedaweb_sources/master/rigveda/versions/lubotsky.json

!wget -O downloads/rv_samhitapatha_vnh.json https://raw.githubusercontent.com/cceh/c-salt_vedaweb_sources/master/rigveda/versions/vnh.json

In [217]:
# TODO make text version from the jsons, with line numbers at the beginning

### Searching text for ninth-class verbal forms

In [359]:
# TODO search text for ninth-class verbal forms , replicating vedaweb search below?
# use vidyut to identify only finite verbal forms?

## Annotating Verbal Roots with Rig Veda Attestations

In [202]:
# TODO
#!curl -H "Content-Type: application/json" -XPOST https://vedaweb.uni-koeln.de/rigveda/api/search/grammar -d '{}'

import requests
import time

from bs4 import BeautifulSoup
from pprint import pprint

VEDAWEB_API_URL = "https://vedaweb.uni-koeln.de/rigveda/api"

def parse_vedaweb_search_highlight_text(text):
    word_instances = []
    
    for instance_text in BeautifulSoup(text, "lxml").text.split('/'):
        word_gloss = {}
        for prop in instance_text.split(';'):
            prop_parts = prop.split(':')
            prop_name = prop_parts[0].strip()
            prop_value = prop_parts[1].strip()
            
            if prop_name in ["lemma", "lemma type"]:
                continue
                    
            word_gloss[prop_name] = prop_value
            
        word_instances.append(word_gloss)

    return word_instances
        
def search_verb_form_attestations_vedaweb(root, stem=None, results_no=10, results_from=0):
    search_block = {
        "lemma type": "root",
        "lemma": root,
        # make sure we get verbal forms only
        # (i.e. ignore nominal forms like participles which are not marked for person)
        "person": "*", # is present
        "required": True,
        "distance": 0
    }
    
    if stem:
        search_block["term"] = '*' + stem + '*'
    
    response = requests.post(
        VEDAWEB_API_URL + "/search/grammar",
        headers = {"Content-Type": "application/json"},
        json = {
            "mode": "grammar",
            "accents": False,
            "blocks": [search_block],
            "scopes": [],
            "meta": {
                #"hymnAddressee": [],
                #"hymnGroup": [],
                #"strata": [],
                #"stanzaType": [],
                #"lateAdditions": []
            },
            "size": results_no,
            "from": results_from
        }
    )

    # raises an exception on non-200 responses, since we want to know and act on it
    response.raise_for_status()
    
    #pprint(response.request.body)
    #pprint(response.json()["hits"][0])
    
    results = {}
    for hit in response.json()["hits"]:
        stanza_no = hit["docId"]
        
        words = []
        for word, highlight_text in hit["highlight"].items():
            word_instances = parse_vedaweb_search_highlight_text(highlight_text) 
            for word_gloss in word_instances:
                words.append({
                    "word": word,
                    "gloss": word_gloss
                })
        
        if stanza_no in results:
            # shouldn't happen in our case at all, but just in case
            raise Exception(f"Unexpected, duplicate stanza number found: {stanza_no}")
        else:
            results[stanza_no] = words
    
    return results
    

# TODO need to add variant number for roots like is- (whitney 2, vedaweb 1, lubotsky 1 'to send')

roots = []

roots_attested_words_by_stanza = {}

for root in whitney_roots:   
    #root_base = "iṣ"
    #variant_no_vedaweb = "1" # TODO try up to 5 and see if we get hits still (if we don't get on base)
    #stem = "iṣṇā"
    #root_variant = root_base + ' ' + variant_no_vedaweb
    #results = search_verb_form_attestations_vedaweb(root_variant, stem, 10)
    
    # test cases
    #results = search_verb_form_attestations_vedaweb("iṣ 1", "iṣṇā", 10)
    #results = search_verb_form_attestations_vedaweb("pū", "pun", 10)

    # FIXME remove this test filter
    #if root["root"] not in ["iṣ 1", "pū", "vr̥ ~ vr̥̄"]:
    ##if root["root"] not in ["iṣ 1"]:        
    #    root["rig_veda_strong_attestations"] = ''
    #    root["rig_veda_weak_attestations"] = ''
    #    roots.append(root)
    #    continue
    
    print(f"Getting data for stem: {root['root']}")
    
    # FIXME automate this logic when no hits are found for the root
    #root_variant = root["root"]
    #if root_variant == "iṣ":
    #    root_variant += " 1"
    # variant_no 1 'cover' is attested only in AV. 2 is 'choose'
    #elif root_variant == "vr̥" and root["variant_no"] == '2':
    #    root_variant = "vr̥ vr̥̄"
    #pprint(root_variant)
  
    root_variant = root["root"].replace(" ~ ", ' ')

    # FIXME implement fetching of more than the result limit
    results_strong = search_verb_form_attestations_vedaweb(
        root_variant, root["strong_stem"], 1
    )
    results_weak = search_verb_form_attestations_vedaweb(
        root_variant, root["weak_stem"], 1
    )

    #pprint(results_strong["09.067.27"])
    #pprint(results_strong)
    #pprint(results_weak)
    
    root["rig_veda_strong_attestations"] = " ".join(list(results_strong.keys()))
    root["rig_veda_weak_attestations"] = " ".join(list(results_weak.keys()))
    
    #root["rig_veda_strong_attestations_data"] = results_strong
    #root["rig_veda_weak_attestations_data"] = results_weak
    
    roots.append(root)
    
    # save full results data, for use later
    roots_attested_words_by_stanza[root["root"]] = {
        "strong": results_strong,
        "weak": results_weak
    }
    
    # so that we don't hammer the api
    time.sleep(0.5)

#pprint(roots)

Getting data for stem: i
Getting data for stem: iṣ 1
Getting data for stem: ubh
Getting data for stem: uṣ
Getting data for stem: kṣi
Getting data for stem: gr̥ 2
Getting data for stem: gr̥bh
Getting data for stem: jū
Getting data for stem: jī
Getting data for stem: dr̥
Getting data for stem: drū
Getting data for stem: pr̥
Getting data for stem: pruṣ
Getting data for stem: bhrī
Getting data for stem: mī̆
Getting data for stem: mr̥
Getting data for stem: ram
Getting data for stem: ri
Getting data for stem: vr̥ 1
Getting data for stem: vlī̆
Getting data for stem: śam
Getting data for stem: ścam
Getting data for stem: śrath
Getting data for stem: śrī 1
Getting data for stem: śrī 2
Getting data for stem: si
Getting data for stem: subh
Getting data for stem: skabh
Getting data for stem: spr̥
Getting data for stem: hr̥
Getting data for stem: hru
Getting data for stem: aś 2
Getting data for stem: krī
Getting data for stem: gr̥ 1
Getting data for stem: grath
Getting data for stem: gr̥h
Getting 

In [248]:
import json

with open(f"data/roots_ninth_class_attestations.json", 'w') as f:
    # TODO dump after prettifying?
    json.dump(roots_attested_words_by_stanza, f, indent=2, ensure_ascii=False)

df_roots = pandas.DataFrame.from_dict(roots)
df_roots.to_csv("data/roots_ninth_class.csv", index=None)
df_roots.head()

Unnamed: 0,root,variant_no,strong_stem,weak_stem,weak_only,attestation_texts,language_period,rig_veda_strong_attestations,rig_veda_weak_attestations
0,i,,inā,inī,True,V.,Earlier,,
1,iṣ 1,,iṣṇā,iṣṇī,False,,Earlier,01.063.02,
2,ubh,,ubhnā,ubhnī,False,V.,Earlier,01.063.04,
3,uṣ,,uṣṇā,uṣṇī,False,V.,Earlier,,
4,kṣi,,kṣiṇā,kṣiṇī,False,V.B.,Earlier,,


## Organizing Data by Verse Lines (pādas)

In [204]:
#df_roots = pandas.read_csv("data/roots_ninth_class.csv")
df_roots = pandas.read_csv("data/roots_ninth_class.csv", keep_default_na=False)
#df_roots.head()

In [205]:
# TODO remove test df once we have all the annotations
#df_roots_test = df_roots[~df_roots["rig_veda_strong_attestations"].isna()]
#df_roots_test = df_roots[df_roots["rig_veda_strong_attestations"].str.len() > 0]
df_roots_test = df_roots.query('rig_veda_strong_attestations != "" or rig_veda_weak_attestations != ""')
df_roots_test.head()

Unnamed: 0,root,variant_no,strong_stem,weak_stem,weak_only,attestation_texts,language_period,rig_veda_strong_attestations,rig_veda_weak_attestations
1,iṣ 1,,iṣṇā,iṣṇī,False,,Earlier,01.063.02,
2,ubh,,ubhnā,ubhnī,False,V.,Earlier,01.063.04,
7,jū,,junā,junī,False,V.,Earlier,01.027.07,09.079.02
16,ram,,ramṇā,ramṇī,False,V.B.,Earlier,02.012.02,
32,krī,,krīṇā,krīṇī,False,,Earlier & Later,04.024.10,


In [135]:
#from pprint import pprint

In [206]:
rv_lines = []

roots_data = df_roots_test.to_dict("records")

for root in roots_data:
    rv_weak_line_nos = root.pop("rig_veda_weak_attestations").split()
    rv_strong_line_nos = root.pop("rig_veda_strong_attestations").split()
    
    #rv_weak_attestations_data = root.pop("rig_veda_weak_attestations_data")
    #rv_strong_attestations_data = root.pop("rig_veda_strong_attestations_data")
    
    weak_stem = root.pop("weak_stem")
    strong_stem = root.pop("strong_stem")
    
    for line_no in rv_weak_line_nos:
        rv_lines.append({"line_no": line_no, "stem": weak_stem, "stem_type": "weak"} | root)
        
    for line_no in rv_strong_line_nos: 
        rv_lines.append({"line_no": line_no, "stem": strong_stem, "stem_type": "strong"} | root)

pprint(rv_lines[0])

{'attestation_texts': '',
 'language_period': 'Earlier',
 'line_no': '01.063.02',
 'root': 'iṣ 1',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'variant_no': '',
 'weak_only': False}


### Parsing line numbers

In [207]:
# "1.1.1b" > "01" "001" "02" "b"
# 01.063.02 > "01" "063" "02" ""
def parse_rv_line_no(string):
    line_no_parts = string.split(".")
    
    book = line_no_parts[0].zfill(2)
    hymn = line_no_parts[1].zfill(3)
    
    last_char = line_no_parts[2][-1]
    if last_char.isalpha():
        stanza = line_no_parts[2][:-1] # drop the last char
        pada = last_char
    else:
        stanza = line_no_parts[2]
        pada = ""

    stanza = stanza.zfill(2)

    return {
        "book"    : book,
        "hymn"    : f"{book}.{hymn}",
        "stanza"  : f"{book}.{hymn}.{stanza}",
        "pada"    : f"{book}.{hymn}.{stanza}.{pada}" if pada else ""
        #"pada_id" : pada or ''
    }    

rv_lines = [line | (parse_rv_line_no(line["line_no"])) for line in rv_lines]

pprint(rv_lines[0])

{'attestation_texts': '',
 'book': '01',
 'hymn': '01.063',
 'language_period': 'Earlier',
 'line_no': '01.063.02',
 'pada': '',
 'root': 'iṣ 1',
 'stanza': '01.063.02',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'variant_no': '',
 'weak_only': False}


## Annotating Verse Lines

### Downloading annotation data

In [282]:
!mkdir -p downloads/vedaweb

In [211]:
import requests
import time

VEDAWEB_API_URL = "https://vedaweb.uni-koeln.de/rigveda/api"

rv_stanza_nos = sorted(list(set([line["stanza"] for line in rv_lines])))
#print(rv_stanza_nos)

In [209]:
for stanza_no in rv_stanza_nos:
    print(f"Getting data for stanza: {stanza_no}")
    
    # eg: https://vedaweb.uni-koeln.de/rigveda/api/document/id/0100102
    vedaweb_doc_id = stanza_no.replace('.', '')
    vedaweb_doc_url = f"{VEDAWEB_API_URL}/document/id/{vedaweb_doc_id}"
    
    response = requests.get(vedaweb_doc_url)
    # raises an exception on non-200 responses, since we want to know and act on it
    response.raise_for_status()
    
    with open(f"downloads/vedaweb/{stanza_no}.json", 'w') as f:
        f.write(response.text)
    
    # so that we don't hammer the api
    time.sleep(0.5)

Getting data for stanza: 01.027.07
Getting data for stanza: 01.063.02
Getting data for stanza: 01.063.04
Getting data for stanza: 01.133.01
Getting data for stanza: 01.139.01
Getting data for stanza: 02.012.02
Getting data for stanza: 04.024.10
Getting data for stanza: 07.007.03
Getting data for stanza: 08.023.16
Getting data for stanza: 09.067.24
Getting data for stanza: 09.079.02


### Enriching the lines with text and metrical info

In [212]:
with open(f"data/roots_ninth_class_attestations.json") as f:
    roots_attested_words_by_stanza = json.load(f)
    
#print(roots_attested_words_by_stanza)

In [213]:
import json


def pada_char_to_no(char):
    match char:
        case 'a':
            return 0
        case 'b':
            return 1
        case 'c':
            return 2
        case 'd':
            return 3
        case _:
            raise Exception(f"Invalid pada char: {char}")
            
            
def get_stanza_words(stanza_padas):    
    stanza_words = {}
    
    for pada_data in stanza_padas:
        for word_grammar_data in pada_data["grammarData"]:
            word = word_grammar_data["form"]
            
            word_grammar_data_props = word_grammar_data["props"]
            word_position = word_grammar_data_props.pop("position", '')
            word_lemma_type = word_grammar_data_props.pop("lemma type", '')

            word_data = {
                # tracker for when we later search for the actual attested words
                "found": False, 
                "data": {
                    "pada_id": pada_data["id"],
                    # TODO test with this and later eliminate
                    #"pada_index": pada_data["index"], 
                    "pada_label": pada_data["label"],
                    "word": word,
                    "word_position_no": word_grammar_data["index"], # not-zero-indexed!
                    # TODO be careful of this, does not seem to be accurate
                    # (eg: for "punīhi" for 9.67.24 )
                    # TODO use these for checks
                    "word_position": word_position,
                    "word_lemma_type": word_lemma_type,
                    #"word_lemma": word_grammar_data["lemma"]
                    # TODO enable this later
                    #"word_props": word_grammar_data_props,
                    # TODO this not needed since all of it is contained in props
                    #"word_gloss": word_tracker_gloss
                }
            }
                
            if word in stanza_words:
                stanza_words[word].append(word_data)
            else:
                # need to use a list since the word may appear multiple times in the stanza
                stanza_words[word] = [word_data]
    
    return stanza_words
    

def get_words_by_pada(stanza_attested_words, stanza_padas, stanza_no=None):
    words_by_pada = []
    
    #pprint(stanza_attested_words)
    #pprint(stanza_padas)
    
    # transform data in stanza_padas to be amenable for searching the attested words
    stanza_words = get_stanza_words(stanza_padas)
    #pprint(stanza_words)
    
    for attested_word_data in stanza_attested_words:
        attested_word = attested_word_data["word"]
        attested_word_gloss = attested_word_data["gloss"]
        
        if attested_word in stanza_words:
            for word_instance in stanza_words[attested_word]:
                # if this word instance was already found, skip to the next one 
                if word_instance["found"]:
                    continue
                    
                word_instance_data = word_instance["data"]
                word_instance_lemma_type = word_instance_data.pop("word_lemma_type")
                
                # TODO check for lemma too?
                # TODO also ensure this is not causing us to drop valid lines
                if (word_instance_lemma_type and word_instance_lemma_type != "root"):
                    continue
                    
                if (word_instance_data["word_position"] and
                        "position" in attested_word_gloss and
                        # python "and" operator is short-circuiting so can access "position" below
                        # TODO can we trust this?
                        word_instance_data["word_position"] != attested_word_gloss["position"]
                    ):
                    continue                
                
                # TODO not needed since all of this info is already in word_instance_data
                #word_instance_data["gloss"] = attested_word_gloss
                words_by_pada.append(word_instance_data)
                
                # no need to do this in-place for python!
                word_instance["found"] = True  
                break
        else:
            # TODO handle this better? ok to let go maybe
            raise Exception(
                f"Word {attested_word} was not found in the stanza {stanza_no}: {stanza_padas}"
            )            
    
    #pprint(stanza_words)
    
    return words_by_pada            
            
      
def annotate_line(line):    
    with open(f"downloads/vedaweb/{line['stanza']}.json") as f:
        stanza = json.load(f)
        
        # TODO get pada no for each line (could be multiple) using data in: 
        # roots_attested_words_by_stanza
        # we will be ultimately returning multiple lines here sometimes
        #if line["pada"]:
        #    pada_no = pada_char_to_no(line["pada"][-1])
        #    #pada_no = pada_char_to_no(line["pada_id"])
        #else:
        
        #pada_no = 0
        
        stanza_attested_words = roots_attested_words_by_stanza[line["root"]][line["stem_type"]][line["stanza"]]
            
        words_by_pada = get_words_by_pada(stanza_attested_words, stanza["padas"], line["stanza"])
        #pprint(words_by_pada)
        
        padas = []
        
        for word in words_by_pada:
            pada = line | word
            
            # TODO rename line_no to location everywhere
            pada["line_no"] = pada["stanza"] + "." + pada["pada_id"]
            # this is not needed now
            pada.pop("pada")
            #pada["pada"] = pada["stanza"] + "." + pada["pada_id"]
            
            # TODO try out getting index from stanza info directly and see if we still
            # get the same results
            pada_no = pada_char_to_no(pada["pada_id"])
            # TODO testing remove
            #pada_no = 0

            for version in stanza["versions"]:
                if version["id"] == "version_lubotsky":
                    pada["text_padapatha"] = version["form"][pada_no]
                    break
        
            for version in stanza["versions"]:
                if version["id"] == "version_vannootenholland":
                    # TODO deal with * at the begining of the text here?
                    pada["text_samhitapatha"] = version["form"][pada_no]
                    pada["meter_scansion"] = version["metricalData"][pada_no]
                    break
    
            pada["meter_name"] = stanza["stanzaType"] or ''
        
            # TODO add these too:
            # get these info from hellewig too?
            # strata, late_additon
            #
            # pada_labels
            # 
            # hymn_absolute_no, hymn_addressee, hymn_group
            #
            # present in search results too: 'hymnAddressee', 'hymnGroup', 'stanzaStrata'
            
            padas.append(pada)

        return padas

    
#line_annotated = annotate_line(rv_lines[1])
#pprint(line_annotated)

#rv_lines_annotated = [annotate_line(line) for line in rv_lines]

rv_lines_annotated = []
for line in rv_lines:
    # TODO rename the annotate function here
    rv_lines_annotated.extend(annotate_line(line))

pprint(rv_lines_annotated[0])
print(f"\nTotal number of lines: {len(rv_lines_annotated)}")

{'attestation_texts': '',
 'book': '01',
 'hymn': '01.063',
 'language_period': 'Earlier',
 'line_no': '01.063.02.d',
 'meter_name': 'Triṣṭubh',
 'meter_scansion': 'SS LLS SSLS LL',
 'pada_id': 'd',
 'pada_label': 'M',
 'root': 'iṣ 1',
 'stanza': '01.063.02',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'text_padapatha': 'púraḥ iṣṇā́si= puruhūta pūrvī́ḥ',
 'text_samhitapatha': 'púra iṣṇā́si puruhūta pūrvī́ḥ',
 'variant_no': '',
 'weak_only': False,
 'word': 'iṣṇā́si',
 'word_position': 'intermediate',
 'word_position_no': 2}

Total number of lines: 13


### Saving the Final Line Results

In [216]:
df_rv_lines = pandas.DataFrame.from_dict(rv_lines_annotated)
df_rv_lines.to_csv("data/rv_lines_ninth_class.csv", index=None)
df_rv_lines.head(100)

Unnamed: 0,line_no,stem,stem_type,root,variant_no,weak_only,attestation_texts,language_period,book,hymn,stanza,pada_id,pada_label,word,word_position_no,word_position,text_padapatha,text_samhitapatha,meter_scansion,meter_name
0,01.063.02.d,iṣṇā,strong,iṣ 1,,False,,Earlier,1,1.063,01.063.02,d,M,iṣṇā́si,2,intermediate,púraḥ iṣṇā́si= puruhūta pūrvī́ḥ,púra iṣṇā́si puruhūta pūrvī́ḥ,SS LLS SSLS LL,Triṣṭubh
1,01.063.04.b,ubhnā,strong,ubh,,False,V.,Earlier,1,1.063,01.063.04,b,Mh,ubhnā́ḥ,5,,vr̥trám yát vajrin= vr̥ṣakarman ubhnā́ḥ,vr̥tráṁ yád vajrin vr̥ṣakarman ubhnā́ḥ,LL L LL SSLS LL,Triṣṭubh
2,09.079.02.b,junī,weak,jū,,False,V.,Earlier,9,9.079,09.079.02,b,M,junīmási,5,,dhánā vā yébhiḥ= árvataḥ} junīmási,dhánā vā yébhir árvato junīmási,SL L LS LSL SLSS,Jagatī
3,01.027.07.b,junā,strong,jū,,False,V.,Earlier,1,1.027,01.027.07,b,M,junā́ḥ,4,,ávāḥ vā́jeṣu yám junā́ḥ,ávā vā́jeṣu yáṁ junā́ḥ,SL LLS L SL,Gāyatrī
4,02.012.02.b,ramṇā,strong,ram,,False,V.B.,Earlier,2,2.012,02.012.02,b,M,áramṇāt,4,,yáḥ párvatān= prákupitān} áramṇāt,yáḥ párvatān prákupitām̐ áramṇāt,L LSL SSSL SLL,Triṣṭubh
5,04.024.10.b,krīṇā,strong,krī,,False,,Earlier & Later,4,4.024,04.024.10,b,PE2,krīṇāti,2,verse final,índram krīṇāti dhenúbhiḥ,índraṁ krīṇāti dhenúbhiḥ,LL LLS LSL,Aṇuṣṭubh
6,09.067.24.b,punī,weak,pū,,False,,Earlier & Later,9,9.067,09.067.24,b,MO,punīhi,3,verse final,ágne-_ téna punīhi naḥ,ágne téna punīhi naḥ,LL LS SLS L,Gāyatrī
7,09.067.24.c,punī,weak,pū,,False,,Earlier & Later,9,9.067,09.067.24,c,MO,punīhi,2,verse final,brahmasavaíḥ punīhi naḥ,brahmasavaíḥ punīhi naḥ,LSSL SLS L,Gāyatrī
8,01.133.01.a,punā,strong,pū,,False,,Earlier & Later,1,1.133,01.133.01,a,P,punāmi,2,intermediate,ubhé+_ punāmi= ródasī+_} r̥téna,ubhé punāmi ródasī r̥téna,SL SLS LSL SLS,Triṣṭubh
9,07.007.03.b,prīṇī,weak,prī,,False,,Earlier & Later,7,7.007,07.007.03,b,M,prīṇīté,1,intermediate,prīṇīté?_ agníḥ= īḷitáḥ} ná+_ hótā,prīṇīté agnír īḷitó ná hótā,LLL LS LSL S LL,Triṣṭubh


## Validating the data

### Checking for missing roots

In [None]:
import numpy as np

In [244]:
# our starting list of roots
#print(np.sort(df_roots["root"].unique()))

Present roots:

In [242]:
print(np.sort(df_rv_lines["root"].unique()))

['iṣ 1' 'jū' 'krī' 'prī' 'pū' 'ram' 'ubh' 'vr̥ ~ vr̥̄']


Missing roots:

In [243]:
print(np.setdiff1d(df_roots["root"].unique(), df_rv_lines["root"].unique()))

['aś 1' 'aś 2' 'badh' 'bhrī' 'dhū̆' 'drū' 'dr̥' 'grath' 'gr̥ 1' 'gr̥ 2'
 'gr̥bh' 'gr̥h' 'hru' 'hr̥' 'i' 'jā' 'jī' 'kliś' 'kuṣ' 'kṣi' 'lu' 'math'
 'mr̥' 'mr̥d' 'muṣ' 'mī̆' 'pruṣ' 'pr̥' 'puṣ' 'ri' 'si' 'skabh' 'spr̥'
 'stabh' 'str̥' 'subh' 'uṣ' 'vlī̆' 'vr̥ 1' 'śam' 'ścam' 'śrath' 'śrī 1'
 'śrī 2' 'śr̥']


### Checking found word against padapatha text

In [245]:
mismatches = []

no_of_mismatches = 0

LUBOTSKY_PADAPATHA_SPECIAL_CHARACTERS = ['-', '_', '=', '?', '+']

def clean_lubotsky_padapatha(string):
    return ''.join(c for c in string if c not in LUBOTSKY_PADAPATHA_SPECIAL_CHARACTERS)


for line in rv_lines_annotated:
    padapatha_text_cleaned = clean_lubotsky_padapatha(line["text_padapatha"])
    padapatha_parts = padapatha_text_cleaned.split(' ')
    
    word_from_padapatha = padapatha_parts[line["word_position_no"] - 1]
    
    if line["word"] != word_from_padapatha:
        no_of_mismatches += 1
        print(
            f"{line['line_no']}:{line['word_position_no']}", 
            f"({line['stem']}) {line['word']} ≠ {word_from_padapatha}",
            #f"[{padapatha_text_cleaned}]",
            f"[{line['text_padapatha']}]"
        )
        
print(f"\nFound {no_of_mismatches} mismatches.")


Found 0 mismatches.


Ignore cases of final vowel lengthening above (especially with imperatives) -- they are not really mismatches.