# Metrical Analysis of Sanskrit Ninth Class Verb Forms

## Getting Verbal Roots 

In [None]:
!wget -O data/whitney_roots.pdf http://gretil.sub.uni-goettingen.de/gretil_elib/Whi885__Whitney_Roots-ACCENTED.pdf

In [4]:
# install pdftk if not already there. eg: for ubuntu: sudo apt install pdftk
!pdftk data/whitney_roots.pdf cat 229 output data/whitney_roots_ninth_class.pdf

In [5]:
# produces data/whitney_roots_ninth_class.txt
!pdftotext data/whitney_roots_ninth_class.pdf

Cleanup the text version manually, fixing formatting and diacritics.

Final results are in [data/whitney_roots_ninth_class_cleaned.txt](data/whitney_roots_ninth_class_cleaned.txt)

In [225]:
# TODO try to get the 9th class forms/roots directly from Lubotksy's concordance?

## Parsing Verbal Roots Info

In [206]:
CLASS_HEADER = "6. nā-class"
EARLIER_LANGUAGE_HEADER = "A. Earlier Language"
EARLIER_AND_LATER_LANGUAGE_HEADER = "B. Earlier and Later Language"
LATER_LANGUAGE_HEADER = "C. Later Language"

NINTH_CLASS_STRONG_MARKER = "ā"
NINTH_CLASS_WEAK_MARKER = "ī"

whitney_roots = []

language_period = None

with open("data/whitney_roots_ninth_class_cleaned.txt", 'r') as whitney_file:
    while line := whitney_file.readline():
        variant_no = None
        attestation_texts = None
        weak_only = False
        
        line = line.rstrip()
        if not line or CLASS_HEADER in line:
            continue    
        elif EARLIER_LANGUAGE_HEADER in line:
            language_period = "Earlier"
            continue
        elif EARLIER_AND_LATER_LANGUAGE_HEADER in line:
            language_period = "Earlier & Later"
            continue
        elif LATER_LANGUAGE_HEADER in line:
            language_period = "Later"
            continue
                
        line_parts = line.split()
        if line_parts[0].isdigit():
            variant_no = line_parts.pop(0)
        stem = line_parts.pop(0)
        if line_parts:
            attestation_texts = " ".join(line_parts)
        
        if stem.endswith(NINTH_CLASS_WEAK_MARKER):
            weak_only = True
        
        whitney_roots.append({
            "root": stem[:-2], # removes the last two chars
            "variant_no": variant_no,
            "stem": stem,
            "weak_only": weak_only,
            "attestation_texts": attestation_texts,
            "language_period": language_period,
        })

In [17]:
import pandas

In [207]:
df_whitney_roots = pandas.DataFrame.from_dict(whitney_roots)
df_whitney_roots.to_csv("data/whitney_roots_ninth_class.csv", index=None)
df_whitney_roots.head()

Unnamed: 0,root,variant_no,stem,weak_only,attestation_texts,language_period
0,i,,inī,True,V.,Earlier
1,iṣ,,iṣṇā,False,,Earlier
2,ubh,,ubhnā,False,V.,Earlier
3,uṣ,,uṣṇā,False,V.,Earlier
4,kṣi,,kṣiṇā,False,V.B.,Earlier


## Annotating Verbal Roots with Rig Veda Attestations (Manual)

Using Lubotsky's concordance, attestation info is manually added to [data/whitney_roots_ninth_class.csv](data/whitney_roots_ninth_class.csv).

Final results are in [data/roots_ninth_class.csv](data/roots_ninth_class.csv).

In [210]:
#df_roots = pandas.read_csv("data/roots_ninth_class.csv")
df_roots = pandas.read_csv("data/roots_ninth_class.csv", keep_default_na=False)

In [211]:
# TODO remove test df once we have all the annotations
#df_roots_test = df_roots[~df_roots["notes"].isna()]
df_roots_test = df_roots[df_roots["notes"].str.len() > 0]
df_roots_test.head()

Unnamed: 0,root,variant_no,stem,weak_only,attestation_texts,language_period,rig_veda_weak_attestations,rig_veda_strong_attestations,lubotsky_page_no,notes
1,iṣ,,iṣṇā,False,,Earlier,,1.63.2d,1:,iSnAsi
18,vr̥,1.0,vr̥ṇī,True,V.,Earlier,1.180.4b 1.67.1b 4.25.3a,,2:1338-1339,avRNItam vRnIte vRnIte(accented - last syll)
37,pu,,punā,False,,Earlier & Later,9.16.3c 9.67.27d,1.133.1a 10.13.3d,1:900-,punIhi puNAmi


## Annotating Verbal Roots with Rig Veda Attestations

### Getting Rig Veda padapatha text (Eichler)

In [None]:
# http://www.detlef108.de/Rigveda.htm 
# http://www.detlef108.de/Notes-to-the-Rigveda-Page.htm 
!wget -O data/rv_padapatha_eichler.html http://www.detlef108.de/RV-Padapatha-TA3-paada-NA-UTF8.html 

In [224]:
# sudo apt install html2text
#!html2text -utf8 -width 3000 -o rv_padapatha.txt rv_padaptaha.html

from bs4 import BeautifulSoup

with open("data/rv_padapatha_eichler.html", "r") as input_file:
    soup = BeautifulSoup(input_file)
    
    hymns = []
    
    for para in soup.find_all("p"):
        # ignore the ending notes
        if para.contents[0].name == "span":
            continue
        
        #hymns.append(para.text.rstrip()) # no extra lines between hymns
        hymns.append(para.text)
    
    with open("data/rv_padapatha_eichler.txt", 'w') as f:
        f.write("".join(hymns))

In [222]:
# TODO break the padapatha verse into sub-lines

### Getting Rig Veda padapatha / metrically restored texts

In [None]:
# https://github.com/cceh/c-salt_vedaweb_sources/tree/master/rigveda/versions
# description of the sources here:
# https://github.com/cceh/c-salt_vedaweb_tei/blob/master/vedaweb_corpus.tei
# https://vedaweb.uni-koeln.de/rigveda/help

!wget -O data/rv_padapatha_lubotsky.json https://raw.githubusercontent.com/cceh/c-salt_vedaweb_sources/master/rigveda/versions/lubotsky.json

!wget -O data/rv_samhitapatha_vnh.json https://raw.githubusercontent.com/cceh/c-salt_vedaweb_sources/master/rigveda/versions/vnh.json

In [217]:
# TODO make text version from the jsons, with line numbers at the beginning

### Searching text for ninth-class verbal forms

In [196]:
# TODO

## Organizing Data by Verse Lines (pādas)

In [135]:
#from pprint import pprint

In [251]:
rv_lines = []

roots = df_roots_test.to_dict("records")

for root in roots:
    rv_weak_line_nos = root.pop("rig_veda_weak_attestations").split()
    rv_strong_line_nos = root.pop("rig_veda_strong_attestations").split()
    
    for line_no in rv_weak_line_nos:
        line = {"line_no": line_no, "stem_type": "weak"} | root
        # update stem vowel (since stem won't have weak marker in whitney when strong 
        # stem is attested)
        line["stem"] = line["stem"][:-1] + NINTH_CLASS_WEAK_MARKER
        rv_lines.append(line)
        
    for line_no in rv_strong_line_nos:
        rv_lines.append({"line_no": line_no, "stem_type": "strong"} | root)

pprint(rv_lines[0])

{'attestation_texts': '',
 'language_period': 'Earlier',
 'line_no': '1.63.2d',
 'lubotsky_page_no': '1:',
 'notes': 'iSnAsi',
 'root': 'iṣ',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'variant_no': '',
 'weak_only': False}


### Parsing line numbers

In [253]:
# "1.1.1b" > "01" "001" "02" "b"
def parse_rv_line_no(string):
    line_no_parts = string.split(".")
    
    book = line_no_parts[0]
    hymn = line_no_parts[1]
    
    last_char = line_no_parts[2][-1]
    if last_char.isalpha():
        stanza = line_no_parts[2][:-1] # drop the last char
        pada = last_char
    else:
        stanza = line_no_parts[2]
        pada = ""

    return {
        "book": book.zfill(2),
        "hymn": hymn.zfill(3),
        "stanza": stanza.zfill(2),
        "pada": pada
    }    

rv_lines = [line | (parse_rv_line_no(line["line_no"])) for line in rv_lines]

print(rv_lines[0])

{'line_no': '1.63.2d', 'stem_type': 'strong', 'root': 'iṣ', 'variant_no': '', 'stem': 'iṣṇā', 'weak_only': False, 'attestation_texts': '', 'language_period': 'Earlier', 'lubotsky_page_no': '1:', 'notes': 'iSnAsi', 'book': '01', 'hymn': '063', 'stanza': '02', 'pada': 'd'}


## Annotating Verse Lines

In [254]:
# TODO
def annotate_line(line):
    #https://vedaweb.uni-koeln.de/rigveda/api/document/id/0100102
    
    line["text"] = "PLACEHOLDER"
    
    line["meter_scansion"] = "PLACEHOLDER"
    
    line["meter_name"] = "PLACEHOLDER"
    
    return line

rv_lines_annotated = [annotate_line(line) for line in rv_lines]
pprint(rv_lines_annotated[0])

{'attestation_texts': '',
 'book': '01',
 'hymn': '063',
 'language_period': 'Earlier',
 'line_no': '1.63.2d',
 'lubotsky_page_no': '1:',
 'meter_name': 'PLACEHOLDER',
 'meter_scansion': 'PLACEHOLDER',
 'notes': 'iSnAsi',
 'pada': 'd',
 'root': 'iṣ',
 'stanza': '02',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'text': 'PLACEHOLDER',
 'variant_no': '',
 'weak_only': False}


## Saving the Final Line Results

In [255]:
df_rv_lines = pandas.DataFrame.from_dict(rv_lines_annotated)
df_rv_lines.to_csv("data/rv_lines_ninth_class.csv", index=None)
df_rv_lines.head(100)

Unnamed: 0,line_no,stem_type,root,variant_no,stem,weak_only,attestation_texts,language_period,lubotsky_page_no,notes,book,hymn,stanza,pada,text,meter_scansion,meter_name
0,1.63.2d,strong,iṣ,,iṣṇā,False,,Earlier,1:,iSnAsi,1,63,2,d,PLACEHOLDER,PLACEHOLDER,PLACEHOLDER
1,1.180.4b,weak,vr̥,1.0,vr̥ṇī,True,V.,Earlier,2:1338-1339,avRNItam vRnIte vRnIte(accented - last syll),1,180,4,b,PLACEHOLDER,PLACEHOLDER,PLACEHOLDER
2,1.67.1b,weak,vr̥,1.0,vr̥ṇī,True,V.,Earlier,2:1338-1339,avRNItam vRnIte vRnIte(accented - last syll),1,67,1,b,PLACEHOLDER,PLACEHOLDER,PLACEHOLDER
3,4.25.3a,weak,vr̥,1.0,vr̥ṇī,True,V.,Earlier,2:1338-1339,avRNItam vRnIte vRnIte(accented - last syll),4,25,3,a,PLACEHOLDER,PLACEHOLDER,PLACEHOLDER
4,9.16.3c,weak,pu,,punī,False,,Earlier & Later,1:900-,punIhi puNAmi,9,16,3,c,PLACEHOLDER,PLACEHOLDER,PLACEHOLDER
5,9.67.27d,weak,pu,,punī,False,,Earlier & Later,1:900-,punIhi puNAmi,9,67,27,d,PLACEHOLDER,PLACEHOLDER,PLACEHOLDER
6,1.133.1a,strong,pu,,punā,False,,Earlier & Later,1:900-,punIhi puNAmi,1,133,1,a,PLACEHOLDER,PLACEHOLDER,PLACEHOLDER
7,10.13.3d,strong,pu,,punā,False,,Earlier & Later,1:900-,punIhi puNAmi,10,13,3,d,PLACEHOLDER,PLACEHOLDER,PLACEHOLDER
