# Verse Lines with Roots and Stems

## Organizing Root Attestations Data by Verse Lines (pādas)

In [1]:
import pandas

# see 3_roots_with_attestations.ipynb for details on this file
df_roots = pandas.read_csv("data/roots_with_attestations.csv", keep_default_na=False)
#df_roots.head()

In [2]:
#df_roots_attested = df_roots[~df_roots["strong_attestations"].isna()]
#df_roots_attested = df_roots[df_roots["strong_attestations"].str.len() > 0]
df_roots_attested = df_roots.query('strong_attestations != "" or weak_attestations != ""')

df_roots_attested.head()

Unnamed: 0,root_guess,variant_no,strong_stem,weak_stem,weak_only,attestation_texts,language_period,present_class,root,strong_attestations,strong_attestations_total,weak_attestations,weak_attestations_total
1,iṣ,,iṣṇā,iṣṇī,False,,Earlier,ninth,iṣ 1,01.063.02,1,,0
2,ubh,,ubhnā,ubhnī,False,V.,Earlier,ninth,ubh,01.063.04 04.019.04,2,,0
4,kṣi,,kṣiṇā,kṣiṇī,False,V.B.,Earlier,ninth,kṣī,04.018.12 10.027.04 10.027.13,3,,0
6,gr̥bh,,gr̥bhṇā,gr̥bhṇī,False,V.B.,Earlier,ninth,gr̥bhⁱ,01.055.02 01.163.02 03.030.05 05.031.07 07.101...,10,09.046.04 09.106.03 10.062.01 10.062.02 10.062...,6
7,jū,,junā,junī,False,V.,Earlier,ninth,jū,01.027.07 01.071.06 01.186.05 07.086.07,4,09.079.02,1


In [3]:
rv_lines = []

roots_attested = df_roots_attested.to_dict("records")

for root in roots_attested:
    rv_weak_stanza_nos = root.pop("weak_attestations").split()
    rv_strong_stanza_nos = root.pop("strong_attestations").split()   
    # FIXME just eliminate these? since these are only stanza
    root.pop("weak_attestations_total")
    root.pop("strong_attestations_total")
    
    weak_stem = root.pop("weak_stem")
    strong_stem = root.pop("strong_stem")
    
    # we set line_no to be stanza_no, to start off with
    # (will be updated to actual line_no later)
    for stanza_no in rv_weak_stanza_nos:
        rv_lines.append({"line_no": stanza_no, "stem": weak_stem, "stem_type": "weak"} | root)  
    for stanza_no in rv_strong_stanza_nos: 
        rv_lines.append({"line_no": stanza_no, "stem": strong_stem, "stem_type": "strong"} | root)

rv_lines[0]

{'line_no': '01.063.02',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'root_guess': 'iṣ',
 'variant_no': '',
 'weak_only': False,
 'attestation_texts': '',
 'language_period': 'Earlier',
 'present_class': 'ninth',
 'root': 'iṣ 1'}

## Parsing Line numbers into Constituents

In [4]:
# "1.1.1b" > "01" "001" "02" "b"
# 01.063.02 > "01" "063" "02" ""
def parse_rv_line_no(string):
    line_no_parts = string.split(".")
    
    book = line_no_parts[0].zfill(2)
    hymn = line_no_parts[1].zfill(3)
    
    last_char = line_no_parts[2][-1]
    if last_char.isalpha(): # if pada is already present
        stanza = line_no_parts[2][:-1] # drop the last char
        pada = last_char
    else:
        stanza = line_no_parts[2]
        pada = ""

    stanza = stanza.zfill(2)

    return {
        "book"    : book,
        "hymn"    : f"{book}.{hymn}",
        "stanza"  : f"{book}.{hymn}.{stanza}",
        "pada"    : f"{book}.{hymn}.{stanza}.{pada}" if pada else ""
        #"pada_id" : pada or ''
    }    

rv_lines = [line | (parse_rv_line_no(line["line_no"])) for line in rv_lines]

rv_lines[0]

{'line_no': '01.063.02',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'root_guess': 'iṣ',
 'variant_no': '',
 'weak_only': False,
 'attestation_texts': '',
 'language_period': 'Earlier',
 'present_class': 'ninth',
 'root': 'iṣ 1',
 'book': '01',
 'hymn': '01.063',
 'stanza': '01.063.02',
 'pada': ''}

## Enrching Verse Lines with Stanza Metadata

### Downloading annotation data

In [5]:
!mkdir -p downloads/vedaweb

In [6]:
import os
import requests
import time

VEDAWEB_API_URL = "https://vedaweb.uni-koeln.de/rigveda/api"

rv_stanza_nos = sorted(list(set([line["stanza"] for line in rv_lines])))

In [7]:
# no of stanzas for which to dowload the stanza data
len(rv_stanza_nos)

814

In [8]:
for stanza_no in rv_stanza_nos:
    print(f"Getting data for stanza: {stanza_no}")
    
    stanza_file = f"downloads/vedaweb/{stanza_no}.json"
    if os.path.exists(stanza_file):
        # we already downloaded this stanza so continue to the next one
        continue
    
    # eg: https://vedaweb.uni-koeln.de/rigveda/api/document/id/0100102
    vedaweb_doc_id = stanza_no.replace('.', '')
    vedaweb_doc_url = f"{VEDAWEB_API_URL}/document/id/{vedaweb_doc_id}"
    
    response = requests.get(vedaweb_doc_url)
    # raises an exception on non-200 responses, since we want to know and act on it
    response.raise_for_status()
    
    with open(stanza_file, 'w') as f:
        f.write(response.text)
    
    # so that we don't hammer the api
    time.sleep(0.5)

print("Done!")

Getting data for stanza: 01.010.04
Getting data for stanza: 01.010.07
Getting data for stanza: 01.010.08
Getting data for stanza: 01.012.01
Getting data for stanza: 01.013.02
Getting data for stanza: 01.013.05
Getting data for stanza: 01.013.12
Getting data for stanza: 01.015.02
Getting data for stanza: 01.015.03
Getting data for stanza: 01.017.09
Getting data for stanza: 01.018.01
Getting data for stanza: 01.018.04
Getting data for stanza: 01.018.08
Getting data for stanza: 01.023.21
Getting data for stanza: 01.025.01
Getting data for stanza: 01.027.07
Getting data for stanza: 01.027.12
Getting data for stanza: 01.028.06
Getting data for stanza: 01.030.12
Getting data for stanza: 01.030.14
Getting data for stanza: 01.030.15
Getting data for stanza: 01.031.03
Getting data for stanza: 01.031.07
Getting data for stanza: 01.031.08
Getting data for stanza: 01.032.03
Getting data for stanza: 01.032.04
Getting data for stanza: 01.035.09
Getting data for stanza: 01.036.03
Getting data for sta

### Enriching the lines with text and metrical info

In [9]:
import json

with open(f"data/roots_with_attested_words.json") as f:
    roots_with_attested_words = json.load(f)

In [18]:
# in the same folder as this notebook
import src.lib.verse_lines as verse_lines

# useful during testing to pick up changes in the file
import importlib
importlib.reload(verse_lines)

rv_lines_annotated = []
for line in rv_lines:
    # we can get multiple pada lines from a singe stanza attestation so need to use extend
    rv_lines_annotated.extend(verse_lines.annotate_line(line, roots_with_attested_words))

print(f"\nTotal number of lines: {len(rv_lines_annotated)}")

rv_lines_annotated[0] # sample


Total number of lines: 881


{'line_no': '01.063.02.d',
 'stem': 'iṣṇā',
 'stem_type': 'strong',
 'root_guess': 'iṣ',
 'variant_no': '',
 'weak_only': False,
 'attestation_texts': '',
 'language_period': 'Earlier',
 'present_class': 'ninth',
 'root': 'iṣ 1',
 'book': '01',
 'hymn': '01.063',
 'stanza': '01.063.02',
 'pada_id': 'd',
 'pada_label': 'M',
 'word': 'iṣṇā́si',
 'word_position_no': 2,
 'word_position': 'intermediate',
 'word_gloss': '2.SG.PRS.IND.ACT',
 'text_padapatha': 'púraḥ iṣṇā́si= puruhūta pūrvī́ḥ',
 'text_samhitapatha': 'púra iṣṇā́si puruhūta pūrvī́ḥ',
 'meter_scansion': 'SS LLS SSLS LL',
 'stanza_meter': 'Triṣṭubh',
 'stanza_strata': 'A',
 'stanza_strata_based_on_meter_only': False,
 'stanza_late_addition': '',
 'hymn_absolute_no': 63,
 'hymn_addressee': 'Indra',
 'hymn_group': 'Hymns of Nodhas, Descendant of Gotama'}

## Saving the Results

In [19]:
df_rv_lines = pandas.DataFrame.from_dict(rv_lines_annotated)
df_rv_lines.to_csv("data/rv_lines.csv", index=None)
df_rv_lines.head()

Unnamed: 0,line_no,stem,stem_type,root_guess,variant_no,weak_only,attestation_texts,language_period,present_class,root,...,text_padapatha,text_samhitapatha,meter_scansion,stanza_meter,stanza_strata,stanza_strata_based_on_meter_only,stanza_late_addition,hymn_absolute_no,hymn_addressee,hymn_group
0,01.063.02.d,iṣṇā,strong,iṣ,,False,,Earlier,ninth,iṣ 1,...,púraḥ iṣṇā́si= puruhūta pūrvī́ḥ,púra iṣṇā́si puruhūta pūrvī́ḥ,SS LLS SSLS LL,Triṣṭubh,A,False,,63,Indra,"Hymns of Nodhas, Descendant of Gotama"
1,01.063.04.b,ubhnā,strong,ubh,,False,V.,Earlier,ninth,ubh,...,vr̥trám yát vajrin= vr̥ṣakarman ubhnā́ḥ,vr̥tráṁ yád vajrin vr̥ṣakarman ubhnā́ḥ,LL L LL SSLS LL,Triṣṭubh,A,False,,63,Indra,"Hymns of Nodhas, Descendant of Gotama"
2,04.019.04.c,ubhnā,strong,ubh,,False,V.,Earlier,ninth,ubh,...,dr̥ḷhā́ni aubhnāt= uśámānaḥ ójaḥ,dr̥r̥ḷhā́ni+ aubhnād uśámāna ójo,SLLS LL SSLS LL,,S,False,,315,Indra,Hymns to Indra
3,04.018.12.d,kṣiṇā,strong,kṣi,,False,V.B.,Earlier,ninth,kṣī,...,yát prá ákṣiṇāḥ= pitáram pādagŕ̥hya,yát prā́kṣiṇāḥ pitáram pādagŕ̥hya,L LSL SSL LSLS,Triṣṭubh,P,False,"[Grassmann (G), Arnold (C1)]",314,"Dialogue Between Indra, Aditi and Vamadeva",Hymns to Indra
4,10.027.04.d,kṣiṇā,strong,kṣi,,False,V.B.,Earlier,ninth,kṣī,...,prá tám kṣiṇām= párvate-_ pādagŕ̥hya,prá táṁ kṣiṇām párvate pādagŕ̥hya,S L SL LSL LSLS,Triṣṭubh,P,False,[Arnold (C1)],853,Indra,The Vasukra Hymns


## Validation

### Checking for missing roots

In [12]:
import numpy as np

def print_roots_attestation_info(df_roots, df_rv_lines, present_class):
    # our starting list of roots
    roots_initial = np.sort(df_roots.query(
        f"present_class == '{present_class}'"
    )["root"].unique())
    print(f"Starting list ({len(roots_initial)}):\n{roots_initial}\n")
    
    # attested roots
    roots_present = np.sort(df_rv_lines.query(
        f"present_class == '{present_class}'"
    )["root"].unique())
    print(f"Attested ({len(roots_present)}):\n{roots_present}\n")

    # missing roots
    roots_absent = np.sort(
        np.setdiff1d(roots_initial, roots_present)
    )
    print(f"Missing ({len(roots_absent)}):\n{roots_absent}\n")

In [13]:
print_roots_attestation_info(df_roots, df_rv_lines, "ninth")

Starting list (53):
['aś' 'aśⁱ' 'bandh' 'bhrī' 'dhu dhū' 'drū' 'dr̥' 'grath' 'gr̥' 'gr̥bhⁱ'
 'gr̥hⁱ' 'gr̥̄ 1' 'hr̥̄' 'hvr̥ hru' 'i 2' 'iṣ 1' 'jyā' 'jñā' 'jū' 'kliś'
 'krī' 'kuṣ' 'kṣī' 'lu' 'mathⁱ' 'mr̥d' 'mr̥̄ 1' 'muṣⁱ' 'mī 1' 'pruṣ' 'prī'
 'pr̥̄ 1' 'puṣ' 'pū' 'ram' 'rī' 'skambhⁱ' 'spr̥' 'stambhⁱ' 'str̥̄' 'subh'
 'sā si' 'ubh' 'uṣ' 'vli vlī' 'vr̥' 'vr̥ vr̥̄' 'śam' 'ścam' 'śrathⁱ' 'śrī'
 'śrī 2' 'śr̥̄ 1']

Attested (31):
['aśⁱ' 'bandh' 'gr̥bhⁱ' 'gr̥hⁱ' 'gr̥̄ 1' 'hr̥̄' 'hvr̥ hru' 'iṣ 1' 'jyā'
 'jñā' 'jū' 'krī' 'kṣī' 'mathⁱ' 'mr̥̄ 1' 'muṣⁱ' 'mī 1' 'prī' 'pr̥̄ 1' 'pū'
 'ram' 'rī' 'skambhⁱ' 'stambhⁱ' 'str̥̄' 'sā si' 'ubh' 'vr̥ vr̥̄' 'śrathⁱ'
 'śrī' 'śr̥̄ 1']

Missing (22):
['aś' 'bhrī' 'dhu dhū' 'drū' 'dr̥' 'grath' 'gr̥' 'i 2' 'kliś' 'kuṣ' 'lu'
 'mr̥d' 'pruṣ' 'puṣ' 'spr̥' 'subh' 'uṣ' 'vli vlī' 'vr̥' 'śam' 'ścam'
 'śrī 2']



In [14]:
print_roots_attestation_info(df_roots, df_rv_lines, "fifth")

Starting list (49):
['akṣ' 'ci' 'ci 1' 'dabh' 'dagh' 'dhi' 'dhr̥ṣ' 'dhū' 'du' 'dāś' 'hi' 'i 2'
 'jagh' 'ji 2 jinv' 'kr̥' 'kṣi' 'kṣubh' 'lu' 'mi' 'mi 1' 'naś 1' 'pi'
 'pruṣⁱ' 'pr̥' 'pr̥ 1' 'ri' 'rādh' 'r̥ 1' 'r̥dh' 'sadh' 'sagh' 'si'
 'skabh' 'sku' 'spr̥' 'stabh' 'stigh' 'str̥' 'stu' 'su' 'takṣ' 'ti' 'tr̥p'
 'u 1' 'vr̥' 'vr̥ vr̥̄' 'āp' 'śak' 'śru']

Attested (23):
['ci 1' 'dabh' 'dhr̥ṣ' 'dhū' 'dāś' 'hi' 'i 2' 'ji 2 jinv' 'kr̥' 'mi 1'
 'naś 1' 'pruṣⁱ' 'r̥ 1' 'r̥dh' 'sagh' 'spr̥' 'str̥' 'su' 'tr̥p' 'u 1'
 'vr̥' 'śak' 'śru']

Missing (26):
['akṣ' 'ci' 'dagh' 'dhi' 'du' 'jagh' 'kṣi' 'kṣubh' 'lu' 'mi' 'pi' 'pr̥'
 'pr̥ 1' 'ri' 'rādh' 'sadh' 'si' 'skabh' 'sku' 'stabh' 'stigh' 'stu'
 'takṣ' 'ti' 'vr̥ vr̥̄' 'āp']



TODO explain that these roots not being in RV is ok (and expected):
    
* Some roots are attested only as weak stem before vowel (i.e no _nī_ form). e.g.:  [_uṣ_](https://vedaweb.uni-koeln.de/rigveda/view/id/09.097.39)
* Some roots are there in later vedic texts.
* Some roots are already marked as later language in whitney (to be found only in Classical / Epic sanskrit

TODO also try searching by alternate stem/root vowel forms for the missing roots (and also for the attested?), to see if there are actually recorded in those forms

### Checking found word against padapatha text

In [15]:
# from src/lib/meter.py
from src.lib.meter import clean_lubotsky_padapatha

mismatches = []

no_of_mismatches = 0

for line in rv_lines_annotated:
    padapatha_text_cleaned = clean_lubotsky_padapatha(line["text_padapatha"])
    padapatha_parts = padapatha_text_cleaned.split(' ')
    
    word_from_padapatha = padapatha_parts[line["word_position_no"] - 1]
    
    if line["word"] != word_from_padapatha:
        no_of_mismatches += 1
        print(
            f"{line['line_no']}:{line['word_position_no']}", 
            f"({line['stem']}) {line['word']} ≠ {word_from_padapatha}",
            #f"[{padapatha_text_cleaned}]",
            f"[{line['text_padapatha']}]"
        )
        
print(f"\nFound {no_of_mismatches} mismatches.")

09.104.03.a:1 (punā) punā́ta ≠ punā́tā [punā́tā+ dakṣasā́dhanam]
02.033.13.c:3 (vr̥ṇī) ávr̥ṇīta ≠ ávr̥ṇītā [yā́ni mánuḥ= ávr̥ṇītā+} pitā́ naḥ]
07.033.02.d:3 (vr̥ṇī) avr̥ṇīta ≠ avr̥ṇītā [sutā́t índraḥ= avr̥ṇītā+} vásiṣṭhān]
06.028.06.b:3 (kr̥ṇu) kr̥ṇutha ≠ kr̥ṇuthā [aśrīrám cit= kr̥ṇuthā+ suprátīkam]
01.110.03.d:3 (kr̥ṇu) akr̥ṇuta ≠ akr̥ṇutā [ékam sántam= akr̥ṇutā+} cáturvayam]
05.049.05.c:4 (kr̥ṇu) kr̥ṇutá ≠ kr̥ṇutā́ [áva etu ábhvam= kr̥ṇutā́+} várīyaḥ]
06.025.03.d:3 (kr̥ṇu) kr̥ṇuhí ≠ kr̥ṇuhī́ [jahí vŕ̥ṣṇyāni= kr̥ṇuhī́?_+} párācaḥ]
08.027.18.a:4 (kr̥ṇu) kr̥ṇutha ≠ kr̥ṇuthā [ájre?_ cit asmai= kr̥ṇuthā+} nyáñcanam]
10.067.11.a:3 (kr̥ṇu) kr̥ṇuta ≠ kr̥ṇutā [satyā́m āśíṣam= kr̥ṇutā+} vayodhaí]
10.078.08.a:4 (kr̥ṇu) kr̥ṇuta ≠ kr̥ṇutā [subhāgā́n naḥ= devāḥ kr̥ṇutā+ surátnān]
01.161.11.a:3 (kr̥ṇo) akr̥ṇotana ≠ akr̥ṇotanā [udvátsu asmai= akr̥ṇotanā+ tŕ̥ṇam]
08.045.22.c:3 (aśnu) aśnuhi ≠ aśnuhī [tr̥mpā́+ ví aśnuhī?_+ mádam]
10.066.14.d:4 (dhunu dhūnu) dhūnuta ≠ dhūnutā [asmé?_ devāsaḥ= áva dhūnu

Ignore cases of final vowel lengthening above (especially with imperatives) -- they are not really mismatches.

TODO look at vowel positions for these? but not really in the scope of our investigation

## Summary of Results

In [16]:
import pandas
res_df_rv_lines = pandas.read_csv("data/rv_lines.csv", keep_default_na=False)
res_df_rv_lines.head()

Unnamed: 0,line_no,stem,stem_type,root_guess,variant_no,weak_only,attestation_texts,language_period,present_class,root,...,text_padapatha,text_samhitapatha,meter_scansion,stanza_meter,stanza_strata,strata_based_on_meter_only,stanza_late_addition,hymn_absolute_no,hymn_addressee,hymn_group
0,01.063.02.d,iṣṇā,strong,iṣ,,False,,Earlier,ninth,iṣ 1,...,púraḥ iṣṇā́si= puruhūta pūrvī́ḥ,púra iṣṇā́si puruhūta pūrvī́ḥ,SS LLS SSLS LL,Triṣṭubh,A,False,,63,Indra,"Hymns of Nodhas, Descendant of Gotama"
1,01.063.04.b,ubhnā,strong,ubh,,False,V.,Earlier,ninth,ubh,...,vr̥trám yát vajrin= vr̥ṣakarman ubhnā́ḥ,vr̥tráṁ yád vajrin vr̥ṣakarman ubhnā́ḥ,LL L LL SSLS LL,Triṣṭubh,A,False,,63,Indra,"Hymns of Nodhas, Descendant of Gotama"
2,04.019.04.c,ubhnā,strong,ubh,,False,V.,Earlier,ninth,ubh,...,dr̥ḷhā́ni aubhnāt= uśámānaḥ ójaḥ,dr̥r̥ḷhā́ni+ aubhnād uśámāna ójo,SLLS LL SSLS LL,,S,False,,315,Indra,Hymns to Indra
3,04.018.12.d,kṣiṇā,strong,kṣi,,False,V.B.,Earlier,ninth,kṣī,...,yát prá ákṣiṇāḥ= pitáram pādagŕ̥hya,yát prā́kṣiṇāḥ pitáram pādagŕ̥hya,L LSL SSL LSLS,Triṣṭubh,P,False,"['Grassmann (G)', 'Arnold (C1)']",314,"Dialogue Between Indra, Aditi and Vamadeva",Hymns to Indra
4,10.027.04.d,kṣiṇā,strong,kṣi,,False,V.B.,Earlier,ninth,kṣī,...,prá tám kṣiṇām= párvate-_ pādagŕ̥hya,prá táṁ kṣiṇām párvate pādagŕ̥hya,S L SL LSL LSLS,Triṣṭubh,P,False,['Arnold (C1)'],853,Indra,The Vasukra Hymns


In [17]:
res_df_rv_lines \
    .groupby(["present_class", "stem_type"]) \
    .size().to_frame("count").reset_index() \
    .sort_values(["present_class", "stem_type"], ascending=[False, False])

Unnamed: 0,present_class,stem_type,count
3,ninth,weak,207
2,ninth,strong,141
1,fifth,weak,271
0,fifth,strong,262
