In [2]:
#!/usr/bin/env python3.10

import pandas as pd
import re
import pickle
import sys

from bs4 import BeautifulSoup
from functions import transliterate_xml
from clean_text import text_cleaner

sys.path.insert(1, '../utilities/')
from timeis import timeis, yellow, line, tic, toc

cst_dir = "../Cst4/Xml/"
output_dir = "output/"

file_list = [
    "s0201m.mul.xml",
    "s0202m.mul.xml",
    "s0203m.mul.xml"
]

# create a list of all words in mn
mn_df = pd.read_csv(
	"../frequency maps/output/word count csvs/sutta_majjhima_mūla.csv",
	sep="\t",
	header=None)
mn_words_list = mn_df[0].to_list()

# all inflections dict
with open("../inflection generator/output/all inflections dict", "rb") as p:
	all_inflection_dict = pickle.load(p)


def find_inflections_with_no_eg1(word):
	headwords_list = []
	search_list = []

	# find which headwords the inflection could belong to
	for headword in all_inflection_dict:
		if word in all_inflection_dict[headword]["inflections"]:
			headwords_list += [headword]

	# build a list of inflected forms to search for
	for headword in headwords_list:
		if all_inflection_dict[headword]["sutta1"] == False:
			search_list += all_inflection_dict[headword]["inflections"]
	return search_list


for word in mn_words_list:
	search_list = find_inflections_with_no_eg1(word)
	if search_list != []:
		break

print(search_list)

for search_word in search_list:
	search_sentence = re.compile(
				f"(\\. |\\! |\\? |^)(.[^.]*?{search_word}.*?($|\\. |\\! |\\? ))")

txt = open("output/temp/test.txt", "w")

sutta_counter = 0

for file_name in file_list:

	# transliterate the xml files into roman and save
	with open(f"{cst_dir}{file_name}", "r", encoding="UTF-16") as f:
		xml = f.read()

	xml = transliterate_xml(xml)

	with open(f"../Cst4/xml roman/{file_name}", "w") as w:
		w.write(xml)

	# make the soup
	soup = BeautifulSoup(xml, "xml")

	# remove all the "pb" tags
	pbs = soup.find_all("pb")
	for pb in pbs:
		pb.decompose()

	# remove all the notes
	notes = soup.find_all("note")
	for note in notes:
		note.decompose()

	# remove all the para tags
	pns = soup.find_all("p")
	for pn in pns:
		del pn["n"]

	# remove all the hi tags
	his = soup.find_all("hi")
	for hi in his:
		hi.unwrap()

	with open(f"output/temp/{file_name}.xml", "w") as w:
		w.write(soup.prettify())

	# paṇṇāsa = head.rend book
	# vagga = head.chapter
	# sutta name = p.subhead(with number)
	# subtitle = p.subhead(no number)

	nikaya = soup.find_all("p", rend="nikaya")[0].string
	pannasa = soup.find_all("head", rend="book")[0].string

	divs = soup.find_all("div", type="vagga")

	for div in divs:

		if div.head["rend"] == "chapter":
				vagga_name = div.head.string

		paras = div.children
		
		for para in paras:
			
			# get sutta and subtitle name and numbers
			if "subhead" in str(para):
				if re.findall("^\\d", para.string):
					sutta_counter += 1
					sutta_name = para.string
					sutta_name_clean = re.sub("^\\d*\\. ", "", sutta_name)
					sutta_no = sutta_counter
					subtitle = "" # close subtitle on each new sutta
				else:
					subtitle = para.string
			
			# get text and search it
			text = para.get_text()
			results = re.findall(search_sentence, text)

			# clean up results and print
			txt = open("output/temp/test.txt", "a")
			for result in results:
				clean_result = text_cleaner(result[1])
				txt.write(f"MN {sutta_counter}\t{sutta_name_clean}")
				if subtitle != "":
					txt.write(f", {subtitle}")
				txt.write(f"\t{clean_result}")
				txt.write(f"\n({nikaya} {pannasa} {vagga_name} {sutta_name})\n\n")

	txt.close()


['homa', 'hotu', 'hontu', 'hosi', 'hoti', 'homi', 'hohi', 'hotha', 'honti']
