In [9]:
import sys,re
sys.path.append('../')
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd 
import numpy as np 

In [150]:
with open(f'../assets/bible/kjv-adorned.txt','r') as file:
    kjv_tokens = file.readlines()

bible = {}
current_ver = None
for t in kjv_tokens:
    t = t.split("\t")
    token, pos, lemma = t[0], t[2], t[4]
    if token[0].isupper() and re.search("vv",pos):
        lemma = token
    if re.search(r'VERSE-',token):
        current_ver = re.sub("VERSE-", "",token)
        if current_ver[0].islower():
            current_ver = "J" + current_ver
        elif "Acts" in current_ver:
            n = current_ver.split("-")[-2:]
            current_ver = f"Acts-{n[0]}-{n[1]}"
        bible[current_ver] = []
    else:
        bible[current_ver].append(lemma)
bible = {k: " ".join(v) for k,v in bible.items()}

In [197]:
import re

def split_verse(segment):
    subsegments = re.split(r"\.|\;|\:|\?",segment)
    to_segment = ["but", ", while", ", let", ", they", ", NONLATINALPHABET",
                    ", then", ", yet", ", than", ', and yet', ', and though',
                    ', at least', ', and to', ', this be', ', for', ', therefore',
                    ', that', ', and we', ', and i ', ', when', ', and say', ', and this',
                    ', and then', ', and than', ', and they', ', i say', ', as the apostle',
                    ', otherwise', ', how', ', according', ', accordi^^', ', say',', and when',
                    ', and he', ', and she', ', he say', ', she say', ', lest', ', and where',
                    ', and how', ', and what', ', and there', ', and therefore', ', and thus',
                    ', and if', ', and because', ', and I ', ', he will', ', they will', ', she will']
    pattern = '|'.join(map(re.escape, to_segment))
    all_parts = []
    for segment in subsegments: 
        parts = re.split(pattern, segment)
        matches = re.findall(pattern,segment)
        for idx, part in enumerate(parts):
            if idx == (len(parts) - 1): break
            # if len(part) == 0: continue
            conj = re.sub(", ", "",matches[idx])
            parts[idx] = part
            parts[idx + 1] = conj + parts[idx+1]
        all_parts.extend(parts)
    return_parts = [] 
    for part in all_parts:
        part = part.strip(" ")
        if len(part) > 0 and not re.search(r"^\)\s*$|^\(\s*$|^[bB]ehold[\s\,]*$|^say[\s\,]*|^and say[\s\,]*",part): return_parts.append(part) 
    return return_parts

In [198]:
segmented_bible = {}
for label, verse in bible.items(): 
    segmented_bible[label] = split_verse(verse)
bible_labels = []
bible_parts = [] 
for label, verse in segmented_bible.items(): 
    for part in verse: 
        bible_parts.append(part)
        bible_labels.append(label)

In [199]:
vector = TfidfVectorizer(norm=None, analyzer='word',sublinear_tf=True)
tfidf_bible = vector.fit_transform(bible_parts)
df = pd.DataFrame(tfidf_bible.toarray(), columns = vector.get_feature_names_out())

In [200]:
doc_scores = df.sum(axis=1)

In [201]:
# get the average TFIDF score for each verse 
for idx, score in enumerate(doc_scores): 
    num_words = len(bible_parts[idx].split(" "))
    doc_scores[idx] = score/num_words
sorted_docs = np.argsort(doc_scores)[::-1]

In [202]:
with open('../assets/bible_parts.txt','w') as file: 
    for idx in sorted_docs: 
        file.write(bible_labels[idx]+"\n")
        file.write(bible_parts[idx]+"\n")