Additional notebook for most accurate pinpoint references of words as well as the export to a non-local mongodb collection

In [1]:
import json
from pprint import pprint
from statistics import mean, median, mode, stdev, variance, pstdev, pvariance
from copy import deepcopy, copy
from os import listdir
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from pymongo.collation import Collation

In [2]:
#Enter mongodb username, uri of database, and password here
user = ""
uri = ""
password = "" 

In [3]:
def extract_ids(strongs):
    """Extract the Strong's Concordance ids from
       a string representing the extended strongs
       of Step Bible data"""
    ids = []
    cur_char_index = 0
    while (cur_char_index < len(strongs)):
        cur_char = strongs[cur_char_index]
        if strongs[cur_char_index] == 'H':
            hebId = ""
            cur_char_index += 1
            while (cur_char_index < len(strongs)):
                cur_char = strongs[cur_char_index]
                if cur_char.isnumeric():
                    hebId += cur_char
                else:
                    if hebId.isnumeric():
                        ids.append(int(hebId))
                        break
                cur_char_index += 1
        cur_char_index += 1
    return ids

#Is this necessary? Take out if needed
def reduce_strongs(word):
    """Reduce Step Bible extended Strong's where a proper noun is elaborated
        Example: H3470a=יְשַׁעְיָ֫הוּ=Isaiah_§Isaiah@2Ki.19.2 => H3470a=יְשַׁעְיָ֫הוּ=Isaiah_§Isaiah
    """
    at_location = word['strongs'].rfind('@')
    if at_location != -1:
        word['strongs'] = word['strongs'][:word['strongs'].rindex('@')]


def remove_extended_strongs(id_list):
    return [item for item in id_list if item <= 8674]


def add_strongs_ids(word):
    reduce_strongs(word)
    ids = extract_ids(word['strongs'])
    ids = remove_extended_strongs(ids)
    word['ids'] = ids
    if len(ids) > 1 and '«' not in word['strongs']:
        word['compound'] = True


def fuse_ketiv_qere(ketiv, qere, ref_parts):
    qere_ref = qere['ref'].replace('-', '.').split('.')
    fused = {
        "ref": qere['ref'][:-2],
        "heb": {
            "heb_ketiv": ketiv['heb'],
            "heb_qere": qere['heb']
        },
        "accented": {
            'accented_ketiv': ketiv['accented'],
            'accented_qere': qere['accented']
        },
        "morph": {
            "morph_ketiv": ketiv['morph'],
            "morph_qere": qere['morph']
        },
        "strongs": {
            'strongs_ketiv': ketiv['strongs'],
            'strongs_qere': qere['strongs']
        },
        "ids": {
            "ids_ketiv": ketiv['ids'],
            "ids_qere": qere['ids']
        },
        "ref_obj": {
            "book": ref_parts[0],
            "chapter": ref_parts[1],
            "verse": ref_parts[2],
            "word": int(ref_parts[3])
        }
    }
    return fused


def create_word(line_parts, ref_parts):
    word = {
        "ref": line_parts[0],
        "heb": line_parts[1],
        "accented": line_parts[2],
        "morph": line_parts[3],
        "strongs": line_parts[4],
        "ref_obj": {
            "book": ref_parts[0],
            "chapter": ref_parts[1],
            "verse": ref_parts[2],
            "word": int(ref_parts[3])
        }
    }
    return word

def add_non_cantillation(word):
    accented_split = word['accented'].split('/')
    longest_part = sorted(accented_split,
                          key=lambda item: len(item))[-1]
    non_cantillation = ''.join(char for char in longest_part if ord(char) > 0x05AF)
    word['non_cantillation'] = non_cantillation

def extract_tl_strongs(strongs):
    strongs_index = 0
    ids = []
    while strongs_index < len(strongs):
        if strongs[strongs_index] == 'H':
            strongs_index += 1
            heb_id = ""
            while strongs_index < len(strongs) and strongs[strongs_index].isnumeric():
                heb_id += strongs[strongs_index]
                strongs_index += 1
            if len(heb_id) != 0:
                heb_id = int(heb_id)
                if heb_id <= 8674 and heb_id not in ids:
                    ids.append(heb_id)
        strongs_index += 1
    if len(ids) > 1:
        ids = ids[1:]
    return ids


def remove_niqqud(text):
    return "".join(char for char in text if ord(char) >= 0x05D0)

def read_bible_text_file(file_path):
    batch_list = []
    count = 0
    with open(file_path, 'r', encoding='utf-8') as file:
        prev_word = None
        prev_word_index = -1
        #Ignore schema
        for i in range(3):
            file.readline()
        for line in file:
            line_parts = line.strip().split('\t')
            #Remove KJV Mapping
            del line_parts[1]
            ref_parts = line_parts[0].replace('-', '.').split('.')
            word = create_word(line_parts, ref_parts)
            add_strongs_ids(word)
            add_non_cantillation(word)
            if word['ref'][-1] == 'Q':
                word = fuse_ketiv_qere(prev_word, word, ref_parts)
                del batch_list[-1]
                count += 1
            strongs = word['strongs']
            if not isinstance(strongs, dict):
                word['tls'] = extract_tl_strongs(strongs)

            else:
                word['tls'] = {
                    'tls_ketiv' : extract_tl_strongs(strongs['strongs_ketiv']),
                    'tls_qere' : extract_tl_strongs(strongs['strongs_qere'])
                }
            batch_list.append(word)
            prev_word = word
    return batch_list

In [4]:
bible_words = []
bible_defs = {}
bible_parts_location = '../hebrew_sources/stepbible/STEPBible-Data/parts/'

with open(f'../definitions/clean_defs.json', 'r', encoding='utf-8') as definitions:
    for i, definition in enumerate(definitions, start = 1):
        bible_defs[i] = json.loads(definition)
        del bible_defs[i]['total_freq']

with open('../definitions/book_map.json', 'r', encoding='utf-8') as book_map_file:
    book_lookup = json.loads("".join(line.strip() for line in book_map_file))
    
for bible_part in listdir(bible_parts_location):
    bible_words += read_bible_text_file(bible_parts_location + bible_part)

print(f'Processed {len(bible_words)} words. Expected value is 305495')


Processed 305495 words. Expected value is 305495


In [5]:
def insert_ref(tls_id, ref):
    definition = bible_defs[tls_id]
    book, chapter, verse, word = ref.values()
    if 'refs' not in definition:
        definition['refs'] = {book: {chapter: {verse: [word]}}}
        definition['frequency'] = 1
        return
    refs = definition['refs']
    if book not in refs:
        refs[book] = {chapter: {verse: [word]}}
    elif chapter not in refs[book]:
        refs[book][chapter] = {verse: [word]}
    elif verse not in refs[book][chapter]:
        refs[book][chapter][verse] = [word]
    else:
        refs[book][chapter][verse].append(word)
    definition['frequency'] += 1
    
for word in bible_words:
    tls = word['tls']
    ref = word['ref_obj']
    if not isinstance(tls, dict):
        for tls_id in tls:
            insert_ref(tls_id, ref)
    else:
        ketiv_ids, qere_ids = tls.values()
        for tls_id in ketiv_ids:
            insert_ref(tls_id, ref)
        if ketiv_ids != qere_ids:
            for tls_id in qere_ids:
                insert_ref(tls_id, ref)

In [6]:
for i in range(1, len(bible_defs) + 1):
    variants = set()
    definition = bible_defs[i]
    for variant in definition['variants']:
        niqqud = variant['niqqud']
        variants.add(niqqud)
    definition['variants'] = list(variants)

In [None]:
with open('../definitions/hebrew_reference.json', 'w', encoding='utf-8') as dictionary:
    for item in bible_defs.values():
        dictionary.write(json.dumps(item, ensure_ascii=False) + "\n")

In [None]:
conn_str = "mongodb+srv://{}:{}@{}/?retryWrites=true&w=majority".format(user, password, uri)
client = MongoClient(conn_str, server_api=ServerApi('1'))
try:
    print(client.server_info())
except Exception:
    print("Unable to connect to server")

In [None]:
databases = client.list_database_names()
print(f'These are the databases: {databases}')

In [None]:
db = client.dictionaries
collection = db.hebrew

In [None]:
defs_list = list(bible_defs.values())