# Embedding the Documents

In [1]:
import os
import json
import numpy as np
import threading
import time
from cltk import NLP


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [3]:
embedding_dir = "./data/embedding/"
doc_dir = "./data/"

In [4]:
def nlp_file(filename, fpath, tpath):
    print(f"--- Start Process for {filename}, {time.ctime()}")
    with open(fpath) as jf:
        wdoc = json.load(jf)
    cltk_nlp = NLP(language='lat', suppress_banner=True)

    for bk in wdoc:
        cltk_doc = cltk_nlp.analyze(bk["texts"])
        bk["sentences"] = cltk_doc.sentences_strings
        bk["sentence_embeddings"] = cltk_doc.sentence_embeddings
        print(f"------ Finished {filename}, {bk['title']}, {bk['bks']}")
    with open(tpath, 'w') as jf:
        json.dump(wdoc, jf, cls=NumpyArrayEncoder)
    print(f"--- End Process for {filename}, {time.ctime()}")


In [5]:
from concurrent.futures import ThreadPoolExecutor


In [6]:
with ThreadPoolExecutor(max_workers=16) as pool:
    def get_result(future):
        print(future.result())

    for tname in os.listdir(doc_dir):
        fpath = os.path.join(doc_dir, tname)
        fname, ext = os.path.splitext(tname)

        if not os.path.isfile(fpath) or ext != '.json':
            #print(f"{fname} is not a json file")
            continue
        tpath = os.path.join(embedding_dir, fname+'_embedding'+ext)
        pool.submit(nlp_file, fname, fpath, tpath).add_done_callback(get_result)
        time.sleep(1)
        

    


--- Start Process for Collectanea_rerum_mirabilium, Fri Jun 24 22:41:54 2022
--- Start Process for De_agri_cultura, Fri Jun 24 22:41:55 2022
--- Start Process for De_rerum_naturis, Fri Jun 24 22:41:56 2022
--- Start Process for Etymologiarum_libri_XX, Fri Jun 24 22:41:57 2022
--- Start Process for Natura_Histori, Fri Jun 24 22:41:58 2022
--- Start Process for Physica_Bingensis, Fri Jun 24 22:41:59 2022
--- Start Process for Res_rustica, Fri Jun 24 22:42:00 2022
------ Finished Etymologiarum_libri_XX,  , Praefatio
------ Finished De_agri_cultura, M. PORCI CATONIS CENSORIS DE AGRI CVLTVRA, De agri cultura-0
------ Finished Collectanea_rerum_mirabilium, SOLINVS ADVENTO SALVTEM, Collectanea rerum mirabilium-0
------ Finished De_agri_cultura, M. PORCI CATONIS CENSORIS DE AGRI CVLTVRA, De agri cultura-1
------ Finished Collectanea_rerum_mirabilium, SOLINVS ADVENTO SALVTEM, Collectanea rerum mirabilium-1
------ Finished De_agri_cultura, M. PORCI CATONIS CENSORIS DE AGRI CVLTVRA, De agri cultu