# Embedding the Documents

In [3]:
import os
import json
import numpy as np
import threading
import time
from cltk import NLP


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [2]:
embedding_dir = "./data/embedding/"
doc_dir = "./data/"

In [4]:
def nlp_file(filename, fpath, tpath):
    print(f"--- Start Process for {filename}, {time.ctime()}")
    with open(fpath) as jf:
        wdoc = json.load(jf)
    cltk_nlp = NLP(language='lat', suppress_banner=True)

    for bk in wdoc:
        cltk_doc = cltk_nlp.analyze(bk["texts"])
        bk["sentences"] = cltk_doc.sentences_strings
        bk["sentence_embeddings"] = cltk_doc.sentence_embeddings
        print(f"------ Finished {filename}, {bk['title']}, {bk['bks']}")
    with open(tpath, 'w') as jf:
        json.dump(wdoc, jf, cls=NumpyArrayEncoder)
    print(f"--- End Process for {filename}, {time.ctime()}")


In [5]:
from concurrent.futures import ThreadPoolExecutor


In [6]:
with ThreadPoolExecutor(max_workers=16) as pool:
    def get_result(future):
        print(future.result())

    for tname in os.listdir(doc_dir):
        fpath = os.path.join(doc_dir, tname)
        fname, ext = os.path.splitext(tname)

        if not os.path.isfile(fpath) or ext != '.json':
            #print(f"{fname} is not a json file")
            continue
        tpath = os.path.join(embedding_dir, fname+'_embedding'+ext)
        pool.submit(nlp_file, fname, fpath, tpath).add_done_callback(get_result)
        time.sleep(1)
        

    


--- Start Process for Collectanea_rerum_mirabilium, Fri Jun 24 22:41:54 2022
--- Start Process for De_agri_cultura, Fri Jun 24 22:41:55 2022
--- Start Process for De_rerum_naturis, Fri Jun 24 22:41:56 2022
--- Start Process for Etymologiarum_libri_XX, Fri Jun 24 22:41:57 2022
--- Start Process for Natura_Histori, Fri Jun 24 22:41:58 2022
--- Start Process for Physica_Bingensis, Fri Jun 24 22:41:59 2022
--- Start Process for Res_rustica, Fri Jun 24 22:42:00 2022
------ Finished Etymologiarum_libri_XX,  , Praefatio
------ Finished De_agri_cultura, M. PORCI CATONIS CENSORIS DE AGRI CVLTVRA, De agri cultura-0
------ Finished Collectanea_rerum_mirabilium, SOLINVS ADVENTO SALVTEM, Collectanea rerum mirabilium-0
------ Finished De_agri_cultura, M. PORCI CATONIS CENSORIS DE AGRI CVLTVRA, De agri cultura-1
------ Finished Collectanea_rerum_mirabilium, SOLINVS ADVENTO SALVTEM, Collectanea rerum mirabilium-1
------ Finished De_agri_cultura, M. PORCI CATONIS CENSORIS DE AGRI CVLTVRA, De agri cultu

# Data Filter

In [6]:
import plotly.figure_factory as ff
import plotly.graph_objects as go
import tqdm
import numpy as np
import pandas as pd

In [5]:
count_sum = 0
stn_len = []
for file_name in os.listdir(embedding_dir):
    fn, ext = os.path.splitext(file_name)
    if ext != '.json':
        continue
    with open(os.path.join(embedding_dir, file_name)) as jf:
        data = json.load(jf)

        count = 0
        for bk in data:
            count += len(bk["sentences"])
            for stn in bk["sentences"]:
                stn_len.append(len(stn))
        count_sum += count
        print(fn, count)

Collectanea_rerum_mirabilium_embedding 2413
De_agri_cultura_embedding 1607
De_rerum_naturis_embedding 14257
Etymologiarum_libri_XX_embedding 17891
Natura_Histori_embedding 31245
Physica_Bingensis_embedding 908
Res_rustica_embedding 7871


In [9]:
lp, hp = np.percentile(stn_len, 10), np.percentile(stn_len, 90)
print(lp, hp)

21.0 179.0


In [16]:
fig = ff.create_distplot([stn_len], ['字符数'], show_rug=False)
fig.add_trace(go.Scatter(x=[lp, hp, hp, lp, lp], y=[0, 0, 0.03, 0.03, 0], name='有效区间', fill='toself'))
fig.update_layout(width=800, height=500, title="质量控制-每句字符数")
fig.update_xaxes(range=(0,300))
fig.update_yaxes(range=(0,0.03))
fig.show()

In [None]:
sentences_info = []
embedding_matrix = []
for file_name in os.listdir(embedding_dir):
    fn, ext = os.path.splitext(file_name)
    if ext != '.json':
        continue
    file_path = os.path.join(embedding_dir, file_name)
    with open(file_path) as jf:
        data = json.load(jf)
        for bk in tqdm.tqdm(data):
            for sti, stn in enumerate(bk["sentences"]):
                sentence_length = len(stn)
                if sentence_length < lp or sentence_length > hp:
                    continue
                sentences_info.append(dict(
                    path = file_path,
                    title = fn.removesuffix('_embedding').replace('_', ' ').title(),
                    bki = bk['bks'],
                    bkt = bk['title'],
                    url = bk['url'],
                    sti = sti,
                    stn = stn,
                ))
                embedding_matrix.append(bk["sentence_embeddings"][sti])

