# Embedding the Documents

In [1]:
import os
import json
import numpy as np
import threading
import time
from cltk import NLP

ModuleNotFoundError: No module named 'cltk'

In [2]:
class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [7]:
embedding_dir = "./data/embedding/"
doc_dir = "./data/raw/"

In [4]:
def nlp_file(filename, fpath, tpath):
    print(f"--- Start Process for {filename}, {time.ctime()}")
    with open(fpath) as jf:
        wdoc = json.load(jf)
    cltk_nlp = NLP(language='lat', suppress_banner=True)

    for bk in wdoc:
        cltk_doc = cltk_nlp.analyze(bk["texts"])
        bk["sentences"] = cltk_doc.sentences_strings
        bk["sentence_embeddings"] = cltk_doc.sentence_embeddings
        print(f"------ Finished {filename}, {bk['title']}, {bk['bks']}")
    with open(tpath, 'w') as jf:
        json.dump(wdoc, jf, cls=NumpyArrayEncoder)
    print(f"--- End Process for {filename}, {time.ctime()}")


In [5]:
from concurrent.futures import ThreadPoolExecutor


In [6]:
with ThreadPoolExecutor(max_workers=16) as pool:
    def get_result(future):
        print(future.result())

    for tname in os.listdir(doc_dir):
        fpath = os.path.join(doc_dir, tname)
        fname, ext = os.path.splitext(tname)

        if not os.path.isfile(fpath) or ext != '.json':
            #print(f"{fname} is not a json file")
            continue
        tpath = os.path.join(embedding_dir, fname+'_embedding'+ext)
        pool.submit(nlp_file, fname, fpath, tpath).add_done_callback(get_result)
        time.sleep(1)
        

    


--- Start Process for Collectanea_rerum_mirabilium, Fri Jun 24 22:41:54 2022
--- Start Process for De_agri_cultura, Fri Jun 24 22:41:55 2022
--- Start Process for De_rerum_naturis, Fri Jun 24 22:41:56 2022
--- Start Process for Etymologiarum_libri_XX, Fri Jun 24 22:41:57 2022
--- Start Process for Natura_Histori, Fri Jun 24 22:41:58 2022
--- Start Process for Physica_Bingensis, Fri Jun 24 22:41:59 2022
--- Start Process for Res_rustica, Fri Jun 24 22:42:00 2022
------ Finished Etymologiarum_libri_XX,  , Praefatio
------ Finished De_agri_cultura, M. PORCI CATONIS CENSORIS DE AGRI CVLTVRA, De agri cultura-0
------ Finished Collectanea_rerum_mirabilium, SOLINVS ADVENTO SALVTEM, Collectanea rerum mirabilium-0
------ Finished De_agri_cultura, M. PORCI CATONIS CENSORIS DE AGRI CVLTVRA, De agri cultura-1
------ Finished Collectanea_rerum_mirabilium, SOLINVS ADVENTO SALVTEM, Collectanea rerum mirabilium-1
------ Finished De_agri_cultura, M. PORCI CATONIS CENSORIS DE AGRI CVLTVRA, De agri cultu

# Data Filter

In [15]:
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import tqdm
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torchmetrics.functional as Fm
import json
import umap
# import networkx as nx

In [5]:
embedding_dir = "./data/embedding/"
doc_dir = "./data/raw/"

In [6]:
count_sum = 0
stn_len = []
for file_name in os.listdir(embedding_dir):
    fn, ext = os.path.splitext(file_name)
    if ext != '.json':
        continue
    with open(os.path.join(embedding_dir, file_name)) as jf:
        data = json.load(jf)

        count = 0
        for bk in data:
            count += len(bk["sentences"])
            for stn in bk["sentences"]:
                stn_len.append(len(stn))
        count_sum += count
        print(fn, count)

Collectanea_rerum_mirabilium_embedding 2413
De_agri_cultura_embedding 1607
De_rerum_naturis_embedding 14257
Etymologiarum_libri_XX_embedding 17891
Natura_Histori_embedding 31245
Physica_Bingensis_embedding 908
Res_rustica_embedding 7871


In [7]:
lp, hp = np.percentile(stn_len, 10), np.percentile(stn_len, 90)
print(lp, hp)

21.0 179.0


In [11]:
fig = ff.create_distplot([stn_len], ['字符数'], show_rug=False)
fig.add_trace(go.Scatter(x=[lp, hp, hp, lp, lp], y=[0, 0, 0.03, 0.03, 0], name='有效区间', fill='toself'))
fig.update_layout(width=800, height=500, title="质量控制-每句字符数")
fig.update_xaxes(range=(0,300))
fig.update_yaxes(range=(0,0.03))
fig.write_json("data/result/fig/qc_control.json")

In [9]:
sentences_info = []
embedding_matrix = []
for file_name in os.listdir(embedding_dir):
    fn, ext = os.path.splitext(file_name)
    if ext != '.json':
        continue
    file_path = os.path.join(embedding_dir, file_name)
    with open(file_path) as jf:
        data = json.load(jf)
        for bk in tqdm.tqdm(data):
            for sti, stn in enumerate(bk["sentences"]):
                sentence_length = len(stn)
                emb = bk["sentence_embeddings"][str(sti)]
                if sentence_length < lp or sentence_length > hp:
                    continue
                if np.linalg.norm(emb) < 1e-8:
                    continue
                sentences_info.append(dict(
                    path = file_path,
                    title = fn.removesuffix('_embedding').replace('_', ' ').title(),
                    bki = bk['bks'],
                    bkt = bk['title'],
                    sti = sti,
                    stn = stn,
                ))
                

                embedding_matrix.append(emb)

sentences_info = pd.DataFrame.from_dict(sentences_info)
embedding_tensor = torch.FloatTensor(embedding_matrix)


100%|██████████| 276/276 [00:00<00:00, 8903.19it/s]
100%|██████████| 182/182 [00:00<00:00, 7279.97it/s]
100%|██████████| 310/310 [00:00<00:00, 1802.32it/s]
100%|██████████| 21/21 [00:00<00:00, 84.00it/s]
100%|██████████| 38/38 [00:00<00:00, 68.10it/s]
100%|██████████| 9/9 [00:00<00:00, 2249.89it/s]
100%|██████████| 13/13 [00:00<00:00, 126.21it/s]


In [13]:
update = False
if update:
    torch.save(embedding_tensor, "data/result/embeds/embedding.pt")
    sentences_info.to_csv("data/result/embeds/sentences_info.csv")
else:
    embedding_tensor = torch.load("data/result/embeds/embedding.pt")
    sentences_info = pd.read_csv("data/result/embeds/sentences_info.csv")

In [16]:
if update:
    cos_sim = Fm.pairwise_cosine_similarity(embedding_tensor)
    torch.save(cos_sim, "data/cosine_similarity.pt")

In [None]:
uembedding = umap.UMAP(n_neighbors=20, n_components=3, min_dist=0.1, metric='cosine').fit_transform(embedding_tensor)

In [57]:
fig_umap3d = px.scatter_3d(x=uembedding[:,0], y=uembedding[:,1], z=uembedding[:,2], 
              data_frame=sentences_info, color='title', width=1000, height = 800, title="无监督句向量嵌入空间(UMAP)", hover_data=['bkt','stn'],
                           
             )
fig_umap3d.write_html("data/result/fig/umap_embedding3d.html")
fig_umap3d.write_json("data/result/fig/umap_embedding3d.json")

In [30]:
uembedding2d = umap.UMAP(n_neighbors=20, n_components=2, min_dist=0.1, metric='cosine').fit_transform(embedding_tensor)

In [48]:
vals, inds = torch.topk(cos_sim, 20, dim=0)

In [35]:
fig_umap = px.scatter(x=uembedding2d[:,0], y=uembedding2d[:,1], data_frame=sentences_info, color='title', width=1200, height = 1000, title="无监督句向量嵌入空间(UMAP)", hover_data=['bkt','stn'])

fig_umap.write_html("data/result/fig/umap_embedding.html")
fig_umap.write_json("data/result/fig/umap_embedding.json")

In [65]:
from torch.utils.data import random_split
from torchnlp.encoders.label_encoder import LabelEncoder


In [None]:
to_list

In [86]:
G = nx.Graph()

In [85]:
G.add_edges_from(inds.T)

60450

## 生成章目索引

In [68]:
contents_ref = {}

i = 0
for file_name in os.listdir(embedding_dir):
    fn, ext = os.path.splitext(file_name)
    if ext != '.json':
        continue
    file_path = os.path.join(embedding_dir, file_name)
    with open(file_path) as jf:
        data = json.load(jf)
    work_name = fn.removesuffix('_embedding')
    temp_book_list = []
    work_dir = os.path.join('data/result/split/', fn)
    if not os.path.exists(work_dir):
        os.mkdir(work_dir)
    
    for bk in tqdm.tqdm(data):
        temp_book_list.append(dict(
            bki = bk['bks'],
            bkt = bk['title']))
        sentences = []
        for sti, stn in enumerate(bk["sentences"]):
            sentence_length = len(stn)
            emb = bk["sentence_embeddings"][str(sti)]
            gi = i
            if sentence_length < lp or sentence_length > hp:
                gi = -1
            elif np.linalg.norm(emb) < 1e-8:
                gi = -1

            sentences.append(dict(
                text = stn,
                gloabl_ind = gi))
            i += 1
        with open(os.path.join(work_dir, bk['bks']+'.json'), 'w') as bf:
            json.dump(sentences, bf)
    contents_ref[fn] = temp_book_list
with open('data/result/split/contents_ref.json','w') as cf:
    json.dump(contents_ref, cf)
                


100%|██████████████████████████████████████████████████████████████████████████████| 276/276 [00:00<00:00, 1117.40it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 182/182 [00:00<00:00, 1119.82it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 310/310 [00:00<00:00, 521.01it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 21/21 [00:00<00:00, 42.08it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [00:00<00:00, 42.13it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 375.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:00<00:00, 61.61it/s]
