# 配置项

In [1]:
# root_id = '20221125094240-jdfn6ce'  # 课程/人工智能与机器学习/网络空间安全领域智能化
root_id = '20220909082530-85ixym4'  # 课程/人工智能与机器学习
path_root = './data/prod'  # 数据目录
path_tree_dir = f'{path_root}/tree'  # 节点树目录
path_tree_file = f'{path_tree_dir}/{root_id}.json'  # 节点树文件
path_tree_file_with_keys = f'{path_tree_dir}/{root_id}-keys.json'  # 提取了关键字的节点树文件
path_tree_file_with_full_keys = f'{path_tree_dir}/{root_id}-full-keys.json'  # 继承了上级节点关键字的树文件
path_index = f'{path_root}/index/{root_id}/'  # 索引文件保存目录

In [2]:
import os
import json
import datetime

In [3]:
# 创建目录
os.makedirs(path_tree_dir, exist_ok=True)

In [4]:
# 统计时间
def progress(start, current, final):
    now = datetime.datetime.now()
    duration = now - start
    rate = current/final
    print(f"{str(duration):<15} {str(duration * (1-rate) / rate):<15} {current:8}/{final} {rate*100:8.3f}%", end='\r')

# 构造节点树并保存

In [5]:
import pkg.env as env

from pkg.api import API
from pkg.notebook import Notebook, Notebooks
from pkg.client import Client
from pkg.tree import Tree

In [6]:
client = Client(
    token=env.token,
    host=env.host,
    port=env.port,
    ssl=env.ssl,
    proxies=env.proxies,
)

In [7]:
notebooks = client.getNotebooks()

In [8]:
tree = Tree(
    client=client,
    notebooks=notebooks,
)

In [9]:
tree.buildTree(
    root_id,
)

20220909082530-85ixym4: 课程/人工智能与机器学习


In [10]:
with open(path_tree_file, 'w') as f:
    f.write(json.dumps(tree.__dict__(), indent=2, ensure_ascii=False))

# 提取各节点关键短语并保存

In [11]:
from pkg.extractor import Extractor
from pkg.tree import Tree

In [12]:
with open(path_tree_file, 'r') as f:
    tree = Tree.fromDict(json.loads(f.read()))
node_count = len(tree)
print('node count:', node_count)

node count: 3677


In [13]:
extractor = Extractor()

  from .autonotebook import tqdm as notebook_tqdm
                                             

In [14]:
start = datetime.datetime.now()
for i, node in enumerate(tree):
    node.data.extractKeys(extractor)
    progress(start, i+1, node_count)

0:02:26.557324  0:00:00             3677/3677  100.000%

In [15]:
with open(path_tree_file_with_keys, 'w') as f:
    f.write(json.dumps(tree.__dict__(), indent=2, ensure_ascii=False))

# 各节点继承关键短语并保存

In [16]:
from pkg.tree import Tree

In [17]:
with open(path_tree_file_with_keys, 'r') as f:
    tree = Tree.fromDict(json.loads(f.read()))
node_count = len(tree)
print('node count:', node_count)

node count: 3677


In [18]:
for i, node in enumerate(tree):
    node.data.inheritKeys(node.parent.data)

In [19]:
with open(path_tree_file_with_full_keys, 'w') as f:
    f.write(json.dumps(tree.__dict__(), indent=2, ensure_ascii=False))

# 索引各节点

In [24]:
from pkg.fts import FTS
from pkg.tree import Tree

In [25]:
with open(path_tree_file_with_full_keys, 'r') as f:
    tree = Tree.fromDict(json.loads(f.read()))
node_count = len(tree)
print('node count:', node_count)

node count: 3677


In [26]:
fts = FTS(
    index_dir=path_index
)
fts.initIndex()
# fts.openIndex()

<pkg.fts.FTS at 0x7f87fc1e2160>

In [27]:
for i, node in enumerate(tree):
    fts.add_document(
        id=node.id,
        keys=','.join(node.data.keys_with_inherit),
        content=node.data.content,
    )
fts.commit()

<pkg.fts.FTS at 0x7f87fc1e2160>