In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import os
import json
import spacy

# 加载spaCy的英语NER模型
nlp = spacy.load("en_core_web_sm")

# 假设JSON文件存放目录
json_dir = "/content/drive/MyDrive/6698video/KG/search_space_train"

# 输出三元组文件
output_triple_file = "/content/drive/MyDrive/6698video/KG/triples.tsv"

# 简单关系抽取占位函数（示例）
# 实际中应使用一个已训练好的RE模型对 (实体1, 实体2) 返回关系
def dummy_relation_extractor(e1, e2, sentence):
    # 简单规则示例：如果句子中包含"comments about"且e2紧接，是"comments_about"关系
    # 这只是一个示例，需要根据实际情况定制
    text = sentence.lower()
    if "comments about" in text:
        return "comments_about"
    elif "executioner" in text or "execute" in text:
        return "wants_to_execute"
    elif "damage to him" in text or "would do more damage" in text:
        return "threatens"
    # 没有匹配到的关系返回None
    return None

triples = []

# 遍历JSON文件夹
for filename in os.listdir(json_dir):
    print("处理文件:"+filename)
    if not filename.endswith(".json"):
        continue

    file_path = os.path.join(json_dir, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # 假定"q"和"r"下都有若干数据段
    all_paragraphs = []
    if "q" in data:
        for item in data["q"]:
            if "document" in item:
                all_paragraphs.append(item["document"])
    if "r" in data:
        for item in data["r"]:
            if "document" in item:
                all_paragraphs.append(item["document"])

    # 对每个段落进行NER和关系抽取
    for paragraph in all_paragraphs:
        doc = nlp(paragraph)
        # 提取实体
        entities = [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]

        # 简化处理：尝试对同一句子中的实体对进行关系抽取
        # 实际中应该基于句子级处理，这里以doc.sents为句子边界
        for sent in doc.sents:
            sent_text = sent.text.strip()
            sent_ents = [ent for ent in entities if ent[2]>=sent.start_char and ent[3]<=sent.end_char]

            # 两两实体组合，尝试抽取关系
            for i in range(len(sent_ents)):
                for j in range(i+1, len(sent_ents)):
                    e1, e1_label, _, _ = sent_ents[i]
                    e2, e2_label, _, _ = sent_ents[j]

                    # 尝试关系抽取
                    relation = dummy_relation_extractor(e1, e2, sent_text)
                    if relation:
                        triples.append((e1.replace(" ", "_"), relation, e2.replace(" ", "_")))

                    # 尝试方向反转（有时关系方向不同）
                    relation = dummy_relation_extractor(e2, e1, sent_text)
                    if relation:
                        triples.append((e2.replace(" ", "_"), relation, e1.replace(" ", "_")))

# 去重
triples = list(set(triples))

# 将三元组写入文件
with open(output_triple_file, "w", encoding="utf-8") as fw:
    for h, r, t in triples:
        fw.write(f"{h}\t{r}\t{t}\n")

print("三元组提取完成，已写入：", output_triple_file)


处理文件:0000000.json
处理文件:0000001.json
处理文件:0000002.json
处理文件:0000003.json
处理文件:0000004.json
处理文件:0000005.json
处理文件:0000006.json
处理文件:0000007.json
处理文件:0000008.json
处理文件:0000009.json
处理文件:0000010.json
处理文件:0000011.json
处理文件:0000012.json
处理文件:0000013.json
处理文件:0000014.json
处理文件:0000015.json
处理文件:0000016.json
处理文件:0000017.json
处理文件:0000018.json
处理文件:0000019.json
处理文件:0000020.json
处理文件:0000021.json
处理文件:0000022.json
处理文件:0000023.json
处理文件:0000024.json
处理文件:0000025.json
处理文件:0000026.json
处理文件:0000027.json
处理文件:0000028.json
处理文件:0000029.json
处理文件:0000030.json
处理文件:0000031.json
处理文件:0000032.json
处理文件:0000033.json
处理文件:0000034.json
处理文件:0000035.json
处理文件:0000036.json
处理文件:0000037.json
处理文件:0000038.json
处理文件:0000039.json
处理文件:0000040.json
处理文件:0000041.json
处理文件:0000042.json
处理文件:0000043.json
处理文件:0000044.json
处理文件:0000045.json
处理文件:0000046.json
处理文件:0000047.json
处理文件:0000048.json
处理文件:0000049.json
处理文件:0000050.json
处理文件:0000051.json
处理文件:0000052.json
处理文件:0000053.json
处理文件:0000054.json
处理文件:00000

In [8]:

!pip install --upgrade pip setuptools wheel

!pip install models


Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Collecting setuptools
  Downloading setuptools-75.6.0-py3-none-any.whl.metadata (6.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading setuptools-75.6.0-py3-none-any.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 75.1.0
    Uninstalling setuptools-75.1.0:
      Successfully uninstalled setuptools-75.1.0
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behavio

Collecting models
  Using cached models-0.9.3.tar.gz (16 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [3]:
import random

# 输入和输出文件路径（可根据需要修改）
triple_file = "/content/drive/MyDrive/6698video/KG/triples.tsv"
entity2id_file = "/content/drive/MyDrive/6698video/KG/entity2id.txt"
relation2id_file = "/content/drive/MyDrive/6698video/KG/relation2id.txt"
train2id_file = "/content/drive/MyDrive/6698video/KG/train2id.txt"
valid2id_file = "/content/drive/MyDrive/6698video/KG/valid2id.txt"
test2id_file = "/content/drive/MyDrive/6698video/KG/test2id.txt"

# 读取三元组
triples = []
with open(triple_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        h, r, t = line.split('\t')
        triples.append((h, r, t))

# 抽取实体和关系集合
entities = set()
relations = set()
for h, r, t in triples:
    entities.add(h)
    entities.add(t)
    relations.add(r)

entities = list(entities)
relations = list(relations)

# 给实体和关系分配ID
entity2id = {e: idx for idx, e in enumerate(entities)}
relation2id = {r: idx for idx, r in enumerate(relations)}

# 将三元组映射为ID格式
triples_id = [(entity2id[h], entity2id[t], relation2id[r]) for h, r, t in triples]

# 划分数据集，假设80%训练，10%验证，10%测试
random.shuffle(triples_id)
total = len(triples_id)
train_count = int(total * 0.8)
valid_count = int(total * 0.1)
test_count = total - train_count - valid_count

train_data = triples_id[:train_count]
valid_data = triples_id[train_count:train_count+valid_count]
test_data = triples_id[train_count+valid_count:]

# 写出entity2id.txt
with open(entity2id_file, "w", encoding="utf-8") as f:
    f.write(str(len(entities)) + "\n")
    for e, idx in entity2id.items():
        f.write(f"{e}\t{idx}\n")

# 写出relation2id.txt
with open(relation2id_file, "w", encoding="utf-8") as f:
    f.write(str(len(relations)) + "\n")
    for r, idx in relation2id.items():
        f.write(f"{r}\t{idx}\n")

# 写出train2id.txt
with open(train2id_file, "w", encoding="utf-8") as f:
    f.write(str(len(train_data)) + "\n")
    for h_id, t_id, r_id in train_data:
        f.write(f"{h_id}\t{t_id}\t{r_id}\n")

# 写出valid2id.txt
with open(valid2id_file, "w", encoding="utf-8") as f:
    f.write(str(len(valid_data)) + "\n")
    for h_id, t_id, r_id in valid_data:
        f.write(f"{h_id}\t{t_id}\t{r_id}\n")

# 写出test2id.txt
with open(test2id_file, "w", encoding="utf-8") as f:
    f.write(str(len(test_data)) + "\n")
    for h_id, t_id, r_id in test_data:
        f.write(f"{h_id}\t{t_id}\t{r_id}\n")

print("转换完成！")
print("Entities:", len(entities))
print("Relations:", len(relations))
print("Triples total:", total)
print("Train:", len(train_data), "Valid:", len(valid_data), "Test:", len(test_data))


转换完成！
Entities: 122
Relations: 2
Triples total: 549
Train: 439 Valid: 54 Test: 56


In [4]:
!pip install pykeen


Collecting pykeen
  Downloading pykeen-1.11.0-py3-none-any.whl.metadata (85 kB)
Collecting dataclasses-json (from pykeen)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting click-default-group (from pykeen)
  Downloading click_default_group-1.2.4-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting optuna>=2.0.0 (from pykeen)
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting more-click (from pykeen)
  Downloading more_click-0.1.2-py3-none-any.whl.metadata (4.3 kB)
Collecting pystow>=0.4.3 (from pykeen)
  Downloading pystow-0.6.1-py3-none-any.whl.metadata (17 kB)
Collecting docdata (from pykeen)
  Downloading docdata-0.0.4-py3-none-any.whl.metadata (13 kB)
Collecting class-resolver>=0.5.1 (from pykeen)
  Downloading class_resolver-0.5.4-py3-none-any.whl.metadata (14 kB)
Collecting torch-max-mem>=0.1.1 (from pykeen)
  Downloading torch_max_mem-0.1.3-py3-none-any.whl.metadata (7.4 kB)
Collecting torch-ppr>=0.0.7 (from pykeen)
  Downloading

In [5]:
import os

# 输入文件路径（根据需要修改）
entity2id_path = "/content/drive/MyDrive/6698video/KG/data/entity2id.txt"
relation2id_path = "/content/drive/MyDrive/6698video/KG/data/relation2id.txt"
train2id_path = "/content/drive/MyDrive/6698video/KG/data/train2id.txt"
valid2id_path = "/content/drive/MyDrive/6698video/KG/data/valid2id.txt"
test2id_path = "/content/drive/MyDrive/6698video/KG/data/test2id.txt"

# 输出文件路径
train_out_path = "/content/drive/MyDrive/6698video/KG/data/train.txt"
valid_out_path = "/content/drive/MyDrive/6698video/KG/data/valid.txt"
test_out_path = "/content/drive/MyDrive/6698video/KG/data/test.txt"

# 读取实体和关系映射
entity2id = {}
relation2id = {}

with open(entity2id_path, 'r', encoding='utf-8') as f:
    lines = f.read().strip().split('\n')
    num_entities = int(lines[0])
    for line in lines[1:]:
        ent, eid = line.split('\t')
        entity2id[int(eid)] = ent

with open(relation2id_path, 'r', encoding='utf-8') as f:
    lines = f.read().strip().split('\n')
    num_relations = int(lines[0])
    for line in lines[1:]:
        rel, rid = line.split('\t')
        relation2id[int(rid)] = rel

def convert_file(openke_path, out_path):
    with open(openke_path, 'r', encoding='utf-8') as f:
        lines = f.read().strip().split('\n')
        count = int(lines[0])
        triple_lines = lines[1:]

    with open(out_path, 'w', encoding='utf-8') as fw:
        for line in triple_lines:
            h_id, t_id, r_id = line.split()
            h_id = int(h_id)
            t_id = int(t_id)
            r_id = int(r_id)
            head = entity2id[h_id]
            tail = entity2id[t_id]
            rel = relation2id[r_id]
            fw.write(f"{head}\t{rel}\t{tail}\n")

# 转换训练集、验证集和测试集
convert_file(train2id_path, train_out_path)
convert_file(valid2id_path, valid_out_path)
convert_file(test2id_path, test_out_path)

print("转换完成！请查看 train.txt, valid.txt, test.txt 文件。")


转换完成！请查看 train.txt, valid.txt, test.txt 文件。


In [8]:
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory

# 从刚刚生成的文件加载数据
train = TriplesFactory.from_path('/content/drive/MyDrive/6698video/KG/data/train.txt')
valid = TriplesFactory.from_path('/content/drive/MyDrive/6698video/KG/data/valid.txt', entity_to_id=train.entity_to_id, relation_to_id=train.relation_to_id)
test = TriplesFactory.from_path('/content/drive/MyDrive/6698video/KG/data/test.txt', entity_to_id=train.entity_to_id, relation_to_id=train.relation_to_id)

# 使用TransE训练
result = pipeline(
    training=train,
    validation=valid,
    testing=test,
    model='TransR',
    training_kwargs=dict(num_epochs=100),
)

print(result.metric_results)

INFO:pykeen.pipeline.api:Using device: None


Training epochs on cpu:   0%|          | 0/100 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/2 [00:00<?, ?batch/s]



Evaluating on cpu:   0%|          | 0.00/56.0 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.18s seconds


<pykeen.evaluation.rank_based_evaluator.RankBasedMetricResults object at 0x7a2e90679ea0>
