In [2]:
import gc
import random
import pickle
import multiprocessing
from collections import defaultdict
from tqdm import tqdm
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [8]:
def word_bag_processer(tuple_dict): # 只挑选所有元素仅包含唯一单词的三元组
    
    newer_dict = defaultdict(list)
    total_word_set = set()
    for key, values in tuple_dict.items():
        for word_lists, full_sent in values:
            # 过滤词组
            if (len(word_lists[0]) > 1) or (len(word_lists[1]) > 1) or (len(word_lists[2]) > 1):
                continue
            # 重新整理元组，同时搜集所有出现词
            eliments_without_inner_list = [w_l[0].lower() for w_l in word_lists]
            newer_dict[key].append((eliments_without_inner_list, full_sent))
            get_word_set(eliments_without_inner_list, total_word_set)
            
    return newer_dict, list(total_word_set)
            
def get_word_set(tuples, container_set):
    
    for eliment in tuples:
        container_set.add(eliment)
    
    return True

def encoding_process(tuple_dict, encoder): # 将三元组转换为模型可接受的编码
        
    encoded_tp = []
    for species in tqdm(tuple_dict.values()):
        for word_lists, _ in species:
            encoded_tp.append(encoder.transform(word_lists))
        
    return encoded_tp

In [4]:
class TupleDataset(Dataset):
    # 构建输入神经网路的数据集
    def __init__(self, input_data):
        self.first_eliment = input_data[0]
        self.second_eliment = input_data[1]
        self.third_eliment = input_data[2]
    
    def __len__(self):
        return len(self.first_eliment)
    
    def __getitem__(self, idx):
        return (self.first_eliment[idx], self.second_eliment[idx], self.third_eliment[idx])

class GraphNet(nn.Module): # 图嵌入网络
    
    def __init__(self, n_words, embedding_dim=150):
        # 初始化
        super(GraphNet, self).__init__()
        self.embedding_layer1 = nn.Embedding(n_words+1, embedding_dim=embedding_dim)
        self.embedding_layer2 = nn.Embedding(n_words+1, embedding_dim=embedding_dim)
        self.embedding_layer3 = nn.Embedding(n_words+1, embedding_dim=embedding_dim)
        
    def forward(self, x, y, z):
        # 前向传播
        return (self.embedding_layer1(x), self.embedding_layer2(y), self.embedding_layer3(z))

In [5]:
with open("../datas/tuple/wiki_tri_tuple.pkl", 'rb') as bf:
    tri_tp = pickle.load(bf)

In [13]:
# 建立单词索引
new_dict, total_word_list = word_bag_processer(tri_tp)
word_encoder = LabelEncoder()
word_encoder.fit(total_word_list)
            
# 编码
input_data = encoding_process(new_dict, word_encoder)
input_data = np.expand_dims(np.array(input_data), axis=2)

100%|██████████| 14000/14000 [56:58<00:00,  4.10it/s]  


In [15]:
with open('../datas/encoded/word_encoder.pkl', 'wb') as bfile:
    pickle.dump(word_encoder, bfile, protocol=4)
np.save('../datas/encoded/encoded_word.npy', input_data)

In [14]:
# 数据输入准备
tuple_dataset = TupleDataset(input_data)
tuple_loader = DataLoader(tuple_dataset, batch_size=128, shuffle=True, num_workers=multiprocessing.cpu_count())

net = GraphNet(len(word_encoder.classes_)).to('cuda')
optimizer = torch.optim.Adam(net.parameters(), lr=3e-3)
mse_loss = nn.MSELoss().to("cuda")

# 训练（使用GPU）
EPOCHS = 30000
eta =  3e-5# 早停步长
mean_mse = 0
for epk in range(EPOCHS):
    last_mse = mean_mse
    mean_mse = 0
    for pos, (x, y, z) in enumerate(tuple_loader):
        x = x.long().to('cuda')
        y = y.long().to('cuda')
        z = z.long().to('cuda')
        
        optimizer.zero_grad() #优化器梯度清零
        x, y, z = net(x, y, z) #前向传播
        loss = mse_loss(x+y, z) #损失函数
        loss.backward() #反向传播
        optimizer.step() #更新参数
        
        mean_mse += loss
    mean_mse /= (pos + 1)
    print("Loss on epoch %d: %.5f" % (epk, mean_mse))
    
    torch.cuda.empty_cache()
    
    if (last_mse - mean_mse < eta) & (epk > 20):
        break

Loss on epoch 0: 3.06641
Loss on epoch 1: 3.04126
Loss on epoch 2: 3.01627
Loss on epoch 3: 2.99145
Loss on epoch 4: 2.96680
Loss on epoch 5: 2.94232
Loss on epoch 6: 2.91802
Loss on epoch 7: 2.89389
Loss on epoch 8: 2.86993
Loss on epoch 9: 2.84616
Loss on epoch 10: 2.82256
Loss on epoch 11: 2.79915
Loss on epoch 12: 2.77592
Loss on epoch 13: 2.75286
Loss on epoch 14: 2.72999
Loss on epoch 15: 2.70731
Loss on epoch 16: 2.68480
Loss on epoch 17: 2.66248
Loss on epoch 18: 2.64034
Loss on epoch 19: 2.61838
Loss on epoch 20: 2.59661
Loss on epoch 21: 2.57502
Loss on epoch 22: 2.55361
Loss on epoch 23: 2.53238
Loss on epoch 24: 2.51133
Loss on epoch 25: 2.49047
Loss on epoch 26: 2.46978
Loss on epoch 27: 2.44927
Loss on epoch 28: 2.42894
Loss on epoch 29: 2.40879
Loss on epoch 30: 2.38882
Loss on epoch 31: 2.36902
Loss on epoch 32: 2.34939
Loss on epoch 33: 2.32994
Loss on epoch 34: 2.31066
Loss on epoch 35: 2.29155
Loss on epoch 36: 2.27261
Loss on epoch 37: 2.25384
Loss on epoch 38: 2.23

Loss on epoch 308: 0.27643
Loss on epoch 309: 0.27437
Loss on epoch 310: 0.27232
Loss on epoch 311: 0.27030
Loss on epoch 312: 0.26828
Loss on epoch 313: 0.26628
Loss on epoch 314: 0.26430
Loss on epoch 315: 0.26233
Loss on epoch 316: 0.26038
Loss on epoch 317: 0.25844
Loss on epoch 318: 0.25651
Loss on epoch 319: 0.25460
Loss on epoch 320: 0.25271
Loss on epoch 321: 0.25083
Loss on epoch 322: 0.24896
Loss on epoch 323: 0.24710
Loss on epoch 324: 0.24526
Loss on epoch 325: 0.24344
Loss on epoch 326: 0.24162
Loss on epoch 327: 0.23982
Loss on epoch 328: 0.23804
Loss on epoch 329: 0.23626
Loss on epoch 330: 0.23451
Loss on epoch 331: 0.23276
Loss on epoch 332: 0.23102
Loss on epoch 333: 0.22930
Loss on epoch 334: 0.22760
Loss on epoch 335: 0.22590
Loss on epoch 336: 0.22422
Loss on epoch 337: 0.22255
Loss on epoch 338: 0.22089
Loss on epoch 339: 0.21925
Loss on epoch 340: 0.21761
Loss on epoch 341: 0.21599
Loss on epoch 342: 0.21438
Loss on epoch 343: 0.21278
Loss on epoch 344: 0.21120
L

Loss on epoch 612: 0.02739
Loss on epoch 613: 0.02717
Loss on epoch 614: 0.02696
Loss on epoch 615: 0.02675
Loss on epoch 616: 0.02654
Loss on epoch 617: 0.02633
Loss on epoch 618: 0.02613
Loss on epoch 619: 0.02592
Loss on epoch 620: 0.02572
Loss on epoch 621: 0.02552
Loss on epoch 622: 0.02532
Loss on epoch 623: 0.02512
Loss on epoch 624: 0.02492
Loss on epoch 625: 0.02473
Loss on epoch 626: 0.02453
Loss on epoch 627: 0.02434
Loss on epoch 628: 0.02415
Loss on epoch 629: 0.02396
Loss on epoch 630: 0.02377
Loss on epoch 631: 0.02358
Loss on epoch 632: 0.02340
Loss on epoch 633: 0.02321
Loss on epoch 634: 0.02303
Loss on epoch 635: 0.02285
Loss on epoch 636: 0.02267
Loss on epoch 637: 0.02249
Loss on epoch 638: 0.02232
Loss on epoch 639: 0.02214
Loss on epoch 640: 0.02197
Loss on epoch 641: 0.02179
Loss on epoch 642: 0.02162
Loss on epoch 643: 0.02145
Loss on epoch 644: 0.02128
Loss on epoch 645: 0.02111
Loss on epoch 646: 0.02095
Loss on epoch 647: 0.02078
Loss on epoch 648: 0.02062
L

In [16]:
torch.save(net.state_dict(), "../models/graphembedding")

In [20]:
# 读取模型
model = torch.load("../models/graphembedding")
net = GraphNet(len(word_encoder.classes_))
net.load_state_dict(model)
net.eval()

GraphNet(
  (embedding_layer1): Embedding(36038, 150)
  (embedding_layer2): Embedding(36038, 150)
  (embedding_layer3): Embedding(36038, 150)
)

In [35]:
tuple_raw_vector_dict = defaultdict(list)
label_ptr = 0
input_data = torch.from_numpy(input_data).long().to('cuda')

# 三元组向量化
net = net.to('cuda')
with torch.no_grad():
    x, y, z = net(input_data[:, 0], input_data[:, 1], input_data[:, 2])

x = np.squeeze(x.to('cpu').numpy(), axis=1)
y = np.squeeze(y.to('cpu').numpy(), axis=1)
z = np.squeeze(z.to('cpu').numpy(), axis=1)

In [59]:
# 记录完整图谱
tuple_yielder = zip(x, y, z)
tuple_raw_vector_dict = defaultdict(list)

for key, value in new_dict.items():
    for t, s in value:
        tuple_raw_vector_dict[key].append([t, s, next(iter(tuple_yielder))])

In [66]:
with open("../datas/knowledge/graph.pkl", 'wb') as g:
    pickle.dump(tuple_raw_vector_dict, g, protocol=4)