In [1]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [2]:
def get_entities(sent):   
    ## chunk 1
    ent1 = ""
    ent2 = ""

    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    prefix = ""
    modifier = ""

  #############################################################
  
    for tok in nlp(sent):
      ## chunk 2
      # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
          # check: token is a compound word or not
            if tok.dep_ == "compound":
                prefix = tok.text
          # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
            if tok.dep_.endswith("mod") == True:
                modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
            if tok.dep_.find("subj") == True:
                ent1 = modifier +" "+ prefix + " "+ tok.text
                prefix = ""
                modifier = ""
                prv_tok_dep = ""
                prv_tok_text = ""      

      ## chunk 4
            if tok.dep_.find("obj") == True:
                ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text
  #############################################################

    return [ent1.strip(), ent2.strip()]

In [3]:
get_entities("The War Between Apple and Google Has Just Begun")

['War', 'Apple']

In [4]:
def get_relation(sent):
    
    doc = nlp(sent)    
    
    # Matcher class object    
    matcher = Matcher(nlp.vocab)  
    
    #define the pattern    
    pattern = [{'DEP':'ROOT'},              
               {'DEP':'prep','OP':"?"},             
               {'DEP':'agent','OP':"?"},               
               {'POS':'ADJ','OP':"?"}]     
    
    matcher.add("matching_1", None, pattern)    
    
    matches = matcher(doc)   
    k = len(matches) - 1    
    
    span = doc[matches[k][1]:matches[k][2]]     
    
    return(span.text)

In [10]:
def givemetuple(sent):
    print('entity is : ' + str(get_entities(sent)))
    print('relation is : ' + str(get_relation(sent)))
    

In [17]:
givemetuple('Apple Sold 300000 iPads on Day One')

entity is : ['Apple', '300000  Day']
relation is : Sold


In [3]:
from stanfordcorenlp import StanfordCoreNLP

In [6]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [1]:
#下載檔案 Stanford Named Entity Recognizer version 4.2.0

import requests as req
from tqdm import tqdm
url = 'https://nlp.stanford.edu/software/stanford-ner-4.2.0.zip'
def download(url):
    filename = url.split('/')[-1]
    r = req.get(url, stream=True)
    with open(filename, 'wb') as f:
        for data in tqdm(r.iter_content(1024)):
            f.write(data)
    return filename
download(url)

176209it [01:12, 2438.69it/s]


'stanford-ner-4.2.0.zip'

In [2]:
# 解壓縮zip檔

import os
import zipfile


# zipfile example
def zip_list(file_path):
    zf = zipfile.ZipFile(file_path, 'r')
    zf.extractall()



file_path = 'stanford-ner-4.2.0.zip'
zip_list(file_path)


In [6]:
import nltk
from nltk.tag.stanford import StanfordNERTagger

sentence = u"Apple's rivals hope its iWatch makes wearable work"
jar = './stanford-ner-tagger/stanford-ner.jar'
model = './stanford-ner-tagger/classifiers/english.all.3class.distsim.crf.ser.gz'

# Prepare NER tagger with english model
ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

# Tokenize: Split sentence into words
words = nltk.word_tokenize(sentence)

# Run NER tagger on words
print(ner_tagger.tag(words))

[('Apple', 'ORGANIZATION'), ("'s", 'O'), ('rivals', 'O'), ('hope', 'O'), ('its', 'O'), ('iWatch', 'O'), ('makes', 'O'), ('wearable', 'O'), ('work', 'O')]


In [22]:
# (範例)用openIE來NER for 單一句子

from openie import StanfordOpenIE

with StanfordOpenIE()as client:
    
    text = "Apple Sold 300000 iPads on Day One."
    print('Text: %s.' % text)
    for triple in client.annotate(text):
       # print('|-', triple)
        print('subject is : '+triple['subject'])
        print('relation is : '+triple['relation'])
        print('object is : '+triple['object'])
        
        print(triple)

        print('-'*50)

Text: Apple Sold 300000 iPads on Day One..
Starting server with command: java -Xmx8G -cp /home/zihjie/stanfordnlp_resources/stanford-corenlp-full-2018-10-05/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 60000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-c9e5818ec1604062.props -preload openie


In [None]:
# (範例)用openIE來NER for 一個文件

from openie import StanfordOpenIE

with StanfordOpenIE()as client:    
    with open('corpus_test.txt', 'r', encoding='utf8') as r:
        corpuss = r.readlines()
    
    for corpus in corpuss:
        print('corpus is : ' + corpus)
        
        triples_corpus = client.annotate(corpus[11:])     
        print('Corpus: %s [...].' % corpus[11:])     
        print('Found %s triples in the corpus.' % len(triples_corpus))  
        print('corpus date is ' + corpus[0:11])
        for triple in triples_corpus:
            print('|-', triple)
        
        print('-'*50)
        

In [None]:
# (跑workday)用openIE來NER for 一個文件

from openie import StanfordOpenIE

with StanfordOpenIE()as client:    
    with open('News_workday.txt', 'r', encoding='utf8') as r:
        corpuss = r.readlines()
    
    f = open('News_tuple.txt','a')
    
    for corpus in corpuss:
        date = corpus[:10]
        triples_corpus = client.annotate(corpus[11:])     
        print('Corpus: %s.' % corpus[11:])     
        print('Found %s triples in the corpus.' % len(triples_corpus))  
        for triple in triples_corpus:
            f.write(date +"    "+ triple['subject'] +"    "+ triple['relation'] +"    "+ triple['object'] + "\n")
        
        print('finish day is : ' + corpus[:10])
        
        print('-'*50)
    
    f.close()
        

In [24]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/zihjie/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [55]:
#詞性還原的範例
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

# 獲取單詞的詞性
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

sentence = ' Apple buys Quattro'
tokens = word_tokenize(sentence)  # 分詞
tagged_sent = pos_tag(tokens)     # 獲取單詞詞性

wnl = WordNetLemmatizer()
lemmas_sent = []
for tag in tagged_sent:
    print('tag[0] is : '+tag[0]+'    tag[1] is : '+tag[1] )
    wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
    print('wordnet_pos is : '+wordnet_pos)
    lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 詞形還原

print(lemmas_sent)

tag[0] is : Apple    tag[1] is : NNP
wordnet_pos is : n
tag[0] is : buys    tag[1] is : VBZ
wordnet_pos is : v
tag[0] is : Quattro    tag[1] is : NNP
wordnet_pos is : n
['Apple', 'buy', 'Quattro']


In [None]:
# (改良範例)用openIE來NER for 一個文件 use lemmazatization

from openie import StanfordOpenIE

with StanfordOpenIE()as client:    
    with open('corpus_test.txt', 'r', encoding='utf8') as r:
        corpuss = r.readlines()
    
    for corpus in corpuss:
        print('corpus is : ' + corpus)
        
        sentence =  str(corpus[11:]).capitalize()
        
        tokens = word_tokenize(sentence) # 分詞
        tagged_sent = pos_tag(tokens)      # 獲取單詞詞性
        
        wnl = WordNetLemmatizer()
        lemmas_sent = []
        for tag in tagged_sent:
            wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
            lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 詞形還原
        
        sen_corpus = ' '.join(lemmas_sent)
        
        print('after sentence is : ' + sen_corpus)
        
        
        triples_corpus = client.annotate(sen_corpus)     
        print('Corpus: %s [...].' % sen_corpus)
        print('Found %s triples in the corpus.' % len(sen_corpus))  
        print('corpus date is ' + corpus[0:11])
        
        for triple in triples_corpus:
            print('triple is : ' + triple['subject'] +"    "+ triple['relation'] +"    "+ triple['object'] + "\n")    
        print('-'*50)
        

## 正式開始做詞性轉換並獲得三元組

In [None]:
# (改良範例)用openIE來NER for 一個文件 use lemmazatization

from openie import StanfordOpenIE

with StanfordOpenIE()as client:    
    with open('News_workday.txt', 'r', encoding='utf8') as r:
        corpuss = r.readlines()
    
    #開啟要寫的文件
    f = open('News_tuple_ex.txt','a')
    
    for corpus in corpuss:
        print('originally sentence is : ' + corpus)
        
        sentence =  str(corpus[11:]).capitalize()
        
        tokens = word_tokenize(sentence) # 分詞
        tagged_sent = pos_tag(tokens)    # 獲取單詞詞性
        
        wnl = WordNetLemmatizer()
        lemmas_sent = []
        for tag in tagged_sent:
            wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
            lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 詞形還原
        
        #把詞形還原後的詞接再一起變句子
        sen_corpus = ' '.join(lemmas_sent)
        
        print('after sentence is : ' + sen_corpus)
        
        #將句子拆成三元組
        triples_corpus = client.annotate(sen_corpus)     
        print('Corpus: %s [...].' % sen_corpus)
        print('Found %s triples in the corpus.' % len(sen_corpus))  
        print('corpus date is ' + corpus[0:10])
        
        for triple in triples_corpus:
            print('triple is : ' + triple['subject'] +"    "+ triple['relation'] +"    "+ triple['object'] + "\n")    
            f.write(corpus[:10] +"    "+ triple['subject'] +"    "+ triple['relation'] +"    "+ triple['object'] + "\n")
        
        print('-'*50)
    
    f.close()

## 將三元組分成等等要用的訓練檔案

In [4]:
f = open('Workday_tuple_ex.txt','r')

#entity和relation部分在下一段落做
#e = open('entity2id.txt','a')
#r = open('relation2id.txt','a')

t = open('train.txt','a')

head = []
relation  = []
tail = []

entity = []
trainfile = []

for lines in f :
    triple = lines[15:].replace('\n','').lower().split("-----")
    #print(triple)
    
    head.append(triple[0])
    relation.append(triple[1])
    tail.append(triple[2])
    
    entity.append(triple[0])
    entity.append(triple[2])
    
    #到時候write直接寫上去即可
    #trainfile.append(triple[0]+"\t"+triple[2]+"\t"+triple[1])
    #train的部分
    #t.write(triple[0] + "\t" + triple[2] + "\t" + triple[1] + "\n")
    
    #print('-'*50)

print("It's finish~")

f.close()
t.close()





It's finish~


In [5]:
#entity2id部分

e = open('entity_noorder.txt','a')
r = open('relation_noorder.txt','a')

for index,subject in enumerate(list({}.fromkeys(entity).keys())):
    #print(subject, index)
    e.write(subject  + "\n")
e.close()

for index,rela in enumerate(list({}.fromkeys(relation).keys())):
    r.write(rela  + "\n")
r.close()

In [8]:
#word2vec所需訓練之文件

#讀取資料來源
f = open('Workday_tuple_ex.txt','r')
#要寫入的資料，之後來訓練word2vec
w = open('word2vec_corpus.txt','a')

head = []
relation  = []
tail = []

entity = []
trainfile = []

for lines in f :
    triple = lines[15:].replace('\n','').lower().split("-----")
    #print(triple)
    
    head.append(triple[0])
    relation.append(triple[1])
    tail.append(triple[2])
    
    w.write("[" +"'" + triple[0] + "'"+","+"'" + triple[1] + "'"+"," +"'"+ triple[2] +"'"+ "]," )


    
    
print("It's finish~")

f.close()
w.close()



It's finish~


## TransE的部分

In [1]:
from random import uniform, sample, choice
import numpy as np
from copy import deepcopy


def get_details_of_entityOrRels_list(file_path, split_delimeter="\t"):
    num_of_file = 0
    lyst = []
    with open(file_path) as file:
        lines = file.readlines()
        for line in lines:
            details_and_id = line.strip().split(split_delimeter)
            lyst.append(details_and_id[0])
            num_of_file += 1
    return num_of_file, lyst


def get_details_of_triplets_list(file_path, split_delimeter="\t"):
    num_of_file = 0
    lyst = []
    with open(file_path) as file:
        lines = file.readlines()
        for line in lines:
            triple = line.strip().split(split_delimeter)
            if len(triple) < 3:
                continue
            lyst.append(tuple(triple))
            num_of_file += 1
    return num_of_file, lyst


def norm(lyst):
    # 归一化 单位向量
    var = np.linalg.norm(lyst)
    i = 0
    while i < len(lyst):
        lyst[i] = lyst[i] / var
        i += 1
    # 需要返回array值 因为list不支持减法
    # return list
    return np.array(lyst)


def dist_L1(h, t, l):
    s = h + l - t
    # 曼哈顿距离/出租车距离， |x-xi|+|y-yi|直接对向量的各个维度取绝对值相加
    # dist = np.fabs(s).sum()
    return np.fabs(s).sum()


def dist_L2(h, t, l):
    s = h + l - t
    # 欧氏距离,是向量的平方和未开方。一定要注意，归一化公式和距离公式的错误书写，会引起收敛的失败
    # dist = (s * s).sum()
    return (s * s).sum()


In [2]:
class TransE(object):
    def __init__(self, entity_list, rels_list, triplets_list, margin=1, learing_rate=0.01, dim=20, normal_form="L1"):
        self.learning_rate = learing_rate
        self.loss = 0
        self.entity_list = entity_list  # entityList是entity的list；初始化后，变为字典，key是entity，values是其向量（使用narray）。
        self.rels_list = rels_list
        self.triplets_list = triplets_list
        self.margin = margin
        self.dim = dim
        self.normal_form = normal_form
        self.entity_vector_dict = {}
        self.rels_vector_dict = {}
        self.loss_list = []

    def initialize(self):
        """对论文中的初始化稍加改动
        初始化l和e，对于原本的l和e的文件中的/m/06rf7字符串标识转化为定义的dim维向量，对dim维向量进行uniform和norm归一化操作
        """
        entity_vector_dict, rels_vector_dict = {}, {}
        entity_vector_compo_list, rels_vector_compo_list = [], []
        for item, dict, compo_list, name in zip(
                [self.entity_list, self.rels_list], [entity_vector_dict, rels_vector_dict],
                [entity_vector_compo_list, rels_vector_compo_list], ["entity_vector_dict", "rels_vector_dict"]):
            for entity_or_rel in item:
                n = 0
                compo_list = []
                while n < self.dim:
                    random = uniform(-6 / (self.dim ** 0.5), 6 / (self.dim ** 0.5))
                    compo_list.append(random)
                    n += 1
                compo_list = norm(compo_list)
                dict[entity_or_rel] = compo_list
            print("The " + name + "'s initialization is over. It's number is %d." % len(dict))
        self.entity_vector_dict = entity_vector_dict
        self.rels_vector_dict = rels_vector_dict

    def transE(self, cycle_index=20):
        print("\n********** Start TransE training **********")
        for i in range(cycle_index):

            if i % 100 == 0:
                print("----------------The {} batchs----------------".format(i))
                print("The loss is: %.4f" % self.loss)
                # 查看最后的结果收敛情况
                self.loss_list.append(self.loss)
                # self.write_vector("data/entityVector.txt", "entity")
                # self.write_vector("data/relationVector.txt", "rels")
                self.loss = 0

            Sbatch = self.sample(150)
            Tbatch = []  # 元组对（原三元组，打碎的三元组）的列表 ：{((h,r,t),(h',r,t'))}
            for sbatch in Sbatch:
                triplets_with_corrupted_triplets = (sbatch, self.get_corrupted_triplets(sbatch))
                if triplets_with_corrupted_triplets not in Tbatch:
                    Tbatch.append(triplets_with_corrupted_triplets)
            self.update(Tbatch)

    def sample(self, size):
        return sample(self.triplets_list, size)

    def get_corrupted_triplets(self, triplets):
        '''training triplets with either the head or tail replaced by a random entity (but not both at the same time)
        :param triplet:单个（h,t,l）
        :return corruptedTriplet:'''
        # i = uniform(-1, 1) if i
        coin = choice([True, False])
        # 由于这个时候的(h,t,l)是从train文件里面抽出来的，要打坏的话直接随机寻找一个和头实体不等的实体即可
        if coin:  # 抛硬币 为真 打破头实体，即第一项
            while True:
                searching_entity = sample(self.entity_vector_dict.keys(), 1)[0]  # 取第一个元素是因为sample返回的是一个列表类型
                if searching_entity != triplets[0]:
                    break
            corrupted_triplets = (searching_entity, triplets[1], triplets[2])
        else:  # 反之，打破尾实体，即第二项
            while True:
                searching_entity = sample(self.entity_vector_dict.keys(), 1)[0]
                if searching_entity != triplets[1]:
                    break
            corrupted_triplets = (triplets[0], searching_entity, triplets[2])
        return corrupted_triplets

    def update(self, Tbatch):
        entity_vector_copy = deepcopy(self.entity_vector_dict)
        rels_vector_copy = deepcopy(self.rels_vector_dict)
        #print(entity_vector_copy)

        for triplets_with_corrupted_triplets in Tbatch:
            head_entity_vector = entity_vector_copy[triplets_with_corrupted_triplets[0][0]]
            tail_entity_vector = entity_vector_copy[triplets_with_corrupted_triplets[0][1]]
            relation_vector = rels_vector_copy[triplets_with_corrupted_triplets[0][2]]

            head_entity_vector_with_corrupted_triplets = entity_vector_copy[triplets_with_corrupted_triplets[1][0]]
            tail_entity_vector_with_corrupted_triplets = entity_vector_copy[triplets_with_corrupted_triplets[1][1]]

            head_entity_vector_before_batch = self.entity_vector_dict[triplets_with_corrupted_triplets[0][0]]
            tail_entity_vector_before_batch = self.entity_vector_dict[triplets_with_corrupted_triplets[0][1]]
            relation_vector_before_batch = self.rels_vector_dict[triplets_with_corrupted_triplets[0][2]]

            head_entity_vector_with_corrupted_triplets_before_batch = self.entity_vector_dict[
                triplets_with_corrupted_triplets[1][0]]
            tail_entity_vector_with_corrupted_triplets_before_batch = self.entity_vector_dict[
                triplets_with_corrupted_triplets[1][1]]

            if self.normal_form == "L1":
                dist_triplets = dist_L1(head_entity_vector_before_batch, tail_entity_vector_before_batch,
                                        relation_vector_before_batch)
                dist_corrupted_triplets = dist_L1(head_entity_vector_with_corrupted_triplets_before_batch,
                                                  tail_entity_vector_with_corrupted_triplets_before_batch,
                                                  relation_vector_before_batch)
            else:
                dist_triplets = dist_L2(head_entity_vector_before_batch, tail_entity_vector_before_batch,
                                        relation_vector_before_batch)
                dist_corrupted_triplets = dist_L2(head_entity_vector_with_corrupted_triplets_before_batch,
                                                  tail_entity_vector_with_corrupted_triplets_before_batch,
                                                  relation_vector_before_batch)
            eg = self.margin + dist_triplets - dist_corrupted_triplets
            if eg > 0:  # 大于0取原值，小于0则置0.即合页损失函数margin-based ranking criterion
                self.loss += eg
                temp_positive = 2 * self.learning_rate * (
                        tail_entity_vector_before_batch - head_entity_vector_before_batch - relation_vector_before_batch)
                temp_negative = 2 * self.learning_rate * (
                        tail_entity_vector_with_corrupted_triplets_before_batch - head_entity_vector_with_corrupted_triplets_before_batch - relation_vector_before_batch)
                if self.normal_form == "L1":
                    temp_positive_L1 = [1 if temp_positive[i] >= 0 else -1 for i in range(self.dim)]
                    temp_negative_L1 = [1 if temp_negative[i] >= 0 else -1 for i in range(self.dim)]
                    temp_positive_L1 = [float(f) for f in temp_positive_L1]
                    temp_negative_L1 = [float(f) for f in temp_negative_L1]
                    temp_positive = np.array(temp_positive_L1) * self.learning_rate
                    temp_negative = np.array(temp_negative_L1) * self.learning_rate
                    # temp_positive = norm(temp_positive_L1) * self.learning_rate
                    # temp_negative = norm(temp_negative_L1) * self.learning_rate

                # 对损失函数的5个参数进行梯度下降， 随机体现在sample函数上
                head_entity_vector += temp_positive
                tail_entity_vector -= temp_positive
                relation_vector = relation_vector + temp_positive - temp_negative
                head_entity_vector_with_corrupted_triplets -= temp_negative
                tail_entity_vector_with_corrupted_triplets += temp_negative

                # 归一化刚才更新的向量，减少计算时间
                entity_vector_copy[triplets_with_corrupted_triplets[0][0]] = norm(head_entity_vector)
                entity_vector_copy[triplets_with_corrupted_triplets[0][1]] = norm(tail_entity_vector)
                rels_vector_copy[triplets_with_corrupted_triplets[0][2]] = norm(relation_vector)
                entity_vector_copy[triplets_with_corrupted_triplets[1][0]] = norm(
                    head_entity_vector_with_corrupted_triplets)
                entity_vector_copy[triplets_with_corrupted_triplets[1][1]] = norm(
                    tail_entity_vector_with_corrupted_triplets)

                # self.entity_vector_dict = deepcopy(entity_vector_copy)
                # self.rels_vector_dict = deepcopy(rels_vector_copy)
            self.entity_vector_dict = entity_vector_copy
            self.rels_vector_dict = rels_vector_copy

    def write_vector(self, file_path, option):
        if option.strip().startswith("entit"):
            print("Write entities vetor into file      : {}".format(file_path))
            # dyct = deepcopy(self.entity_vector_dict)
            dyct = self.entity_vector_dict
        if option.strip().startswith("rel"):
            print("Write relationships vector into file: {}".format(file_path))
            # dyct = deepcopy(self.rels_vector_dict)
            dyct = self.rels_vector_dict
        with open(file_path, 'w') as file:  # 写文件，每次覆盖写 用with自动调用close
            for dyct_key in dyct.keys():
                file.write(dyct_key + "\t")
                file.write(str(dyct[dyct_key].tolist()))
                file.write("\n")

    def write_loss(self, file_path, num_of_col):
        with open(file_path, 'w') as file:
            lyst = deepcopy(self.loss_list)
            for i in range(len(lyst)):
                if num_of_col == 1:
                    # 保留4位小数
                    file.write(str(int(lyst[i] * 10000) / 10000) + "\n")
                    # file.write(str(lyst[i]).split('.')[0] + '.' + str(lyst[i]).split('.')[1][:4] + "\n")
                else:
                    # file.write(str(lyst[i]).split('.')[0] + '.' + str(lyst[i]).split('.')[1][:4] + "\t")
                    file.write(str(int(lyst[i] * 10000) / 10000) + "    ")
                    if (i + 1) % num_of_col == 0 and i != 0:
                        file.write("\n")

In [3]:
if __name__ == "__main__":
    entity_file_path = "./entity2id.txt"
    num_of_entity, entity_list = get_details_of_entityOrRels_list(entity_file_path)
    rels_file_path = "./relation2id.txt"
    num_of_rels, rels_list = get_details_of_entityOrRels_list(rels_file_path)
    train_file_path = "./train.txt"
    num_of_triplets, triplets_list = get_details_of_triplets_list(train_file_path)

    transE = TransE(entity_list, rels_list, triplets_list, margin=1, dim=50)
    print("\nTransE is initializing...")
    transE.initialize()
    transE.transE(5000)
    print("********** End TransE training ***********\n")
    # 训练的批次并不一定是100的整数倍，将最后更新的向量写到文件
    transE.write_vector("./entityVector(20d).txt", "entity")
    transE.write_vector("./relationVector(20d).txt", "relationship")


TransE is initializing...
The entity_vector_dict's initialization is over. It's number is 1366.
The rels_vector_dict's initialization is over. It's number is 676.

********** Start TransE training **********
----------------The 0 batchs----------------
The loss is: 0.0000
----------------The 100 batchs----------------
The loss is: 4186.5572
----------------The 200 batchs----------------
The loss is: 485.5670
----------------The 300 batchs----------------
The loss is: 250.9241
----------------The 400 batchs----------------
The loss is: 154.5309
----------------The 500 batchs----------------
The loss is: 126.7138
----------------The 600 batchs----------------
The loss is: 98.0635
----------------The 700 batchs----------------
The loss is: 64.2522
----------------The 800 batchs----------------
The loss is: 65.9256
----------------The 900 batchs----------------
The loss is: 46.6238
----------------The 1000 batchs----------------
The loss is: 43.9631
----------------The 1100 batchs--------

In [None]:
#num_of_entity, entity_list = get_details_of_entityOrRels_list(entity_file_path)
#num_of_rels, rels_list = get_details_of_entityOrRels_list(rels_file_path)
#num_of_triplets, triplets_list = get_details_of_triplets_list(train_file_path)
