In [1]:
from pymongo import MongoClient
db_name = 'freebase'
collection_name = 'fb15k237'
MONGO_DB_URL = "mongodb://localhost:27017/" # currently just using a local db
client = MongoClient(MONGO_DB_URL)
coll = client[db_name][collection_name]

In [4]:
coll.find({}, {'@id':1, '@type':1, 'name':1, 'description':1})[0]

{'_id': ObjectId('5cca66da1e8e3de846b9b879'),
 '@id': 'kg:/m/029jpy',
 'name': 'New England',
 '@type': ['Thing', 'Place'],
 'description': 'Region in the United States of America'}

In [2]:
with open('../benchmarks/FB15K237/entity2id.txt') as ef:
    lines = [line.split('\t') for line in ef.readlines()[1:-1]]
    ent2id = {"kg:"+line[0]:int(line[1]) for line in lines}
    id2ent = {int(line[1]):"kg:"+line[0] for line in lines}

In [14]:
with open('../benchmarks/FB15K237/train2id.txt') as tf:
    all_lines = tf.readlines()[1:-1]

In [57]:
# get valid ent ids
# get a map of id to name

noname_ids = set()
id2name = {}
id2types = {}
id2desc = {}
for id_, ent_id in id2ent.items():
    entry = coll.find_one({'@id': ent_id}, {'@id':1, '@type':1, 'name':1, 'description':1})
    if entry != None:
#         print(entry)
        if 'name' in entry:
            id2name[id_] = entry['name']
        else:
            id2name[id_] = ent_id
        id2types[id_] = entry['@type']
        if 'description' in entry:
            id2desc[id_] = entry['description']
    else:
        id2name[id_] = ent_id
        noname_ids.add(id_)


In [158]:
unique_types = sorted([ 'tp:' + typ for typ in coll.distinct('@type')])
unique_types

['tp:AdministrativeArea',
 'tp:Airline',
 'tp:Airport',
 'tp:AmusementPark',
 'tp:BodyOfWater',
 'tp:Book',
 'tp:BookSeries',
 'tp:Brand',
 'tp:Bridge',
 'tp:BroadcastChannel',
 'tp:BroadcastService',
 'tp:BusStation',
 'tp:Cemetery',
 'tp:City',
 'tp:CivicStructure',
 'tp:CollegeOrUniversity',
 'tp:Continent',
 'tp:Corporation',
 'tp:Country',
 'tp:CreativeWork',
 'tp:DefenceEstablishment',
 'tp:EducationalOrganization',
 'tp:Event',
 'tp:GovernmentOrganization',
 'tp:GovernmentPermit',
 'tp:ItemList',
 'tp:LakeBodyOfWater',
 'tp:LandmarksOrHistoricalBuildings',
 'tp:LocalBusiness',
 'tp:LodgingBusiness',
 'tp:Mountain',
 'tp:Movie',
 'tp:MovieSeries',
 'tp:MovieTheater',
 'tp:Museum',
 'tp:MusicAlbum',
 'tp:MusicComposition',
 'tp:MusicGroup',
 'tp:Organization',
 'tp:Person',
 'tp:Place',
 'tp:PlaceOfWorship',
 'tp:Product',
 'tp:ProductModel',
 'tp:RadioStation',
 'tp:Restaurant',
 'tp:RiverBodyOfWater',
 'tp:School',
 'tp:SingleFamilyResidence',
 'tp:SportsOrganization',
 'tp:Spor

In [160]:
docs = []

docs.extend(id2ent.values())
docs.extend(unique_types)
docs.extend([desc for desc in id2desc.values()])
docs.extend([n for n in id2name.values() if not n.startswith('kg:')])


In [161]:
sub_docs = [' '.join(transform_into_substring(doc.strip())) for doc in docs if doc.strip() != '']

In [162]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(sub_docs)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [163]:
X = vectorizer.transform([' '.join(transform_into_substring('Barack Obama'))])
Y = vectorizer.transform([' '.join(transform_into_substring('Obama'))])

In [164]:
X

<1x32041 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [106]:
import torch.nn as nn
import torch

def get_model():
    

In [107]:
char_embeddings = nn.Linear(32055, 200)
nn.init.xavier_uniform_(char_embeddings.weight.data)

tensor([[-0.0046, -0.0096, -0.0120,  ...,  0.0016, -0.0068,  0.0040],
        [ 0.0022, -0.0099,  0.0131,  ..., -0.0053,  0.0135,  0.0015],
        [-0.0087, -0.0041, -0.0028,  ...,  0.0019,  0.0081,  0.0059],
        ...,
        [-0.0090, -0.0090,  0.0024,  ...,  0.0072,  0.0133, -0.0089],
        [ 0.0016, -0.0091,  0.0015,  ..., -0.0065, -0.0131,  0.0066],
        [-0.0098,  0.0126, -0.0133,  ..., -0.0013, -0.0062,  0.0089]])

In [135]:
import numpy as np
from scipy.sparse import coo_matrix

def sparse_tensor(vector):
    coo = coo_matrix(vector)
    values = coo.data
    indices = np.vstack((coo.row, coo.col))

    i = torch.LongTensor(indices)
    v = torch.FloatTensor(values)
    shape = coo.shape

    return torch.sparse.FloatTensor(i, v, torch.Size(shape))

In [139]:
text_x  = [' '.join(transform_into_substring('Barack Obama')), ' '.join(transform_into_substring('Hello Obama'))]
set_X = vectorizer.transform(text_x)
sparse_x = sparse_tensor(set_X)
sparse_x

tensor(indices=tensor([[    0,     0,     0,     0,     0,     0,     0,     0,
                            0,     0,     0,     0,     1,     1,     1,     1,
                            1,     1,     1,     1,     1,     1,     1],
                       [27289, 25884, 25883, 24141, 22804, 17786, 16348, 16347,
                        16321, 15840, 15624, 15247, 25884, 25883, 25848, 24141,
                        23844, 23823, 21440, 21430, 19439, 16321, 15624]]),
       values=tensor([0.2270, 0.3950, 0.3128, 0.2353, 0.3161, 0.2151, 0.3950,
                      0.2634, 0.3011, 0.2404, 0.2420, 0.2461, 0.3973, 0.3146,
                      0.3162, 0.2367, 0.2773, 0.2789, 0.3973, 0.2673, 0.2285,
                      0.3028, 0.2434]),
       size=(2, 32055), nnz=23, layout=torch.sparse_coo)

In [136]:
X = vectorizer.transform([' '.join(transform_into_substring('Barack Obama'))])
Y = vectorizer.transform([' '.join(transform_into_substring('Alabama'))])
e1 = char_embeddings(sparse_tensor(X))
e2 = char_embeddings(sparse_tensor(Y))

nn.functional.cosine_similarity(e1, e2)

tensor([0.3903], grad_fn=<DivBackward0>)

In [51]:
len(invalid_ids)

1855

In [63]:
# parse through all the training data
# eliminate the entries that are not in the freebase database
# save cooccuring name somewhere for trying out char embeddings

to_be_written = []
docs = []
docs

for line in all_lines:
#   try:
    l = line.strip().split(' ')
    id_1 = int(l[0])
    id_2 = int(l[2])
#     ent1 = id2ent[id_1]
    rel = l[1]
#     ent2 = id2ent[id_2]
    docs.append(id2name[id_1])
    docs.append(id2name[id_2])
#     entry1 = coll.find_one({'@id': ent1}, {'@id':1, '@name':1})
#     entry2 = coll.find_one({'@id': ent2}, {'@id':1, '@name':1})
#         print('hello')
#     if entry1 != None and entry2 != None:
#         to_be_written.append(' '.join(l))

# with open('../benchmarks/FB15K237/train2id_fixed.txt', 'w') as bf:
#     bf.write(str(len(to_be_written)) + '\n')
#     bf.write('\n'.join(to_be_written))
#     bf.write('\n')


In [44]:
a = coll.find_one({'@id': id2ent[2]}, {'@id':1, 'name':1})
a['name']

'Mighty Morphin Power Rangers'

In [26]:
id2ent[4]

'kg:/m/07s9rl0'

In [29]:
print(coll.find_one())

{'_id': ObjectId('5cca66da1e8e3de846b9b879'), '@id': 'kg:/m/029jpy', 'name': 'New England', '@type': ['Thing', 'Place'], 'description': 'Region in the United States of America', 'detailedDescription': {'articleBody': 'New England is a region composed of six states of the northeastern United States: Maine, Vermont, New Hampshire, Massachusetts, Rhode Island, and Connecticut. ', 'url': 'https://en.wikipedia.org/wiki/New_England', 'license': 'https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License'}, 'image': {'contentUrl': 'http://t2.gstatic.com/images?q=tbn:ANd9GcRnyvsB5xpjgdid8Z5o6XfZ_xFKgtLLQK_lLMJUJo6xUZUc6MPl', 'url': 'https://commons.wikimedia.org/wiki/File:NewEngland_Fall.jpg'}}


In [2]:
import re

def partition(some_list, length=3):
    for i in range(0, len(some_list)):  
        yield some_list[i:i + length] 

def transform_into_substring(some_string, sub_len = 3):
    if some_string.startswith('kg:') or some_string.startswith('tp:'):
        return [some_string]
    all_subs = []
    clean_string = re.sub('[\.\,\:]', '', some_string)
    clean_string = re.sub('\s', '_', clean_string)
    words = [w.strip() for w in clean_string.split('_') if w.strip() != '']
    all_subs.extend(words)
    for word in words:
        all_subs.extend(partition(word, sub_len))
    if len(words) > 1:
        bridge_subs = ['_'.join([first[-1],second[0]]) for first,second in zip(words[:-1], words[1:])]
        all_subs.extend(bridge_subs)
    return all_subs
print(transform_into_substring('Barack Obama'))

['Barack', 'Obama', 'Bar', 'ara', 'rac', 'ack', 'ck', 'k', 'Oba', 'bam', 'ama', 'ma', 'a', 'k_O']


In [53]:
' '.join(['jkk'])

'jkk'

In [329]:
class CharTransE(nn.Module):
    def __init__(self, entCharTotal, relTotal, hidden_size=100, margin=1.0, p_norm = 2):
        super(CharTransE, self).__init__()
        self.ent_char_weights = nn.Linear(entCharTotal, hidden_size, bias=False)
        self.rel_embeddings = nn.Embedding(relTotal, hidden_size)
        self.p_norm = p_norm
        self.criterion = nn.MarginRankingLoss(margin, False)
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.ent_char_weights.weight.data)
        nn.init.xavier_uniform_(self.rel_embeddings.weight.data)

    def _calc(self, h, t, r):
        return torch.norm((h + r) - t, self.p_norm, -1)

    def loss(self, p_score, n_score):
        y = Variable(torch.Tensor([-1]).cpu())
        return self.criterion(p_score, n_score, y)
    
#     def get_positive_score(self, score):
#         return score[0:self.config.batch_size]

#     def get_negative_score(self, score):
#         negative_score = score[self.config.batch_size:self.config.batch_seq_size]
#         negative_score = negative_score.view(-1, self.config.batch_size)
#         negative_score = torch.mean(negative_score, 0)
#         return negative_score
    
    def forward(self, batch_h_p, batch_r_p, batch_t_p, batch_h_n, batch_r_n, batch_t_n):
        h_p = self.ent_char_weights(batch_h_p)
        t_p = self.ent_char_weights(batch_t_p)
        r_p = self.rel_embeddings(batch_r_p)
        p_score = self._calc(h_p, t_p, r_p)
        h_n = self.ent_char_weights(batch_h_n)
        t_n = self.ent_char_weights(batch_t_n)
        r_n = self.rel_embeddings(batch_r_n)
        n_score = self._calc(h_n, t_n, r_n)
        return self.loss(p_score, n_score)
    
    def predict(self, batch_h, batch_r, batch_t):
        h = self.ent_char_weights(batch_h)
        t = self.ent_char_weights(batch_t)
        r = self.rel_embeddings(batch_r)
        score = self._calc(h, t, r)
        return score.cpu().data.numpy()


In [238]:
charTotal = len(vectorizer.vocabulary_)
relTotal = 3
rel_dict = {'name':0, '@type':1, 'sameAs':2}
model = CharTransE(charTotal, relTotal)



In [458]:
from random import choice
pos_examples = []
neg_examples = []
ids_sub = list(id2types.keys())[:100]

for id_ in ids_sub:
    random_id = choice(ids_sub)
    while random_id == id_:
        random_id = choice(ids_sub)
    
    types = set(id2types[id_])
    name = id2name[id_]
    random_name = id2name[random_id]
    
    fb_id = id2ent[id_]
    random_fb_id = id2ent[random_id]
    
    pos_examples.append((fb_id, 2, fb_id)) #sameAs relation
    pos_examples.append((fb_id, 0, name)) #name relation
    pos_examples.append((name, 2, name)) #sameAs relation
        
    neg_examples.append((fb_id, 2, random_fb_id)) #not sameAs relation
    neg_examples.append((fb_id, 0, random_name)) #not name relation
    neg_examples.append((name, 2, random_name)) #not sameAs relation
    
    for typ in types:
        pos_examples.append((name, 1, typ))
        pos_examples.append((fb_id, 1, typ))
        random_type = choice(unique_types)
        while random_type in types:
            random_type = choice(unique_types)
        neg_examples.append((name, 1, random_type))
        neg_examples.append((fb_id, 1, random_type))

In [459]:
h_batch_pos, r_batch_pos, t_batch_pos = zip(*pos_examples)
h_batch_neg, r_batch_neg, t_batch_neg = zip(*neg_examples)

In [460]:
assert (len(h_batch_pos) == len(h_batch_neg) and \
len(r_batch_pos) == len(r_batch_neg) and \
len(t_batch_pos) == len(t_batch_neg)) == True

In [461]:
def vectorize_batch(batch):
    batch_docs = [' '.join(transform_into_substring(ent)) for ent in batch]
    tfidf_vector = vectorizer.transform(batch_docs)
    return sparse_tensor(tfidf_vector)

In [462]:
hbp_X = vectorize_batch(h_batch_pos)
tbp_X = vectorize_batch(t_batch_pos)
hbn_X = vectorize_batch(h_batch_neg)
tbn_X = vectorize_batch(t_batch_neg)
rp_X = Variable(torch.from_numpy(np.array(r_batch_pos).reshape((-1,))).cpu())
rn_X = Variable(torch.from_numpy(np.array(r_batch_neg).reshape((-1,))).cpu())

In [463]:
len(pos_examples) + len(neg_examples)

1576

In [464]:
model.forward(hbp_X, rp_X, tbp_X, hbn_X, rn_X, tbn_X) 

tensor(79.4508, grad_fn=<SumBackward0>)

In [465]:
import torch.optim as optim

In [466]:
model = CharTransE(charTotal, relTotal)
optimizer = optim.SGD(
                model.parameters(),
                lr=0.01,
                weight_decay=0,
            )
for _ in range(100):
    optimizer.zero_grad()
    loss = model.forward(hbp_X, rp_X, tbp_X, hbn_X, rn_X, tbn_X)
    loss.backward()
    optimizer.step()
    print(loss)



tensor(782.9592, grad_fn=<SumBackward0>)
tensor(296.2975, grad_fn=<SumBackward0>)
tensor(293.6457, grad_fn=<SumBackward0>)
tensor(290.8419, grad_fn=<SumBackward0>)
tensor(287.8528, grad_fn=<SumBackward0>)
tensor(284.6504, grad_fn=<SumBackward0>)
tensor(281.2131, grad_fn=<SumBackward0>)
tensor(277.5279, grad_fn=<SumBackward0>)
tensor(273.5897, grad_fn=<SumBackward0>)
tensor(269.4014, grad_fn=<SumBackward0>)
tensor(264.9705, grad_fn=<SumBackward0>)
tensor(260.3057, grad_fn=<SumBackward0>)
tensor(255.4108, grad_fn=<SumBackward0>)
tensor(250.2756, grad_fn=<SumBackward0>)
tensor(244.8629, grad_fn=<SumBackward0>)
tensor(239.1570, grad_fn=<SumBackward0>)
tensor(233.4622, grad_fn=<SumBackward0>)
tensor(227.4662, grad_fn=<SumBackward0>)
tensor(219.9946, grad_fn=<SumBackward0>)
tensor(207.9036, grad_fn=<SumBackward0>)
tensor(171.7464, grad_fn=<SumBackward0>)
tensor(248.8409, grad_fn=<SumBackward0>)
tensor(236.7564, grad_fn=<SumBackward0>)
tensor(224.8380, grad_fn=<SumBackward0>)
tensor(214.1040,

In [246]:
h_batch_pos

('kg:/m/027rn',
 'kg:/m/027rn',
 'Dominican Republic',
 'Dominican Republic',
 'Dominican Republic',
 'Dominican Republic',
 'Dominican Republic',
 'kg:/m/017dcd',
 'kg:/m/017dcd',
 'Mighty Morphin Power Rangers',
 'Mighty Morphin Power Rangers',
 'Mighty Morphin Power Rangers',
 'kg:/m/06v8s0',
 'kg:/m/06v8s0',
 'Wendee Lee',
 'Wendee Lee',
 'Wendee Lee',
 'kg:/m/0170z3',
 'kg:/m/0170z3',
 'American History X',
 'American History X',
 'American History X',
 'kg:/m/01sl1q',
 'kg:/m/01sl1q',
 'Michelle Rodriguez',
 'Michelle Rodriguez',
 'Michelle Rodriguez',
 'kg:/m/044mz_',
 'kg:/m/044mz_',
 'Naveen Andrews',
 'Naveen Andrews',
 'Naveen Andrews',
 'kg:/m/0cnk2q',
 'kg:/m/0cnk2q',
 'Australia national football team',
 'Australia national football team',
 'Australia national football team',
 'kg:/m/04nrcg',
 'kg:/m/04nrcg',
 'Maldives national football team',
 'Maldives national football team',
 'Maldives national football team',
 'Maldives national football team',
 'kg:/m/07nznf',
 'kg

In [314]:
def generate_batch(tuple_array):
    h, r, t = zip(*tuple_array)
    return vectorize_batch(h), Variable(torch.from_numpy(np.array(r).reshape((-1,))).cpu()), vectorize_batch(t)

In [454]:
sorted(pos_examples, key=lambda x: x[0])

[('52nd Annual Grammy Awards', 2, '52nd Annual Grammy Awards'),
 ('52nd Annual Grammy Awards', 1, 'Event'),
 ('52nd Annual Grammy Awards', 1, 'Thing'),
 ('61st Academy Awards', 2, '61st Academy Awards'),
 ('61st Academy Awards', 1, 'Event'),
 ('61st Academy Awards', 1, 'Thing'),
 ('ACF Fiorentina', 2, 'ACF Fiorentina'),
 ('ACF Fiorentina', 1, 'Corporation'),
 ('ACF Fiorentina', 1, 'SportsTeam'),
 ('ACF Fiorentina', 1, 'Organization'),
 ('ACF Fiorentina', 1, 'Thing'),
 ('Academy Award for Best International Feature Film',
  2,
  'Academy Award for Best International Feature Film'),
 ('Academy Award for Best International Feature Film', 1, 'Thing'),
 ('Alan Bennett', 2, 'Alan Bennett'),
 ('Alan Bennett', 1, 'Thing'),
 ('Alan Bennett', 1, 'Person'),
 ('Alan Burnett', 2, 'Alan Burnett'),
 ('Alan Burnett', 1, 'Thing'),
 ('Alan Burnett', 1, 'Person'),
 ('American History X', 2, 'American History X'),
 ('American History X', 1, 'Thing'),
 ('American History X', 1, 'Movie'),
 ('American Reunio

In [453]:
pos_examples

[('kg:/m/027rn', 2, 'kg:/m/027rn'),
 ('kg:/m/027rn', 0, 'Dominican Republic'),
 ('Dominican Republic', 2, 'Dominican Republic'),
 ('Dominican Republic', 1, 'AdministrativeArea'),
 ('Dominican Republic', 1, 'Thing'),
 ('Dominican Republic', 1, 'Place'),
 ('Dominican Republic', 1, 'Country'),
 ('kg:/m/017dcd', 2, 'kg:/m/017dcd'),
 ('kg:/m/017dcd', 0, 'Mighty Morphin Power Rangers'),
 ('Mighty Morphin Power Rangers', 2, 'Mighty Morphin Power Rangers'),
 ('Mighty Morphin Power Rangers', 1, 'TVSeries'),
 ('Mighty Morphin Power Rangers', 1, 'Thing'),
 ('kg:/m/06v8s0', 2, 'kg:/m/06v8s0'),
 ('kg:/m/06v8s0', 0, 'Wendee Lee'),
 ('Wendee Lee', 2, 'Wendee Lee'),
 ('Wendee Lee', 1, 'Thing'),
 ('Wendee Lee', 1, 'Person'),
 ('kg:/m/0170z3', 2, 'kg:/m/0170z3'),
 ('kg:/m/0170z3', 0, 'American History X'),
 ('American History X', 2, 'American History X'),
 ('American History X', 1, 'Thing'),
 ('American History X', 1, 'Movie'),
 ('kg:/m/01sl1q', 2, 'kg:/m/01sl1q'),
 ('kg:/m/01sl1q', 0, 'Michelle Rodrigu

In [480]:
e1,_,e2 = generate_batch([('kg:/m/027dtxw', 2, 'kg:/m/040njc')])
e1 = model.ent_char_weights(e1)
e2 = model.ent_char_weights(e2)
nn.functional.cosine_similarity(e1, e2)

tensor([0.4600], grad_fn=<DivBackward0>)

In [284]:
rel_dict

{'name': 0, '@type': 1, 'sameAs': 2}

In [435]:
test_batch = [('James Franco', 0, 'James Franco'), ('kg:/m/073hkh', 0, '62nd Wtf')]
t_h, t_r, t_t = generate_batch(test_batch)
h_x = model.ent_char_weights(t_h)
t_x = model.ent_char_weights(t_t)
r_x = model.rel_embeddings(t_r)
print(model._calc(h_x, t_x, r_x))

tensor([0.1569, 0.4634], grad_fn=<NormBackward1>)
