In [1]:
from sfe.sfe import Graph, SFE
import pandas as pd
import numpy as np
import os, time

In [2]:
train2id = pd.read_csv('./benchmarks/FB13/train2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
valid2id = pd.read_csv('./benchmarks/FB13/valid2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])

from tools import dataset_tools
entity2id, id2entity     = dataset_tools.read_name2id_file('./benchmarks/FB13/entity2id.txt')
relation2id, id2relation = dataset_tools.read_name2id_file('./benchmarks/FB13/relation2id.txt')

train = pd.read_csv('./benchmarks/FB13/train.txt', sep='\t', skiprows=0, names=['head', 'relation', 'tail'])
valid = pd.read_csv('./benchmarks/FB13/valid.txt', sep='\t', skiprows=0, names=['head', 'relation', 'tail', 'label'])
valid = valid.loc[valid['label'] == 1]

In [3]:
start_time = time.time()
g = Graph()
g.partial_build_from_df(train)
g.partial_build_from_df(valid)
print("Elapsed time: {}".format(time.time() - start_time))

Elapsed time: 43.9185209274


In [4]:
sfe = SFE(g)

In [5]:
%%time
paths = sfe.search_paths('martin_van_buren', 'male', max_depth=2)

time get nodes: 5.96046447754e-06
time to find node sequences: 5.3416159153
time to merge node sequences: 0.355556964874
time to get paths: 0.19667506218
CPU times: user 6.02 s, sys: 121 ms, total: 6.14 s
Wall time: 6.05 s


In [6]:
lpaths = {}
for path in paths:
    lpaths[len(path)] = lpaths.get(len(path), []) + [path]
for idx, ps in lpaths.iteritems():
    print idx, ps
print "\nNumber of paths: {}\n\n".format(len(paths))

1 [('gender',)]
2 [('children', 'gender'), ('_parents', 'gender')]
3 [('profession', '_profession', 'gender'), ('place_of_death', '_place_of_birth', 'gender'), ('nationality', '_location', 'gender'), ('cause_of_death', '_cause_of_death', 'gender'), ('nationality', '_place_of_death', 'gender'), ('nationality', '_profession', 'gender'), ('nationality', '_place_of_birth', 'gender'), ('nationality', '_nationality', 'gender')]
4 [('nationality', '_nationality', '_spouse', 'gender'), ('profession', '_profession', '_spouse', 'gender'), ('nationality', '_place_of_death', '_parents', 'gender'), ('cause_of_death', '_cause_of_death', 'parents', 'gender'), ('spouse', 'nationality', '_nationality', 'gender'), ('_spouse', 'location', '_location', 'gender'), ('cause_of_death', '_cause_of_death', '_children', 'gender'), ('_spouse', 'place_of_death', '_location', 'gender'), ('nationality', '_nationality', 'spouse', 'gender'), ('children', 'place_of_birth', '_place_of_death', 'gender'), ('_spouse', 'pla

In [7]:
# df = train[1230:1231]
# feats = sfe.generate_features(df, max_depth=2)
# feats = next(feats)

In [8]:
# for idx,f_ in enumerate(feats):
#     f = f_[1]
#     row = df.iloc[idx]
#     print(row['head'], row['relation'], row['tail'], f)
#     print "\nNumber of features: {}\n\n".format(len(f))

Experimenting with IDs instead of names: it did not help much.

In [9]:
start_time = time.time()
g2 = Graph()
g2.partial_build_from_df(train2id)
g2.partial_build_from_df(valid2id)
print("Elapsed time: {}".format(time.time() - start_time))

KeyboardInterrupt: 

In [None]:
sfe2 = SFE(g2)

In [None]:
%%timeit
df = train2id[1230:1231]
feats = sfe2.generate_features(df, max_depth=2)
feats = next(feats)

In [None]:
%%timeit
paths = sfe2.search_paths(entity2id['beals_wright'], entity2id['male'], max_depth=2)

In [None]:
for idx,f in feats:
    print idx
    row = df.iloc[idx]
    print(row['head'], row['relation'], row['tail'], f)

In [None]:
head, rel, tail, paths = (29639, 6, 67575, [['10', '_5'], ['_9', '_5'], ['9', '_5'], ['_10', '_5'], ['2', '_2', '_5'], ['_11', '_5'], ['11', '_5']])

In [None]:
print id2entity[head]
print id2relation[rel]
print id2entity[tail]
for path in paths:
    p = []
    for r in path:
        relation_id = int(r.replace('_', ''))
        relation_label = id2relation[relation_id]
        p.append(r.replace(str(relation_id), relation_label))
    print p

PRA results for `camilo_jose_cela` `nationality` `spain`:

```
-gender-_gender-place_of_birth-,1.0 -#-
-gender-_gender-location-,1.0 -#-
-gender-_gender-place_of_death-,1.0 -#-
-profession-_profession-place_of_birth-,1.0 -#-
-profession-_profession-place_of_death-,1.0 -#-
-place_of_death-_place_of_death-place_of_birth-,1.0 -#-
-place_of_death-_location-place_of_birth-,1.0 -#-
-place_of_death-_place_of_birth-place_of_death-,1.0 -#-
-place_of_death-_place_of_death-location-,1.0 -#-
-place_of_death-_location-location-,1.0 -#-
-profession-_profession-location-,1.0 -#-
-gender-_gender-institution-,1.0
```

In [None]:
paths = sfe.search_paths('spain', 'camilo_jose_cela', max_depth=2)
for path in paths:
    path_strings = []
    for r in path:
        path_strings.append(r.__str__())
    print path_strings

In [None]:
node = sfe.graph.get_node('menasseh_ben_israel')

In [None]:
for n in node.neighbors: print n

In [None]:
node = sfe.graph.get_node('spain')
for n in node.neighbors: print n

In [None]:
sfe.graph.get_node('richard_baxter') in node.neighbors

In [None]:
# run for one triple
start_time = time.time()
paths = sfe.extract_features(0, 67393, 2)
for p in paths:
    print "-----------path-----------"
    for e in p:
        print e.__str__()
print("Elapsed time: {}".format(time.time() - start_time))

In [None]:
# run for n triples
start_time = time.time()
res = []
for idx,row in train[:100].iterrows():
    res.append(sfe.extract_features(row['head'], row['tail'], 2))
print("Elapsed time: {}".format(time.time() - start_time))

In [None]:
with open('output_test.txt', 'w') as f:
    for row in res:
        for seq in row:
            for edge in seq:
                f.write(edge.__str__())
                f.write(',')
            f.write("\n")

In [None]:
!cat output_test.txt

# Debug python

In [None]:
l = [1,2,3,4,5,6]

In [None]:
l.remove(3)

In [None]:
l

In [None]:
s = {1,2,3,4}

In [None]:
s - {4}

In [None]:
s

In [None]:
from itertools import product
l = [[1], [5,90,84]]
res = product(*l)
s = set()
s.update(res)
s