In [2]:
# --- start make file run from another dir ---
#
# Note: File should be run from XKE root dir. E.g.:
#
#       $ cd Projects/XKE
#       $ python examples/emb_grid_search/grid_search_TransE_FB13.py
#
import os, sys
file_ = os.path.join(os.path.abspath(sys.path[0])) # use this when inside a jupyter notebook
# file_ = __file__ # use this when running from command line
xke_root = os.path.abspath(os.path.join(file_, "../"))
sys.path.insert(0, xke_root)
# --- end make file run from another dir ---

In [3]:
%load_ext autoreload
%autoreload 2

from sfe.sfe import Graph, SFE
import pandas as pd
import numpy as np
import os, time

In [3]:
def debug_print(list_of_els):
    """Prints a list of elements using their string method."""
    l = []
    for n in list_of_els:
        l.append(n.__str__())
    print(l)
    
def print_node_seqs(node_seqs):
    """Prints a node sequences dict."""
    for node,seqs in node_seqs.iteritems():
        print "{}:".format(node)
        for seq in seqs: debug_print(seq)
        print
        
def print_paths_per_length(paths):
    """Prints paths between a node pair separated by length."""
    lpaths = {}
    for path in paths:
        lpaths[len(path)] = lpaths.get(len(path), []) + [path]
    for idx, ps in lpaths.iteritems():
        print "Paths of length: ", idx
        for p in ps: print p
        print
    print "\nNumber of paths: {}\n\n".format(len(paths))

# FB13

In [4]:
dataset_path = '../benchmarks/FB13'

train2id = pd.read_csv(dataset_path + '/train2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
valid2id = pd.read_csv(dataset_path + '/valid2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
test2id = pd.read_csv(dataset_path + '/test2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])

from tools import dataset_tools
entity2id, id2entity     = dataset_tools.read_name2id_file(dataset_path + '/entity2id.txt')
relation2id, id2relation = dataset_tools.read_name2id_file(dataset_path + '/relation2id.txt')

train = pd.read_csv(dataset_path + '/train.txt', sep='\t', skiprows=0, names=['head', 'relation', 'tail'])
valid = pd.read_csv(dataset_path + '/valid.txt', sep='\t', skiprows=0, names=['head', 'relation', 'tail', 'label'])
test = pd.read_csv(dataset_path + '/test.txt', sep='\t', skiprows=0, names=['head', 'relation', 'tail', 'label'])

valid_pos = valid.loc[valid['label'] == 1]

In [5]:
start_time = time.time()
g = Graph()
g.partial_build_from_df(train)
g.partial_build_from_df(valid_pos)
print("Elapsed time to build G: {}".format(time.time() - start_time))

Elapsed time to build G: 206.842651844


# Search Paths

In [30]:
sfe = SFE(g, max_depth=2, max_fan_out=9999999)

In [37]:
%%time
paths = sfe.search_paths('antoine_barnave', 'guillotine')

CPU times: user 74.1 ms, sys: 16.1 ms, total: 90.2 ms
Wall time: 72.8 ms


In [32]:
paths

{('cause_of_death',),
 ('gender', '_gender', '_children', 'cause_of_death'),
 ('gender', '_gender', '_parents', 'cause_of_death'),
 ('gender', '_gender', '_spouse', 'cause_of_death'),
 ('gender', '_gender', 'cause_of_death'),
 ('gender', '_gender', 'children', 'cause_of_death'),
 ('gender', '_gender', 'parents', 'cause_of_death'),
 ('gender', '_gender', 'spouse', 'cause_of_death'),
 ('nationality', '_nationality', '_children', 'cause_of_death'),
 ('nationality', '_nationality', '_parents', 'cause_of_death'),
 ('nationality', '_nationality', '_spouse', 'cause_of_death'),
 ('nationality', '_nationality', 'cause_of_death'),
 ('nationality', '_nationality', 'children', 'cause_of_death'),
 ('nationality', '_nationality', 'parents', 'cause_of_death'),
 ('nationality', '_nationality', 'spouse', 'cause_of_death'),
 ('place_of_death', '_location', '_children', 'cause_of_death'),
 ('place_of_death', '_location', 'cause_of_death'),
 ('place_of_death', '_location', 'parents', 'cause_of_death'),
 (

In [33]:
lpaths = {}
for path in paths:
    lpaths[len(path)] = lpaths.get(len(path), []) + [path]
for idx, ps in lpaths.iteritems():
    print "Paths of length: ", idx
    for p in ps: print p
    print
print "\nNumber of paths: {}\n\n".format(len(paths))

Paths of length:  1
('cause_of_death',)

Paths of length:  3
('place_of_death', '_place_of_death', 'cause_of_death')
('place_of_death', '_location', 'cause_of_death')
('gender', '_gender', 'cause_of_death')
('nationality', '_nationality', 'cause_of_death')
('place_of_death', '_place_of_birth', 'cause_of_death')

Paths of length:  4
('nationality', '_nationality', 'spouse', 'cause_of_death')
('place_of_death', '_location', '_children', 'cause_of_death')
('gender', '_gender', '_children', 'cause_of_death')
('nationality', '_nationality', 'parents', 'cause_of_death')
('place_of_death', '_place_of_birth', '_parents', 'cause_of_death')
('nationality', '_nationality', '_spouse', 'cause_of_death')
('gender', '_gender', '_parents', 'cause_of_death')
('place_of_death', '_place_of_birth', 'parents', 'cause_of_death')
('nationality', '_nationality', '_parents', 'cause_of_death')
('gender', '_gender', 'parents', 'cause_of_death')
('gender', '_gender', 'children', 'cause_of_death')
('gender', '_gen

# Extract Features

In [38]:
df = train[1230:1241]
df

Unnamed: 0,head,relation,tail
1230,beals_wright,gender,male
1231,warring_kennedy,gender,male
1232,jackson_c_pharris,nationality,united_states
1233,halford_john_mackinder,institution,christ_church_oxford
1234,tony_snow,cause_of_death,cancer
1235,ernst_lecher,gender,male
1236,antoine_barnave,cause_of_death,guillotine
1237,jack_warden,place_of_birth,newark
1238,charles_jackson_paine,nationality,united_states
1239,big_moe,ethnicity,african_american


In [39]:
res = sfe.extract_features(df)

In [40]:
%%time
res = next(res)

CPU times: user 9.35 s, sys: 385 ms, total: 9.74 s
Wall time: 9.65 s


In [41]:
res

{'cause_of_death': [{'entity_pair': ('antoine_barnave', 'guillotine'),
   'features': {('gender', '_gender', '_children', 'cause_of_death'),
    ('gender', '_gender', '_parents', 'cause_of_death'),
    ('gender', '_gender', '_spouse', 'cause_of_death'),
    ('gender', '_gender', 'cause_of_death'),
    ('gender', '_gender', 'children', 'cause_of_death'),
    ('gender', '_gender', 'parents', 'cause_of_death'),
    ('gender', '_gender', 'spouse', 'cause_of_death'),
    ('nationality', '_nationality', '_children', 'cause_of_death'),
    ('nationality', '_nationality', '_parents', 'cause_of_death'),
    ('nationality', '_nationality', '_spouse', 'cause_of_death'),
    ('nationality', '_nationality', 'cause_of_death'),
    ('nationality', '_nationality', 'children', 'cause_of_death'),
    ('nationality', '_nationality', 'parents', 'cause_of_death'),
    ('nationality', '_nationality', 'spouse', 'cause_of_death'),
    ('place_of_death', '_location', '_children', 'cause_of_death'),
    ('place

In [77]:
def stringify_features(res):
    for rel in res: # for each relation
        for inst in res[rel]: # for each instance
            if type(inst['features']) == str:
                continue
            stringified_paths = ['-' + '-'.join(path) + '-,1.0' for path in inst['features']]
            stringified_feats = ' -#- '.join(stringified_paths)
            inst['features'] = stringified_feats
            
def stringify_ent_pair(res):
    for rel in res: # for each relation
        for inst in res[rel]: # for each instance
            if type(inst['entity_pair']) == str:
                continue
            inst['entity_pair'] = ','.join(inst['entity_pair'])
            
stringify_features(res)
stringify_ent_pair(res)

In [89]:
inst = res['cause_of_death'][0]

In [93]:
inst['label'] = '?'

In [98]:
with open('./test/rel/train.tsv', 'a') as f:
    f.write('{}\t{}\t{}\n'.format(inst['entity_pair'], inst['label'], inst['features']))

In [21]:
df.loc[1233]

head        halford_john_mackinder
relation               institution
tail          christ_church_oxford
Name: 1233, dtype: object

In [38]:
for idx,feats in res:
    row = df.loc[idx]
    print(row['head'], row['relation'], row['tail'], feats)
    print "\nNumber of features: {}\n\n".format(len(feats))

('antoine_barnave', 'cause_of_death', 'guillotine', set([('nationality', '_nationality', 'spouse', 'cause_of_death'), ('place_of_death', '_location', '_children', 'cause_of_death'), ('gender', '_gender', '_children', 'cause_of_death'), ('place_of_death', '_place_of_birth', '_parents', 'cause_of_death'), ('gender', '_gender', '_parents', 'cause_of_death'), ('place_of_death', '_place_of_birth', 'parents', 'cause_of_death'), ('nationality', '_nationality', '_parents', 'cause_of_death'), ('place_of_death', '_place_of_death', 'cause_of_death'), ('gender', '_gender', 'parents', 'cause_of_death'), ('gender', '_gender', '_spouse', 'cause_of_death'), ('gender', '_gender', 'spouse', 'cause_of_death'), ('place_of_death', '_place_of_birth', 'children', 'cause_of_death'), ('nationality', '_nationality', '_children', 'cause_of_death'), ('gender', '_gender', 'cause_of_death'), ('place_of_death', '_location', 'parents', 'cause_of_death'), ('nationality', '_nationality', 'parents', 'cause_of_death'), (

In [49]:
for idx,row in df.iterrows():
    print row
#     print row['label'].get(None)
    print row.get('relation', 98024370)
    break

head        beals_wright
relation          gender
tail                male
Name: 1230, dtype: object
gender


Experimenting with IDs instead of names: it did not help much.

In [None]:
start_time = time.time()
g2 = Graph()
g2.partial_build_from_df(train2id)
g2.partial_build_from_df(valid2id)
print("Elapsed time: {}".format(time.time() - start_time))

In [None]:
sfe2 = SFE(g2)

In [None]:
%%timeit
df = train2id[1230:1231]
feats = sfe2.extract_features(df, max_depth=2)
feats = next(feats)

In [None]:
%%timeit
paths = sfe2.search_paths(entity2id['beals_wright'], entity2id['male'], max_depth=2)

In [None]:
for idx,f in feats:
    print idx
    row = df.iloc[idx]
    print(row['head'], row['relation'], row['tail'], f)

In [None]:
head, rel, tail, paths = (29639, 6, 67575, [['10', '_5'], ['_9', '_5'], ['9', '_5'], ['_10', '_5'], ['2', '_2', '_5'], ['_11', '_5'], ['11', '_5']])

In [None]:
print id2entity[head]
print id2relation[rel]
print id2entity[tail]
for path in paths:
    p = []
    for r in path:
        relation_id = int(r.replace('_', ''))
        relation_label = id2relation[relation_id]
        p.append(r.replace(str(relation_id), relation_label))
    print p

PRA results for `camilo_jose_cela` `nationality` `spain`:

```
-gender-_gender-place_of_birth-,1.0 -#-
-gender-_gender-location-,1.0 -#-
-gender-_gender-place_of_death-,1.0 -#-
-profession-_profession-place_of_birth-,1.0 -#-
-profession-_profession-place_of_death-,1.0 -#-
-place_of_death-_place_of_death-place_of_birth-,1.0 -#-
-place_of_death-_location-place_of_birth-,1.0 -#-
-place_of_death-_place_of_birth-place_of_death-,1.0 -#-
-place_of_death-_place_of_death-location-,1.0 -#-
-place_of_death-_location-location-,1.0 -#-
-profession-_profession-location-,1.0 -#-
-gender-_gender-institution-,1.0
```

In [None]:
paths = sfe.search_paths('spain', 'camilo_jose_cela', max_depth=2)
for path in paths:
    path_strings = []
    for r in path:
        path_strings.append(r.__str__())
    print path_strings

In [None]:
node = sfe.graph.get_node('menasseh_ben_israel')

In [None]:
for n in node.neighbors: print n

In [None]:
node = sfe.graph.get_node('spain')
for n in node.neighbors: print n

In [None]:
sfe.graph.get_node('richard_baxter') in node.neighbors

In [None]:
# run for one triple
start_time = time.time()
paths = sfe.extract_features(0, 67393, 2)
for p in paths:
    print "-----------path-----------"
    for e in p:
        print e.__str__()
print("Elapsed time: {}".format(time.time() - start_time))

In [None]:
# run for n triples
start_time = time.time()
res = []
for idx,row in train[:100].iterrows():
    res.append(sfe.extract_features(row['head'], row['tail'], 2))
print("Elapsed time: {}".format(time.time() - start_time))

In [None]:
with open('output_test.txt', 'w') as f:
    for row in res:
        for seq in row:
            for edge in seq:
                f.write(edge.__str__())
                f.write(',')
            f.write("\n")

In [None]:
!cat output_test.txt

---

# NELL186

In [None]:
dataset_path = './benchmarks/NELL186'

train2id = pd.read_csv(dataset_path + '/train2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
valid2id = pd.read_csv(dataset_path + '/valid2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])
test2id = pd.read_csv(dataset_path + '/test2id.txt', sep=' ', skiprows=1, names=['head', 'tail', 'relation'])

from tools import dataset_tools
entity2id, id2entity     = dataset_tools.read_name2id_file(dataset_path + '/entity2id.txt')
relation2id, id2relation = dataset_tools.read_name2id_file(dataset_path + '/relation2id.txt')

train = pd.read_csv(dataset_path + '/train.txt', sep='\t', skiprows=0, names=['head', 'relation', 'tail'])
valid = pd.read_csv(dataset_path + '/valid.txt', sep='\t', skiprows=0, names=['head', 'relation', 'tail', 'label'])
test = pd.read_csv(dataset_path + '/test.txt', sep='\t', skiprows=0, names=['head', 'relation', 'tail', 'label'])

valid_pos = valid.loc[valid['label'] == 1]

In [None]:
start_time = time.time()
g = Graph()
g.partial_build_from_df(train)
g.partial_build_from_df(valid_pos)
print("Elapsed time: {}".format(time.time() - start_time))

In [None]:
sfe = SFE(g)

In [None]:
train.head()

## Find paths between an instance (node pair)

In [None]:
%%time
paths = sfe.search_paths('concept:academicfield:applied_science', 'concept:academicfield:engineering_technology', max_depth=2)

In [None]:
lpaths = {}
for path in paths:
    lpaths[len(path)] = lpaths.get(len(path), []) + [path]
for idx, ps in lpaths.iteritems():
    print "Paths of length: ", idx
    for p in ps: print p
    print
print "\nNumber of paths: {}\n\n".format(len(paths))

## Try to run SFE for the entire dataset

In [None]:
valid.head()

In [None]:
valid.label.unique()

In [None]:
len(valid.sort_values(by=['head', 'tail']))

In [None]:
%%time
res = sfe.extract_features(valid, max_depth=2, batch_size=999999)

In [None]:
%%time
boo = next(res)

In [None]:
len(boo)

In [None]:
len(boo)

# Debug python

In [None]:
l = [1,2,3,4,5,6]

In [None]:
l.remove(3)

In [None]:
l

In [None]:
s = {1,2,3,4}

In [None]:
s - {4}

In [None]:
s

In [None]:
from itertools import product
l = [[1], [5,90,84]]
res = product(*l)
s = set()
s.update(res)
s

In [None]:
import math
float('inf') > 10

# Investigating difference in:

1. Max Depth = 2 and Max Fan Out = Infinite
2. Max Depth = 4 and Max Fan Out = 1000

In [23]:
sfe.max_depth = 2
sfe.max_fan_out = 99999999999
paths2inf = sfe.search_paths('camilo_jose_cela', 'male')
paths2inf

time get nodes: 6.91413879395e-06
time to perform BFS on both nodes: 3.06642198563
time to merge edge sequences: 0.157758235931


{('ethnicity', '_ethnicity', '_children', 'gender'),
 ('ethnicity', '_ethnicity', '_parents', 'gender'),
 ('ethnicity', '_ethnicity', '_spouse', 'gender'),
 ('ethnicity', '_ethnicity', 'children', 'gender'),
 ('ethnicity', '_ethnicity', 'gender'),
 ('ethnicity', '_ethnicity', 'parents', 'gender'),
 ('ethnicity', '_ethnicity', 'spouse', 'gender'),
 ('gender',),
 ('nationality', '_institution', '_children', 'gender'),
 ('nationality', '_institution', 'gender'),
 ('nationality', '_institution', 'parents', 'gender'),
 ('nationality', '_location', 'gender'),
 ('nationality', '_nationality', '_children', 'gender'),
 ('nationality', '_nationality', '_parents', 'gender'),
 ('nationality', '_nationality', '_place_of_death', 'gender'),
 ('nationality', '_nationality', '_spouse', 'gender'),
 ('nationality', '_nationality', 'children', 'gender'),
 ('nationality', '_nationality', 'gender'),
 ('nationality', '_nationality', 'parents', 'gender'),
 ('nationality', '_nationality', 'spouse', 'gender'),


In [None]:
sfe.max_depth = 4
sfe.max_fan_out = 1494
paths4none = sfe.search_paths('camilo_jose_cela', 'male')
paths4none

In [None]:
paths2inf.difference(paths4none)

In [None]:
camilo_jose_celta = g.get_node('camilo_jose_cela')

In [None]:
debug_print(camilo_jose_celta.neighbors)

In [None]:
print len(g.get_node('writer').neighbors)
print g.get_node('writer').fan_out
print len(g.get_node('male').neighbors)
print g.get_node('male').fan_out
print len(g.get_node('novelist').neighbors)
print g.get_node('novelist').fan_out

In [None]:
sfe.bfs_node_seqs(sfe.graph.get_node('male'))

In [None]:
camilo_jose_celta.neighbor2edgesstr[sfe.graph.get_node('novelist')]

---

# Compare my SFE with Gardner's (PRA repo)

Features found by PRA for (`yongzheng_emperor`, `maria_elisabeth_of_austria`):

```
_children, gender, _gender
parents, gender, _gender
_spouse, gender, _gender
spouse, gender, _gender
```

In [24]:
sfe.max_depth = 2
sfe.max_fan_out = 100

In [25]:
sfe.search_paths('yongzheng_emperor', 'maria_elisabeth_of_austria')

time get nodes: 6.19888305664e-06
time to perform BFS on both nodes: 0.00113296508789
time to merge edge sequences: 0.000262975692749


{('_children', 'gender', '_gender'),
 ('_spouse', 'gender', '_gender'),
 ('parents', 'gender', '_gender'),
 ('spouse', 'gender', '_gender')}

---
Features found by PRA for (`jimmy_mcculloch`, `lady_caroline_lennox`):

```
-gender-_gender-_spouse-,1.0
-gender-_gender-spouse-,1.0
```

In [26]:
sfe.search_paths('jimmy_mcculloch', 'lady_caroline_lennox')

time get nodes: 4.05311584473e-06
time to perform BFS on both nodes: 0.000751972198486
time to merge edge sequences: 0.000103950500488


{('gender', '_gender', '_spouse'), ('gender', '_gender', 'spouse')}

---
PRA results for `camilo_jose_cela` `nationality` `spain`:

In [27]:
res_gar = {('gender','_gender','place_of_birth'),
('gender','_gender','location'),
('gender','_gender','place_of_death'),
('profession','_profession','place_of_birth'),
('profession','_profession','place_of_death'),
('place_of_death','_place_of_death','place_of_birth'),
('place_of_death','_location','place_of_birth'),
('place_of_death','_place_of_birth','place_of_death'),
('place_of_death','_place_of_death','location'),
('place_of_death','_location','location'),
('profession','_profession','location'),
('gender','_gender','institution')}

In [28]:
sfe.max_depth = 2
sfe.max_fan_out = 1000
res_art = sfe.search_paths('camilo_jose_cela', 'spain')
res_art

time get nodes: 5.00679016113e-06
time to perform BFS on both nodes: 0.0253939628601
time to merge edge sequences: 0.00680613517761


{('ethnicity', '_ethnicity', '_spouse', 'nationality'),
 ('ethnicity', '_ethnicity', 'nationality'),
 ('ethnicity', '_ethnicity', 'spouse', 'nationality'),
 ('gender', '_gender', 'institution'),
 ('gender', '_gender', 'location'),
 ('gender', '_gender', 'nationality'),
 ('gender', '_gender', 'place_of_birth'),
 ('gender', '_gender', 'place_of_death'),
 ('nationality',),
 ('place_of_death', '_location', '_children', 'nationality'),
 ('place_of_death', '_location', '_parents', 'nationality'),
 ('place_of_death', '_location', '_spouse', 'nationality'),
 ('place_of_death', '_location', 'children', 'nationality'),
 ('place_of_death', '_location', 'location'),
 ('place_of_death', '_location', 'nationality'),
 ('place_of_death', '_location', 'parents', 'nationality'),
 ('place_of_death', '_location', 'place_of_birth'),
 ('place_of_death', '_location', 'spouse', 'nationality'),
 ('place_of_death', '_place_of_birth', '_children', 'nationality'),
 ('place_of_death', '_place_of_birth', '_parents'

In [29]:
res_gar - res_art

set()

In [30]:
res_art - res_gar

{('ethnicity', '_ethnicity', '_spouse', 'nationality'),
 ('ethnicity', '_ethnicity', 'nationality'),
 ('ethnicity', '_ethnicity', 'spouse', 'nationality'),
 ('gender', '_gender', 'nationality'),
 ('nationality',),
 ('place_of_death', '_location', '_children', 'nationality'),
 ('place_of_death', '_location', '_parents', 'nationality'),
 ('place_of_death', '_location', '_spouse', 'nationality'),
 ('place_of_death', '_location', 'children', 'nationality'),
 ('place_of_death', '_location', 'nationality'),
 ('place_of_death', '_location', 'parents', 'nationality'),
 ('place_of_death', '_location', 'spouse', 'nationality'),
 ('place_of_death', '_place_of_birth', '_children', 'nationality'),
 ('place_of_death', '_place_of_birth', '_parents', 'nationality'),
 ('place_of_death', '_place_of_birth', '_spouse', 'nationality'),
 ('place_of_death', '_place_of_birth', '_spouse', 'place_of_birth'),
 ('place_of_death', '_place_of_birth', '_spouse', 'place_of_death'),
 ('place_of_death', '_place_of_birt

In [61]:
from sfe.helpers import dfs_node_sequence_from_path
gen = dfs_node_sequence_from_path(
    sfe.graph.get_node('camilo_jose_cela'),
    sfe.graph.get_node('spain'),
    ('place_of_death', '_location', 'spouse', 'nationality')
)
debug_print(next(gen))

['Node(camilo_jose_cela)', 'Node(madrid)', 'Node(alfonso_xii_of_spain)', 'Node(mercedes_of_orleans)', 'Node(spain)']


In [63]:
debug_print(next(gen))

StopIteration: 

In [None]:
sfe.graph.get_node('camilo_jose_cela').get_edgestr2neighbors('ethnicity')[0].name

In [None]:
sfe.graph.get_node('galician_people').get_edgestr2neighbors('_ethnicity')[0].name

In [None]:
sfe.graph.get_node('francisco_franco').get_edgestr2neighbors('_spouse')[0].name

In [None]:
sfe.graph.get_node('carmen_polo').get_edgestr2neighbors('nationality')[0].name

---
PRA results for `frank_g_slaughter,johns_hopkins_university`:

```
-gender-_gender-institution-,1.0
-profession-_profession-institution-,1.0
-nationality-_nationality-institution-,1.0
-nationality-_location-institution-,1.0
-place_of_death-_place_of_birth-institution-,1.0
-place_of_death-_location-institution-,1.0
-location-_place_of_death-institution-,1.0
-location-_location-institution-,1.0
```

In [None]:
sfe.search_paths('frank_g_slaughter','johns_hopkins_university')

---
PRA results for `emily_donelson,united_states`:

In [64]:
res_gar = {('spouse','nationality'),
('_spouse','nationality'),
('gender','_gender','profession'),
('gender','_gender','place_of_birth'),
('spouse','gender','_gender','place_of_birth'),
('spouse','nationality','_place_of_birth','place_of_birth'),
('_spouse','nationality','_place_of_birth','place_of_birth'),
('_spouse','gender','_gender','place_of_birth'),
('_spouse','gender','_gender','profession'),
('spouse','nationality','_profession','profession'),
('_spouse','nationality','_location','place_of_birth'),
('_spouse','nationality','_nationality','place_of_birth'),
('spouse','nationality','_nationality','place_of_birth'),
('spouse','nationality','_place_of_death','place_of_birth'),
('spouse','nationality','_nationality','profession'),
('spouse','nationality','_location','place_of_birth'),
('spouse','gender','_gender','profession'),
('_spouse','nationality','_nationality','profession'),
('_spouse','nationality','_profession','profession'),
('_spouse','nationality','_place_of_death','place_of_birth'),
('location','_location','place_of_birth'),
('_spouse','location','_location','place_of_birth'),
('spouse','location','_location','place_of_birth')}

In [72]:
sfe.max_fan_out = 100
res_art = sfe.search_paths('emily_donelson','united_states')
print_paths_per_length(res_art)

time get nodes: 3.81469726562e-06
time to perform BFS on both nodes: 0.00343179702759
time to merge edge sequences: 0.000920057296753
Paths of length:  1
('nationality',)

Paths of length:  2
('spouse', 'nationality')
('_spouse', 'nationality')

Paths of length:  3
('gender', '_gender', 'profession')
('location', '_location', 'place_of_birth')
('gender', '_gender', 'place_of_birth')

Paths of length:  4
('spouse', 'gender', '_gender', 'profession')
('spouse', 'location', '_location', 'place_of_birth')
('_spouse', 'gender', '_gender', 'profession')
('_spouse', 'location', '_location', 'place_of_birth')
('_spouse', 'gender', '_gender', 'place_of_birth')
('spouse', 'gender', '_gender', 'place_of_birth')


Number of paths: 12




In [73]:
res_gar - res_art

{('_spouse', 'nationality', '_location', 'place_of_birth'),
 ('_spouse', 'nationality', '_nationality', 'place_of_birth'),
 ('_spouse', 'nationality', '_nationality', 'profession'),
 ('_spouse', 'nationality', '_place_of_birth', 'place_of_birth'),
 ('_spouse', 'nationality', '_place_of_death', 'place_of_birth'),
 ('_spouse', 'nationality', '_profession', 'profession'),
 ('spouse', 'nationality', '_location', 'place_of_birth'),
 ('spouse', 'nationality', '_nationality', 'place_of_birth'),
 ('spouse', 'nationality', '_nationality', 'profession'),
 ('spouse', 'nationality', '_place_of_birth', 'place_of_birth'),
 ('spouse', 'nationality', '_place_of_death', 'place_of_birth'),
 ('spouse', 'nationality', '_profession', 'profession')}

In [74]:
res_art - res_gar

{('nationality',)}

In [75]:
from sfe.helpers import dfs_node_sequence_from_path

gen = dfs_node_sequence_from_path(
    sfe.graph.get_node('emily_donelson'),
    sfe.graph.get_node('united_states'),
    ('location', '_place_of_birth', 'place_of_death')
)
debug_print(next(gen))

['Node(emily_donelson)', 'Node(tennessee)', 'Node(david_lipscomb)', 'Node(united_states)']


In [71]:
debug_print(next(gen))

StopIteration: 

In [None]:
usa = sfe.graph.get_node('united_states')

In [None]:
usa.fan_out

In [None]:
list(usa.neighbors)[0].name

In [None]:
# usa.edgesstr2neighborsname = {val: key.name for usa.neighbor2edgesstr}

usa.edgesstr2neighborsname = {}
for key,val in usa.neighbor2edgesstr.iteritems():
    for edge_name in val:
        usa.edgesstr2neighborsname[edge_name] = usa.edgesstr2neighborsname.get(edge_name, []) + [key.name]

In [None]:
usa.edgesstr2neighborsname['_profession']

In [None]:
# def follow_path_between_entities(head_name, tail_name, path):
#     # get node objects
#     head = sfe.graph.get_node(head_name)
#     tail = sfe.graph.get_node(tail_name)
#     # expand
    
#     level2nodes = [set(head)] # this list will store at each position a set of nodes that can be reached at the level of the path
#     visited = set(head)
#     for level,edge_str in enumerate(path):
#         nodes = level2nodes[level]
#         next_nodes = set()
#         for node in nodes:
#             next_nodes.add(node.edgestr2neighbors(edge_str))
#         level2nodes.append(next_nodes - visited)
#         visited.add(level2nodes[level])
        
# def dfs(head, path, visited=None):
#     if visited is None:
#         visited = set()
#     visited.add(head)
#     edge_str = path[0]
#     for next_ in set(node.edgestr2neighbors(edge_str)) - visited:
#         dfs(next_, path[1:], visited)
#     return visited

def dfs_node_sequence_from_path(start, goal, edges_path, nodes_path=None):
    """Performs a DFS following a restricted edges path. One may want to use this function
    in order to search the possible nodes one visits when following a sequence of edges.
    
    Arguments:
    - `start`: start node
    - `goal`: goal node
    - `edges_path`: a list containing the sequence of edge names you want to consider.
    - `nodes_path` (optional): the current set of nodes the have been visited so far.
    """
    if nodes_path is None:
        nodes_path = [start]
    if len(edges_path) == 0:
        if start == goal: yield nodes_path
    else:
        for next_ in set(start.get_edgestr2neighbors(edges_path[0])) - set(nodes_path):
            for p in dfs_node_sequence_from_path(next_, goal, edges_path[1:], nodes_path + [next_]):
                yield p

# def dfs_node_sequence_from_path(start, goal, edges_path):
#     stack = [(start, [start])]
#     while stack:
#         (vertex, nodes_path) = stack.pop()
# #         print vertex
#         for next_ in set(start.get_edgestr2neighbors(edges_path[0])) - set(nodes_path):
#             if next_ == goal:
#                 yield nodes_path + [next_]
#             else:
#                 stack.append((next_, nodes_path + [next_]))
                
head = sfe.graph.get_node('emily_donelson')
tail = sfe.graph.get_node('united_states')
path = ('spouse', 'nationality', '_profession', 'profession')
# path = ('nationality',)
# path = ('spouse',)

gen = dfs_node_sequence_from_path(head, tail, path)
debug_print(next(gen))

In [None]:
debug_print(next(gen))

In [None]:
edges_path = ('spouse', 'nationality', '_profession', 'profession')
neighbors = head.get_edgestr2neighbors(edges_path[0])
neighbors[0].name

In [None]:
for next_ in set():
    print 'fuck'