Skip to content

Commit

Permalink
Merge pull request #26 from anhaidgroup/bug_appveyor
Browse files Browse the repository at this point in the history
Bug appveyor
  • Loading branch information
pjmartinkus committed May 17, 2019
2 parents c446c62 + 2faff6a commit 291c769
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 28 deletions.
5 changes: 2 additions & 3 deletions appveyor.yml
@@ -1,14 +1,13 @@
environment:

matrix:
- python : 27
- python : 27-x64
- python : 34
- python : 34-x64
- python : 35
- python : 35-x64
- python : 36
- python : 36-x64
- python : 37
- python : 37-x64

install:

Expand Down
29 changes: 14 additions & 15 deletions py_stringsimjoin/join/set_sim_join_cy.pyx
Expand Up @@ -41,7 +41,7 @@ def set_sim_join_cy(ltable, rtable,
r_join_attr_index = r_columns.index(r_join_attr)
r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs)


cdef vector[vector[int]] ltokens, rtokens
tokenize_lists(ltable, rtable, l_join_attr_index, r_join_attr_index,
tokenizer, ltokens, rtokens)
Expand All @@ -50,8 +50,7 @@ def set_sim_join_cy(ltable, rtable,
sim_type = get_sim_type(sim_measure)

cdef PositionIndexCy index = PositionIndexCy()
index = build_position_index(ltokens, sim_type, threshold, allow_empty)

index = build_position_index(ltokens, sim_type, threshold, allow_empty)

cdef omap[int, int] candidate_overlap, overlap_threshold_cache
cdef vector[pair[int, int]] candidates
Expand All @@ -61,7 +60,7 @@ def set_sim_join_cy(ltable, rtable,
cdef int size, size_lower_bound, size_upper_bound
cdef double sim_score, overlap_score
cdef fnptr sim_fn
cdef compfnptr comp_fn
cdef compfnptr comp_fn
sim_fn = get_sim_function(sim_type)
comp_fn = get_comparison_function(get_comp_type(comp_op))

Expand All @@ -72,9 +71,9 @@ def set_sim_join_cy(ltable, rtable,
if show_progress:
prog_bar = pyprind.ProgBar(len(rtable))

for i in range(rtokens.size()):
for i in range(rtokens.size()):
tokens = rtokens[i]
m = tokens.size()
m = tokens.size()

if allow_empty and m == 0:
for j in index.l_empty_ids:
Expand All @@ -97,12 +96,12 @@ def set_sim_join_cy(ltable, rtable,
size_lower_bound = int_max(get_size_lower_bound(m, sim_type, threshold),
index.min_len)
size_upper_bound = int_min(get_size_upper_bound(m, sim_type, threshold),
index.max_len)
index.max_len)

for size in range(size_lower_bound, size_upper_bound + 1):
overlap_threshold_cache[size] = get_overlap_threshold(size, m, sim_type, threshold)

for j in range(prefix_length):
for j in range(min(m, prefix_length)):
if index.index.find(tokens[j]) == index.index.end():
continue
candidates = index.index[tokens[j]]
Expand Down Expand Up @@ -171,14 +170,14 @@ cdef PositionIndexCy build_position_index(vector[vector[int]]& token_vectors,
cdef vector[int] tokens, size_vector
cdef int prefix_length, token, i, j, m, n=token_vectors.size(), min_len=100000, max_len=0
cdef omap[int, vector[pair[int, int]]] index
cdef vector[int] empty_l_ids
for i in range(n):
tokens = token_vectors[i]
m = tokens.size()
cdef vector[int] empty_l_ids
for i in range(n):
tokens = token_vectors[i]
m = tokens.size()
size_vector.push_back(m)
prefix_length = get_prefix_length(m, sim_type, threshold)
for j in range(prefix_length):
index[tokens[j]].push_back(pair[int, int](i, j))
prefix_length = get_prefix_length(m, sim_type, threshold)
for j in range(min(m, prefix_length)):
index[tokens[j]].push_back(pair[int, int](i, j))
if m > max_len:
max_len = m
if m < min_len:
Expand Down
6 changes: 3 additions & 3 deletions py_stringsimjoin/tests/test_join.py
Expand Up @@ -170,7 +170,7 @@ def test_set_sim_join():

# similarity thresholds to be tested.
thresholds = {'JACCARD' : [0.3, 0.5, 0.7, 0.85, 1],
'COSINE' : [0.3, 0.5, 0.7, 0.85, 1],
'COSINE' : [0.3, 0.5, 0.7, 0.85, 1],
'DICE' : [0.3, 0.5, 0.7, 0.85, 1],
'OVERLAP_COEFFICIENT' : [0.3, 0.5, 0.7, 0.85, 1]}

Expand All @@ -180,8 +180,7 @@ def test_set_sim_join():
'2_GRAM': QgramTokenizer(qval=2, return_set=True),
'3_GRAM': QgramTokenizer(qval=3, return_set=True)}

# Test each combination of similarity measure, threshold and tokenizer
# for different test scenarios.
# Test each combination of similarity measure, threshold and tokenizer for different test scenarios.
for label, scenario in iteritems(data):
for sim_measure_type in sim_measure_types:
for threshold in thresholds.get(sim_measure_type):
Expand All @@ -193,6 +192,7 @@ def test_set_sim_join():
tok_type + ' tokenizer for ' + label + '.'
yield test_function,


# Test each similarity measure with different comparison operators.
for sim_measure_type in sim_measure_types:
for comp_op in ['>', '=']:
Expand Down
14 changes: 7 additions & 7 deletions py_stringsimjoin/utils/cython_utils.pyx
Expand Up @@ -27,15 +27,15 @@ cdef void tokenize_lists(ltable, rtable,
for lrow in ltable:
lstr = lrow[l_join_attr_index]
py_tokens = order_using_token_ordering(
tokenizer.tokenize(lstr), token_ordering)
ltokens.push_back(py_tokens)
for rrow in rtable:
tokenizer.tokenize(lstr), token_ordering)
ltokens.push_back(<vector[int]>py_tokens)

for rrow in rtable:
rstr = rrow[r_join_attr_index]
py_tokens = order_using_token_ordering(
tokenizer.tokenize(rstr), token_ordering)
rtokens.push_back(py_tokens)

tokenizer.tokenize(rstr), token_ordering)
rtokens.push_back(<vector[int]>py_tokens)

cdef generate_output_table(ltable_array, rtable_array,
vector[vector[pair[int, int]]]& output_pairs,
Expand Down

0 comments on commit 291c769

Please sign in to comment.