Merge pull request #26 from anhaidgroup/bug_appveyor

Bug appveyor
anhaidgroup · May 17, 2019 · 291c769 · 291c769
2 parents c446c62 + 2faff6a
commit 291c769
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 28 deletions.
diff --git a/appveyor.yml b/appveyor.yml
@@ -1,14 +1,13 @@
 environment:
-
   matrix:
     - python : 27
     - python : 27-x64
-    - python : 34
-    - python : 34-x64
     - python : 35
     - python : 35-x64
     - python : 36
     - python : 36-x64
+    - python : 37
+    - python : 37-x64
 
 install:
 

diff --git a/py_stringsimjoin/join/set_sim_join_cy.pyx b/py_stringsimjoin/join/set_sim_join_cy.pyx
@@ -41,7 +41,7 @@ def set_sim_join_cy(ltable, rtable,
     r_join_attr_index = r_columns.index(r_join_attr)                            
     r_out_attrs_indices = find_output_attribute_indices(r_columns, r_out_attrs) 
 
-                     
+
     cdef vector[vector[int]] ltokens, rtokens
     tokenize_lists(ltable, rtable, l_join_attr_index, r_join_attr_index, 
                    tokenizer, ltokens, rtokens)
@@ -50,8 +50,7 @@ def set_sim_join_cy(ltable, rtable,
     sim_type = get_sim_type(sim_measure)
 
     cdef PositionIndexCy index = PositionIndexCy()                                
-    index = build_position_index(ltokens, sim_type, threshold, allow_empty)                            
-
+    index = build_position_index(ltokens, sim_type, threshold, allow_empty)     
 
     cdef omap[int, int] candidate_overlap, overlap_threshold_cache              
     cdef vector[pair[int, int]] candidates                                      
@@ -61,7 +60,7 @@ def set_sim_join_cy(ltable, rtable,
     cdef int size, size_lower_bound, size_upper_bound                           
     cdef double sim_score, overlap_score                                        
     cdef fnptr sim_fn                                           
-    cdef compfnptr comp_fn                
+    cdef compfnptr comp_fn               
     sim_fn = get_sim_function(sim_type)                                         
     comp_fn = get_comparison_function(get_comp_type(comp_op))
 
@@ -72,9 +71,9 @@ def set_sim_join_cy(ltable, rtable,
     if show_progress:                                                           
         prog_bar = pyprind.ProgBar(len(rtable))
 
-    for i in range(rtokens.size()):                          
+    for i in range(rtokens.size()):                        
         tokens = rtokens[i]                                                     
-        m = tokens.size()                                                       
+        m = tokens.size()                                                    
 
         if allow_empty and m == 0:
             for j in index.l_empty_ids:
@@ -97,12 +96,12 @@ def set_sim_join_cy(ltable, rtable,
         size_lower_bound = int_max(get_size_lower_bound(m, sim_type, threshold),
                                    index.min_len)                               
         size_upper_bound = int_min(get_size_upper_bound(m, sim_type, threshold),
-                                   index.max_len)                               
+                                   index.max_len)                         
 
         for size in range(size_lower_bound, size_upper_bound + 1):              
             overlap_threshold_cache[size] = get_overlap_threshold(size, m, sim_type, threshold)
 
-        for j in range(prefix_length):                                          
+        for j in range(min(m, prefix_length)):                                          
             if index.index.find(tokens[j]) == index.index.end():                
                 continue                                                        
             candidates = index.index[tokens[j]]                                 
@@ -171,14 +170,14 @@ cdef PositionIndexCy build_position_index(vector[vector[int]]& token_vectors,
     cdef vector[int] tokens, size_vector                                        
     cdef int prefix_length, token, i, j, m, n=token_vectors.size(), min_len=100000, max_len=0
     cdef omap[int, vector[pair[int, int]]] index
-    cdef vector[int] empty_l_ids                                
-    for i in range(n):                                                          
-        tokens = token_vectors[i]                                               
-        m = tokens.size()                                                       
+    cdef vector[int] empty_l_ids                               
+    for i in range(n):                                                         
+        tokens = token_vectors[i]                                           
+        m = tokens.size()                                                     
         size_vector.push_back(m)                                                
-        prefix_length = get_prefix_length(m, sim_type, threshold)               
-        for j in range(prefix_length):                                          
-            index[tokens[j]].push_back(pair[int, int](i, j))                    
+        prefix_length = get_prefix_length(m, sim_type, threshold)         
+        for j in range(min(m, prefix_length)):                                          
+            index[tokens[j]].push_back(pair[int, int](i, j))                  
         if m > max_len:                                                         
             max_len = m                                                         
         if m < min_len:                                                         

diff --git a/py_stringsimjoin/tests/test_join.py b/py_stringsimjoin/tests/test_join.py
@@ -170,7 +170,7 @@ def test_set_sim_join():
 
     # similarity thresholds to be tested.
     thresholds = {'JACCARD' : [0.3, 0.5, 0.7, 0.85, 1],
-                  'COSINE' : [0.3, 0.5, 0.7, 0.85, 1],
+                  'COSINE' : [0.3, 0.5, 0.7, 0.85, 1], 
                   'DICE' : [0.3, 0.5, 0.7, 0.85, 1],
                   'OVERLAP_COEFFICIENT' : [0.3, 0.5, 0.7, 0.85, 1]}
 
@@ -180,8 +180,7 @@ def test_set_sim_join():
                   '2_GRAM': QgramTokenizer(qval=2, return_set=True),
                   '3_GRAM': QgramTokenizer(qval=3, return_set=True)}    
 
-    # Test each combination of similarity measure, threshold and tokenizer
-    # for different test scenarios.
+    # Test each combination of similarity measure, threshold and tokenizer for different test scenarios.
     for label, scenario in iteritems(data):
         for sim_measure_type in sim_measure_types:
             for threshold in thresholds.get(sim_measure_type):
@@ -193,6 +192,7 @@ def test_set_sim_join():
                         tok_type + ' tokenizer for ' + label + '.'
                     yield test_function,
 
+
    # Test each similarity measure with different comparison operators.
     for sim_measure_type in sim_measure_types:
         for comp_op in ['>', '=']:

diff --git a/py_stringsimjoin/utils/cython_utils.pyx b/py_stringsimjoin/utils/cython_utils.pyx
@@ -27,15 +27,15 @@ cdef void tokenize_lists(ltable, rtable,
     for lrow in ltable:                                                         
         lstr = lrow[l_join_attr_index]                                               
         py_tokens = order_using_token_ordering(                                 
-                        tokenizer.tokenize(lstr), token_ordering)               
-        ltokens.push_back(py_tokens)                                            
-                                                                                
-    for rrow in rtable:                                                         
+                        tokenizer.tokenize(lstr), token_ordering)
+        ltokens.push_back(<vector[int]>py_tokens)                         
+
+    for rrow in rtable:
         rstr = rrow[r_join_attr_index]                                               
         py_tokens = order_using_token_ordering(                                 
-                        tokenizer.tokenize(rstr), token_ordering)               
-        rtokens.push_back(py_tokens)                                            
-
+                        tokenizer.tokenize(rstr), token_ordering)   
+        rtokens.push_back(<vector[int]>py_tokens)         
+            
 
 cdef generate_output_table(ltable_array, rtable_array, 
                            vector[vector[pair[int, int]]]& output_pairs,