Skip to content

Commit

Permalink
Merge pull request #41 from anhaidgroup/dev
Browse files Browse the repository at this point in the history
0.3.4 changes
  • Loading branch information
Anson-Doan committed Feb 2, 2024
2 parents db0bb5b + 658aeb7 commit 2195839
Show file tree
Hide file tree
Showing 17 changed files with 62 additions and 55 deletions.
9 changes: 5 additions & 4 deletions .github/workflows/testing.yml
@@ -1,4 +1,4 @@
# Testing on linux, windows, macos, for python versions 3.7, 3.8, 3.9, 3.10
# Testing on linux, windows, macos, for python versions 3.7, 3.8, 3.9, 3.10, 3.11, 3.12

name: Testing

Expand All @@ -12,7 +12,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
os: ["ubuntu-latest", "windows-latest", "macos-latest"]
runs-on: ${{ matrix.os }}
env:
Expand All @@ -30,8 +30,9 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install numpy
pip install pandas joblib six Cython pyprind==2.9.8 py_stringmatching coveralls
python setup.py build_ext --inplace
pip install pandas joblib six Cython>=0.29.23 pyprind coveralls py-stringmatching
- name: Install package
run: python setup.py build_ext --inplace
- name: Run tests
run: |
python -m unittest -v
6 changes: 6 additions & 0 deletions CHANGES.txt
@@ -1,3 +1,9 @@
v0.3.4 - 2/2/2024
* Added support for Python 3.12
* Discontinued usage of cythonize.py during setup due to Python 3.12 compatibility issues
* Adjusted setuptools.setup project name to match name on PyPI
* Fixed compatibility issues with the latest versions of Pandas

v0.3.3 - 3/16/2023
* Dropped support for Python 3.6.
* Added support for Python 3.10 and 3.11.
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Expand Up @@ -17,7 +17,7 @@ Important links
Dependencies
============

py_stringsimjoin has been tested on each Python version between 3.7 and 3.11, inclusive.
py_stringsimjoin has been tested on each Python version between 3.7 and 3.12, inclusive.

The required dependencies to build the package are pandas 0.16.0 or higher, py_stringmatching 0.2.1 or higher,
joblib, pyprind, six and a C++ compiler. For the development version, you will also need Cython.
Expand Down
2 changes: 1 addition & 1 deletion py_stringsimjoin/__init__.py
@@ -1,5 +1,5 @@

__version__ = '0.3.2'
__version__ = '0.3.4'

# determine whether to use available cython implementations
__use_cython__ = True
Expand Down
6 changes: 3 additions & 3 deletions py_stringsimjoin/tests/test_apply_matcher.py
Expand Up @@ -61,9 +61,9 @@ def setUp(self):
self.r_join_attr,
'B.zipcode',
'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
self.ltable.drop('tmp_join_key', 1)
self.rtable.drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)
self.ltable.drop('tmp_join_key', axis=1)
self.rtable.drop('tmp_join_key', axis=1)

def test_apply_matcher(self):
tok = QgramTokenizer(qval=2, return_set=True)
Expand Down
32 changes: 15 additions & 17 deletions py_stringsimjoin/tests/test_converter_utils.py
Expand Up @@ -13,14 +13,13 @@

class DataframeColumnToStrTestCases(unittest.TestCase):
def setUp(self):
float_col = pd.Series(np.random.randn(10)).append(
pd.Series([np.NaN for _ in range(10)], index=range(10, 20)))
float_col_with_int_val = pd.Series(
np.random.randint(1, 100, 10)).append(
pd.Series([np.NaN for _ in range(10)], index=range(10, 20)))
str_col = pd.Series([random.choice(string.ascii_lowercase)
for _ in range(10)]).append(
pd.Series([np.NaN for _ in range(10)], index=range(10, 20)))
float_col = pd.concat([pd.Series(np.random.randn(10)),
pd.Series([np.NaN for _ in range(10)], index=range(10, 20))])
float_col_with_int_val = pd.concat([pd.Series(np.random.randint(1, 100, 10)),
pd.Series([np.NaN for _ in range(10)], index=range(10, 20))])
str_col = pd.concat([pd.Series([random.choice(string.ascii_lowercase)
for _ in range(10)]),
pd.Series([np.NaN for _ in range(10)], index=range(10, 20))])
int_col = pd.Series(np.random.randint(1, 100, 20))
nan_col = pd.Series([np.NaN for _ in range(20)])

Expand Down Expand Up @@ -168,14 +167,13 @@ def test_invalid_flag_combination(self):

class SeriesToStrTestCases(unittest.TestCase):
def setUp(self):
self.float_col = pd.Series(np.random.randn(10)).append(
pd.Series([np.NaN for _ in range(10)], index=range(10, 20)))
self.float_col_with_int_val = pd.Series(
np.random.randint(1, 100, 10)).append(
pd.Series([np.NaN for _ in range(10)], index=range(10, 20)))
self.str_col = pd.Series([random.choice(string.ascii_lowercase)
for _ in range(10)]).append(
pd.Series([np.NaN for _ in range(10)], index=range(10, 20)))
self.float_col = pd.concat([pd.Series(np.random.randn(10)),
pd.Series([np.NaN for _ in range(10)], index=range(10, 20))])
self.float_col_with_int_val = pd.concat([pd.Series(np.random.randint(1, 100, 10)),
pd.Series([np.NaN for _ in range(10)], index=range(10, 20))])
self.str_col = pd.concat([pd.Series([random.choice(string.ascii_lowercase)
for _ in range(10)]),
pd.Series([np.NaN for _ in range(10)], index=range(10, 20))])
self.int_col = pd.Series(np.random.randint(1, 100, 20))
self.nan_col = pd.Series([np.NaN for _ in range(20)])

Expand Down Expand Up @@ -213,7 +211,7 @@ def test_float_col_with_int_val(self):
self.assertEqual(self.float_col_with_int_val.dtype, float)
self.assertEqual(sum(pd.isnull(self.float_col_with_int_val)),
sum(pd.isnull(out_series)))
for idx, val in self.float_col_with_int_val.iteritems():
for idx, val in self.float_col_with_int_val.items():
if pd.isnull(val):
continue
self.assertEqual(str(int(val)), out_series.loc[idx])
Expand Down
6 changes: 3 additions & 3 deletions py_stringsimjoin/tests/test_disk_edit_dist_join.py
Expand Up @@ -65,9 +65,9 @@ def test_valid_join(scenario, tok, threshold,comp_op=DEFAULT_COMP_OP, args=(),
rtable_not_missing[[r_key_attr,
r_join_attr,
'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
ltable_not_missing.drop('tmp_join_key', 1)
rtable_not_missing.drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)
ltable_not_missing.drop('tmp_join_key', axis=1)
rtable_not_missing.drop('tmp_join_key', axis=1)

sim_measure_type = 'EDIT_DISTANCE'
sim_func = get_sim_function(sim_measure_type)
Expand Down
6 changes: 3 additions & 3 deletions py_stringsimjoin/tests/test_edit_dist_join.py
Expand Up @@ -60,9 +60,9 @@ def test_valid_join(scenario, tok, threshold, comp_op=DEFAULT_COMP_OP, args=(),
rtable_not_missing[[r_key_attr,
r_join_attr,
'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
ltable_not_missing.drop('tmp_join_key', 1)
rtable_not_missing.drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)
ltable_not_missing.drop('tmp_join_key', axis=1)
rtable_not_missing.drop('tmp_join_key', axis=1)

sim_measure_type = 'EDIT_DISTANCE'
sim_func = get_sim_function(sim_measure_type)
Expand Down
6 changes: 3 additions & 3 deletions py_stringsimjoin/tests/test_join.py
Expand Up @@ -75,9 +75,9 @@ def test_valid_join(scenario, sim_measure_type, args, convert_to_str=False):
rtable_not_missing[[r_key_attr,
r_join_attr,
'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
ltable_not_missing.drop('tmp_join_key', 1)
rtable_not_missing.drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)
ltable_not_missing.drop('tmp_join_key', axis=1)
rtable_not_missing.drop('tmp_join_key', axis=1)

sim_func = get_sim_function(sim_measure_type)

Expand Down
8 changes: 4 additions & 4 deletions py_stringsimjoin/tests/test_overlap_filter.py
Expand Up @@ -241,7 +241,7 @@ def setUp(self):
self.B['tmp_join_key'] = 1
self.C = pd.merge(self.A[['l_id', 'tmp_join_key']],
self.B[['r_id', 'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)

self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr'])
self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr'])
Expand Down Expand Up @@ -292,7 +292,7 @@ def test_candset_with_join_attr_of_type_int(self):
B['tmp_join_key'] = 1
C = pd.merge(A[['l_id', 'tmp_join_key']],
B[['r_id', 'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)

qg2_tok = QgramTokenizer(2, return_set=True)
expected_pairs = set(['1,2', '1,3', '2,1', '2,4', '2,5',
Expand Down Expand Up @@ -409,7 +409,7 @@ def test_candset_with_numeric_l_filter_attr(self):
B['tmp_join_key'] = 1
C = pd.merge(A[['l_id', 'tmp_join_key']],
B[['r_id', 'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)

qg2_tok = QgramTokenizer(2, return_set=True)
overlap_filter = OverlapFilter(qg2_tok)
Expand All @@ -426,7 +426,7 @@ def test_candset_with_numeric_r_filter_attr(self):
B['tmp_join_key'] = 1
C = pd.merge(A[['l_id', 'tmp_join_key']],
B[['r_id', 'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)

qg2_tok = QgramTokenizer(2, return_set=True)
overlap_filter = OverlapFilter(qg2_tok)
Expand Down
2 changes: 1 addition & 1 deletion py_stringsimjoin/tests/test_position_filter.py
Expand Up @@ -370,7 +370,7 @@ def setUp(self):
self.B['tmp_join_key'] = 1
self.C = pd.merge(self.A[['l_id', 'tmp_join_key']],
self.B[['r_id', 'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)

self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr'])
self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr'])
Expand Down
2 changes: 1 addition & 1 deletion py_stringsimjoin/tests/test_prefix_filter.py
Expand Up @@ -372,7 +372,7 @@ def setUp(self):
self.B['tmp_join_key'] = 1
self.C = pd.merge(self.A[['l_id', 'tmp_join_key']],
self.B[['r_id', 'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)

self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr'])
self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr'])
Expand Down
2 changes: 1 addition & 1 deletion py_stringsimjoin/tests/test_size_filter.py
Expand Up @@ -380,7 +380,7 @@ def setUp(self):
self.B['tmp_join_key'] = 1
self.C = pd.merge(self.A[['l_id', 'tmp_join_key']],
self.B[['r_id', 'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)

self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr'])
self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr'])
Expand Down
2 changes: 1 addition & 1 deletion py_stringsimjoin/tests/test_suffix_filter.py
Expand Up @@ -376,7 +376,7 @@ def setUp(self):
self.B['tmp_join_key'] = 1
self.C = pd.merge(self.A[['l_id', 'tmp_join_key']],
self.B[['r_id', 'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
on='tmp_join_key').drop('tmp_join_key', axis=1)

self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr'])
self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr'])
Expand Down
2 changes: 1 addition & 1 deletion py_stringsimjoin/utils/generic_helper.py
Expand Up @@ -73,7 +73,7 @@ def convert_dataframe_to_list(table, join_attr_index,
def convert_dataframe_to_array(dataframe, proj_attrs, join_attr,
remove_nan=True):
if remove_nan:
projected_dataframe = dataframe[proj_attrs].dropna(0,
projected_dataframe = dataframe[proj_attrs].dropna(axis=0,
subset=[join_attr])
else:
projected_dataframe = dataframe[proj_attrs]
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Expand Up @@ -2,4 +2,4 @@ pandas>=0.16.0
six
joblib
pyprind>=2.9.3
py_stringmatching>=0.2.1
py-stringmatching>=0.2.1
22 changes: 12 additions & 10 deletions setup.py
Expand Up @@ -67,16 +67,17 @@ def build_extensions(self):


def generate_cython():
cwd = os.path.abspath(os.path.dirname(__file__))
print("Cythonizing sources")
p = subprocess.call([sys.executable, os.path.join(cwd,
'build_tools',
'cythonize.py'),
'py_stringsimjoin'],
cwd=cwd)
if p != 0:
from Cython.Build import cythonize

module_list = [MODULES[key]['sources'][0] for key in MODULES.keys()]
print(module_list)

p = cythonize(module_list)

if not p:
raise RuntimeError("Running cythonize failed!")


MODULES = {
"py_stringsimjoin.index.inverted_index_cy": {'sources':["py_stringsimjoin/index/inverted_index_cy.pyx"],
'comargs':[]
Expand Down Expand Up @@ -213,8 +214,8 @@ def setup_package():

cmdclass = {"build_ext": build_ext}
setuptools.setup(
name='py_stringsimjoin',
version='0.3.3',
name='py-stringsimjoin',
version='0.3.4',
description='Python library for performing string similarity joins.',
long_description=LONG_DESCRIPTION,
url='https://sites.google.com/site/anhaidgroup/projects/magellan/py_stringsimjoin',
Expand All @@ -239,6 +240,7 @@ def setup_package():
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
'Topic :: Scientific/Engineering',
'Topic :: Utilities',
'Topic :: Software Development :: Libraries',
Expand Down

0 comments on commit 2195839

Please sign in to comment.