diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6cf74a9ab4..9df3974b79 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: true matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v2 @@ -71,20 +71,30 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'pip' + - name: Install dependencies + run: | + pip install poetry + pip install -e '.[dev]' pytest-xdist pip + if: matrix.python-version != '3.9' && matrix.python-version != '3.10' && matrix.python-version != '3.11' && matrix.python-version != '3.12' +# uv venv +# source .venv/bin/activate +# uv pip install -e '.[dev]' pytest-xdist pip + - name: Install dependencies run: | pip install poetry pip install -e '.[dev,setup]' pytest-xdist pip - if: matrix.python-version != '3.10' + if: matrix.python-version == '3.9' # uv venv # source .venv/bin/activate -# uv pip install -e '.[dev,setup]' pytest-xdist pip +# uv pip install -e '.[dev]' pytest-xdist pip - name: Install dependencies run: | pip install poetry - pip install -e '.[dev-no-ml,setup]' pytest-xdist pip - if: matrix.python-version == '3.10' + pip install -e '.[dev-no-ml]' pytest-xdist pip + # skip ML tests for 3.10 and 3.11 + if: matrix.python-version == '3.10' || matrix.python-version == '3.11' || matrix.python-version == '3.12' - name: Test with Pytest on Python ${{ matrix.python-version }} env: diff --git a/changelog.md b/changelog.md index 41aad3cee4..9bbc576346 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,16 @@ # Changelog +## Unreleased + +### Added + +- Support for numpy>2.0, and formal support for Python 3.11 and Python 3.12 + +### Fixed + +- `edsnlp.package` now correctly detect if a project uses an old-style poetry pyproject or a PEP621 pyproject.toml. +- PEP621 projects containing nested directories (e.g., "my_project/pipes/foo.py") are now supported. + ## v0.16.0 (2025-0.3-26) ### Added diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py index d92f999251..1c71e91fb9 100644 --- a/edsnlp/core/pipeline.py +++ b/edsnlp/core/pipeline.py @@ -102,7 +102,7 @@ def __init__( create_tokenizer: Optional[Callable[[Self], Tokenizer]] = None, vocab: Union[bool, Vocab] = True, batch_size: Optional[int] = None, - vocab_config: Type[BaseDefaults] = None, + vocab_config: Optional[Type[BaseDefaults]] = None, meta: Dict[str, Any] = None, pipeline: Optional[Sequence[str]] = None, components: Dict[str, Any] = {}, diff --git a/edsnlp/data/converters.py b/edsnlp/data/converters.py index 4cca047b00..c1247a14d0 100644 --- a/edsnlp/data/converters.py +++ b/edsnlp/data/converters.py @@ -89,7 +89,9 @@ def validate_kwargs(func, kwargs): model = vd.init_model_instance( **{k: v for k, v in kwargs.items() if k in spec.args} ) - fields = model.__fields__ if pydantic.__version__ < "2" else model.model_fields + fields = ( + model.__fields__ if pydantic.__version__ < "2" else vd.model.model_fields + ) d = { k: v for k, v in model.__dict__.items() diff --git a/edsnlp/package.py b/edsnlp/package.py index 375a16905e..3c0c837d08 100644 --- a/edsnlp/package.py +++ b/edsnlp/package.py @@ -6,15 +6,7 @@ import tempfile import warnings from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Dict, - Mapping, - Optional, - Sequence, - Union, -) +from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Sequence, Union import build import confit @@ -288,7 +280,7 @@ def make_src_dir(self): logger.info(f"SKIP {rel}") -class PoetryPackager(Packager): +class OldStylePoetryPackager(Packager): def __init__( self, *, @@ -326,7 +318,13 @@ def __init__( "requires": ["hatchling"], "build-backend": "hatchling.build", }, - "tool": {"hatch": {"build": {}}}, + "tool": { + "hatch": { + "build": {}, + # in case the user provides a git dependency for example + "metadata": {"allow-direct-references": True}, + } + }, "project": { "name": model_package, "version": version, @@ -458,7 +456,7 @@ def __init__( ) -class SetuptoolsPackager(Packager): +class StandardPackager(Packager): def __init__( self, *, @@ -499,7 +497,13 @@ def __init__( "requires": ["hatchling"], "build-backend": "hatchling.build", }, - "tool": {"hatch": {"build": {}}}, + "tool": { + "hatch": { + "build": {}, + # in case the user provides a git dependency for example + "metadata": {"allow-direct-references": True}, + }, + }, "project": { "name": model_package, "version": version, @@ -523,7 +527,10 @@ def __init__( packages = sorted([p for p in packages if p]) file_paths = [] for package in packages: - file_paths.extend((root_dir / package).rglob("*")) + for path in (root_dir / package).rglob("*"): + if "__pycache__" in path.parts or path.is_dir(): + continue + file_paths.append(path) new_pyproject["tool"]["hatch"]["build"] = { "packages": [*packages, artifacts_name], @@ -570,7 +577,7 @@ def package( dist_dir: Path = Path("dist"), artifacts_name: ModuleName = "artifacts", check_dependencies: bool = False, - project_type: Optional[Literal["poetry", "setuptools"]] = None, + project_type: Optional[str] = None, version: Optional[str] = None, metadata: Optional[Dict[str, Any]] = {}, distributions: Optional[AsList[Literal["wheel", "sdist"]]] = ["wheel"], @@ -601,21 +608,26 @@ def package( if pyproject_path.exists(): pyproject = toml.loads((root_dir / "pyproject.toml").read_text()) - package_managers = {"setuptools", "poetry", "hatch", "pdm"} & set( - (pyproject or {}).get("tool", {}) - ) - package_managers = package_managers or {"setuptools"} # default + try: + _ = pyproject["tool"]["poetry"]["name"] + inferred_project_type = "old-style-poetry" + except (KeyError, TypeError): + inferred_project_type = "standard" + try: if project_type is None: - [project_type] = package_managers + project_type = inferred_project_type packager_cls = { - "poetry": PoetryPackager, - "setuptools": SetuptoolsPackager, + "old-style-poetry": OldStylePoetryPackager, + "standard": StandardPackager, + # for backward compatibility + "poetry": OldStylePoetryPackager, + "setuptools": StandardPackager, }[project_type] except Exception: # pragma: no cover raise ValueError( - "Could not infer project type, only poetry and setuptools based projects " - "are supported for now" + f"Could not process project type {project_type!r} only old-style poetry " + f"and PEP 621 pyproject.toml formats are supported for now." ) packager = packager_cls( pyproject=pyproject, diff --git a/edsnlp/pipes/core/endlines/model.py b/edsnlp/pipes/core/endlines/model.py index 86fb00eb53..0fcabac1f9 100644 --- a/edsnlp/pipes/core/endlines/model.py +++ b/edsnlp/pipes/core/endlines/model.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd -from numpy.lib.function_base import iterable from pandas.api.types import CategoricalDtype from pandas.core.groupby import DataFrameGroupBy from spacy.strings import StringStore @@ -239,9 +238,9 @@ def _convert_A(self, df: pd.DataFrame, col: str) -> pd.DataFrame: df[new_col] = df[col].astype(cat_type_A) df[new_col] = df[new_col].cat.codes # Ensure that not known values are coded as OTHER - df.loc[ - ~df[col].isin(self.vocabulary["A3A4"].keys()), new_col - ] = self.vocabulary["A3A4"]["OTHER"] + df.loc[~df[col].isin(self.vocabulary["A3A4"].keys()), new_col] = ( + self.vocabulary["A3A4"]["OTHER"] + ) return df def _convert_B(self, df: pd.DataFrame, col: str) -> pd.DataFrame: @@ -594,7 +593,7 @@ def _retrieve_lines(cls, dfg: DataFrameGroupBy) -> DataFrameGroupBy: return dfg @classmethod - def _create_vocabulary(cls, x: iterable) -> dict: + def _create_vocabulary(cls, x: Iterable) -> dict: """Function to create a vocabulary for attributes in the training set. Parameters diff --git a/edsnlp/pipes/misc/dates/models.py b/edsnlp/pipes/misc/dates/models.py index 8af090bb2b..506af3f958 100644 --- a/edsnlp/pipes/misc/dates/models.py +++ b/edsnlp/pipes/misc/dates/models.py @@ -4,12 +4,24 @@ import pydantic from pandas._libs.tslibs.nattype import NaTType -from pydantic import BaseModel, Field, root_validator, validator +from pydantic import BaseModel, Field from pytz import timezone from spacy.tokens import Span from edsnlp.pipes.misc.dates.patterns.relative import specific_dict +try: + from pydantic import field_validator, model_validator + + def validator(x, allow_reuse=True, pre=False): + return field_validator(x, mode="before" if pre else "after") + + def root_validator(allow_reuse=True, pre=False): + return model_validator(mode="before" if pre else "after") + +except ImportError: + from pydantic import root_validator, validator + class Direction(str, Enum): FUTURE = "future" diff --git a/edsnlp/pipes/ner/scores/charlson/factory.py b/edsnlp/pipes/ner/scores/charlson/factory.py index 0faa1b3736..276580fbf2 100644 --- a/edsnlp/pipes/ner/scores/charlson/factory.py +++ b/edsnlp/pipes/ner/scores/charlson/factory.py @@ -24,7 +24,6 @@ "eds.charlson", assigns=["doc.ents", "doc.spans"], deprecated=[ - "eds.charlson", "charlson", ], ) diff --git a/edsnlp/tune.py b/edsnlp/tune.py index 20a21e33da..1dd6188e39 100644 --- a/edsnlp/tune.py +++ b/edsnlp/tune.py @@ -10,6 +10,7 @@ import joblib import optuna import optuna.visualization as vis +import pydantic from configobj import ConfigObj from confit import Cli, Config from confit.utils.collections import split_path @@ -49,6 +50,9 @@ class HyperparameterConfig(BaseModel): class Config: extra = "forbid" + if pydantic.VERSION < "2": + model_dump = BaseModel.dict + def to_dict(self) -> dict: """ Convert the hyperparameter configuration to a dictionary. @@ -57,7 +61,7 @@ def to_dict(self) -> dict: Returns: dict: A dictionary representation of the hyperparameter configuration. """ - return self.dict(exclude_unset=True, exclude_defaults=True) + return self.model_dump(exclude_unset=True, exclude_defaults=True) def setup_logging(): @@ -598,7 +602,7 @@ def tune( output_dir: str, checkpoint_dir: str, gpu_hours: confloat(gt=0) = DEFAULT_GPU_HOUR, - n_trials: conint(gt=0) = None, + n_trials: Optional[conint(gt=0)] = None, two_phase_tuning: bool = False, seed: int = 42, metric="ner.micro.f", diff --git a/pyproject.toml b/pyproject.toml index f0d016bd5b..fb7db54584 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,13 +13,16 @@ dependencies = [ "pytz", "pysimstring>=1.2.1", "regex", - "spacy>=3.2,<3.8", - "thinc<8.2.5", # we don't need thinc but spacy depdends on it 8.2.5 cause binary issues + # spacy doesn't provide binaries for python<3.9 from 3.8.2 so we need to cap it ourself + "spacy>=3.2,<3.8.2; python_version<'3.9'", + "spacy>=3.8.5,<4.0.0; python_version>='3.9'", + # thinc doesn't provide binaries for python<3.9 from 8.2.5 so we need to cap it ourself + "thinc<8.2.5; python_version<'3.9'", + "thinc>=8.2.5; python_version>='3.9'", "confit>=0.7.3", "tqdm", "umls-downloader>=0.1.1", - "numpy>=1.15.0,<1.23.2; python_version<'3.8'", - "numpy>=1.15.0,<2.0.0; python_version>='3.8'", + "numpy>=1.15.0", "pandas>=1.1.0; python_version<'3.8'", "pandas>=1.4.0; python_version>='3.8'", "typing-extensions>=4.0.0", @@ -49,8 +52,7 @@ dev-no-ml = [ "pyspark", "polars", - "mlconjug3<3.9.0", - "scikit-learn>=1.0.0", + "scikit-learn", "edsnlp[docs-no-ml]", ] @@ -75,7 +77,7 @@ docs-no-ml = [ ml = [ "rich-logger>=0.3.1", "torch>=1.13.0", - "foldedtensor>=0.3.4", + "foldedtensor>=0.4.0", "safetensors>=0.3.0; python_version>='3.8'", "safetensors>=0.3.0,<0.5.0; python_version<'3.8'", "transformers>=4.0.0,<5.0.0", @@ -92,10 +94,11 @@ dev = [ "plotly>=5.18.0", # required by optuna viz "ruamel.yaml>=0.18.0", "configobj>=5.0.9", - + "scikit-learn", ] setup = [ - "typer" + "mlconjug3<3.9.0", # bug https://github.com/Ars-Linguistica/mlconjug3/pull/506 + "numpy<2", # mlconjug has scikit-learn dep which doesn't support for numpy 2 yet ] [project.urls] @@ -312,7 +315,11 @@ where = ["."] requires = [ "setuptools", "cython>=0.25", - "spacy>=3.2,<3.8", + "spacy>=3.2,!=3.8.2; python_version<'3.9'", + "spacy>=3.2,!=3.8.2,<4.0.0; python_version>='3.9'", + # thinc doesn't provide binaries for python<3.9 from 8.2.5 so we need to cap it ourselves + "thinc<8.2.5; python_version<'3.9'", + "thinc>=8.2.5; python_version>='3.9'", # to update from https://github.com/scipy/oldest-supported-numpy/blob/main/setup.cfg # while setting numpy >= 1.15.0 due to spacy reqs "numpy==1.15.0; python_version=='3.7' and platform_machine not in 'arm64|aarch64|loongarch64' and platform_system!='AIX' and platform_python_implementation != 'PyPy'", @@ -324,19 +331,12 @@ requires = [ "numpy==1.19.0; python_version=='3.6' and platform_machine!='loongarch64' and platform_python_implementation=='PyPy'", "numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64' and platform_system!='AIX' and platform_python_implementation != 'PyPy'", "numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' and platform_python_implementation != 'PyPy'", - "numpy==1.19.3; python_version=='3.9' and platform_machine=='arm64' and platform_system=='Windows' and platform_python_implementation != 'PyPy'", - "numpy==1.19.3; python_version=='3.9' and platform_system not in 'OS400' and platform_machine not in 'arm64|loongarch64' and platform_python_implementation != 'PyPy'", "numpy==1.20.0; python_version=='3.7' and platform_machine!='loongarch64' and platform_python_implementation=='PyPy'", "numpy==1.21.0; python_version=='3.7' and platform_machine=='arm64' and platform_system=='Darwin' and platform_python_implementation!='PyPy'", "numpy==1.21.0; python_version=='3.8' and platform_machine=='arm64' and platform_system=='Darwin' and platform_python_implementation!='PyPy'", - "numpy==1.21.0; python_version=='3.9' and platform_machine=='arm64' and platform_system=='Darwin' and platform_python_implementation!='PyPy'", - "numpy==1.21.6; python_version=='3.10' and platform_machine!='loongarch64'", - "numpy==1.22.2; platform_machine=='loongarch64' and python_version>='3.8' and python_version<'3.11' and platform_python_implementation!='PyPy'", + "numpy==1.22.2; python_version>='3.8' and python_version<'3.9' and platform_machine=='loongarch64' and platform_python_implementation!='PyPy'", "numpy==1.22.2; python_version=='3.8' and platform_machine!='loongarch64' and platform_python_implementation=='PyPy'", - "numpy==1.23.2; python_version=='3.11'", - "numpy==1.23.3; python_version=='3.9' and platform_system=='OS400' and platform_machine!='loongarch64' and platform_python_implementation!='PyPy'", - "numpy==1.25.0; python_version=='3.9' and platform_python_implementation=='PyPy'", - "numpy==1.26.1; python_version=='3.12'", + "numpy>=2.0; python_version>='3.9'", ] build-backend = "setuptools.build_meta" diff --git a/tests/tuning/config.yml b/tests/tuning/config.yml index 08a3896ebe..e38655cd4e 100644 --- a/tests/tuning/config.yml +++ b/tests/tuning/config.yml @@ -125,3 +125,4 @@ train: scorer: ${ scorer } num_workers: 0 optimizer: ${ optimizer } + cpu: true diff --git a/tests/utils/test_package.py b/tests/utils/test_package.py index cb153234d5..b4290402bc 100644 --- a/tests/utils/test_package.py +++ b/tests/utils/test_package.py @@ -48,6 +48,7 @@ def test_package_with_files(nlp, tmp_path, package_name, manager): ((tmp_path / "test_model").mkdir(parents=True)) (tmp_path / "test_model" / "__init__.py").write_text('print("Hello World!")\n') + (tmp_path / "test_model" / "empty_folder").mkdir() (tmp_path / "README.md").write_text( """\