From da938acff3cc933eefdfc5fe5e9f8be4bc5897ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Tue, 1 Apr 2025 21:49:30 +0200 Subject: [PATCH 1/3] fix: add missing type hints and suppress pydantic warnings --- edsnlp/core/pipeline.py | 2 +- edsnlp/data/converters.py | 4 +- edsnlp/package.py | 460 ++++---------------- edsnlp/pipes/misc/dates/models.py | 14 +- edsnlp/pipes/ner/scores/charlson/factory.py | 1 - edsnlp/tune.py | 8 +- 6 files changed, 118 insertions(+), 371 deletions(-) diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py index d92f999251..1c71e91fb9 100644 --- a/edsnlp/core/pipeline.py +++ b/edsnlp/core/pipeline.py @@ -102,7 +102,7 @@ def __init__( create_tokenizer: Optional[Callable[[Self], Tokenizer]] = None, vocab: Union[bool, Vocab] = True, batch_size: Optional[int] = None, - vocab_config: Type[BaseDefaults] = None, + vocab_config: Optional[Type[BaseDefaults]] = None, meta: Dict[str, Any] = None, pipeline: Optional[Sequence[str]] = None, components: Dict[str, Any] = {}, diff --git a/edsnlp/data/converters.py b/edsnlp/data/converters.py index 4cca047b00..c1247a14d0 100644 --- a/edsnlp/data/converters.py +++ b/edsnlp/data/converters.py @@ -89,7 +89,9 @@ def validate_kwargs(func, kwargs): model = vd.init_model_instance( **{k: v for k, v in kwargs.items() if k in spec.args} ) - fields = model.__fields__ if pydantic.__version__ < "2" else model.model_fields + fields = ( + model.__fields__ if pydantic.__version__ < "2" else vd.model.model_fields + ) d = { k: v for k, v in model.__dict__.items() diff --git a/edsnlp/package.py b/edsnlp/package.py index 375a16905e..ea7f877a28 100644 --- a/edsnlp/package.py +++ b/edsnlp/package.py @@ -1,20 +1,11 @@ import os import re import shutil -import subprocess import sys import tempfile import warnings from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Dict, - Mapping, - Optional, - Sequence, - Union, -) +from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Sequence, Union import build import confit @@ -23,27 +14,11 @@ from build.__main__ import build_package, build_package_via_sdist from confit import Cli from loguru import logger -from typing_extensions import Literal, TypedDict +from typing_extensions import Literal import edsnlp from edsnlp.utils.typing import AsList, Validated -PoetryConstraint = TypedDict( - "PoetryConstraint", - { - "version": str, - "extras": Optional[Sequence[str]], - "markers": Optional[str], - "url": Optional[str], - "path": Optional[str], - "git": Optional[str], - "ref": Optional[str], - "branch": Optional[str], - "tag": Optional[str], - }, - total=False, -) - logger.remove() logger.add( sys.stdout, @@ -83,41 +58,6 @@ def validate(cls, value, config=None): if TYPE_CHECKING: ModuleName = str # noqa F811 -POETRY_SNIPPET = """\ -from poetry.core.masonry.builders.sdist import SdistBuilder -from poetry.factory import Factory -try: - from poetry.core.masonry.utils.module import ModuleOrPackageNotFound -except ImportError: - from poetry.core.masonry.utils.module import ModuleOrPackageNotFoundError as ModuleOrPackageNotFound -import sys -# Initialize the Poetry object for the current project -poetry = Factory().create_poetry("__root_dir__") - -# Initialize the builder -try: - builder = SdistBuilder(poetry) - # Get the list of files to include - files = builder.find_files_to_add() -except ModuleOrPackageNotFound: - if not poetry.package.packages: - print([]) - sys.exit(0) - -print([ - {k: v for k, v in { - "include": getattr(include, '_include'), - "from": getattr(include, 'source', None), - "formats": getattr(include, 'formats', None), - }.items() if v} - for include in builder._module.includes -]) - - -# Print the list of files -for file in files: - print(file.path) -""" # noqa E501 INIT_PY = """ # ----------------------------------------- @@ -173,13 +113,87 @@ def __init__( build_dir: Optional[Path] = None, dist_dir: Path, artifacts_name: ModuleName, + metadata: Optional[Dict[str, Any]] = {}, exclude: AsList[str], readme_replacements: Dict[str, str] = {}, - file_paths: Sequence[Path], ): - self.name = name + try: + version = version or pyproject["project"]["version"] + except (KeyError, TypeError): + version = "0.1.0" + name = name or pyproject["project"]["name"] + if pyproject is not None: + main_package = snake_case(pyproject["project"]["name"].lower()) + else: + main_package = None + model_package = snake_case(name.lower()) + + root_dir = root_dir.resolve() + dist_dir = dist_dir if Path(dist_dir).is_absolute() else root_dir / dist_dir + + build_dir = Path(tempfile.mkdtemp()) if build_dir is None else build_dir + + new_pyproject: confit.Config = confit.Config() + if pyproject is not None: + new_pyproject["project"] = pyproject["project"] + new_pyproject = new_pyproject.merge( + { + "build-system": { + "requires": ["hatchling"], + "build-backend": "hatchling.build", + }, + "tool": {"hatch": {"build": {}}}, + "project": { + "name": model_package, + "version": version, + "requires-python": ">=3.7", + }, + } + ) + + try: + find = dict(pyproject["tool"].pop("setuptools", {})["packages"]["find"]) + except Exception: + find = {} + where = find.pop("where", ["."]) + where = [where] if not isinstance(where, list) else where + packages = {main_package, model_package} + for w in where: + # TODO Should we handle namespaces ? + # if find.pop("namespace", None) is not None: + # packages.extend(setuptools.find_namespace_packages(**find)) + packages.update(setuptools.find_packages(w, **find)) + packages = sorted([p for p in packages if p]) + file_paths = [] + for package in packages: + for path in (root_dir / package).rglob("*"): + if "__pycache__" in path.parts or path.is_dir(): + continue + file_paths.append(path) + + new_pyproject["tool"]["hatch"]["build"] = { + "packages": [*packages, artifacts_name], + "exclude": ["__pycache__/", "*.pyc", "*.pyo", ".ipynb_checkpoints"], + "artifacts": [artifacts_name], + "targets": { + "wheel": { + "sources": { + f"{artifacts_name}": f"{model_package}/{artifacts_name}" + }, + }, + }, + } + + if "authors" in metadata: + metadata["authors"] = parse_authors(metadata["authors"]) + metadata["name"] = model_package + metadata["version"] = version + + pyproject = new_pyproject.merge({"project": metadata}) + + self.name = model_package self.version = version - assert name == pyproject["project"]["name"] + assert model_package == pyproject["project"]["name"] assert version == pyproject["project"]["version"] self.root_dir = root_dir.resolve() self.pipeline = pipeline @@ -195,7 +209,7 @@ def __init__( logger.info(f"root_dir: {root_dir}") logger.info(f"artifacts_name: {artifacts_name}") - logger.info(f"name: {name}") + logger.info(f"name: {model_package}") def build( self, @@ -220,15 +234,6 @@ def build( skip_dependency_check=skip_dependency_check, ) - # def update_pyproject(self): - # # Adding artifacts to include in pyproject.toml - # snake_name = snake_case(self.name.lower()) - # included = self.pyproject["tool"]["poetry"].setdefault("include", []) - # included.append(f"{snake_name}/{self.artifacts_name}/**") - # packages = list(self.packages) - # packages.append({"include": snake_name}) - # self.pyproject["tool"]["poetry"]["packages"] = packages - def make_src_dir(self): snake_name = snake_case(self.name.lower()) package_dir = self.build_dir / snake_name @@ -288,278 +293,6 @@ def make_src_dir(self): logger.info(f"SKIP {rel}") -class PoetryPackager(Packager): - def __init__( - self, - *, - name: ModuleName, - pyproject: Optional[Dict[str, Any]], - pipeline: Union[Path, "edsnlp.Pipeline"], - version: Optional[str], - root_dir: Path = ".", - build_dir: Optional[Path] = None, - dist_dir: Path, - artifacts_name: ModuleName, - metadata: Optional[Dict[str, Any]] = {}, - exclude: AsList[str], - readme_replacements: Dict[str, str] = {}, - ): - try: - version = version or pyproject["tool"]["poetry"]["version"] - except (KeyError, TypeError): # pragma: no cover - version = "0.1.0" - name = name or pyproject["tool"]["poetry"]["name"] - main_package = ( - snake_case(pyproject["tool"]["poetry"]["name"].lower()) - if pyproject is not None - else None - ) - model_package = snake_case(name.lower()) - - root_dir = root_dir.resolve() - dist_dir = dist_dir if Path(dist_dir).is_absolute() else root_dir / dist_dir - - build_dir = Path(tempfile.mkdtemp()) if build_dir is None else build_dir - - new_pyproject: Dict[str, Any] = { - "build-system": { - "requires": ["hatchling"], - "build-backend": "hatchling.build", - }, - "tool": {"hatch": {"build": {}}}, - "project": { - "name": model_package, - "version": version, - "requires-python": ">=3.7", - }, - } - file_paths = [] - - if pyproject is not None: - poetry = pyproject["tool"]["poetry"] - - # Extract packages - poetry_bin_path = ( - subprocess.run(["which", "poetry"], stdout=subprocess.PIPE) - .stdout.decode() - .strip() - ) - python_executable = Path(poetry_bin_path).read_text().split("\n")[0][2:] - result = subprocess.run( - [ - *python_executable.split(), - "-c", - POETRY_SNIPPET.replace("__root_dir__", str(root_dir)), - ], - stdout=subprocess.PIPE, - cwd=root_dir, - ) - if result.returncode != 0: - raise Exception() - out = result.stdout.decode().strip().split("\n") - file_paths = [root_dir / file_path for file_path in out[1:]] - packages = { - main_package, - model_package, - *(package["include"] for package in eval(out[0])), - } - packages = sorted([p for p in packages if p]) - new_pyproject["tool"]["hatch"]["build"] = { - "packages": [*packages, artifacts_name], - "exclude": ["__pycache__/", "*.pyc", "*.pyo", ".ipynb_checkpoints"], - "artifacts": [artifacts_name], - "targets": { - "wheel": { - "sources": { - f"{artifacts_name}": f"{model_package}/{artifacts_name}" - }, - }, - }, - } - if "description" in poetry: # pragma: no cover - new_pyproject["project"]["description"] = poetry["description"] - if "classifiers" in poetry: # pragma: no cover - new_pyproject["project"]["classifiers"] = poetry["classifiers"] - if "keywords" in poetry: # pragma: no cover - new_pyproject["project"]["keywords"] = poetry["keywords"] - if "license" in poetry: # pragma: no cover - new_pyproject["project"]["license"] = {"text": poetry["license"]} - if "readme" in poetry: # pragma: no cover - new_pyproject["project"]["readme"] = poetry["readme"] - if "authors" in poetry: # pragma: no cover - new_pyproject["project"]["authors"] = parse_authors(poetry["authors"]) - if "plugins" in poetry: # pragma: no cover - new_pyproject["project"]["entry-points"] = poetry["plugins"] - if "scripts" in poetry: # pragma: no cover - new_pyproject["project"]["scripts"] = poetry["scripts"] - - # Dependencies - deps = [] - poetry_deps = poetry["dependencies"] - for dep_name, constraint in poetry_deps.items(): - dep = dep_name - constraint: PoetryConstraint = ( - dict(constraint) - if isinstance(constraint, dict) - else {"version": constraint} - ) - try: - dep += f"[{','.join(constraint.pop('extras'))}]" - except KeyError: - pass - if "version" in constraint: - dep_version = constraint.pop("version") - assert not dep_version.startswith( - "^" - ), "Packaging models with ^ dependencies is not supported" - dep += ( - "" - if dep_version == "*" - else dep_version - if not dep_version[0].isdigit() - else f"=={dep_version}" - ) - try: - dep += f"; {constraint.pop('markers')}" - except KeyError: - pass - assert ( - not constraint - ), f"Unsupported constraints for dependency {dep_name}: {constraint}" - if dep_name == "python": - new_pyproject["project"]["requires-python"] = dep.replace( - "python", "" - ) - continue - deps.append(dep) - - new_pyproject["project"]["dependencies"] = deps - - if "authors" in metadata: - metadata["authors"] = parse_authors(metadata["authors"]) - metadata["name"] = model_package - metadata["version"] = version - - new_pyproject = confit.Config(new_pyproject).merge({"project": metadata}) - - # Use hatch - super().__init__( - name=model_package, - pyproject=new_pyproject, - pipeline=pipeline, - version=version, - root_dir=root_dir, - build_dir=build_dir, - dist_dir=dist_dir, - artifacts_name=artifacts_name, - exclude=exclude, - readme_replacements=readme_replacements, - file_paths=file_paths, - ) - - -class SetuptoolsPackager(Packager): - def __init__( - self, - *, - name: ModuleName, - pyproject: Optional[Dict[str, Any]], - pipeline: Union[Path, "edsnlp.Pipeline"], - version: Optional[str], - root_dir: Path = ".", - build_dir: Optional[Path] = None, - dist_dir: Path, - artifacts_name: ModuleName, - metadata: Optional[Dict[str, Any]] = {}, - exclude: AsList[str], - readme_replacements: Dict[str, str] = {}, - ): - try: - version = version or pyproject["project"]["version"] - except (KeyError, TypeError): - version = "0.1.0" - name = name or pyproject["project"]["name"] - if pyproject is not None: - main_package = snake_case(pyproject["project"]["name"].lower()) - else: - main_package = None - model_package = snake_case(name.lower()) - - root_dir = root_dir.resolve() - dist_dir = dist_dir if Path(dist_dir).is_absolute() else root_dir / dist_dir - - build_dir = Path(tempfile.mkdtemp()) if build_dir is None else build_dir - - new_pyproject: confit.Config = confit.Config() - if pyproject is not None: - new_pyproject["project"] = pyproject["project"] - new_pyproject = new_pyproject.merge( - { - "build-system": { - "requires": ["hatchling"], - "build-backend": "hatchling.build", - }, - "tool": {"hatch": {"build": {}}}, - "project": { - "name": model_package, - "version": version, - "requires-python": ">=3.7", - }, - } - ) - - try: - find = dict(pyproject["tool"].pop("setuptools", {})["packages"]["find"]) - except Exception: - find = {} - where = find.pop("where", ["."]) - where = [where] if not isinstance(where, list) else where - packages = {main_package, model_package} - for w in where: - # TODO Should we handle namespaces ? - # if find.pop("namespace", None) is not None: - # packages.extend(setuptools.find_namespace_packages(**find)) - packages.update(setuptools.find_packages(w, **find)) - packages = sorted([p for p in packages if p]) - file_paths = [] - for package in packages: - file_paths.extend((root_dir / package).rglob("*")) - - new_pyproject["tool"]["hatch"]["build"] = { - "packages": [*packages, artifacts_name], - "exclude": ["__pycache__/", "*.pyc", "*.pyo", ".ipynb_checkpoints"], - "artifacts": [artifacts_name], - "targets": { - "wheel": { - "sources": { - f"{artifacts_name}": f"{model_package}/{artifacts_name}" - }, - }, - }, - } - - if "authors" in metadata: - metadata["authors"] = parse_authors(metadata["authors"]) - metadata["name"] = model_package - metadata["version"] = version - - new_pyproject = new_pyproject.merge({"project": metadata}) - - super().__init__( - name=model_package, - pyproject=new_pyproject, - pipeline=pipeline, - version=version, - root_dir=root_dir, - build_dir=build_dir, - dist_dir=dist_dir, - artifacts_name=artifacts_name, - exclude=exclude, - readme_replacements=readme_replacements, - file_paths=file_paths, - ) - - @app.command(name="package") def package( pipeline: Union[Path, "edsnlp.Pipeline"], @@ -580,7 +313,20 @@ def package( exclude: Optional[AsList[str]] = None, readme_replacements: Dict[str, str] = {}, ): - # root_dir = Path(".").resolve() + """ + + Parameters + ---------- + + Returns + ------- + + """ + if project_type is not None: + warnings.warn( + "Project_type is deprecated, only PEP621 pyproject.toml is supported", + DeprecationWarning, + ) exclude = exclude or ["artifacts/vocab/*"] pyproject_path = root_dir / "pyproject.toml" @@ -601,23 +347,7 @@ def package( if pyproject_path.exists(): pyproject = toml.loads((root_dir / "pyproject.toml").read_text()) - package_managers = {"setuptools", "poetry", "hatch", "pdm"} & set( - (pyproject or {}).get("tool", {}) - ) - package_managers = package_managers or {"setuptools"} # default - try: - if project_type is None: - [project_type] = package_managers - packager_cls = { - "poetry": PoetryPackager, - "setuptools": SetuptoolsPackager, - }[project_type] - except Exception: # pragma: no cover - raise ValueError( - "Could not infer project type, only poetry and setuptools based projects " - "are supported for now" - ) - packager = packager_cls( + packager = Packager( pyproject=pyproject, pipeline=pipeline, name=name, diff --git a/edsnlp/pipes/misc/dates/models.py b/edsnlp/pipes/misc/dates/models.py index 8af090bb2b..506af3f958 100644 --- a/edsnlp/pipes/misc/dates/models.py +++ b/edsnlp/pipes/misc/dates/models.py @@ -4,12 +4,24 @@ import pydantic from pandas._libs.tslibs.nattype import NaTType -from pydantic import BaseModel, Field, root_validator, validator +from pydantic import BaseModel, Field from pytz import timezone from spacy.tokens import Span from edsnlp.pipes.misc.dates.patterns.relative import specific_dict +try: + from pydantic import field_validator, model_validator + + def validator(x, allow_reuse=True, pre=False): + return field_validator(x, mode="before" if pre else "after") + + def root_validator(allow_reuse=True, pre=False): + return model_validator(mode="before" if pre else "after") + +except ImportError: + from pydantic import root_validator, validator + class Direction(str, Enum): FUTURE = "future" diff --git a/edsnlp/pipes/ner/scores/charlson/factory.py b/edsnlp/pipes/ner/scores/charlson/factory.py index 0faa1b3736..276580fbf2 100644 --- a/edsnlp/pipes/ner/scores/charlson/factory.py +++ b/edsnlp/pipes/ner/scores/charlson/factory.py @@ -24,7 +24,6 @@ "eds.charlson", assigns=["doc.ents", "doc.spans"], deprecated=[ - "eds.charlson", "charlson", ], ) diff --git a/edsnlp/tune.py b/edsnlp/tune.py index 20a21e33da..1dd6188e39 100644 --- a/edsnlp/tune.py +++ b/edsnlp/tune.py @@ -10,6 +10,7 @@ import joblib import optuna import optuna.visualization as vis +import pydantic from configobj import ConfigObj from confit import Cli, Config from confit.utils.collections import split_path @@ -49,6 +50,9 @@ class HyperparameterConfig(BaseModel): class Config: extra = "forbid" + if pydantic.VERSION < "2": + model_dump = BaseModel.dict + def to_dict(self) -> dict: """ Convert the hyperparameter configuration to a dictionary. @@ -57,7 +61,7 @@ def to_dict(self) -> dict: Returns: dict: A dictionary representation of the hyperparameter configuration. """ - return self.dict(exclude_unset=True, exclude_defaults=True) + return self.model_dump(exclude_unset=True, exclude_defaults=True) def setup_logging(): @@ -598,7 +602,7 @@ def tune( output_dir: str, checkpoint_dir: str, gpu_hours: confloat(gt=0) = DEFAULT_GPU_HOUR, - n_trials: conint(gt=0) = None, + n_trials: Optional[conint(gt=0)] = None, two_phase_tuning: bool = False, seed: int = 42, metric="ner.micro.f", From e135db767ee017f5e8cf7eee4f98448247ff95ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Wed, 2 Apr 2025 09:34:25 +0200 Subject: [PATCH 2/3] fix: support packaging pep621 poetry projects and fix standard packager --- changelog.md | 7 + edsnlp/package.py | 472 ++++++++++++++++++++++++++++-------- pyproject.toml | 2 +- tests/utils/test_package.py | 1 + 4 files changed, 386 insertions(+), 96 deletions(-) diff --git a/changelog.md b/changelog.md index 41aad3cee4..0b723af988 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,12 @@ # Changelog +## Unreleased + +### Fixed + +- `edsnlp.package` now correctly detect if a project uses an old-style poetry pyproject or a PEP621 pyproject.toml. +- PEP621 projects containing nested directories (e.g., "my_project/pipes/foo.py") are now supported. + ## v0.16.0 (2025-0.3-26) ### Added diff --git a/edsnlp/package.py b/edsnlp/package.py index ea7f877a28..3c0c837d08 100644 --- a/edsnlp/package.py +++ b/edsnlp/package.py @@ -1,6 +1,7 @@ import os import re import shutil +import subprocess import sys import tempfile import warnings @@ -14,11 +15,27 @@ from build.__main__ import build_package, build_package_via_sdist from confit import Cli from loguru import logger -from typing_extensions import Literal +from typing_extensions import Literal, TypedDict import edsnlp from edsnlp.utils.typing import AsList, Validated +PoetryConstraint = TypedDict( + "PoetryConstraint", + { + "version": str, + "extras": Optional[Sequence[str]], + "markers": Optional[str], + "url": Optional[str], + "path": Optional[str], + "git": Optional[str], + "ref": Optional[str], + "branch": Optional[str], + "tag": Optional[str], + }, + total=False, +) + logger.remove() logger.add( sys.stdout, @@ -58,6 +75,41 @@ def validate(cls, value, config=None): if TYPE_CHECKING: ModuleName = str # noqa F811 +POETRY_SNIPPET = """\ +from poetry.core.masonry.builders.sdist import SdistBuilder +from poetry.factory import Factory +try: + from poetry.core.masonry.utils.module import ModuleOrPackageNotFound +except ImportError: + from poetry.core.masonry.utils.module import ModuleOrPackageNotFoundError as ModuleOrPackageNotFound +import sys +# Initialize the Poetry object for the current project +poetry = Factory().create_poetry("__root_dir__") + +# Initialize the builder +try: + builder = SdistBuilder(poetry) + # Get the list of files to include + files = builder.find_files_to_add() +except ModuleOrPackageNotFound: + if not poetry.package.packages: + print([]) + sys.exit(0) + +print([ + {k: v for k, v in { + "include": getattr(include, '_include'), + "from": getattr(include, 'source', None), + "formats": getattr(include, 'formats', None), + }.items() if v} + for include in builder._module.includes +]) + + +# Print the list of files +for file in files: + print(file.path) +""" # noqa E501 INIT_PY = """ # ----------------------------------------- @@ -113,87 +165,13 @@ def __init__( build_dir: Optional[Path] = None, dist_dir: Path, artifacts_name: ModuleName, - metadata: Optional[Dict[str, Any]] = {}, exclude: AsList[str], readme_replacements: Dict[str, str] = {}, + file_paths: Sequence[Path], ): - try: - version = version or pyproject["project"]["version"] - except (KeyError, TypeError): - version = "0.1.0" - name = name or pyproject["project"]["name"] - if pyproject is not None: - main_package = snake_case(pyproject["project"]["name"].lower()) - else: - main_package = None - model_package = snake_case(name.lower()) - - root_dir = root_dir.resolve() - dist_dir = dist_dir if Path(dist_dir).is_absolute() else root_dir / dist_dir - - build_dir = Path(tempfile.mkdtemp()) if build_dir is None else build_dir - - new_pyproject: confit.Config = confit.Config() - if pyproject is not None: - new_pyproject["project"] = pyproject["project"] - new_pyproject = new_pyproject.merge( - { - "build-system": { - "requires": ["hatchling"], - "build-backend": "hatchling.build", - }, - "tool": {"hatch": {"build": {}}}, - "project": { - "name": model_package, - "version": version, - "requires-python": ">=3.7", - }, - } - ) - - try: - find = dict(pyproject["tool"].pop("setuptools", {})["packages"]["find"]) - except Exception: - find = {} - where = find.pop("where", ["."]) - where = [where] if not isinstance(where, list) else where - packages = {main_package, model_package} - for w in where: - # TODO Should we handle namespaces ? - # if find.pop("namespace", None) is not None: - # packages.extend(setuptools.find_namespace_packages(**find)) - packages.update(setuptools.find_packages(w, **find)) - packages = sorted([p for p in packages if p]) - file_paths = [] - for package in packages: - for path in (root_dir / package).rglob("*"): - if "__pycache__" in path.parts or path.is_dir(): - continue - file_paths.append(path) - - new_pyproject["tool"]["hatch"]["build"] = { - "packages": [*packages, artifacts_name], - "exclude": ["__pycache__/", "*.pyc", "*.pyo", ".ipynb_checkpoints"], - "artifacts": [artifacts_name], - "targets": { - "wheel": { - "sources": { - f"{artifacts_name}": f"{model_package}/{artifacts_name}" - }, - }, - }, - } - - if "authors" in metadata: - metadata["authors"] = parse_authors(metadata["authors"]) - metadata["name"] = model_package - metadata["version"] = version - - pyproject = new_pyproject.merge({"project": metadata}) - - self.name = model_package + self.name = name self.version = version - assert model_package == pyproject["project"]["name"] + assert name == pyproject["project"]["name"] assert version == pyproject["project"]["version"] self.root_dir = root_dir.resolve() self.pipeline = pipeline @@ -209,7 +187,7 @@ def __init__( logger.info(f"root_dir: {root_dir}") logger.info(f"artifacts_name: {artifacts_name}") - logger.info(f"name: {model_package}") + logger.info(f"name: {name}") def build( self, @@ -234,6 +212,15 @@ def build( skip_dependency_check=skip_dependency_check, ) + # def update_pyproject(self): + # # Adding artifacts to include in pyproject.toml + # snake_name = snake_case(self.name.lower()) + # included = self.pyproject["tool"]["poetry"].setdefault("include", []) + # included.append(f"{snake_name}/{self.artifacts_name}/**") + # packages = list(self.packages) + # packages.append({"include": snake_name}) + # self.pyproject["tool"]["poetry"]["packages"] = packages + def make_src_dir(self): snake_name = snake_case(self.name.lower()) package_dir = self.build_dir / snake_name @@ -293,6 +280,293 @@ def make_src_dir(self): logger.info(f"SKIP {rel}") +class OldStylePoetryPackager(Packager): + def __init__( + self, + *, + name: ModuleName, + pyproject: Optional[Dict[str, Any]], + pipeline: Union[Path, "edsnlp.Pipeline"], + version: Optional[str], + root_dir: Path = ".", + build_dir: Optional[Path] = None, + dist_dir: Path, + artifacts_name: ModuleName, + metadata: Optional[Dict[str, Any]] = {}, + exclude: AsList[str], + readme_replacements: Dict[str, str] = {}, + ): + try: + version = version or pyproject["tool"]["poetry"]["version"] + except (KeyError, TypeError): # pragma: no cover + version = "0.1.0" + name = name or pyproject["tool"]["poetry"]["name"] + main_package = ( + snake_case(pyproject["tool"]["poetry"]["name"].lower()) + if pyproject is not None + else None + ) + model_package = snake_case(name.lower()) + + root_dir = root_dir.resolve() + dist_dir = dist_dir if Path(dist_dir).is_absolute() else root_dir / dist_dir + + build_dir = Path(tempfile.mkdtemp()) if build_dir is None else build_dir + + new_pyproject: Dict[str, Any] = { + "build-system": { + "requires": ["hatchling"], + "build-backend": "hatchling.build", + }, + "tool": { + "hatch": { + "build": {}, + # in case the user provides a git dependency for example + "metadata": {"allow-direct-references": True}, + } + }, + "project": { + "name": model_package, + "version": version, + "requires-python": ">=3.7", + }, + } + file_paths = [] + + if pyproject is not None: + poetry = pyproject["tool"]["poetry"] + + # Extract packages + poetry_bin_path = ( + subprocess.run(["which", "poetry"], stdout=subprocess.PIPE) + .stdout.decode() + .strip() + ) + python_executable = Path(poetry_bin_path).read_text().split("\n")[0][2:] + result = subprocess.run( + [ + *python_executable.split(), + "-c", + POETRY_SNIPPET.replace("__root_dir__", str(root_dir)), + ], + stdout=subprocess.PIPE, + cwd=root_dir, + ) + if result.returncode != 0: + raise Exception() + out = result.stdout.decode().strip().split("\n") + file_paths = [root_dir / file_path for file_path in out[1:]] + packages = { + main_package, + model_package, + *(package["include"] for package in eval(out[0])), + } + packages = sorted([p for p in packages if p]) + new_pyproject["tool"]["hatch"]["build"] = { + "packages": [*packages, artifacts_name], + "exclude": ["__pycache__/", "*.pyc", "*.pyo", ".ipynb_checkpoints"], + "artifacts": [artifacts_name], + "targets": { + "wheel": { + "sources": { + f"{artifacts_name}": f"{model_package}/{artifacts_name}" + }, + }, + }, + } + if "description" in poetry: # pragma: no cover + new_pyproject["project"]["description"] = poetry["description"] + if "classifiers" in poetry: # pragma: no cover + new_pyproject["project"]["classifiers"] = poetry["classifiers"] + if "keywords" in poetry: # pragma: no cover + new_pyproject["project"]["keywords"] = poetry["keywords"] + if "license" in poetry: # pragma: no cover + new_pyproject["project"]["license"] = {"text": poetry["license"]} + if "readme" in poetry: # pragma: no cover + new_pyproject["project"]["readme"] = poetry["readme"] + if "authors" in poetry: # pragma: no cover + new_pyproject["project"]["authors"] = parse_authors(poetry["authors"]) + if "plugins" in poetry: # pragma: no cover + new_pyproject["project"]["entry-points"] = poetry["plugins"] + if "scripts" in poetry: # pragma: no cover + new_pyproject["project"]["scripts"] = poetry["scripts"] + + # Dependencies + deps = [] + poetry_deps = poetry["dependencies"] + for dep_name, constraint in poetry_deps.items(): + dep = dep_name + constraint: PoetryConstraint = ( + dict(constraint) + if isinstance(constraint, dict) + else {"version": constraint} + ) + try: + dep += f"[{','.join(constraint.pop('extras'))}]" + except KeyError: + pass + if "version" in constraint: + dep_version = constraint.pop("version") + assert not dep_version.startswith( + "^" + ), "Packaging models with ^ dependencies is not supported" + dep += ( + "" + if dep_version == "*" + else dep_version + if not dep_version[0].isdigit() + else f"=={dep_version}" + ) + try: + dep += f"; {constraint.pop('markers')}" + except KeyError: + pass + assert ( + not constraint + ), f"Unsupported constraints for dependency {dep_name}: {constraint}" + if dep_name == "python": + new_pyproject["project"]["requires-python"] = dep.replace( + "python", "" + ) + continue + deps.append(dep) + + new_pyproject["project"]["dependencies"] = deps + + if "authors" in metadata: + metadata["authors"] = parse_authors(metadata["authors"]) + metadata["name"] = model_package + metadata["version"] = version + + new_pyproject = confit.Config(new_pyproject).merge({"project": metadata}) + + # Use hatch + super().__init__( + name=model_package, + pyproject=new_pyproject, + pipeline=pipeline, + version=version, + root_dir=root_dir, + build_dir=build_dir, + dist_dir=dist_dir, + artifacts_name=artifacts_name, + exclude=exclude, + readme_replacements=readme_replacements, + file_paths=file_paths, + ) + + +class StandardPackager(Packager): + def __init__( + self, + *, + name: ModuleName, + pyproject: Optional[Dict[str, Any]], + pipeline: Union[Path, "edsnlp.Pipeline"], + version: Optional[str], + root_dir: Path = ".", + build_dir: Optional[Path] = None, + dist_dir: Path, + artifacts_name: ModuleName, + metadata: Optional[Dict[str, Any]] = {}, + exclude: AsList[str], + readme_replacements: Dict[str, str] = {}, + ): + try: + version = version or pyproject["project"]["version"] + except (KeyError, TypeError): + version = "0.1.0" + name = name or pyproject["project"]["name"] + if pyproject is not None: + main_package = snake_case(pyproject["project"]["name"].lower()) + else: + main_package = None + model_package = snake_case(name.lower()) + + root_dir = root_dir.resolve() + dist_dir = dist_dir if Path(dist_dir).is_absolute() else root_dir / dist_dir + + build_dir = Path(tempfile.mkdtemp()) if build_dir is None else build_dir + + new_pyproject: confit.Config = confit.Config() + if pyproject is not None: + new_pyproject["project"] = pyproject["project"] + new_pyproject = new_pyproject.merge( + { + "build-system": { + "requires": ["hatchling"], + "build-backend": "hatchling.build", + }, + "tool": { + "hatch": { + "build": {}, + # in case the user provides a git dependency for example + "metadata": {"allow-direct-references": True}, + }, + }, + "project": { + "name": model_package, + "version": version, + "requires-python": ">=3.7", + }, + } + ) + + try: + find = dict(pyproject["tool"].pop("setuptools", {})["packages"]["find"]) + except Exception: + find = {} + where = find.pop("where", ["."]) + where = [where] if not isinstance(where, list) else where + packages = {main_package, model_package} + for w in where: + # TODO Should we handle namespaces ? + # if find.pop("namespace", None) is not None: + # packages.extend(setuptools.find_namespace_packages(**find)) + packages.update(setuptools.find_packages(w, **find)) + packages = sorted([p for p in packages if p]) + file_paths = [] + for package in packages: + for path in (root_dir / package).rglob("*"): + if "__pycache__" in path.parts or path.is_dir(): + continue + file_paths.append(path) + + new_pyproject["tool"]["hatch"]["build"] = { + "packages": [*packages, artifacts_name], + "exclude": ["__pycache__/", "*.pyc", "*.pyo", ".ipynb_checkpoints"], + "artifacts": [artifacts_name], + "targets": { + "wheel": { + "sources": { + f"{artifacts_name}": f"{model_package}/{artifacts_name}" + }, + }, + }, + } + + if "authors" in metadata: + metadata["authors"] = parse_authors(metadata["authors"]) + metadata["name"] = model_package + metadata["version"] = version + + new_pyproject = new_pyproject.merge({"project": metadata}) + + super().__init__( + name=model_package, + pyproject=new_pyproject, + pipeline=pipeline, + version=version, + root_dir=root_dir, + build_dir=build_dir, + dist_dir=dist_dir, + artifacts_name=artifacts_name, + exclude=exclude, + readme_replacements=readme_replacements, + file_paths=file_paths, + ) + + @app.command(name="package") def package( pipeline: Union[Path, "edsnlp.Pipeline"], @@ -303,7 +577,7 @@ def package( dist_dir: Path = Path("dist"), artifacts_name: ModuleName = "artifacts", check_dependencies: bool = False, - project_type: Optional[Literal["poetry", "setuptools"]] = None, + project_type: Optional[str] = None, version: Optional[str] = None, metadata: Optional[Dict[str, Any]] = {}, distributions: Optional[AsList[Literal["wheel", "sdist"]]] = ["wheel"], @@ -313,20 +587,7 @@ def package( exclude: Optional[AsList[str]] = None, readme_replacements: Dict[str, str] = {}, ): - """ - - Parameters - ---------- - - Returns - ------- - - """ - if project_type is not None: - warnings.warn( - "Project_type is deprecated, only PEP621 pyproject.toml is supported", - DeprecationWarning, - ) + # root_dir = Path(".").resolve() exclude = exclude or ["artifacts/vocab/*"] pyproject_path = root_dir / "pyproject.toml" @@ -347,7 +608,28 @@ def package( if pyproject_path.exists(): pyproject = toml.loads((root_dir / "pyproject.toml").read_text()) - packager = Packager( + try: + _ = pyproject["tool"]["poetry"]["name"] + inferred_project_type = "old-style-poetry" + except (KeyError, TypeError): + inferred_project_type = "standard" + + try: + if project_type is None: + project_type = inferred_project_type + packager_cls = { + "old-style-poetry": OldStylePoetryPackager, + "standard": StandardPackager, + # for backward compatibility + "poetry": OldStylePoetryPackager, + "setuptools": StandardPackager, + }[project_type] + except Exception: # pragma: no cover + raise ValueError( + f"Could not process project type {project_type!r} only old-style poetry " + f"and PEP 621 pyproject.toml formats are supported for now." + ) + packager = packager_cls( pyproject=pyproject, pipeline=pipeline, name=name, diff --git a/pyproject.toml b/pyproject.toml index f0d016bd5b..3fd66f11b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ dev-no-ml = [ "pyspark", "polars", - "mlconjug3<3.9.0", + "mlconjug3<3.9.0; python_version<'3.12'", "scikit-learn>=1.0.0", "edsnlp[docs-no-ml]", diff --git a/tests/utils/test_package.py b/tests/utils/test_package.py index cb153234d5..b4290402bc 100644 --- a/tests/utils/test_package.py +++ b/tests/utils/test_package.py @@ -48,6 +48,7 @@ def test_package_with_files(nlp, tmp_path, package_name, manager): ((tmp_path / "test_model").mkdir(parents=True)) (tmp_path / "test_model" / "__init__.py").write_text('print("Hello World!")\n') + (tmp_path / "test_model" / "empty_folder").mkdir() (tmp_path / "README.md").write_text( """\ From ec7bee00023b37892369702b00f859e946bc374c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Wed, 2 Apr 2025 09:42:47 +0200 Subject: [PATCH 3/3] test: support in py311 and py312 and fix deps --- .github/workflows/tests.yml | 20 +++++++++++---- changelog.md | 4 +++ edsnlp/pipes/core/endlines/model.py | 9 +++---- pyproject.toml | 38 ++++++++++++++--------------- tests/tuning/config.yml | 1 + 5 files changed, 43 insertions(+), 29 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6cf74a9ab4..9df3974b79 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: true matrix: - python-version: ["3.7", "3.8", "3.9", "3.10"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v2 @@ -71,20 +71,30 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'pip' + - name: Install dependencies + run: | + pip install poetry + pip install -e '.[dev]' pytest-xdist pip + if: matrix.python-version != '3.9' && matrix.python-version != '3.10' && matrix.python-version != '3.11' && matrix.python-version != '3.12' +# uv venv +# source .venv/bin/activate +# uv pip install -e '.[dev]' pytest-xdist pip + - name: Install dependencies run: | pip install poetry pip install -e '.[dev,setup]' pytest-xdist pip - if: matrix.python-version != '3.10' + if: matrix.python-version == '3.9' # uv venv # source .venv/bin/activate -# uv pip install -e '.[dev,setup]' pytest-xdist pip +# uv pip install -e '.[dev]' pytest-xdist pip - name: Install dependencies run: | pip install poetry - pip install -e '.[dev-no-ml,setup]' pytest-xdist pip - if: matrix.python-version == '3.10' + pip install -e '.[dev-no-ml]' pytest-xdist pip + # skip ML tests for 3.10 and 3.11 + if: matrix.python-version == '3.10' || matrix.python-version == '3.11' || matrix.python-version == '3.12' - name: Test with Pytest on Python ${{ matrix.python-version }} env: diff --git a/changelog.md b/changelog.md index 0b723af988..9bbc576346 100644 --- a/changelog.md +++ b/changelog.md @@ -2,6 +2,10 @@ ## Unreleased +### Added + +- Support for numpy>2.0, and formal support for Python 3.11 and Python 3.12 + ### Fixed - `edsnlp.package` now correctly detect if a project uses an old-style poetry pyproject or a PEP621 pyproject.toml. diff --git a/edsnlp/pipes/core/endlines/model.py b/edsnlp/pipes/core/endlines/model.py index 86fb00eb53..0fcabac1f9 100644 --- a/edsnlp/pipes/core/endlines/model.py +++ b/edsnlp/pipes/core/endlines/model.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd -from numpy.lib.function_base import iterable from pandas.api.types import CategoricalDtype from pandas.core.groupby import DataFrameGroupBy from spacy.strings import StringStore @@ -239,9 +238,9 @@ def _convert_A(self, df: pd.DataFrame, col: str) -> pd.DataFrame: df[new_col] = df[col].astype(cat_type_A) df[new_col] = df[new_col].cat.codes # Ensure that not known values are coded as OTHER - df.loc[ - ~df[col].isin(self.vocabulary["A3A4"].keys()), new_col - ] = self.vocabulary["A3A4"]["OTHER"] + df.loc[~df[col].isin(self.vocabulary["A3A4"].keys()), new_col] = ( + self.vocabulary["A3A4"]["OTHER"] + ) return df def _convert_B(self, df: pd.DataFrame, col: str) -> pd.DataFrame: @@ -594,7 +593,7 @@ def _retrieve_lines(cls, dfg: DataFrameGroupBy) -> DataFrameGroupBy: return dfg @classmethod - def _create_vocabulary(cls, x: iterable) -> dict: + def _create_vocabulary(cls, x: Iterable) -> dict: """Function to create a vocabulary for attributes in the training set. Parameters diff --git a/pyproject.toml b/pyproject.toml index 3fd66f11b6..fb7db54584 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,13 +13,16 @@ dependencies = [ "pytz", "pysimstring>=1.2.1", "regex", - "spacy>=3.2,<3.8", - "thinc<8.2.5", # we don't need thinc but spacy depdends on it 8.2.5 cause binary issues + # spacy doesn't provide binaries for python<3.9 from 3.8.2 so we need to cap it ourself + "spacy>=3.2,<3.8.2; python_version<'3.9'", + "spacy>=3.8.5,<4.0.0; python_version>='3.9'", + # thinc doesn't provide binaries for python<3.9 from 8.2.5 so we need to cap it ourself + "thinc<8.2.5; python_version<'3.9'", + "thinc>=8.2.5; python_version>='3.9'", "confit>=0.7.3", "tqdm", "umls-downloader>=0.1.1", - "numpy>=1.15.0,<1.23.2; python_version<'3.8'", - "numpy>=1.15.0,<2.0.0; python_version>='3.8'", + "numpy>=1.15.0", "pandas>=1.1.0; python_version<'3.8'", "pandas>=1.4.0; python_version>='3.8'", "typing-extensions>=4.0.0", @@ -49,8 +52,7 @@ dev-no-ml = [ "pyspark", "polars", - "mlconjug3<3.9.0; python_version<'3.12'", - "scikit-learn>=1.0.0", + "scikit-learn", "edsnlp[docs-no-ml]", ] @@ -75,7 +77,7 @@ docs-no-ml = [ ml = [ "rich-logger>=0.3.1", "torch>=1.13.0", - "foldedtensor>=0.3.4", + "foldedtensor>=0.4.0", "safetensors>=0.3.0; python_version>='3.8'", "safetensors>=0.3.0,<0.5.0; python_version<'3.8'", "transformers>=4.0.0,<5.0.0", @@ -92,10 +94,11 @@ dev = [ "plotly>=5.18.0", # required by optuna viz "ruamel.yaml>=0.18.0", "configobj>=5.0.9", - + "scikit-learn", ] setup = [ - "typer" + "mlconjug3<3.9.0", # bug https://github.com/Ars-Linguistica/mlconjug3/pull/506 + "numpy<2", # mlconjug has scikit-learn dep which doesn't support for numpy 2 yet ] [project.urls] @@ -312,7 +315,11 @@ where = ["."] requires = [ "setuptools", "cython>=0.25", - "spacy>=3.2,<3.8", + "spacy>=3.2,!=3.8.2; python_version<'3.9'", + "spacy>=3.2,!=3.8.2,<4.0.0; python_version>='3.9'", + # thinc doesn't provide binaries for python<3.9 from 8.2.5 so we need to cap it ourselves + "thinc<8.2.5; python_version<'3.9'", + "thinc>=8.2.5; python_version>='3.9'", # to update from https://github.com/scipy/oldest-supported-numpy/blob/main/setup.cfg # while setting numpy >= 1.15.0 due to spacy reqs "numpy==1.15.0; python_version=='3.7' and platform_machine not in 'arm64|aarch64|loongarch64' and platform_system!='AIX' and platform_python_implementation != 'PyPy'", @@ -324,19 +331,12 @@ requires = [ "numpy==1.19.0; python_version=='3.6' and platform_machine!='loongarch64' and platform_python_implementation=='PyPy'", "numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64' and platform_system!='AIX' and platform_python_implementation != 'PyPy'", "numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64' and platform_python_implementation != 'PyPy'", - "numpy==1.19.3; python_version=='3.9' and platform_machine=='arm64' and platform_system=='Windows' and platform_python_implementation != 'PyPy'", - "numpy==1.19.3; python_version=='3.9' and platform_system not in 'OS400' and platform_machine not in 'arm64|loongarch64' and platform_python_implementation != 'PyPy'", "numpy==1.20.0; python_version=='3.7' and platform_machine!='loongarch64' and platform_python_implementation=='PyPy'", "numpy==1.21.0; python_version=='3.7' and platform_machine=='arm64' and platform_system=='Darwin' and platform_python_implementation!='PyPy'", "numpy==1.21.0; python_version=='3.8' and platform_machine=='arm64' and platform_system=='Darwin' and platform_python_implementation!='PyPy'", - "numpy==1.21.0; python_version=='3.9' and platform_machine=='arm64' and platform_system=='Darwin' and platform_python_implementation!='PyPy'", - "numpy==1.21.6; python_version=='3.10' and platform_machine!='loongarch64'", - "numpy==1.22.2; platform_machine=='loongarch64' and python_version>='3.8' and python_version<'3.11' and platform_python_implementation!='PyPy'", + "numpy==1.22.2; python_version>='3.8' and python_version<'3.9' and platform_machine=='loongarch64' and platform_python_implementation!='PyPy'", "numpy==1.22.2; python_version=='3.8' and platform_machine!='loongarch64' and platform_python_implementation=='PyPy'", - "numpy==1.23.2; python_version=='3.11'", - "numpy==1.23.3; python_version=='3.9' and platform_system=='OS400' and platform_machine!='loongarch64' and platform_python_implementation!='PyPy'", - "numpy==1.25.0; python_version=='3.9' and platform_python_implementation=='PyPy'", - "numpy==1.26.1; python_version=='3.12'", + "numpy>=2.0; python_version>='3.9'", ] build-backend = "setuptools.build_meta" diff --git a/tests/tuning/config.yml b/tests/tuning/config.yml index 08a3896ebe..e38655cd4e 100644 --- a/tests/tuning/config.yml +++ b/tests/tuning/config.yml @@ -125,3 +125,4 @@ train: scorer: ${ scorer } num_workers: 0 optimizer: ${ optimizer } + cpu: true