diff --git a/providers/git/docs/bundles/index.rst b/providers/git/docs/bundles/index.rst index f9e9efe9a1720..d551235bb860a 100644 --- a/providers/git/docs/bundles/index.rst +++ b/providers/git/docs/bundles/index.rst @@ -34,9 +34,10 @@ Example of using the GitDagBundle: "kwargs": { "subdir": "dags", "tracking_ref": "main", - "refresh_interval": 3600 - "submodules": False, - "prune_dotgit_folder": True + "refresh_interval": 3600, + "submodules": false, + "prune_dotgit_folder": true, + "sparse_dirs": ["dags", "includes"] } } ]' diff --git a/providers/git/src/airflow/providers/git/bundles/git.py b/providers/git/src/airflow/providers/git/bundles/git.py index 33d3c9979d5c1..2f03ed48cf134 100644 --- a/providers/git/src/airflow/providers/git/bundles/git.py +++ b/providers/git/src/airflow/providers/git/bundles/git.py @@ -52,6 +52,11 @@ class GitDagBundle(BaseDagBundle): to share the object directory via hard links, but if you have a lot of current versions running, or an especially large git repo leaving this as True will save some disk space at the expense of `git` operations not working in the bundle that Tasks run from. + :param sparse_dirs: List of directories to include when cloning the repository. Needs git version 2.25 or higher. + + The sparse checkout will only produce the files and subfolders of the list of provided directories + into the working tree. The "cone" mode is used, which means that effective and fast filtering can be made. + See https://git-scm.com/docs/git-sparse-checkout for more information on the sparse checkout feature. """ supports_versioning = True @@ -65,6 +70,7 @@ def __init__( repo_url: str | None = None, submodules: bool = False, prune_dotgit_folder: bool = True, + sparse_dirs: list[str] | None = None, **kwargs, ) -> None: super().__init__(**kwargs) @@ -78,7 +84,7 @@ def __init__( self.git_conn_id = git_conn_id self.repo_url = repo_url self.submodules = submodules - + self.sparse_dirs = sparse_dirs # Force prune to False if submodules are used, otherwise git links break if self.submodules: self.prune_dotgit_folder = False @@ -93,6 +99,7 @@ def __init__( versions_path=self.versions_dir, git_conn_id=self.git_conn_id, submodules=self.submodules, + sparse_dirs=self.sparse_dirs, ) self._log.debug("bundle configured") @@ -247,7 +254,14 @@ def _clone_repo_if_required(self) -> None: Repo.clone_from( url=self.bare_repo_path, to_path=self.repo_path, + multi_options=["--sparse", "--no-checkout"] if self.sparse_dirs else None, ) + if self.sparse_dirs: + self._log.info("Setting up sparse checkout") + repo = Repo(self.repo_path) + repo.git.sparse_checkout("init", "--cone") + repo.git.sparse_checkout("set", *self.sparse_dirs) + repo.git.checkout(self.tracking_ref) else: self._log.debug("repo exists", repo_path=self.repo_path) self.repo = Repo(self.repo_path) diff --git a/providers/git/tests/unit/git/bundles/test_git.py b/providers/git/tests/unit/git/bundles/test_git.py index 6297ab675d9bf..43808ffdf0977 100644 --- a/providers/git/tests/unit/git/bundles/test_git.py +++ b/providers/git/tests/unit/git/bundles/test_git.py @@ -21,6 +21,7 @@ import os import re import types +from pathlib import Path from unittest import mock from unittest.mock import patch @@ -54,8 +55,8 @@ def bundle_temp_dir(tmp_path): @pytest.fixture -def git_repo(tmp_path_factory): - directory = tmp_path_factory.mktemp("repo") +def git_repo(tmp_path_factory) -> tuple[Path, Repo]: + directory: Path = tmp_path_factory.mktemp("repo") repo = Repo.init(directory) repo.git.symbolic_ref("HEAD", f"refs/heads/{GIT_DEFAULT_BRANCH}") file_path = directory / "test_dag.py" @@ -839,6 +840,70 @@ def test_subdir(self, mock_githook, git_repo): assert str(bundle.path).endswith(subdir) assert {"some_new_file.py"} == files_in_repo + @mock.patch("airflow.providers.git.bundles.git.GitHook") + def test_sparse_checkout(self, mock_githook, git_repo): + repo_path, repo = git_repo + mock_githook.return_value.repo_url = repo_path + + subdir = "some/subdir" + subdir_path = repo_path / subdir + subdir_path.mkdir(parents=True) + file_path = subdir_path / "some_relevant_file.py" + with open(file_path, "w") as f: + f.write("hello world") + otherdir = "other/dir" + otherdir_path = repo_path / otherdir + otherdir_path.mkdir(parents=True) + otherfile_path = otherdir_path / "some_other_file.py" + with open(otherfile_path, "w") as f: + f.write("hello world") + + repo.index.add([file_path, otherfile_path]) + repo.index.commit("Other commit") + + bundle = GitDagBundle( + name="test-sparse", + git_conn_id=CONN_HTTPS, + tracking_ref=GIT_DEFAULT_BRANCH, + sparse_dirs=[subdir], + ) + bundle.initialize() + + files_in_repo = {f.name for f in bundle.path.glob("**/*.py") if f.is_file()} + assert "some_other_file.py" not in files_in_repo + assert "some_relevant_file.py" in files_in_repo + + @mock.patch("airflow.providers.git.bundles.git.GitHook") + def test_sparse_checkout_with_version_prunes_dotgit(self, mock_githook, git_repo): + repo_path, repo = git_repo + mock_githook.return_value.repo_url = repo_path + subdir = "some/subdir" + subdir_path = repo_path / subdir + subdir_path.mkdir(parents=True) + file_path = subdir_path / "some_relevant_file.py" + with open(file_path, "w") as f: + f.write("hello world") + otherdir = "other/dir" + otherdir_path = repo_path / otherdir + otherdir_path.mkdir(parents=True) + otherfile_path = otherdir_path / "some_other_file.py" + with open(otherfile_path, "w") as f: + f.write("hello world") + repo.index.add([file_path, otherfile_path]) + commit = repo.index.commit("Other commit") + bundle = GitDagBundle( + name="test-sparse-version", + git_conn_id=CONN_HTTPS, + tracking_ref=GIT_DEFAULT_BRANCH, + version=commit.hexsha, + sparse_dirs=[subdir], + ) + bundle.initialize() + files_in_repo = {f.name for f in bundle.path.glob("**/*.py") if f.is_file()} + assert "some_other_file.py" not in files_in_repo + assert "some_relevant_file.py" in files_in_repo + assert not (bundle.path / ".git").exists() + def test_raises_when_no_repo_url(self): bundle = GitDagBundle( name="test",