Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions providers/git/docs/bundles/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ Example of using the GitDagBundle:
"kwargs": {
"subdir": "dags",
"tracking_ref": "main",
"refresh_interval": 3600
"submodules": False,
"prune_dotgit_folder": True
"refresh_interval": 3600,
"submodules": false,
"prune_dotgit_folder": true,
"sparse_dirs": ["dags", "includes"]
}
}
]'
16 changes: 15 additions & 1 deletion providers/git/src/airflow/providers/git/bundles/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ class GitDagBundle(BaseDagBundle):
to share the object directory via hard links, but if you have a lot of current versions
running, or an especially large git repo leaving this as True will save some disk space
at the expense of `git` operations not working in the bundle that Tasks run from.
:param sparse_dirs: List of directories to include when cloning the repository. Needs git version 2.25 or higher.

The sparse checkout will only produce the files and subfolders of the list of provided directories
into the working tree. The "cone" mode is used, which means that effective and fast filtering can be made.
See https://git-scm.com/docs/git-sparse-checkout for more information on the sparse checkout feature.
"""

supports_versioning = True
Expand All @@ -65,6 +70,7 @@ def __init__(
repo_url: str | None = None,
submodules: bool = False,
prune_dotgit_folder: bool = True,
sparse_dirs: list[str] | None = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
Expand All @@ -78,7 +84,7 @@ def __init__(
self.git_conn_id = git_conn_id
self.repo_url = repo_url
self.submodules = submodules

self.sparse_dirs = sparse_dirs
# Force prune to False if submodules are used, otherwise git links break
if self.submodules:
self.prune_dotgit_folder = False
Expand All @@ -93,6 +99,7 @@ def __init__(
versions_path=self.versions_dir,
git_conn_id=self.git_conn_id,
submodules=self.submodules,
sparse_dirs=self.sparse_dirs,
)

self._log.debug("bundle configured")
Expand Down Expand Up @@ -247,7 +254,14 @@ def _clone_repo_if_required(self) -> None:
Repo.clone_from(
url=self.bare_repo_path,
to_path=self.repo_path,
multi_options=["--sparse", "--no-checkout"] if self.sparse_dirs else None,
)
if self.sparse_dirs:
self._log.info("Setting up sparse checkout")
repo = Repo(self.repo_path)
repo.git.sparse_checkout("init", "--cone")
repo.git.sparse_checkout("set", *self.sparse_dirs)
repo.git.checkout(self.tracking_ref)
else:
self._log.debug("repo exists", repo_path=self.repo_path)
self.repo = Repo(self.repo_path)
Expand Down
69 changes: 67 additions & 2 deletions providers/git/tests/unit/git/bundles/test_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import os
import re
import types
from pathlib import Path
from unittest import mock
from unittest.mock import patch

Expand Down Expand Up @@ -54,8 +55,8 @@ def bundle_temp_dir(tmp_path):


@pytest.fixture
def git_repo(tmp_path_factory):
directory = tmp_path_factory.mktemp("repo")
def git_repo(tmp_path_factory) -> tuple[Path, Repo]:
directory: Path = tmp_path_factory.mktemp("repo")
repo = Repo.init(directory)
repo.git.symbolic_ref("HEAD", f"refs/heads/{GIT_DEFAULT_BRANCH}")
file_path = directory / "test_dag.py"
Expand Down Expand Up @@ -839,6 +840,70 @@ def test_subdir(self, mock_githook, git_repo):
assert str(bundle.path).endswith(subdir)
assert {"some_new_file.py"} == files_in_repo

@mock.patch("airflow.providers.git.bundles.git.GitHook")
def test_sparse_checkout(self, mock_githook, git_repo):
repo_path, repo = git_repo
mock_githook.return_value.repo_url = repo_path

subdir = "some/subdir"
subdir_path = repo_path / subdir
subdir_path.mkdir(parents=True)
file_path = subdir_path / "some_relevant_file.py"
with open(file_path, "w") as f:
f.write("hello world")
otherdir = "other/dir"
otherdir_path = repo_path / otherdir
otherdir_path.mkdir(parents=True)
otherfile_path = otherdir_path / "some_other_file.py"
with open(otherfile_path, "w") as f:
f.write("hello world")

repo.index.add([file_path, otherfile_path])
repo.index.commit("Other commit")

bundle = GitDagBundle(
name="test-sparse",
git_conn_id=CONN_HTTPS,
tracking_ref=GIT_DEFAULT_BRANCH,
sparse_dirs=[subdir],
)
bundle.initialize()

files_in_repo = {f.name for f in bundle.path.glob("**/*.py") if f.is_file()}
assert "some_other_file.py" not in files_in_repo
assert "some_relevant_file.py" in files_in_repo

Comment thread
jscheffl marked this conversation as resolved.
@mock.patch("airflow.providers.git.bundles.git.GitHook")
def test_sparse_checkout_with_version_prunes_dotgit(self, mock_githook, git_repo):
repo_path, repo = git_repo
mock_githook.return_value.repo_url = repo_path
subdir = "some/subdir"
subdir_path = repo_path / subdir
subdir_path.mkdir(parents=True)
file_path = subdir_path / "some_relevant_file.py"
with open(file_path, "w") as f:
f.write("hello world")
otherdir = "other/dir"
otherdir_path = repo_path / otherdir
otherdir_path.mkdir(parents=True)
otherfile_path = otherdir_path / "some_other_file.py"
with open(otherfile_path, "w") as f:
f.write("hello world")
repo.index.add([file_path, otherfile_path])
commit = repo.index.commit("Other commit")
bundle = GitDagBundle(
name="test-sparse-version",
git_conn_id=CONN_HTTPS,
tracking_ref=GIT_DEFAULT_BRANCH,
version=commit.hexsha,
sparse_dirs=[subdir],
)
bundle.initialize()
files_in_repo = {f.name for f in bundle.path.glob("**/*.py") if f.is_file()}
assert "some_other_file.py" not in files_in_repo
assert "some_relevant_file.py" in files_in_repo
assert not (bundle.path / ".git").exists()

def test_raises_when_no_repo_url(self):
bundle = GitDagBundle(
name="test",
Expand Down
Loading