From ac68acb18be1d5d3f39cb4c57febafe4236685ce Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 6 Aug 2025 12:11:28 +0530 Subject: [PATCH 1/6] Add support to fetch purl Signed-off-by: Tushar Goel --- requirements.txt | 4 ++-- src/fetchcode/__init__.py | 26 +++++++++++++++++++++----- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/requirements.txt b/requirements.txt index b7daca1..1485f8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -41,8 +41,8 @@ MarkupSafe==2.0.1 more-itertools==8.13.0 normality==2.3.3 packagedcode-msitools==0.101.210706 -packageurl-python==0.9.9 -packaging==21.3 +packageurl-python==0.17.4 +packaging==24.0 parameter-expansion-patched==0.3.1 patch==1.16 pdfminer-six==20220506 diff --git a/src/fetchcode/__init__.py b/src/fetchcode/__init__.py index 82523d6..9e5973b 100644 --- a/src/fetchcode/__init__.py +++ b/src/fetchcode/__init__.py @@ -21,6 +21,7 @@ from urllib.parse import urlparse import requests +from packageurl.contrib.purl2url import get_download_url class Response: @@ -89,19 +90,34 @@ def fetch_ftp(url, location): return resp +def fetch_purl(purl, location=None): + """ + Return a `Response` object built from fetching the content at a PURL based `purl` URL string + saving the content in a file at `location` + """ + from fetchcode.download_urls import download_url as get_download_url_from_fetchcode + + for resolver in (get_download_url, get_download_url_from_fetchcode): + url = resolver(purl) + if url: + return fetch(url=url) + return + + def fetch(url): """ Return a `Response` object built from fetching the content at the `url` URL string and store content at a temporary file. """ - - temp = tempfile.NamedTemporaryFile(delete=False) - location = temp.name - url_parts = urlparse(url) scheme = url_parts.scheme + location = None + + if scheme != "purl": + temp = tempfile.NamedTemporaryFile(delete=False) + location = temp.name - fetchers = {"ftp": fetch_ftp, "http": fetch_http, "https": fetch_http} + fetchers = {"ftp": fetch_ftp, "http": fetch_http, "https": fetch_http, "pkg": fetch_purl} if scheme in fetchers: return fetchers.get(scheme)(url, location) From 299d366574cd148b1546e6dc3d27db7f6a3d85de Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 6 Aug 2025 12:33:00 +0530 Subject: [PATCH 2/6] Fix purl scheme Signed-off-by: Tushar Goel --- src/fetchcode/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fetchcode/__init__.py b/src/fetchcode/__init__.py index 9e5973b..56bd795 100644 --- a/src/fetchcode/__init__.py +++ b/src/fetchcode/__init__.py @@ -113,7 +113,7 @@ def fetch(url): scheme = url_parts.scheme location = None - if scheme != "purl": + if scheme != "pkg": temp = tempfile.NamedTemporaryFile(delete=False) location = temp.name From 0838dafa586eacb2af21db2beeffee0bd25dbf98 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 6 Aug 2025 14:08:17 +0530 Subject: [PATCH 3/6] Address review comments Signed-off-by: Tushar Goel --- src/fetchcode/__init__.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/fetchcode/__init__.py b/src/fetchcode/__init__.py index 56bd795..65bfa9e 100644 --- a/src/fetchcode/__init__.py +++ b/src/fetchcode/__init__.py @@ -21,7 +21,9 @@ from urllib.parse import urlparse import requests -from packageurl.contrib.purl2url import get_download_url +from packageurl.contrib import purl2url + +from fetchcode.utils import _http_exists class Response: @@ -90,18 +92,19 @@ def fetch_ftp(url, location): return resp -def fetch_purl(purl, location=None): +def resolve_purl(purl): """ - Return a `Response` object built from fetching the content at a PURL based `purl` URL string - saving the content in a file at `location` + Resolve a Package URL (PURL) to a download URL. + + This function attempts to resolve the PURL using both the purl2url library and + the fetchcode.download_urls module. It returns the first valid download URL found. """ from fetchcode.download_urls import download_url as get_download_url_from_fetchcode - for resolver in (get_download_url, get_download_url_from_fetchcode): + for resolver in (purl2url.get_download_url, get_download_url_from_fetchcode): url = resolver(purl) - if url: - return fetch(url=url) - return + if url and _http_exists(url): + return url def fetch(url): @@ -111,13 +114,16 @@ def fetch(url): """ url_parts = urlparse(url) scheme = url_parts.scheme - location = None - if scheme != "pkg": - temp = tempfile.NamedTemporaryFile(delete=False) - location = temp.name + if scheme == "pkg": + url = resolve_purl(url) + url_parts = urlparse(url) + scheme = url_parts.scheme + + temp = tempfile.NamedTemporaryFile(delete=False) + location = temp.name - fetchers = {"ftp": fetch_ftp, "http": fetch_http, "https": fetch_http, "pkg": fetch_purl} + fetchers = {"ftp": fetch_ftp, "http": fetch_http, "https": fetch_http} if scheme in fetchers: return fetchers.get(scheme)(url, location) From 02b9671f88f8eece35cec0c20c221828962a10df Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Wed, 6 Aug 2025 14:28:49 +0530 Subject: [PATCH 4/6] Add tests Signed-off-by: Tushar Goel --- src/fetchcode/__init__.py | 47 +++++++++++++++++++++++++++++++-------- tests/test_fetch.py | 21 +++++++++++++++++ 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/fetchcode/__init__.py b/src/fetchcode/__init__.py index 65bfa9e..e1b9fee 100644 --- a/src/fetchcode/__init__.py +++ b/src/fetchcode/__init__.py @@ -96,8 +96,8 @@ def resolve_purl(purl): """ Resolve a Package URL (PURL) to a download URL. - This function attempts to resolve the PURL using both the purl2url library and - the fetchcode.download_urls module. It returns the first valid download URL found. + This function attempts to resolve the PURL using first purl2url library and + if that fails, it falls back to fetchcode's download_urls module. """ from fetchcode.download_urls import download_url as get_download_url_from_fetchcode @@ -107,18 +107,47 @@ def resolve_purl(purl): return url +def get_resolved_url(url, scheme): + resoltion_by_scheme = { + "pkg": resolve_url_from_purl, + } + resolution_handler = resoltion_by_scheme.get(scheme) + if not resolution_handler: + raise ValueError(f"Not a supported/known scheme: {scheme}") + url, scheme = resolution_handler(url) + return url, scheme + + +def resolve_url_from_purl(url): + """ + Resolve a Package URL (PURL) to a valid URL. + Raises ValueError if the PURL cannot be resolved. + """ + url = resolve_purl(url) + if not url: + raise ValueError("Could not resolve PURL to a valid URL.") + scheme = get_url_scheme(url) + return url, scheme + + +def get_url_scheme(url): + """ + Return the scheme of the given URL. + """ + url_parts = urlparse(url) + scheme = url_parts.scheme + return scheme + + def fetch(url): """ Return a `Response` object built from fetching the content at the `url` URL string and store content at a temporary file. """ - url_parts = urlparse(url) - scheme = url_parts.scheme + scheme = get_url_scheme(url) - if scheme == "pkg": - url = resolve_purl(url) - url_parts = urlparse(url) - scheme = url_parts.scheme + if scheme in ["pkg"]: + url, scheme = get_resolved_url(url, scheme) temp = tempfile.NamedTemporaryFile(delete=False) location = temp.name @@ -128,7 +157,7 @@ def fetch(url): if scheme in fetchers: return fetchers.get(scheme)(url, location) - raise Exception("Not a supported/known scheme.") + raise Exception(f"Not a supported/known scheme: {scheme}.") def fetch_json_response(url): diff --git a/tests/test_fetch.py b/tests/test_fetch.py index 1dcf746..9d9e760 100644 --- a/tests/test_fetch.py +++ b/tests/test_fetch.py @@ -63,3 +63,24 @@ def test_fetch_with_scheme_not_present(): url = "abc://speedtest/1KB.zip" response = fetch(url=url) assert "Not a supported/known scheme." == e_info + + +@mock.patch("fetchcode.resolve_url_from_purl") +@mock.patch("fetchcode.fetch_http") +def test_fetch_purl(mock_fetch_http, mock_resolve): + mock_fetch_http.return_value = "mocked_purl_response" + mock_resolve.return_value = ("http://resolved.com/file.tar.gz", "http") + + response = fetch("pkg:pypi/sample@1.0.0") + + assert response == "mocked_purl_response" + mock_resolve.assert_called_once() + mock_fetch_http.assert_called_once() + + +@mock.patch("fetchcode.get_url_scheme") +def test_fetch_unsupported_scheme(mock_get_scheme): + mock_get_scheme.return_value = "s3" + + with pytest.raises(Exception, match="Not a supported/known scheme"): + fetch("s3://bucket/object") From 9682dacf8e474b382ae91b7d074748f77e88b6d9 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Thu, 7 Aug 2025 14:41:53 +0530 Subject: [PATCH 5/6] Address review comments Signed-off-by: Tushar Goel --- README.rst | 35 ++++++++++++ src/fetchcode/__init__.py | 4 +- src/fetchcode/composer.py | 1 - tests/test_fetch.py | 115 +++++++++++++++++++++++++++++++++++--- 4 files changed, 144 insertions(+), 11 deletions(-) diff --git a/README.rst b/README.rst index a493ebd..ef17f90 100644 --- a/README.rst +++ b/README.rst @@ -59,6 +59,41 @@ Fetch some package metadata and get a ``fetchcode.packagedcode_models.Package`` >>> list(package.info('pkg:rubygems/files')) [Package(type='rubygems', namespace=None, name='files', version=None)] +Fetch a purl and get a ``fetchcode.fetch.Response`` object back:: + + >>> from fetchcode import fetch + >>> f = fetch('pkg:swift/github.com/Alamofire/Alamofire@5.4.3') + >>> f.location + '/tmp/tmp_cm02xsg' + >>> f.content_type + 'application/zip' + >>> f.url + 'https://github.com/Alamofire/Alamofire/archive/5.4.3.zip' + +Ecosystems supported for fetching a purl from fetchcode: + + - alpm + - apk + - bitbucket + - cargo + - composer + - conda + - cpan + - cran + - deb + - gem + - generic + - github + - golang + - hackage + - hex + - luarocks + - maven + - npm + - nuget + - pub + - pypi + - swift License -------- diff --git a/src/fetchcode/__init__.py b/src/fetchcode/__init__.py index e1b9fee..d4403c5 100644 --- a/src/fetchcode/__init__.py +++ b/src/fetchcode/__init__.py @@ -134,9 +134,7 @@ def get_url_scheme(url): """ Return the scheme of the given URL. """ - url_parts = urlparse(url) - scheme = url_parts.scheme - return scheme + return urlparse(url).scheme def fetch(url): diff --git a/src/fetchcode/composer.py b/src/fetchcode/composer.py index 32b73f0..3188d00 100644 --- a/src/fetchcode/composer.py +++ b/src/fetchcode/composer.py @@ -26,7 +26,6 @@ class Composer: @classmethod def get_download_url(cls, purl): - """ Return the download URL for a Composer PURL. """ diff --git a/tests/test_fetch.py b/tests/test_fetch.py index 9d9e760..c4adccb 100644 --- a/tests/test_fetch.py +++ b/tests/test_fetch.py @@ -19,6 +19,8 @@ import pytest from fetchcode import fetch +from fetchcode import resolve_purl +from fetchcode import resolve_url_from_purl @mock.patch("fetchcode.requests.get") @@ -65,22 +67,121 @@ def test_fetch_with_scheme_not_present(): assert "Not a supported/known scheme." == e_info -@mock.patch("fetchcode.resolve_url_from_purl") +@mock.patch("fetchcode._http_exists") @mock.patch("fetchcode.fetch_http") -def test_fetch_purl(mock_fetch_http, mock_resolve): +@mock.patch("fetchcode.pypi.fetch_json_response") +def test_fetch_purl_with_fetchcode(mock_fetch_json_response, mock_fetch_http, mock_http_exists): mock_fetch_http.return_value = "mocked_purl_response" - mock_resolve.return_value = ("http://resolved.com/file.tar.gz", "http") + mock_http_exists.return_value = True + mock_fetch_json_response.return_value = { + "urls": [{"url": "https://example.com/sample-1.0.0.zip"}] + } response = fetch("pkg:pypi/sample@1.0.0") assert response == "mocked_purl_response" - mock_resolve.assert_called_once() + mock_http_exists.assert_called_once() + mock_fetch_http.assert_called_once() + + +@mock.patch("fetchcode._http_exists") +@mock.patch("fetchcode.fetch_http") +def test_fetch_purl_with_purl2url(mock_fetch_http, mock_http_exists): + mock_fetch_http.return_value = "mocked_purl_response" + mock_http_exists.return_value = True + + response = fetch("pkg:alpm/sample@1.0.0") + + assert response == "mocked_purl_response" + mock_http_exists.assert_called_once() mock_fetch_http.assert_called_once() -@mock.patch("fetchcode.get_url_scheme") -def test_fetch_unsupported_scheme(mock_get_scheme): - mock_get_scheme.return_value = "s3" +@mock.patch("fetchcode.pypi.fetch_json_response") +def test_fetch_invalid_purl(mock_fetch_json_response): + mock_fetch_json_response.return_value = {} + with pytest.raises(Exception, match="No download URL found for invalid-package version 1.0.0"): + fetch("pkg:pypi/invalid-package@1.0.0") + + +@mock.patch("fetchcode.pypi.fetch_json_response") +def test_fetch_invalid_purl(mock_fetch_json_response): + mock_fetch_json_response.return_value = {} + + with pytest.raises(Exception, match="No download URL found for invalid-package version 1.0.0"): + fetch("pkg:pypi/invalid-package@1.0.0") + + +def test_fetch_unsupported_scheme(): with pytest.raises(Exception, match="Not a supported/known scheme"): fetch("s3://bucket/object") + + +def test_resolve_url_from_purl_invalid(): + with pytest.raises(ValueError, match="Could not resolve PURL to a valid URL."): + fetch("pkg:invalid/invalid-package@1.0.0") + + +@mock.patch("fetchcode._http_exists") +def test_resolve_url_from_purl_using_purl2url(mock_http_exists): + mock_http_exists.return_value = True + + url, _ = resolve_url_from_purl("pkg:swift/github.com/Alamofire/Alamofire@5.4.3") + assert url == "https://github.com/Alamofire/Alamofire/archive/5.4.3.zip" + mock_http_exists.assert_called_once_with( + "https://github.com/Alamofire/Alamofire/archive/5.4.3.zip" + ) + + +@mock.patch("fetchcode._http_exists") +@mock.patch("fetchcode.pypi.fetch_json_response") +def test_resolve_url_from_purl_using_fetchcode(mock_fetch_json_response, mock_http_exists): + mock_http_exists.return_value = True + mock_fetch_json_response.return_value = { + "urls": [{"url": "https://example.com/sample-1.0.0.zip"}] + } + + url, _ = resolve_url_from_purl("pkg:pypi/example@1.0.0") + assert url == "https://example.com/sample-1.0.0.zip" + mock_http_exists.assert_called_once_with("https://example.com/sample-1.0.0.zip") + + +def test_resolve_purl_invalid(): + assert resolve_purl("pkg:invalid/invalid-package@1.0.0") is None + + +def test_resolve_purl_using_purl2url(): + url = resolve_purl("pkg:pub/http@0.13.3") + assert url == "https://pub.dev/api/archives/http-0.13.3.tar.gz" + + +@mock.patch("fetchcode._http_exists") +def test_resolve_purl_using_purl2url_url_does_not_exists(mock_http_exists): + mock_http_exists.return_value = False + url = resolve_purl("pkg:pub/http@0.13.3") + assert url is None + + +@mock.patch("fetchcode._http_exists") +@mock.patch("fetchcode.pypi.fetch_json_response") +def test_resolve_purl_using_fetchcode(mock_fetch_json_response, mock_http_exists): + mock_fetch_json_response.return_value = { + "urls": [{"url": "https://example.com/sample-1.0.0.zip"}] + } + mock_http_exists.return_value = True + url = resolve_purl("pkg:pypi/example@1.0.0") + assert url == "https://example.com/sample-1.0.0.zip" + + +@mock.patch("fetchcode._http_exists") +@mock.patch("fetchcode.pypi.fetch_json_response") +def test_resolve_purl_using_fetchcode_url_does_not_exists( + mock_fetch_json_response, mock_http_exists +): + mock_fetch_json_response.return_value = { + "urls": [{"url": "https://example.com/sample-1.0.0.zip"}] + } + mock_http_exists.return_value = False + url = resolve_purl("pkg:pypi/example@1.0.0") + assert url is None From 56156b0e47ca46c2c59acd111da223fae962f204 Mon Sep 17 00:00:00 2001 From: Tushar Goel Date: Thu, 7 Aug 2025 14:44:52 +0530 Subject: [PATCH 6/6] Fix README indentation Signed-off-by: Tushar Goel --- README.rst | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/README.rst b/README.rst index ef17f90..cd2139e 100644 --- a/README.rst +++ b/README.rst @@ -72,28 +72,28 @@ Fetch a purl and get a ``fetchcode.fetch.Response`` object back:: Ecosystems supported for fetching a purl from fetchcode: - - alpm - - apk - - bitbucket - - cargo - - composer - - conda - - cpan - - cran - - deb - - gem - - generic - - github - - golang - - hackage - - hex - - luarocks - - maven - - npm - - nuget - - pub - - pypi - - swift +- alpm +- apk +- bitbucket +- cargo +- composer +- conda +- cpan +- cran +- deb +- gem +- generic +- github +- golang +- hackage +- hex +- luarocks +- maven +- npm +- nuget +- pub +- pypi +- swift License --------