diff --git a/.github/workflows/daily-update.yml b/.github/workflows/daily-update.yml index 015a332..f2858d2 100644 --- a/.github/workflows/daily-update.yml +++ b/.github/workflows/daily-update.yml @@ -57,6 +57,6 @@ jobs: run: | git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" - git add cloud_providers_v2.json README.md + git add cloud_providers_v3.json README.md git commit -m "chore: daily signature update $(date -u +%Y-%m-%d)" git push origin HEAD:stable diff --git a/Cargo.lock b/Cargo.lock index c7bc914..8742642 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -231,7 +231,7 @@ checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" [[package]] name = "cloudcheck" -version = "9.3.0" +version = "10.0.0" dependencies = [ "axum", "clap", diff --git a/Cargo.toml b/Cargo.toml index 3e404ef..3ccf219 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cloudcheck" -version = "9.3.0" +version = "10.0.0" edition = "2024" description = "CloudCheck is a simple Rust tool to check whether an IP address or hostname belongs to a cloud provider." license = "GPL-3.0" diff --git a/README.md b/README.md index 5221b0b..e555588 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ CloudCheck is a simple Rust tool to check whether an IP address or hostname belo ## Cloud Provider Signatures -The latest cloud provider signatures are available in [`cloud_providers_v2.json`](https://github.com/blacklanternsecurity/cloudcheck/blob/master/cloud_providers_v2.json), which is updated daily via [CI/CD](.github/workflows/daily-update.yml). Domains associated with each cloud provider are fetched dynamically from the [v2fly community repository](https://github.com/v2fly/domain-list-community), and CIDRs are fetched from [ASNDB](https://asndb.api.bbot.io/). +The latest cloud provider signatures are available in [`cloud_providers_v3.json`](https://github.com/blacklanternsecurity/cloudcheck/blob/master/cloud_providers_v3.json), which is updated daily via [CI/CD](.github/workflows/daily-update.yml). Domains associated with each cloud provider are fetched dynamically from the [v2fly community repository](https://github.com/v2fly/domain-list-community), and CIDRs are fetched from [ASNDB](https://asndb.api.bbot.io/). Used by [BBOT](https://github.com/blacklanternsecurity/bbot) and [BBOT Server](https://github.com/blacklanternsecurity/bbot-server). diff --git a/cloudcheck/helpers.py b/cloudcheck/helpers.py index 1b0aaee..7b645fc 100644 --- a/cloudcheck/helpers.py +++ b/cloudcheck/helpers.py @@ -1,9 +1,12 @@ import ipaddress import os +import sys import httpx from pathlib import Path from typing import List, Set, Union +_warned_missing_api_key = False + def defrag_cidrs( cidrs: List[Union[ipaddress.IPv4Network, ipaddress.IPv6Network]], @@ -202,12 +205,22 @@ def strings_to_cidrs( def request(url, include_api_key=False, browser_headers=False, timeout=60, **kwargs): + global _warned_missing_api_key headers = kwargs.get("headers", {}) if browser_headers: headers.update(browser_base_headers) bbot_io_api_key = os.getenv("BBOT_IO_API_KEY") - if include_api_key and bbot_io_api_key: - headers["Authorization"] = f"Bearer {bbot_io_api_key}" + if include_api_key: + if bbot_io_api_key: + headers["Authorization"] = f"Bearer {bbot_io_api_key}" + elif not _warned_missing_api_key: + _warned_missing_api_key = True + print( + "WARNING: BBOT_IO_API_KEY env var is not set; asndb requests will be " + "unauthenticated and may be rate-limited. Export BBOT_IO_API_KEY before " + "running the update.", + file=sys.stderr, + ) kwargs["headers"] = headers kwargs["timeout"] = timeout kwargs.setdefault("follow_redirects", True) diff --git a/cloudcheck/providers/amazon.py b/cloudcheck/providers/amazon.py index bca538f..50cf782 100644 --- a/cloudcheck/providers/amazon.py +++ b/cloudcheck/providers/amazon.py @@ -25,10 +25,21 @@ class Amazon(BaseProvider): ] tags: List[str] = ["cloud"] _bucket_name_regex = r"[a-z0-9_][a-z0-9-\.]{1,61}[a-z0-9]" + _region_regex = r"[a-z]{2}-[a-z]+-\d+" regexes: Dict[str, List[str]] = { - "STORAGE_BUCKET_NAME": [_bucket_name_regex], + "STORAGE_BUCKET_NAME": [r"(?P" + _bucket_name_regex + r")"], "STORAGE_BUCKET_HOSTNAME": [ - r"(" + _bucket_name_regex + r")\.(s3-?(?:[a-z0-9-]*\.){1,2}amazonaws\.com)" + r"(?P" + _bucket_name_regex + r")\.s3\.amazonaws\.com", + r"(?P" + + _bucket_name_regex + + r")\.s3-(?P" + + _region_regex + + r")\.amazonaws\.com", + r"(?P" + + _bucket_name_regex + + r")\.s3\.(?P" + + _region_regex + + r")\.amazonaws\.com", ], } diff --git a/cloudcheck/providers/base.py b/cloudcheck/providers/base.py index c72aa15..377d541 100644 --- a/cloudcheck/providers/base.py +++ b/cloudcheck/providers/base.py @@ -67,7 +67,6 @@ def __init__(self, **data): self._cache_dir = Path.home() / ".cache" / "cloudcheck" self._repo_url = "https://github.com/v2fly/domain-list-community.git" self._asndb_url = os.getenv("ASNDB_URL", "https://asndb.api.bbot.io/v1") - self._bbot_io_api_key = os.getenv("BBOT_IO_API_KEY") def update(self): print(f"Updating {self.name}") @@ -184,6 +183,7 @@ def _fetch_org_id(self, org_id: str): print(f"Fetching {url}") res = self.request(url, include_api_key=True) print(f"{url} -> {res}: {res.text}") + res.raise_for_status() j = res.json() return [a["asn"] for a in j.get("asns", [])], [] except Exception as e: @@ -242,6 +242,7 @@ def fetch_asn( try: res = self.request(url, include_api_key=True) print(f"{url} -> {res.text}") + res.raise_for_status() j = res.json() cidrs = j.get("subnets", []) except Exception as e: diff --git a/cloudcheck/providers/cloudflare.py b/cloudcheck/providers/cloudflare.py index c14bcab..f0bf29b 100644 --- a/cloudcheck/providers/cloudflare.py +++ b/cloudcheck/providers/cloudflare.py @@ -23,10 +23,10 @@ class Cloudflare(BaseProvider): ] _bucket_name_regex = r"[a-z0-9_][a-z0-9-\.]{1,61}[a-z0-9]" regexes: Dict[str, List[str]] = { - "STORAGE_BUCKET_NAME": [_bucket_name_regex], + "STORAGE_BUCKET_NAME": [r"(?P" + _bucket_name_regex + r")"], "STORAGE_BUCKET_HOSTNAME": [ - r"(" + _bucket_name_regex + r")\.(r2\.dev)", - r"(" + _bucket_name_regex + r")\.(r2\.cloudflarestorage\.com)", + r"(?P" + _bucket_name_regex + r")\.r2\.dev", + r"(?P" + _bucket_name_regex + r")\.r2\.cloudflarestorage\.com", ], } diff --git a/cloudcheck/providers/digitalocean.py b/cloudcheck/providers/digitalocean.py index 4fabbdb..aa44867 100644 --- a/cloudcheck/providers/digitalocean.py +++ b/cloudcheck/providers/digitalocean.py @@ -13,10 +13,15 @@ class DigitalOcean(BaseProvider): "DO-13-ARIN", ] _bucket_name_regex = r"[a-z0-9][a-z0-9-]{2,62}" + _region_regex = r"[a-z]{3}\d" regexes: Dict[str, List[str]] = { - "STORAGE_BUCKET_NAME": [_bucket_name_regex], + "STORAGE_BUCKET_NAME": [r"(?P" + _bucket_name_regex + r")"], "STORAGE_BUCKET_HOSTNAME": [ - r"(" + _bucket_name_regex + r")\.([a-z]{3}[\d]{1}\.digitaloceanspaces\.com)" + r"(?P" + + _bucket_name_regex + + r")\.(?P" + + _region_regex + + r")\.digitaloceanspaces\.com" ], } diff --git a/cloudcheck/providers/gocache.py b/cloudcheck/providers/gocache.py index dfb48ec..7b08d91 100644 --- a/cloudcheck/providers/gocache.py +++ b/cloudcheck/providers/gocache.py @@ -5,7 +5,9 @@ class Gocache(BaseProvider): tags: List[str] = ["cdn"] short_description: str = "GoCache" - long_description: str = "A Brazilian content delivery network provider offering CDN services." + long_description: str = ( + "A Brazilian content delivery network provider offering CDN services." + ) _ips_url = "https://gocache.com.br/ips" diff --git a/cloudcheck/providers/google.py b/cloudcheck/providers/google.py index 884a226..6ca0d59 100644 --- a/cloudcheck/providers/google.py +++ b/cloudcheck/providers/google.py @@ -43,10 +43,13 @@ class Google(BaseProvider): _bucket_name_regex = r"[a-z0-9][a-z0-9-_\.]{1,61}[a-z0-9]" _firebase_bucket_name_regex = r"[a-z0-9][a-z0-9-\.]{1,61}[a-z0-9]" regexes: Dict[str, List[str]] = { - "STORAGE_BUCKET_NAME": [_bucket_name_regex, _firebase_bucket_name_regex], + "STORAGE_BUCKET_NAME": [ + r"(?P" + _bucket_name_regex + r")", + r"(?P" + _firebase_bucket_name_regex + r")", + ], "STORAGE_BUCKET_HOSTNAME": [ - r"(" + _firebase_bucket_name_regex + r")\.(firebaseio\.com)", - r"(" + _bucket_name_regex + r")\.(storage\.googleapis\.com)", + r"(?P" + _firebase_bucket_name_regex + r")\.firebaseio\.com", + r"(?P" + _bucket_name_regex + r")\.storage\.googleapis\.com", ], } diff --git a/cloudcheck/providers/hetzner.py b/cloudcheck/providers/hetzner.py index 45c4a98..48aed68 100644 --- a/cloudcheck/providers/hetzner.py +++ b/cloudcheck/providers/hetzner.py @@ -12,9 +12,14 @@ class Hetzner(BaseProvider): "ORG-HOA1-RIPE", ] _bucket_name_regex = r"[a-z0-9][a-z0-9-_\.]{1,61}[a-z0-9]" + _region_regex = r"[a-z]{3}\d" regexes: Dict[str, List[str]] = { - "STORAGE_BUCKET_NAME": [_bucket_name_regex], + "STORAGE_BUCKET_NAME": [r"(?P" + _bucket_name_regex + r")"], "STORAGE_BUCKET_HOSTNAME": [ - r"(" + _bucket_name_regex + r")\.(your-objectstorage\.com)" + r"(?P" + + _bucket_name_regex + + r")\.(?P" + + _region_regex + + r")\.your-objectstorage\.com" ], } diff --git a/cloudcheck/providers/microsoft.py b/cloudcheck/providers/microsoft.py index f77a77f..a1a83ec 100644 --- a/cloudcheck/providers/microsoft.py +++ b/cloudcheck/providers/microsoft.py @@ -21,9 +21,9 @@ class Microsoft(BaseProvider): ] _bucket_name_regex = r"[a-z0-9][a-z0-9-_\.]{1,61}[a-z0-9]" regexes: Dict[str, List[str]] = { - "STORAGE_BUCKET_NAME": [_bucket_name_regex], + "STORAGE_BUCKET_NAME": [r"(?P" + _bucket_name_regex + r")"], "STORAGE_BUCKET_HOSTNAME": [ - r"(" + _bucket_name_regex + r")\.(blob\.core\.windows\.net)" + r"(?P" + _bucket_name_regex + r")\.blob\.core\.windows\.net" ], } diff --git a/cloudcheck_update/__init__.py b/cloudcheck_update/__init__.py index fb75991..9da2ff1 100644 --- a/cloudcheck_update/__init__.py +++ b/cloudcheck_update/__init__.py @@ -12,7 +12,7 @@ project_root = Path(__file__).parent.parent -json_path = project_root / "cloud_providers_v2.json" +json_path = project_root / "cloud_providers_v3.json" def _update_provider(provider_class): diff --git a/pyproject.toml b/pyproject.toml index f32c999..75733b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "cloudcheck" -version = "9.3.0" +version = "10.0.0" description = "Detailed database of cloud providers. Instantly look up a domain or IP address" readme = "README.md" requires-python = ">=3.9" diff --git a/scripts/update_readme_table.py b/scripts/update_readme_table.py index f6bd853..49d4c76 100755 --- a/scripts/update_readme_table.py +++ b/scripts/update_readme_table.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Update README.md with a provider table generated from cloud_providers_v2.json""" +"""Update README.md with a provider table generated from cloud_providers_v3.json""" import json import re @@ -7,7 +7,7 @@ def load_providers(json_path: Path): - """Load providers from cloud_providers_v2.json""" + """Load providers from cloud_providers_v3.json""" with open(json_path, "r") as f: return json.load(f) @@ -83,7 +83,7 @@ def update_readme(readme_path: Path, table: str): def main(): project_root = Path(__file__).parent.parent - json_path = project_root / "cloud_providers_v2.json" + json_path = project_root / "cloud_providers_v3.json" readme_path = project_root / "README.md" if not json_path.exists(): diff --git a/src/lib.rs b/src/lib.rs index b6b9444..5a10839 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,7 +11,7 @@ use tokio::sync::{Mutex, RwLock}; #[cfg(feature = "py")] mod python; -const CLOUDCHECK_SIGNATURE_URL: &str = "https://raw.githubusercontent.com/blacklanternsecurity/cloudcheck/refs/heads/stable/cloud_providers_v2.json"; +const CLOUDCHECK_SIGNATURE_URL: &str = "https://raw.githubusercontent.com/blacklanternsecurity/cloudcheck/refs/heads/stable/cloud_providers_v3.json"; #[derive(Debug, Clone, Serialize, Deserialize, utoipa::ToSchema)] pub struct CloudProvider { @@ -86,7 +86,7 @@ impl CloudCheck { let mut path = PathBuf::from(home); path.push(".cache"); path.push("cloudcheck"); - path.push("cloud_providers_v2.json"); + path.push("cloud_providers_v3.json"); Ok(path) } diff --git a/test_cloudcheck.py b/test_cloudcheck.py index 28f970e..aa4030d 100644 --- a/test_cloudcheck.py +++ b/test_cloudcheck.py @@ -1,3 +1,5 @@ +import re + import pytest from cloudcheck import CloudCheck, CloudCheckError @@ -25,10 +27,13 @@ async def test_lookup_with_invalid_url(): signature_url="https://invalid.example.com/nonexistent.json", max_retries=2, retry_delay_seconds=0, - force_refresh=True + force_refresh=True, ) - - with pytest.raises(CloudCheckError, match=r"Failed to fetch cloud provider data from https://invalid\.example\.com/nonexistent\.json after 3 attempts"): + + with pytest.raises( + CloudCheckError, + match=r"Failed to fetch cloud provider data from https://invalid\.example\.com/nonexistent\.json after 3 attempts", + ): await cloudcheck.lookup("8.8.8.8") @@ -36,3 +41,70 @@ def test_import_provider(): from cloudcheck.providers import Amazon assert Amazon.regexes + + +@pytest.mark.parametrize( + "provider_name,hostname,expected", + [ + ("Amazon", "mybucket.s3.amazonaws.com", {"name": "mybucket"}), + ( + "Amazon", + "mybucket.s3-us-west-2.amazonaws.com", + {"name": "mybucket", "region": "us-west-2"}, + ), + ( + "Amazon", + "mybucket.s3.eu-central-1.amazonaws.com", + {"name": "mybucket", "region": "eu-central-1"}, + ), + ( + "DigitalOcean", + "mybucket.nyc3.digitaloceanspaces.com", + {"name": "mybucket", "region": "nyc3"}, + ), + ( + "Hetzner", + "mybucket.fsn1.your-objectstorage.com", + {"name": "mybucket", "region": "fsn1"}, + ), + ("Google", "mybucket.storage.googleapis.com", {"name": "mybucket"}), + ("Google", "mybucket.firebaseio.com", {"name": "mybucket"}), + ("Microsoft", "mybucket.blob.core.windows.net", {"name": "mybucket"}), + ("Cloudflare", "mybucket.r2.dev", {"name": "mybucket"}), + ( + "Cloudflare", + "mybucket.r2.cloudflarestorage.com", + {"name": "mybucket"}, + ), + ], +) +def test_storage_bucket_hostname_named_groups(provider_name, hostname, expected): + import cloudcheck.providers as providers + + provider_cls = getattr(providers, provider_name) + patterns = provider_cls.regexes["STORAGE_BUCKET_HOSTNAME"] + for pattern in patterns: + match = re.fullmatch(pattern, hostname) + if match: + groups = {k: v for k, v in match.groupdict().items() if v is not None} + assert groups == expected, ( + f"{provider_name}: {hostname} matched {pattern} but groups={groups}" + ) + return + pytest.fail( + f"{provider_name}: no STORAGE_BUCKET_HOSTNAME pattern matched {hostname}" + ) + + +def test_all_storage_bucket_regexes_compile(): + from cloudcheck.providers import load_provider_classes + + for provider_cls in load_provider_classes().values(): + regexes = provider_cls.model_fields["regexes"].default or {} + for category, patterns in regexes.items(): + for pattern in patterns: + compiled = re.compile(pattern) + assert "name" in compiled.groupindex, ( + f"{provider_cls.__name__} {category} pattern {pattern!r} " + f"is missing the 'name' named group" + ) diff --git a/uv.lock b/uv.lock index beccc16..77e7f54 100644 --- a/uv.lock +++ b/uv.lock @@ -49,7 +49,7 @@ wheels = [ [[package]] name = "cloudcheck" -version = "9.3.0" +version = "10.0.0" source = { editable = "." } [package.dev-dependencies]