diff --git a/Makefile b/Makefile index bc8eeaea..0e416510 100644 --- a/Makefile +++ b/Makefile @@ -56,27 +56,22 @@ envfile_testing: envfile @echo SCANCODEIO_DB_USER=\"postgres\" >> ${ENV_FILE} @echo SCANCODEIO_DB_PASSWORD=\"postgres\" >> ${ENV_FILE} -isort: - @echo "-> Apply isort changes to ensure proper imports ordering" - ${VENV}/bin/isort . - -black: - @echo "-> Apply black code formatter" - ${VENV}/bin/black . - doc8: @echo "-> Run doc8 validation" @${ACTIVATE} doc8 --max-line-length 100 --ignore-path docs/_build/ --quiet docs/ -valid: isort black +valid: + @echo "-> Run Ruff format" + @${ACTIVATE} ruff format --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/ + @echo "-> Run Ruff linter" + @${ACTIVATE} ruff check --fix --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/ check: - @echo "-> Run pycodestyle (PEP8) validation" - @${ACTIVATE} pycodestyle --max-line-length=100 --exclude=venv,lib,thirdparty,docs,migrations,settings.py . - @echo "-> Run isort imports ordering validation" - @${ACTIVATE} isort --check-only . - @echo "-> Run black validation" - @${ACTIVATE} black --check ${BLACK_ARGS} + @echo "-> Run Ruff linter validation (pycodestyle, bandit, isort, and more)" + @${ACTIVATE} ruff check --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/ + @echo "-> Run Ruff format validation" + @${ACTIVATE} ruff format --check --exclude etc/scripts/ --exclude purldb-toolkit/ --exclude purl2vcs/ + @$(MAKE) doc8 clean: @echo "-> Clean the Python env" diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3a108dbd..9e157f2e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -40,3 +40,11 @@ jobs: python_versions: ['3.10'] test_suites: all: make check_docs + + - template: etc/ci/azure-posix.yml + parameters: + job_name: ci_code_style + image_name: ubuntu-22.04 + python_versions: ['3.10'] + test_suites: + all: make check diff --git a/clearcode/cdutils.py b/clearcode/cdutils.py index 0469c4a3..5645cc41 100644 --- a/clearcode/cdutils.py +++ b/clearcode/cdutils.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -18,23 +17,21 @@ # limitations under the License. import base64 +import os +import time from hashlib import md5 from itertools import zip_longest -import os from os import path -import subprocess -import time -from urllib.parse import urlsplit -from urllib.parse import urlunsplit from urllib.parse import parse_qs from urllib.parse import quote_plus from urllib.parse import unquote_plus +from urllib.parse import urlsplit +from urllib.parse import urlunsplit import attr import click -from packageurl import PackageURL import requests - +from packageurl import PackageURL """ ClearlyDefined utlities. @@ -46,67 +43,65 @@ PACKAGE_TYPES_BY_CD_TYPE = { - 'crate': 'cargo', - 'deb': 'deb', - 'debsrc': 'deb', + "crate": "cargo", + "deb": "deb", + "debsrc": "deb", # Currently used only for maven packages - 'sourcearchive': 'maven', - 'maven': 'maven', - 'composer': 'composer', + "sourcearchive": "maven", + "maven": "maven", + "composer": "composer", # Currently used only for Github repo/packages - 'git': 'github', - 'pod': 'pod', - 'nuget': 'nuget', - 'pypi': 'pypi', - 'gem': 'gem', - 'npm': 'npm', - 'go': 'golang', + "git": "github", + "pod": "pod", + "nuget": "nuget", + "pypi": "pypi", + "gem": "gem", + "npm": "npm", + "go": "golang", } PACKAGE_TYPES_BY_PURL_TYPE = { - 'cargo': 'crate', - 'deb': 'deb', - 'maven': 'maven', - 'composer': 'composer', - 'github': 'git', - 'pod': 'pod', - 'nuget': 'nuget', - 'pypi': 'pypi', - 'gem': 'gem', - 'npm': 'npm', + "cargo": "crate", + "deb": "deb", + "maven": "maven", + "composer": "composer", + "github": "git", + "pod": "pod", + "nuget": "nuget", + "pypi": "pypi", + "gem": "gem", + "npm": "npm", } PROVIDERS_BY_PURL_TYPE = { - 'cargo': 'cratesio', - 'deb': 'debian', - 'maven': 'mavencentral', - 'composer': 'packagist', + "cargo": "cratesio", + "deb": "debian", + "maven": "mavencentral", + "composer": "packagist", # Currently used only for Github repo/packages - 'git': 'github', - 'github': 'github', - 'pod': 'cocoapods', - 'nuget': 'nuget', - 'pypi': 'pypi', - 'gem': 'rubygem', - 'npm': 'npmjs', + "git": "github", + "github": "github", + "pod": "cocoapods", + "nuget": "nuget", + "pypi": "pypi", + "gem": "rubygem", + "npm": "npmjs", } QUALIFIERS_BY_CD_TYPE = { - 'sourcearchive': {'classifier': 'sources'}, - 'debsrc': {'arch': 'source'} + "sourcearchive": {"classifier": "sources"}, + "debsrc": {"arch": "source"}, } @attr.s(slots=True) -class Coordinate(object): - """ - ClearlyDefined coordinates are used to identify any tracked component. - """ +class Coordinate: + """ClearlyDefined coordinates are used to identify any tracked component.""" - base_api_url = 'https://dev-api.clearlydefined.io' + base_api_url = "https://dev-api.clearlydefined.io" type = attr.ib() provider = attr.ib() @@ -115,15 +110,15 @@ class Coordinate(object): revision = attr.ib() def __attrs_post_init__(self, *args, **kwargs): - if self.provider == 'debian': - self.namespace = 'debian' + if self.provider == "debian": + self.namespace = "debian" if not self.namespace: - self.namespace = '-' + self.namespace = "-" @classmethod def from_dict(cls, coords): - if 'namespace' not in coords: - coords['namespace'] = '-' + if "namespace" not in coords: + coords["namespace"] = "-" return cls(**coords) def to_dict(self): @@ -169,20 +164,20 @@ def from_path(cls, pth, root=None): >>> assert expected == test """ - pth = pth.strip('/') + pth = pth.strip("/") if root and root in pth: - root = root.strip('/') + root = root.strip("/") _, _, pth = pth.partition(root) - segments = pth.strip('/').split('/') - if len(segments) >= 6 and segments[4] == 'revision': + segments = pth.strip("/").split("/") + if len(segments) >= 6 and segments[4] == "revision": # AZ blob style # /maven/mavencentral/io.dropwizard/dropwizard/revision/2.0.0-rc13.json # /maven/mavencentral/io.dropwizard/dropwizard/revision/2.0.0-rc13/tool/scancode/3.2.2.json start = segments[:4] version = segments[5] - if version.endswith('.json'): - version, _, _ = version.rpartition('.json') + if version.endswith(".json"): + version, _, _ = version.rpartition(".json") segments = start + [version] else: # plain API paths do not have a /revision/ segment @@ -190,47 +185,45 @@ def from_path(cls, pth, root=None): return cls(*segments) def to_api_path(self): - return '{type}/{provider}/{namespace}/{name}/{revision}'.format(**self.to_dict()) + return "{type}/{provider}/{namespace}/{name}/{revision}".format( + **self.to_dict() + ) def to_def_blob_path(self): - return '{type}/{provider}/{namespace}/{name}/revision/{revision}.json'.format(**self.to_dict()) + return "{type}/{provider}/{namespace}/{name}/revision/{revision}.json".format( + **self.to_dict() + ) def to_harvest_blob_path(self, tool, tool_version): - return '{type}/{provider}/{namespace}/{name}/revision/{revision}/tool/{tool}/{tool_version}.json'.format( - tool=tool, tool_version=tool_version, - **self.to_dict()) + return "{type}/{provider}/{namespace}/{name}/revision/{revision}/tool/{tool}/{tool_version}.json".format( + tool=tool, tool_version=tool_version, **self.to_dict() + ) def get_definition_api_url(self, base_api_url=None): - """ - Return a URL to fetch the full definition. - """ - return '{base_url}/definitions/{type}/{provider}/{namespace}/{name}/{revision}'.format( + """Return a URL to fetch the full definition.""" + return "{base_url}/definitions/{type}/{provider}/{namespace}/{name}/{revision}".format( base_url=base_api_url or self.base_api_url, path=self.to_api_path(), - **self.to_dict()) + **self.to_dict(), + ) def get_harvests_api_url(self, base_api_url=None): - """ - Return a URL to fetch all harvests at once. - """ - return '{base_url}/harvest/{type}/{provider}/{namespace}/{name}/{revision}?form=raw'.format( + """Return a URL to fetch all harvests at once.""" + return "{base_url}/harvest/{type}/{provider}/{namespace}/{name}/{revision}?form=raw".format( base_url=base_api_url or self.base_api_url, path=self.to_api_path(), - **self.to_dict()) + **self.to_dict(), + ) def to_def_query_api_url(self, include_revision=False, base_api_url=None): - """ - Return a CD API URL for query definitions. - """ - qs = 'type={type}&provider={provider}&name{name}' + """Return a CD API URL for query definitions.""" + qs = "type={type}&provider={provider}&name{name}" if include_revision: - qs += '&revision={revision}' - if self.namespace and self.namespace != '-': - qs += '&namespace={namespace}' - qs = qs.format( - base_url=base_api_url or self.base_api_url, - **self.to_dict()) - return '{base_url}/definitions?{qs}'.format(**locals()) + qs += "&revision={revision}" + if self.namespace and self.namespace != "-": + qs += "&namespace={namespace}" + qs = qs.format(base_url=base_api_url or self.base_api_url, **self.to_dict()) + return "{base_url}/definitions?{qs}".format(**locals()) def to_purl(self): """ @@ -250,15 +243,18 @@ def to_purl(self): """ converted_package_type = PACKAGE_TYPES_BY_CD_TYPE[self.type] - namespace = '' - if self.namespace != '-': + namespace = "" + if self.namespace != "-": namespace = self.namespace - if self.provider == 'debian': - namespace = 'debian' + if self.provider == "debian": + namespace = "debian" qualifiers = {} - if self.type in ('debsrc', 'sourcearchive',): + if self.type in ( + "debsrc", + "sourcearchive", + ): qualifiers = QUALIFIERS_BY_CD_TYPE[self.type] return PackageURL( @@ -293,14 +289,16 @@ def from_purl(cls, purl): package_type = p.type if package_type not in PACKAGE_TYPES_BY_PURL_TYPE: - raise Exception('Package type is not supported by ClearlyDefined: {}'.format(package_type)) + raise Exception( + f"Package type is not supported by ClearlyDefined: {package_type}" + ) # Handle the source types of Maven and Debian packages - if package_type == 'maven' and p.qualifiers.get('classifier', '') == 'sources': - package_type = 'sourcearchive' - provider = 'mavencentral' - elif package_type == 'deb' and p.qualifiers.get('arch', '') == 'source': - package_type = 'debsrc' - provider = 'debian' + if package_type == "maven" and p.qualifiers.get("classifier", "") == "sources": + package_type = "sourcearchive" + provider = "mavencentral" + elif package_type == "deb" and p.qualifiers.get("arch", "") == "source": + package_type = "debsrc" + provider = "debian" else: package_type = PACKAGE_TYPES_BY_PURL_TYPE[package_type] # TODO: Have way to set other providers? @@ -320,19 +318,21 @@ def get_coordinates(data_dir): Yield tuple of (path, Coordinate) from definition directories from `data_dir` at full depth. """ - data_dir = data_dir.strip('/') + data_dir = data_dir.strip("/") for dirpath, dirnames, _filenames in os.walk(data_dir, followlinks=False): for d in dirnames: pth = path.join(dirpath, d) _, _, cdpth = pth.partition(data_dir) - segments = cdpth.strip('/').split('/') + segments = cdpth.strip("/").split("/") # skip paths that have not the full depth required (e.g. 5 segments) if not len(segments) == 5: continue yield pth, Coordinate.from_path(cdpth) -def _get_response_content(url, retries=2, wait=2, session=requests, verbose=False, _retries=set()): +def _get_response_content( + url, retries=2, wait=2, session=requests, verbose=False, _retries=set() +): """ Return a tuple of (etag, md5, content bytes) with the content as bytes or as decoded text if `as_text` is True) of the response of a GET HTTP request at `url`. @@ -340,7 +340,7 @@ def _get_response_content(url, retries=2, wait=2, session=requests, verbose=Fals `wait` seconds. """ if verbose: - click.echo(' --> Fetching: {url}'.format(**locals())) + click.echo(" --> Fetching: {url}".format(**locals())) response = session.get(url, timeout=600) status_code = response.status_code @@ -350,25 +350,39 @@ def _get_response_content(url, retries=2, wait=2, session=requests, verbose=Fals # to restart from an earlier continuation if url in _retries: _retries.remove(url) - print(' SUCCESS after Failure to fetch:', url) - etag = response.headers.get('etag') + print(" SUCCESS after Failure to fetch:", url) + etag = response.headers.get("etag") content = response.content checksum = md5(content).hexdigest() return etag, checksum, response.content - error_code = requests.codes.get(status_code) or '' + error_code = requests.codes.get(status_code) or "" if status_code >= 500 and retries: # timeout/522 or other server error: let's wait a bit and retry for "retries" number of retries retries -= 1 - print(' Failure to fetch:', url, 'with', status_code, error_code, 'retrying after waiting:', wait, 'seconds.') + print( + " Failure to fetch:", + url, + "with", + status_code, + error_code, + "retrying after waiting:", + wait, + "seconds.", + ) _retries.add(url) time.sleep(wait) return _get_response_content( - url=url, retries=retries, wait=wait, session=session, verbose=verbose) + url=url, retries=retries, wait=wait, session=session, verbose=verbose + ) # all other errors - raise Exception('Failed HTTP request for {url} : error: {status_code} : {error_code}'.format(**locals())) + raise Exception( + "Failed HTTP request for {url} : error: {status_code} : {error_code}".format( + **locals() + ) + ) def get_response_content(url, retries=2, wait=4, session=requests, verbose=False): @@ -378,25 +392,31 @@ def get_response_content(url, retries=2, wait=4, session=requests, verbose=False """ try: return _get_response_content( - url=url, retries=retries, wait=wait, - session=session, verbose=verbose) + url=url, retries=retries, wait=wait, session=session, verbose=verbose + ) except Exception as e: if retries: - print(' Failure to fetch:', url, 'with error:', e, 'and retrying after waiting:', wait, 'seconds.') + print( + " Failure to fetch:", + url, + "with error:", + e, + "and retrying after waiting:", + wait, + "seconds.", + ) # we sleep progressively more after each failure and up to wait seconds time.sleep(int(wait / (retries or 1))) retries -= 1 return get_response_content( - url=url, retries=retries, wait=wait, - session=session, verbose=verbose) + url=url, retries=retries, wait=wait, session=session, verbose=verbose + ) else: raise def split_url(url): - """ - Given a URL, return a tuple of URL elements where `query` is a mapping. - """ + """Given a URL, return a tuple of URL elements where `query` is a mapping.""" scheme, netloc, path, query, fragment = urlsplit(url) query = parse_qs(query) return scheme, netloc, path, query, fragment @@ -408,25 +428,24 @@ def join_qs(keys_values, do_not_quote=()): Quote values unless the name is in in the `do_not_quote` set. """ keys_values = { - k: (v[0] if v and isinstance(v, list) else v) for k, v in keys_values.items()} - return '&'.join('='.join([k, v if k in do_not_quote else quote_plus(v)]) - for k, v in keys_values.items()) + k: (v[0] if v and isinstance(v, list) else v) for k, v in keys_values.items() + } + return "&".join( + "=".join([k, v if k in do_not_quote else quote_plus(v)]) + for k, v in keys_values.items() + ) def append_path_to_url(url, extra_path): - """ - Return a new `url` with `extra_path` appended to its path. - """ + """Return a new `url` with `extra_path` appended to its path.""" scheme, netloc, path, query, fragment = split_url(url) - path = path.strip('/') + '/' + extra_path.strip('/') + path = path.strip("/") + "/" + extra_path.strip("/") segments = scheme, netloc, path, join_qs(query), fragment return urlunsplit(segments) def update_url(url, qs_mapping, do_not_quote=()): - """ - Return a new `url` with its query string updated from a mapping of key/value pairs. - """ + """Return a new `url` with its query string updated from a mapping of key/value pairs.""" scheme, netloc, path, query, fragment = split_url(url) query.update(qs_mapping) segments = scheme, netloc, path, join_qs(query, do_not_quote=do_not_quote), fragment @@ -434,10 +453,8 @@ def update_url(url, qs_mapping, do_not_quote=()): def build_cdapi_continuation_url(api_url, continuation_token): - """ - Return a new `api_url` with a CD API `continuation_token`. - """ - return update_url(api_url, {'continuationToken': continuation_token}) + """Return a new `api_url` with a CD API `continuation_token`.""" + return update_url(api_url, {"continuationToken": continuation_token}) def build_cdapi_continuation_url_from_coordinates(api_url, coordinates): @@ -458,15 +475,15 @@ def split_cdapi_url(url): # get a continuation-free base URL. This assumes that the continuationToken # is always the last query string param if it is present. scheme, netloc, url, query, fragment = split_url(url) - token = query.pop('continuationToken', None) + token = query.pop("continuationToken", None) if token: token = token[0] - if '%' in token: + if "%" in token: token = unquote_plus(token) segments = scheme, netloc, url, join_qs(query), fragment unparsed = urlunsplit(segments) if TRACE: - print('split_cdapi_url:', 'unparsed:', unparsed, 'token:', token) + print("split_cdapi_url:", "unparsed:", unparsed, "token:", token) return unparsed, token @@ -483,19 +500,17 @@ def get_coord_from_cdapi_continuation_url(api_url): def get_coord_from_cdapi_continuation(continuation): - """ - Given an encoded continuation token, return a string of CD coordinates. - """ + """Given an encoded continuation token, return a string of CD coordinates.""" if TRACE: - print('get_coord_from_cdapi_continuation: continuation:', continuation) - continuation = continuation.replace(' ', '+') + print("get_coord_from_cdapi_continuation: continuation:", continuation) + continuation = continuation.replace(" ", "+") - if '%' in continuation: + if "%" in continuation: continuation = unquote_plus(continuation) decoded = base64.b64decode(continuation) if not isinstance(decoded, str): - decoded = decoded.decode('utf-8') + decoded = decoded.decode("utf-8") return decoded @@ -506,10 +521,10 @@ def get_cdapi_continuation_token(coord): """ if isinstance(coord, dict): coord = coord2str(coord) - coord = coord.replace(' ', '+') - encoded = coord.encode('utf-8') + coord = coord.replace(" ", "+") + encoded = coord.encode("utf-8") - return base64.b64encode(encoded).decode('utf-8') + return base64.b64encode(encoded).decode("utf-8") def str2coord(s): @@ -521,17 +536,23 @@ def str2coord(s): URN: "urn:gem:rubygems:-:mocha:revision:1.7.0:tool:scancode:3.1.0" plain: /gem/rubygems/foo/mocha/1.7.0" """ - #TODO: Add doctest - is_urn = s.startswith('urn') - is_url = s.startswith('cd:') - splitter = ':' if is_urn else '/' + # TODO: Add doctest + is_urn = s.startswith("urn") + is_url = s.startswith("cd:") + splitter = ":" if is_urn else "/" segments = s.strip(splitter).split(splitter) if is_urn or is_url: segments = segments[1:] # ignore extra segments for now beyond the 5 fisrt (such as the PR of a curation) segments = segments[:5] - fields = ('type', 'provider', 'namespace', 'name', 'revision',) + fields = ( + "type", + "provider", + "namespace", + "name", + "revision", + ) return dict(zip_longest(fields, segments)) @@ -547,18 +568,17 @@ def coord2str(coord): "name": "license-expression", "revision": "70277cdfc186466667cb58ec9f9c7281e68a221b" """ - assert coord, 'Empty or missing coordinate mapping: {}'.format(coord) - rev = coord.get('revision') + assert coord, f"Empty or missing coordinate mapping: {coord}" + rev = coord.get("revision") kwargs = dict( - t=coord['type'], - p=coord['provider'], - ns=coord.get('namespace') or '-', - n=coord['name'], + t=coord["type"], + p=coord["provider"], + ns=coord.get("namespace") or "-", + n=coord["name"], r=rev, ) if rev: - template = '{t}/{p}/{ns}/{n}/{r}' + template = "{t}/{p}/{ns}/{n}/{r}" else: - template = '{t}/{p}/{ns}/{n}' + template = "{t}/{p}/{ns}/{n}" return template.format(**kwargs) - diff --git a/clearcode/load.py b/clearcode/load.py index a2889d63..c90a4b48 100644 --- a/clearcode/load.py +++ b/clearcode/load.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -16,16 +15,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import multiprocessing import os -from pathlib import Path import sys from django.db.utils import IntegrityError import click - """ Load ClearlyDefined definitions and harvests from the filesystem @@ -52,30 +48,30 @@ def walk_and_load_from_filesystem(input_dir, cd_root_dir): CDitem.path = npm/npmjs/@actions/github/revision/2.1.1.json.gz CDitem.content = 'the file: 2.1.1.json.gz in bytes' """ - # for now, we count dirs too file_counter = 1 for root, dirs, files in os.walk(input_dir): for filename in files: # output some progress - print(' ', end='\r') - print("Processing file #{}".format(file_counter), end='\r') - file_counter +=1 + print(" ", end="\r") + print(f"Processing file #{file_counter}", end="\r") + file_counter += 1 # TODO: check if the location is actually a CD data item. full_gzip_path = os.path.join(root, filename) - full_json_path = full_gzip_path.rstrip('.gz') + full_json_path = full_gzip_path.rstrip(".gz") # normalize the `path` value by removing the arbitrary parent directory cditem_rel_path = os.path.relpath(full_json_path, cd_root_dir) - with open(full_gzip_path, mode='rb') as f: + with open(full_gzip_path, mode="rb") as f: content = f.read() from clearcode import models + # Save to DB try: - cditem = models.CDitem.objects.create(path=cditem_rel_path, content=content) + models.CDitem.objects.create(path=cditem_rel_path, content=content) except IntegrityError: # skip if we already have it in the DB continue @@ -87,41 +83,38 @@ def load(input_dir=None, cd_root_dir=None, *arg, **kwargs): creating CDItem objects and loading them into a PostgreSQL database. """ if not input_dir: - sys.exit('Please specify an input directory using the `--input-dir` option.') + sys.exit("Please specify an input directory using the `--input-dir` option.") if not cd_root_dir: - sys.exit('Please specify the cd-root-directory using the --cd-root-dir option.') + sys.exit("Please specify the cd-root-directory using the --cd-root-dir option.") # get proper DB setup walk_and_load_from_filesystem(input_dir, cd_root_dir) - print(' ', end='\r') + print(" ", end="\r") print("Loading complete") @click.command() - -@click.option('--input-dir', - type=click.Path(), metavar='DIR', - help='Load content from this input directory that contains a tree of gzip-compressed JSON CD files') - -@click.option('--cd-root-dir', - type=click.Path(), metavar='DIR', - help='specify root directory that contains a tree of gzip-compressed JSON CD files') - -@click.help_option('-h', '--help') - +@click.option( + "--input-dir", + type=click.Path(), + metavar="DIR", + help="Load content from this input directory that contains a tree of gzip-compressed JSON CD files", +) +@click.option( + "--cd-root-dir", + type=click.Path(), + metavar="DIR", + help="specify root directory that contains a tree of gzip-compressed JSON CD files", +) +@click.help_option("-h", "--help") def cli(input_dir=None, cd_root_dir=None, *arg, **kwargs): """ Handle ClearlyDefined gzipped JSON scans by walking a clearsync directory structure, creating CDItem objects and loading them into a PostgreSQL database. """ - load( - input_dir=input_dir, - cd_root_dir=cd_root_dir, - *arg, - **kwargs - ) + load(input_dir=input_dir, cd_root_dir=cd_root_dir, *arg, **kwargs) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/clearcode/management/commands/clearload.py b/clearcode/management/commands/clearload.py index c56c2f48..476b58bb 100644 --- a/clearcode/management/commands/clearload.py +++ b/clearcode/management/commands/clearload.py @@ -19,23 +19,22 @@ class Command(VerboseCommand): def add_arguments(self, parser): parser.add_argument( - '--input-dir', - dest='input_dir', + "--input-dir", + dest="input_dir", default=None, type=str, - help='Load content from this input directory that contains a tree of gzip-compressed JSON CD files') + help="Load content from this input directory that contains a tree of gzip-compressed JSON CD files", + ) parser.add_argument( - '--cd-root-dir', - dest='cd_root_dir', + "--cd-root-dir", + dest="cd_root_dir", default=None, type=str, - help='Specify root directory that contains a tree of gzip-compressed JSON CD files') + help="Specify root directory that contains a tree of gzip-compressed JSON CD files", + ) def handle(self, *args, **options): - input_dir = options.get('input_dir') - cd_root_dir = options.get('cd_root_dir') + input_dir = options.get("input_dir") + cd_root_dir = options.get("cd_root_dir") - load( - input_dir=input_dir, - cd_root_dir=cd_root_dir - ) + load(input_dir=input_dir, cd_root_dir=cd_root_dir) diff --git a/clearcode/management/commands/clearsync.py b/clearcode/management/commands/clearsync.py index ab7cf94a..4b8f74a6 100644 --- a/clearcode/management/commands/clearsync.py +++ b/clearcode/management/commands/clearsync.py @@ -20,76 +20,86 @@ class Command(VerboseCommand): def add_arguments(self, parser): parser.add_argument( - '--output-dir', - dest='output_dir', + "--output-dir", + dest="output_dir", default=None, type=str, - help='Save fetched content as compressed gzipped files to this output directory.') + help="Save fetched content as compressed gzipped files to this output directory.", + ) parser.add_argument( - '--save-to-db', - dest='save_to_db', - action='store_true', - help='Save fetched content as compressed gzipped blobs in the configured database.') + "--save-to-db", + dest="save_to_db", + action="store_true", + help="Save fetched content as compressed gzipped blobs in the configured database.", + ) parser.add_argument( - '--unsorted', - dest='unsorted', - action='store_true', - help='Fetch data without any sorting. The default is to fetch data sorting by latest updated first.') + "--unsorted", + dest="unsorted", + action="store_true", + help="Fetch data without any sorting. The default is to fetch data sorting by latest updated first.", + ) parser.add_argument( - '--base-api-url', - dest='base_api_url', - default='https://api.clearlydefined.io', - help='ClearlyDefined base API URL.') + "--base-api-url", + dest="base_api_url", + default="https://api.clearlydefined.io", + help="ClearlyDefined base API URL.", + ) parser.add_argument( - '--wait', - dest='wait', + "--wait", + dest="wait", default=60, type=int, - help='Set the number of seconds to wait for new or updated definitions ' - 'between two loops.') + help="Set the number of seconds to wait for new or updated definitions " + "between two loops.", + ) parser.add_argument( - '-n', - '--processes', - dest='processes', + "-n", + "--processes", + dest="processes", default=1, type=int, - help='Set the number of parallel processes to use. ' - 'Disable parallel processing if 0.') + help="Set the number of parallel processes to use. " + "Disable parallel processing if 0.", + ) parser.add_argument( - '--max-def', - dest='max_def', + "--max-def", + dest="max_def", default=0, type=int, - help='Set the maximum number of definitions to fetch.') + help="Set the maximum number of definitions to fetch.", + ) parser.add_argument( - '--only-definitions', - dest='only_definitions', - action='store_true', - help='Only fetch definitions and no other data item.') + "--only-definitions", + dest="only_definitions", + action="store_true", + help="Only fetch definitions and no other data item.", + ) parser.add_argument( - '--log-file', - dest='log_file', + "--log-file", + dest="log_file", default=None, type=str, - help='Path to a file where to log fetched paths, one per line. ' - 'Log entries will be appended to this file if it exists.') + help="Path to a file where to log fetched paths, one per line. " + "Log entries will be appended to this file if it exists.", + ) parser.add_argument( - '--verbose', - dest='verbose', - action='store_true', - help='Display more verbose progress messages.') + "--verbose", + dest="verbose", + action="store_true", + help="Display more verbose progress messages.", + ) def handle(self, *args, **options): - output_dir = options.get('output_dir') - save_to_db = options.get('save_to_db') - base_api_url = options.get('base_api_url') - wait = options.get('wait') - processes = options.get('processes') - unsorted = options.get('unsorted') - log_file = options.get('log_file') - max_def = options.get('max_def') - only_definitions = options.get('only_definitions') - verbose = options.get('verbose') + output_dir = options.get("output_dir") + save_to_db = options.get("save_to_db") + base_api_url = options.get("base_api_url") + wait = options.get("wait") + processes = options.get("processes") + unsorted = options.get("unsorted") + log_file = options.get("log_file") + max_def = options.get("max_def") + only_definitions = options.get("only_definitions") + verbose = options.get("verbose") sync( output_dir=output_dir, @@ -101,5 +111,5 @@ def handle(self, *args, **options): log_file=log_file, max_def=max_def, only_definitions=only_definitions, - verbose=verbose + verbose=verbose, ) diff --git a/clearcode/management/commands/store_scans.py b/clearcode/management/commands/store_scans.py index 47212f1d..d102346e 100644 --- a/clearcode/management/commands/store_scans.py +++ b/clearcode/management/commands/store_scans.py @@ -12,13 +12,16 @@ class Command(VerboseCommand): - help = 'Store scancode scans in git repositories' + help = "Store scancode scans in git repositories" def add_arguments(self, parser): - parser.add_argument('work_dir', type=str) - parser.add_argument('--github_org', type=str, default="") - parser.add_argument('--count', type=int, default=0) + parser.add_argument("work_dir", type=str) + parser.add_argument("--github_org", type=str, default="") + parser.add_argument("--count", type=int, default=0) def handle(self, *args, **options): store_scancode_scans_from_cd_items( - work_dir=options['work_dir'], github_org=options['github_org'], count=options['count']) + work_dir=options["work_dir"], + github_org=options["github_org"], + count=options["count"], + ) diff --git a/clearcode/models.py b/clearcode/models.py index 0b96f2ee..871a4549 100644 --- a/clearcode/models.py +++ b/clearcode/models.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -25,9 +24,8 @@ class VirtualFileStore: - """ - Convenience wrapper to access CDitems as if they would be concrete files. - """ + """Convenience wrapper to access CDitems as if they would be concrete files.""" + @classmethod def walk(self, prefix=None, since=None): """ @@ -48,17 +46,17 @@ class CDitemQuerySet(models.QuerySet): def known_package_types(self): # These are the Package types that can be stored in the PackageDB KNOWN_PACKAGE_TYPES = [ - 'composer', - 'crate', - 'deb', - 'debsrc', - 'gem', - 'git', - 'maven', - 'npm', - 'nuget', - 'pypi', - 'sourcearchive', + "composer", + "crate", + "deb", + "debsrc", + "gem", + "git", + "maven", + "npm", + "nuget", + "pypi", + "sourcearchive", ] q_objs = models.Q() for package_type in KNOWN_PACKAGE_TYPES: @@ -66,10 +64,10 @@ def known_package_types(self): return self.filter(q_objs) def definitions(self): - return self.exclude(path__contains='/tool/') + return self.exclude(path__contains="/tool/") def scancode_harvests(self): - return self.filter(path__contains='tool/scancode') + return self.filter(path__contains="tool/scancode") def mappable(self): return self.filter(last_map_date__isnull=True, map_error__isnull=True) @@ -81,9 +79,7 @@ def mappable_scancode_harvests(self): return self.mappable().scancode_harvests().known_package_types() def modified_after(self, date): - """ - Limit the QuerySet to CDitems that were modified after a given `date`. - """ + """Limit the QuerySet to CDitems that were modified after a given `date`.""" return self.filter(last_modified_date__gt=date) @@ -93,8 +89,11 @@ class CDitem(models.Model): stored in ClearlyDefined blob storage and the value is a GZipped compressed JSON file content, stored as a binary bytes blob. """ - path = models.CharField(primary_key=True, max_length=2048, - help_text='Path to the original file in the ClearlyDefined file storage.' + + path = models.CharField( + primary_key=True, + max_length=2048, + help_text="Path to the original file in the ClearlyDefined file storage.", ) uuid = models.UUIDField( @@ -103,12 +102,10 @@ class CDitem(models.Model): editable=False, ) - content = models.BinaryField( - help_text='Actual gzipped JSON content.' - ) + content = models.BinaryField(help_text="Actual gzipped JSON content.") last_modified_date = models.DateTimeField( - help_text='Date and time that this record was last modified.', + help_text="Date and time that this record was last modified.", auto_now=True, # Automatically set to now on object save() ) @@ -116,24 +113,22 @@ class CDitem(models.Model): null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of the last mapping. ' - 'Used to track mapping status.', + help_text="Timestamp set to the date of the last mapping. " + "Used to track mapping status.", ) map_error = models.TextField( null=True, blank=True, - help_text='Mapping errors messages. When present this means the mapping failed.', + help_text="Mapping errors messages. When present this means the mapping failed.", ) objects = CDitemQuerySet.as_manager() @property def data(self): - """ - Return the data content deserialized from the content field. - """ + """Return the data content deserialized from the content field.""" uncompressed_content = gzip.decompress(self.content) if not uncompressed_content: - uncompressed_content = '{}' + uncompressed_content = "{}" return json.loads(uncompressed_content) diff --git a/clearcode/store_scans.py b/clearcode/store_scans.py index fed1e4f4..f2dbef2a 100644 --- a/clearcode/store_scans.py +++ b/clearcode/store_scans.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -17,31 +16,35 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import os from collections import defaultdict -from clearcode.models import CDitem -from clearcode.cdutils import Coordinate -from clearcode.cdutils import str2coord -from django.db.models import Q from hashlib import sha512 -import json -import requests -from packageurl import PackageURL from pathlib import Path + +from django.db.models import Q + +import requests from git import Repo -import os +from packageurl import PackageURL + +from clearcode.cdutils import Coordinate +from clearcode.cdutils import str2coord +from clearcode.models import CDitem + """ -The input is a bunch of scans from ClearlyDefined and -the output is a bunch of git repositories with commited and -pushed scans such that we balance the scans roughly evenly accross +The input is a bunch of scans from ClearlyDefined and +the output is a bunch of git repositories with commited and +pushed scans such that we balance the scans roughly evenly accross different repositories. -The primary reason for multiple repositories is size of a single -repo. There is a size limit of 5 GB at GitHub and it's difficult +The primary reason for multiple repositories is size of a single +repo. There is a size limit of 5 GB at GitHub and it's difficult to work with repositories with million files. -Therefore the approach is to use hashing as a way to name git -repositories and directories. We compute hash on the purl of the scanned -package and use the first few layers of this hash for the repo and +Therefore the approach is to use hashing as a way to name git +repositories and directories. We compute hash on the purl of the scanned +package and use the first few layers of this hash for the repo and directory names. Initial processing steps are: @@ -51,31 +54,32 @@ - Then we store the scan using the purl hash and purl as path. - Finally commit and push! : ) -Because it's not practical to process many repos at once, we organize the -processing one repo a time. For this, we iterate over a bunch of records get or compute +Because it's not practical to process many repos at once, we organize the +processing one repo a time. For this, we iterate over a bunch of records get or compute the purl hash and process the records that share the same hash. -We are using a short hash that is three characters long using hexadecimal encoding. -Therefore we can have 16*16*16 = 4096 repositories where each repo would contain about +We are using a short hash that is three characters long using hexadecimal encoding. +Therefore we can have 16*16*16 = 4096 repositories where each repo would contain about 25k scan files, if we were to store 100 million scans (which is a high mark). -For reference one scan should use less than a 100k on average when compressed -with gzip or git based on looking at 15 million scans. Each repo should be roughly +For reference one scan should use less than a 100k on average when compressed +with gzip or git based on looking at 15 million scans. Each repo should be roughly couple hundred mega bytes big, based on 15 million scans. """ # Create hex values of integers and ignore the 0x prefix repo_names = [hex(hash)[2:].zfill(3) for hash in range(4096)] + def store_scancode_scans_from_cd_items(work_dir, github_org="", count=0): """ - Iterate over CDItem objects with scancode scans. + Iterate over CDItem objects with scancode scans. Save and commit them in git repositories in work dir. Process a maximum of count items and process all items if count is 0 """ - cd_items = CDitem.objects.filter(~Q(content=b''), path__contains="tool/scancode") + cd_items = CDitem.objects.filter(~Q(content=b""), path__contains="tool/scancode") if count: - cd_items = cd_items[:count] + cd_items = cd_items[:count] for purl_hash, cd_items in get_cd_item_by_purl_hash(cd_items=cd_items).items(): commit_count = 0 for cd_item in cd_items: @@ -88,20 +92,24 @@ def store_scancode_scans_from_cd_items(work_dir, github_org="", count=0): scancode_scan = data.get("content") if not scancode_scan: continue - repo = get_or_init_repo(repo_name=purl_hash, work_dir=work_dir, repo_namespace=github_org, user_name=github_org, pull=False) + repo = get_or_init_repo( + repo_name=purl_hash, + work_dir=work_dir, + repo_namespace=github_org, + user_name=github_org, + pull=False, + ) purl = coordinate.to_purl() if add_scancode_scan(scancode_scan=scancode_scan, purl=purl, repo=repo): commit_count += 1 if commit_count % 10 == 0: print(".", end="") - origin = repo.remote(name='origin') + origin = repo.remote(name="origin") origin.push() def get_cd_item_by_purl_hash(cd_items): - """ - Return a mapping of {purl_hash: [CDItem,....]} - """ + """Return a mapping of {purl_hash: [CDItem,....]}""" cd_item_by_purl_hash = defaultdict(list) for cd_item in cd_items: data = cd_item.data @@ -124,7 +132,7 @@ def add_scancode_scan(repo, purl, scancode_scan): purl_data_dir = get_or_create_dir_for_purl(purl=purl, repo=repo) scancode_scan_path = purl_data_dir / "scancode-toolkit-scan.json" with open(scancode_scan_path, "w") as f: - json.dump(scancode_scan,f,indent=2) + json.dump(scancode_scan, f, indent=2) if repo.is_dirty(): repo.index.add([scancode_scan_path]) @@ -138,13 +146,14 @@ def is_valid_coordinate(coordinate): def get_or_create_dir_for_purl(purl, repo): """ - Return a path to a directory for this purl, + Return a path to a directory for this purl, in this git repo. """ purl_dir = repo.working_dir / get_purl_path(purl) purl_dir.mkdir(parents=True, exist_ok=True) return purl_dir + def get_purl_path(purl): purl_path = Path(purl.type) if purl.namespace: @@ -152,10 +161,8 @@ def get_purl_path(purl): return purl_path / purl.name / purl.version -def get_purl_hash(purl: PackageURL, length: int=3) -> str: - """ - Return a short lower cased hash of a purl. - """ +def get_purl_hash(purl: PackageURL, length: int = 3) -> str: + """Return a short lower cased hash of a purl.""" # This function takes a PackageURL object and an optional length parameter. # It returns a short hash of the purl. The length of the hash is determined by the length parameter. # The default length is 3. The function first converts the purl to bytes and then computes the sha512 hash of the purl. @@ -166,10 +173,16 @@ def get_purl_hash(purl: PackageURL, length: int=3) -> str: return short_hash.lower() -def get_or_init_repo(repo_name: str, work_dir: Path, repo_namespace: str= "", user_name: str = "", pull=False): +def get_or_init_repo( + repo_name: str, + work_dir: Path, + repo_namespace: str = "", + user_name: str = "", + pull=False, +): """ - Return a repo object for repo name and namespace - and store it in the work dir. Clone if it does not + Return a repo object for repo name and namespace + and store it in the work dir. Clone if it does not exist optionally take the latest pull if it does exist. """ # TODO: Manage org repo name @@ -186,7 +199,9 @@ def get_or_init_repo(repo_name: str, work_dir: Path, repo_namespace: str= "", us return repo -def get_scan_download_url(namespace:str, purl:str, scan_file_name: str = "scancode-toolkit-scan.json"): +def get_scan_download_url( + namespace: str, purl: str, scan_file_name: str = "scancode-toolkit-scan.json" +): purl_hash = get_purl_hash(purl=purl) purl_path = get_purl_path(purl) return f"https://raw.githubusercontent.com/{namespace}/{purl_hash}/main/{purl_path}/{scan_file_name}" @@ -194,15 +209,15 @@ def get_scan_download_url(namespace:str, purl:str, scan_file_name: str = "scanco def create_github_repo(repo_name, token=os.getenv("GH_TOKEN")): headers = { - 'Authorization': f'token {token}', - 'Accept': 'application/vnd.github.v3+json' + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", } data = { - 'name': repo_name, + "name": repo_name, } - url = 'https://api.github.com/user/repos' + url = "https://api.github.com/user/repos" response = requests.post(url, headers=headers, json=data) @@ -218,18 +233,18 @@ def get_github_repos(user_name, token=os.getenv("GH_TOKEN")): Yield full repo names for a user or org name, use the optional ``token`` if provided. Full repo name is in the form user or org name / repo name """ - headers = { - 'Accept': 'application/vnd.github.v3+json' - } + headers = {"Accept": "application/vnd.github.v3+json"} if token: - headers['Authorization'] = f'token {token}' + headers["Authorization"] = f"token {token}" - url = f'https://api.github.com/users/{user_name}/repos' + url = f"https://api.github.com/users/{user_name}/repos" response = requests.get(url, headers=headers) # TODO: We need have a way to handle failures from GH API if not response.status_code == 200: - raise Exception(f"HTTP {response.status_code}: Failed to get repos for {user_name}") + raise Exception( + f"HTTP {response.status_code}: Failed to get repos for {user_name}" + ) data = response.json() for repo_data in data: diff --git a/clearcode/sync.py b/clearcode/sync.py index 6b39d526..9d3dd34f 100644 --- a/clearcode/sync.py +++ b/clearcode/sync.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -17,21 +16,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -from datetime import datetime import gzip import json -from multiprocessing import pool import os -from os import path import time +from datetime import datetime +from multiprocessing import pool +from os import path -import click from django.utils import timezone + +import click import requests from clearcode import cdutils - """ Fetch the latest definitions and harvests from ClearlyDefined @@ -67,18 +66,18 @@ known_types = ( # fake empty type None, - 'npm', - 'git', - 'pypi', - 'composer', - 'maven', - 'gem', - 'nuget', - 'sourcearchive', - 'deb', - 'debsrc', - 'crate', - 'pod', + "npm", + "git", + "pypi", + "composer", + "maven", + "gem", + "nuget", + "sourcearchive", + "deb", + "debsrc", + "crate", + "pod", ) @@ -87,8 +86,14 @@ def fetch_and_save_latest_definitions( - base_api_url, cache, output_dir=None, save_to_db=False, - by_latest=True, retries=2, verbose=True): + base_api_url, + cache, + output_dir=None, + save_to_db=False, + by_latest=True, + retries=2, + verbose=True, +): """ Fetch ClearlyDefined definitions and paginate through. Save these as blobs to data_dir. @@ -97,26 +102,32 @@ def fetch_and_save_latest_definitions( Otherwise, the order is not specified. NOTE: these do not contain file details (but the harvest do) """ - assert output_dir or save_to_db, 'You must select one of the --output-dir or --save-to-db options.' + assert ( + output_dir or save_to_db + ), "You must select one of the --output-dir or --save-to-db options." - definitions_url = cdutils.append_path_to_url(base_api_url, extra_path='definitions') + definitions_url = cdutils.append_path_to_url(base_api_url, extra_path="definitions") if by_latest: - definitions_url = cdutils.update_url(definitions_url, qs_mapping=dict(sort='releaseDate', sortDesc='true')) + definitions_url = cdutils.update_url( + definitions_url, qs_mapping=dict(sort="releaseDate", sortDesc="true") + ) - for content in fetch_definitions(api_url=definitions_url, cache=cache, retries=retries, verbose=TRACE): + for content in fetch_definitions( + api_url=definitions_url, cache=cache, retries=retries, verbose=TRACE + ): # content is a batch of 100 definitions - definitions = content and content.get('data') + definitions = content and content.get("data") if not definitions: if verbose: - print(' No more data for: {}'.format(definitions_url)) + print(f" No more data for: {definitions_url}") break if verbose: - first = cdutils.coord2str(definitions[0]['coordinates']) - last = cdutils.coord2str(definitions[-1]['coordinates']) - print('Fetched definitions from :', first, 'to:', last, flush=True) + first = cdutils.coord2str(definitions[0]["coordinates"]) + last = cdutils.coord2str(definitions[-1]["coordinates"]) + print("Fetched definitions from :", first, "to:", last, flush=True) else: - print('.', end='', flush=True) + print(".", end="", flush=True) savers = [] if save_to_db: @@ -126,11 +137,14 @@ def fetch_and_save_latest_definitions( # we received a batch of definitions: let's save each as a Gzipped JSON for definition in definitions: - coordinate = cdutils.Coordinate.from_dict(definition['coordinates']) + coordinate = cdutils.Coordinate.from_dict(definition["coordinates"]) for saver in savers: blob_path, _size = save_def( - coordinate=coordinate, content=definition, output_dir=output_dir, - saver=saver) + coordinate=coordinate, + content=definition, + output_dir=output_dir, + saver=saver, + ) yield coordinate, blob_path @@ -147,7 +161,7 @@ def fetch_definitions(api_url, cache, retries=1, verbose=True): The structure of the REST payload is a list : {"data": [{}, ...], "continuationToken": ""} """ - assert '/definitions' in api_url + assert "/definitions" in api_url content = None errors_count = 0 max_errors = 5 @@ -159,7 +173,9 @@ def fetch_definitions(api_url, cache, retries=1, verbose=True): content = json.loads(content) except requests.exceptions.ConnectionError as ex: - print('!!!!!!!!!!!!!!!!!! -> Request failed, retrying:', api_url, 'with:', ex) + print( + "!!!!!!!!!!!!!!!!!! -> Request failed, retrying:", api_url, "with:", ex + ) errors_count += 1 if errors_count <= max_errors: # wait and retry, sleeping more each time we egt some error @@ -168,14 +184,14 @@ def fetch_definitions(api_url, cache, retries=1, verbose=True): else: raise - continuation_token = '' + continuation_token = "" if content: yield content - continuation_token = content.get('continuationToken', '') + continuation_token = content.get("continuationToken", "") if not continuation_token: if verbose: - print(' No more data for: {}'.format(api_url)) + print(f" No more data for: {api_url}") break api_url = cdutils.build_cdapi_continuation_url(api_url, continuation_token) @@ -187,9 +203,9 @@ def compress(content): `content` is eiher a string or a JSON-serializable data structure. """ if isinstance(content, str): - content = content.encode('utf-8') + content = content.encode("utf-8") else: - content = json.dumps(content , separators=(',', ':')).encode('utf-8') + content = json.dumps(content, separators=(",", ":")).encode("utf-8") return gzip.compress(content, compresslevel=9) @@ -198,11 +214,11 @@ def file_saver(content, blob_path, output_dir, **kwargs): Save `content` bytes (or dict or string) as gzip compressed bytes to `file_path`. Return the length of the written payload or 0 if it existed and was not updated. """ - file_path = path.join(output_dir, blob_path + '.gz') + file_path = path.join(output_dir, blob_path + ".gz") compressed = compress(content) if path.exists(file_path): - with open(file_path , 'rb') as ef: + with open(file_path, "rb") as ef: existing = ef.read() if existing == compressed: return 0 @@ -210,9 +226,9 @@ def file_saver(content, blob_path, output_dir, **kwargs): parent_dir = path.dirname(file_path) os.makedirs(parent_dir, exist_ok=True) - with open(file_path , 'wb') as oi: + with open(file_path, "wb") as oi: if TRACE: - print('Saving:', blob_path) + print("Saving:", blob_path) oi.write(compressed) return len(compressed) @@ -233,12 +249,12 @@ def db_saver(content, blob_path, **kwargs): cditem.content = compressed cditem.save() if TRACE: - print('Updating content for:', blob_path) + print("Updating content for:", blob_path) else: return 0 else: if TRACE: - print('Adding content for:', blob_path) + print("Adding content for:", blob_path) return len(compressed) @@ -254,8 +270,7 @@ def save_def(coordinate, content, output_dir, saver=file_saver): return blob_path, saver(content=content, output_dir=output_dir, blob_path=blob_path) -def save_harvest( - coordinate, tool, tool_version, content, output_dir, saver=file_saver): +def save_harvest(coordinate, tool, tool_version, content, output_dir, saver=file_saver): """ Save the scan `content` bytes (or dict or string) for `tool` `tool_version` of `coordinate` object to `output_dir` using blob paths conventions. @@ -267,19 +282,28 @@ def save_harvest( def fetch_and_save_harvests( - coordinate, cache, output_dir=None, save_to_db=False, retries=2, - session=session, verbose=True): + coordinate, + cache, + output_dir=None, + save_to_db=False, + retries=2, + session=session, + verbose=True, +): """ Fetch all the harvests for `coordinate` Coordinate object and save them in `outputdir` using blob-style paths, one file for each harvest/scan. (Note: Return a tuple of (etag, md5, url) for usage as a callback) """ - assert output_dir or save_to_db, 'You must select one of the --output-dir or --save-to-db options.' + assert ( + output_dir or save_to_db + ), "You must select one of the --output-dir or --save-to-db options." url = coordinate.get_harvests_api_url() etag, checksum, content = cache.get_content( - url, retries=retries, session=session, with_cache_keys=True) + url, retries=retries, session=session, with_cache_keys=True + ) if content: savers = [] @@ -289,9 +313,9 @@ def fetch_and_save_harvests( savers.append(file_saver) if verbose: - print(' Fetched harvest for:', coordinate.to_api_path(), flush=True) + print(" Fetched harvest for:", coordinate.to_api_path(), flush=True) else: - print('.', end='', flush=True) + print(".", end="", flush=True) for tool, versions in json.loads(content).items(): for tool_version, harvest in versions.items(): @@ -302,15 +326,14 @@ def fetch_and_save_harvests( tool_version=tool_version, content=harvest, output_dir=output_dir, - saver=saver) + saver=saver, + ) return etag, checksum, url -class Cache(object): - """ - A caching object for etags and checksums to avoid refetching things. - """ +class Cache: + """A caching object for etags and checksums to avoid refetching things.""" def __init__(self, max_size=100 * 1000): self.etags_cache = {} @@ -324,16 +347,14 @@ def is_unchanged_remotely(self, url, session=session): """ try: response = session.head(url) - remote_etag = response.headers.get('etag') + remote_etag = response.headers.get("etag") if remote_etag and self.etags_cache.get(url) == remote_etag: return True - except: + except Exception: return False def is_fetched(self, checksum, url): - """ - Return True if the content checksum exists for url, using MD5 checksum. - """ + """Return True if the content checksum exists for url, using MD5 checksum.""" return url and checksum and self.checksums_cache.get(checksum) == url def add(self, etag, checksum, url): @@ -346,9 +367,7 @@ def add_args(self, args): self.add(*args) def trim(self): - """ - Trim the cache to its max size. - """ + """Trim the cache to its max size.""" def _resize(cache): extra_items = len(cache) - self.max_size @@ -368,7 +387,8 @@ def get_content(self, url, retries=1, session=session, with_cache_keys=False): return etag, checksum, content = cdutils.get_response_content( - url, retries=retries, session=session) + url, retries=retries, session=session + ) if not content: return @@ -384,26 +404,36 @@ def get_content(self, url, retries=1, session=session, with_cache_keys=False): return content def copy(self): - """ - Return a deep copy of self - """ + """Return a deep copy of self""" cache = Cache(self.max_size) cache.checksums_cache = dict(self.checksums_cache) cache.etags_cache = dict(self.etags_cache) return cache -def sync(output_dir=None, save_to_db=False, - base_api_url='https://api.clearlydefined.io', - wait=60, processes=1, unsorted=False, - log_file=None, max_def=0, only_definitions=False, session=session, - verbose=False, *arg, **kwargs): +def sync( + output_dir=None, + save_to_db=False, + base_api_url="https://api.clearlydefined.io", + wait=60, + processes=1, + unsorted=False, + log_file=None, + max_def=0, + only_definitions=False, + session=session, + verbose=False, + *arg, + **kwargs, +): """ Fetch the latest definitions and harvests from ClearlyDefined and save these as gzipped JSON either as as files in output-dir or in a PostgreSQL database. Loop forever after waiting some seconds between each cycles. """ - assert output_dir or save_to_db, 'You must select at least one of the --output-dir or --save-to-db options.' + assert ( + output_dir or save_to_db + ), "You must select at least one of the --output-dir or --save-to-db options." fetch_harvests = not only_definitions @@ -421,7 +451,7 @@ def sync(output_dir=None, save_to_db=False, log_file_fn = None if log_file: - log_file_fn = open(log_file, 'a') + log_file_fn = open(log_file, "a") try: if fetch_harvests: @@ -441,7 +471,9 @@ def sync(output_dir=None, save_to_db=False, if def_type: # get latest with a "type" query - def_api_url = cdutils.update_url(base_api_url, qs_mapping=dict(type=def_type)) + def_api_url = cdutils.update_url( + base_api_url, qs_mapping=dict(type=def_type) + ) else: # do nothing if we have no type def_api_url = base_api_url @@ -452,16 +484,17 @@ def sync(output_dir=None, save_to_db=False, save_to_db=save_to_db, cache=cache, by_latest=not unsorted, - verbose=verbose) + verbose=verbose, + ) for coordinate, file_path in definitions: - cycle_defs_count += 1 if log_file: - log_file_fn.write(file_path.partition('.gz')[0] + '\n') + log_file_fn.write(file_path.partition(".gz")[0] + "\n") - if TRACE: print(' Saved def for:', coordinate) + if TRACE: + print(" Saved def for:", coordinate) if fetch_harvests: kwds = dict( @@ -472,17 +505,19 @@ def sync(output_dir=None, save_to_db=False, # subprocess, the data is best not shared to avoid # any sync issue cache=cache.copy(), - verbose=verbose) + verbose=verbose, + ) harvest_fetchers.apply_async( - fetch_and_save_harvests, - kwds=kwds, - callback=cache.add_args) + fetch_and_save_harvests, kwds=kwds, callback=cache.add_args + ) if max_def and max_def <= cycle_defs_count: break - if max_def and (max_def <= cycle_defs_count or max_def <= total_defs_count): + if max_def and ( + max_def <= cycle_defs_count or max_def <= total_defs_count + ): break total_defs_count += cycle_defs_count @@ -490,24 +525,42 @@ def sync(output_dir=None, save_to_db=False, total_duration += cycle_duration if not sleeping: - print('Saved', cycle_defs_count, 'defs and harvests,', - 'in:', int(cycle_duration), 'sec.') - - print('TOTAL cycles:', cycles, - 'with:', total_defs_count, 'defs and combined harvests,', - 'in:', int(total_duration), 'sec.') - - print('Cycle completed at:', datetime.utcnow().isoformat(), - 'Sleeping for', wait, 'seconds...') + print( + "Saved", + cycle_defs_count, + "defs and harvests,", + "in:", + int(cycle_duration), + "sec.", + ) + + print( + "TOTAL cycles:", + cycles, + "with:", + total_defs_count, + "defs and combined harvests,", + "in:", + int(total_duration), + "sec.", + ) + + print( + "Cycle completed at:", + datetime.utcnow().isoformat(), + "Sleeping for", + wait, + "seconds...", + ) else: - print('.', end='') + print(".", end="") sleeping = True time.sleep(wait) cache.trim() except KeyboardInterrupt: - click.secho('\nAborted with Ctrl+C!', fg='red', err=True) + click.secho("\nAborted with Ctrl+C!", fg="red", err=True) return finally: @@ -518,66 +571,97 @@ def sync(output_dir=None, save_to_db=False, harvest_fetchers.close() harvest_fetchers.terminate() - print('TOTAL cycles:', cycles, - 'with:', total_defs_count, 'defs and combined harvests,', - 'in:', int(total_duration), 'sec.') + print( + "TOTAL cycles:", + cycles, + "with:", + total_defs_count, + "defs and combined harvests,", + "in:", + int(total_duration), + "sec.", + ) @click.command() - -@click.option('--output-dir', - type=click.Path(), metavar='DIR', - help='Save fetched content as compressed gzipped files to this output directory.') - -@click.option('--save-to-db', +@click.option( + "--output-dir", + type=click.Path(), + metavar="DIR", + help="Save fetched content as compressed gzipped files to this output directory.", +) +@click.option( + "--save-to-db", is_flag=True, - help='Save fetched content as compressed gzipped blobs in the configured database.') - -@click.option('--unsorted', + help="Save fetched content as compressed gzipped blobs in the configured database.", +) +@click.option( + "--unsorted", is_flag=True, - help='Fetch data without any sorting. The default is to fetch data sorting by latest updated first.') - -@click.option('--base-api-url', + help="Fetch data without any sorting. The default is to fetch data sorting by latest updated first.", +) +@click.option( + "--base-api-url", type=str, - default='https://api.clearlydefined.io', show_default=True, - help='ClearlyDefined base API URL.') - -@click.option('--wait', - type=int, metavar='INT', - default=60, show_default=True, - help='Set the number of seconds to wait for new or updated definitions ' - 'between two loops.') - -@click.option('-n', '--processes', - type=int, metavar='INT', - default=1, show_default=True, - help='Set the number of parallel processes to use. ' - 'Disable parallel processing if 0.') - -@click.option('--max-def', - type=int, metavar='INT', + default="https://api.clearlydefined.io", + show_default=True, + help="ClearlyDefined base API URL.", +) +@click.option( + "--wait", + type=int, + metavar="INT", + default=60, + show_default=True, + help="Set the number of seconds to wait for new or updated definitions " + "between two loops.", +) +@click.option( + "-n", + "--processes", + type=int, + metavar="INT", + default=1, + show_default=True, + help="Set the number of parallel processes to use. " + "Disable parallel processing if 0.", +) +@click.option( + "--max-def", + type=int, + metavar="INT", default=0, - help='Set the maximum number of definitions to fetch.') - -@click.option('--only-definitions', - is_flag=True, - help='Only fetch definitions and no other data item.') - -@click.option('--log-file', - type=click.Path(), default=None, - help='Path to a file where to log fetched paths, one per line. ' - 'Log entries will be appended to this file if it exists.') - -@click.option('--verbose', + help="Set the maximum number of definitions to fetch.", +) +@click.option( + "--only-definitions", is_flag=True, - help='Display more verbose progress messages.') - -@click.help_option('-h', '--help') -def cli(output_dir=None, save_to_db=False, - base_api_url='https://api.clearlydefined.io', - wait=60, processes=1, unsorted=False, - log_file=None, max_def=0, only_definitions=False, session=session, - verbose=False, *arg, **kwargs): + help="Only fetch definitions and no other data item.", +) +@click.option( + "--log-file", + type=click.Path(), + default=None, + help="Path to a file where to log fetched paths, one per line. " + "Log entries will be appended to this file if it exists.", +) +@click.option("--verbose", is_flag=True, help="Display more verbose progress messages.") +@click.help_option("-h", "--help") +def cli( + output_dir=None, + save_to_db=False, + base_api_url="https://api.clearlydefined.io", + wait=60, + processes=1, + unsorted=False, + log_file=None, + max_def=0, + only_definitions=False, + session=session, + verbose=False, + *arg, + **kwargs, +): """ Fetch the latest definitions and harvests from ClearlyDefined and save these as gzipped JSON either as as files in output-dir or in a PostgreSQL @@ -600,5 +684,5 @@ def cli(output_dir=None, save_to_db=False, ) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/clearcode/tests/test_models.py b/clearcode/tests/test_models.py index 541ac8bf..33fa4a27 100644 --- a/clearcode/tests/test_models.py +++ b/clearcode/tests/test_models.py @@ -10,15 +10,14 @@ class CDitemManagerModifiedAfterTestCase(TestCase): - def setUp(self): - self.cditem0 = CDitem.objects.create(path='npm/name/version') + self.cditem0 = CDitem.objects.create(path="npm/name/version") def test_modified_after_1_day_old(self): test_date = datetime.datetime.now() - datetime.timedelta(days=1) self.assertIsNotNone(CDitem.objects.modified_after(test_date)) self.assertEqual(1, len(CDitem.objects.modified_after(test_date))) - + def test_modified_after_1_week_old(self): test_date = datetime.datetime.now() - datetime.timedelta(days=7) self.assertIsNotNone(CDitem.objects.modified_after(test_date)) @@ -28,7 +27,7 @@ def test_modified_after_1_day_new(self): test_date = datetime.datetime.now() + datetime.timedelta(days=1) self.assertIsNotNone(CDitem.objects.modified_after(test_date)) self.assertEqual(0, len(CDitem.objects.modified_after(test_date))) - + def test_modified_after_1_week_new(self): test_date = datetime.datetime.now() + datetime.timedelta(days=7) self.assertIsNotNone(CDitem.objects.modified_after(test_date)) @@ -38,66 +37,91 @@ def test_modified_after_1_week_new(self): class CDitemManagerTestCase(TestCase): def test_known_package_types(self): # This path starts with npm, which is known - cditem_1 = CDitem.objects.create(path='npm/name/version') + cditem_1 = CDitem.objects.create(path="npm/name/version") # asdf is not a proper type - cditem_2 = CDitem.objects.create(path='asdf/name/version') + CDitem.objects.create(path="asdf/name/version") cditems = list(CDitem.objects.known_package_types()) self.assertEqual(1, len(cditems)) cditem = cditems[0] self.assertEqual(cditem_1, cditem) def test_definitions(self): - expected_definition = CDitem.objects.create(path='composer/packagist/yoast/wordpress-seo/revision/9.5-RC3.json') + expected_definition = CDitem.objects.create( + path="composer/packagist/yoast/wordpress-seo/revision/9.5-RC3.json" + ) # harvest should not be in cditems - harvest = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json') + CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" + ) cditems = list(CDitem.objects.definitions()) self.assertEqual(1, len(cditems)) definition = cditems[0] self.assertEqual(expected_definition, definition) def test_scancode_harvests(self): - expected_harvest = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json') + expected_harvest = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" + ) # unexpected_harvest should not be in cditems - unexpected_harvest = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/licensee/9.13.0.json') + CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/licensee/9.13.0.json" + ) harvests = list(CDitem.objects.scancode_harvests()) self.assertEqual(1, len(harvests)) harvest = harvests[0] self.assertEqual(expected_harvest, harvest) def test_mappable(self): - definition_1 = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json') - definition_2 = CDitem.objects.create( - path='sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json', + definition_1 = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json" + ) + # This should not be mappable + CDitem.objects.create( + path="sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json", last_map_date=timezone.now(), - map_error='error' + map_error="error", + ) + harvest = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" ) - harvest = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json') mappables = list(CDitem.objects.mappable()) self.assertEqual(2, len(mappables)) self.assertIn(definition_1, mappables) self.assertIn(harvest, mappables) def test_mappable_definitions(self): - definition_1 = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json') - definition_2 = CDitem.objects.create( - path='sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json', + definition_1 = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json" + ) + # This should not be mappable + CDitem.objects.create( + path="sourcearchive/mavencentral/io.quarkus/quarkus-jsonb/revision/0.26.1.json", last_map_date=timezone.now(), - map_error='error' + map_error="error", + ) + # This should not be mappable + CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" ) - harvest = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json') mappables = list(CDitem.objects.mappable_definitions()) self.assertEqual(1, len(mappables)) definition = mappables[0] self.assertEqual(definition_1, definition) def test_mappable_scancode_harvests(self): - harvest_1 = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json') - harvest_2 = CDitem.objects.create( - path='sourcearchive/mavencentral/io.cucumber/cucumber-core/revision/5.0.0-RC1/tool/scancode/3.2.2.json', + harvest_1 = CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6/tool/scancode/3.2.2.json" + ) + # This should not be mappable + CDitem.objects.create( + path="sourcearchive/mavencentral/io.cucumber/cucumber-core/revision/5.0.0-RC1/tool/scancode/3.2.2.json", last_map_date=timezone.now(), - map_error='error' + map_error="error", + ) + # This should not be mappable + CDitem.objects.create( + path="sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json" ) - definition_1 = CDitem.objects.create(path='sourcearchive/mavencentral/io.nats/jnats/revision/2.6.6.json') mappables = list(CDitem.objects.mappable_scancode_harvests()) self.assertEqual(1, len(mappables)) harvest = mappables[0] diff --git a/clearcode/tests/test_sync.py b/clearcode/tests/test_sync.py index 83bb7ce8..97a409b3 100644 --- a/clearcode/tests/test_sync.py +++ b/clearcode/tests/test_sync.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # @@ -21,7 +20,6 @@ import json from django.test import TestCase -from django.utils import timezone from clearcode.models import CDitem from clearcode.sync import db_saver @@ -29,12 +27,12 @@ class SyncDbsaverTestCase(TestCase): def setUp(self): - self.test_path = 'composer/packagist/yoast/wordpress-seo/revision/9.5-RC3.json' - self.test_content = {'test': 'content'} + self.test_path = "composer/packagist/yoast/wordpress-seo/revision/9.5-RC3.json" + self.test_content = {"test": "content"} self.cditem0 = CDitem.objects.create( - path=self.test_path, - content=gzip.compress(json.dumps(self.test_content).encode('utf-8')), + path=self.test_path, + content=gzip.compress(json.dumps(self.test_content).encode("utf-8")), ) def test_db_saver_identical_path(self): @@ -42,5 +40,5 @@ def test_db_saver_identical_path(self): self.assertEqual(1, len(CDitem.objects.all())) def test_db_saver_different_path(self): - db_saver(content=self.test_content, blob_path='new/blob/path.json') + db_saver(content=self.test_content, blob_path="new/blob/path.json") self.assertEqual(2, len(CDitem.objects.all())) diff --git a/clearindex/harvest.py b/clearindex/harvest.py index 2cf4ef42..31105093 100644 --- a/clearindex/harvest.py +++ b/clearindex/harvest.py @@ -13,12 +13,10 @@ from django.db import transaction from django.utils import timezone -from packagedb.models import Package -from packagedb.models import Resource - from minecode.model_utils import merge_packages from minecode.utils import stringify_null_purl_fields - +from packagedb.models import Package +from packagedb.models import Resource logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -31,13 +29,13 @@ def get_resource_license_expressions(file_data): with a newline separating each or None if there are no license_expression statements in the scan data. """ - license_expressions = file_data.get('license_expressions', []) or [] + license_expressions = file_data.get("license_expressions", []) or [] if license_expressions == []: return expressions = set(list(expression for expression in license_expressions)) - return '\n'.join(expressions) + return "\n".join(expressions) def get_resource_copyright_statements(file_data): @@ -45,62 +43,58 @@ def get_resource_copyright_statements(file_data): Return a string that contains all the copyright statements (deduped), with a newline separating each or None if there are no copyright statements in the scan data. """ - copyrights = file_data.get('copyrights', []) or [] + copyrights = file_data.get("copyrights", []) or [] if copyrights == []: return - statements = set(list(copyright.get('value') for copyright in copyrights)) + statements = set(list(copyright.get("value") for copyright in copyrights)) - return '\n'.join(statements) + return "\n".join(statements) -def create_from_harvest(package_scan={}, files_data=[], cditem_path=''): - """ - Return a Package object, created or updated via a ScanCode-Toolkit "package" scan. - """ +def create_from_harvest(package_scan={}, files_data=[], cditem_path=""): + """Return a Package object, created or updated via a ScanCode-Toolkit "package" scan.""" fields = ( - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'primary_language', - 'description', - 'keywords', - 'homepage_url', - 'download_url', - 'size', - 'sha1', - 'md5', - 'sha256', - 'sha512', - 'bug_tracking_url', - 'code_view_url', - 'vcs_url', - 'copyright', - 'license_expression', - 'declared_license', - 'notice_text', - 'source_packages', + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "primary_language", + "description", + "keywords", + "homepage_url", + "download_url", + "size", + "sha1", + "md5", + "sha256", + "sha512", + "bug_tracking_url", + "code_view_url", + "vcs_url", + "copyright", + "license_expression", + "declared_license", + "notice_text", + "source_packages", ) - package_data = {field_name: package_scan.get( - field_name) for field_name in fields} + package_data = {field_name: package_scan.get(field_name) for field_name in fields} stringify_null_purl_fields(package_data) - pkg_type = package_data.get('type') - namespace = package_data.get('namespace') - name = package_data.get('name') - version = package_data.get('version') - qualifiers = package_data.get('qualifiers') - subpath = package_data.get('subpath') + pkg_type = package_data.get("type") + namespace = package_data.get("namespace") + name = package_data.get("name") + version = package_data.get("version") + qualifiers = package_data.get("qualifiers") + subpath = package_data.get("subpath") - download_url = package_data.get('download_url') + download_url = package_data.get("download_url") if not download_url: - logger.error( - 'Null `download_url` value for `package_data`: {}'.format(package_data)) + logger.error(f"Null `download_url` value for `package_data`: {package_data}") return # This ugly block is needed until https://github.com/nexB/packagedb/issues/14 @@ -113,57 +107,51 @@ def create_from_harvest(package_scan={}, files_data=[], cditem_path=''): version=version, qualifiers=qualifiers, subpath=subpath, - download_url=download_url + download_url=download_url, ) # Merge package records if it already exists merge_packages( - existing_package=package, - new_package_data=package_data, - replace=False + existing_package=package, new_package_data=package_data, replace=False ) - package.append_to_history( - 'Updated package from CDitem harvest: {}'.format(cditem_path)) + package.append_to_history(f"Updated package from CDitem harvest: {cditem_path}") - logger.info( - 'Merged package data from scancode harvest: {}'.format(package)) + logger.info(f"Merged package data from scancode harvest: {package}") except Package.DoesNotExist: try: package = Package.objects.get(download_url=download_url) # Merge package records if it already exists merge_packages( - existing_package=package, - new_package_data=package_data, - replace=False + existing_package=package, new_package_data=package_data, replace=False ) package.append_to_history( - 'Updated package from CDitem harvest: {}'.format(cditem_path)) + f"Updated package from CDitem harvest: {cditem_path}" + ) - logger.info( - 'Merged package data from scancode harvest: {}'.format(package)) + logger.info(f"Merged package data from scancode harvest: {package}") except Package.DoesNotExist: package = Package.objects.create(**package_data) package.append_to_history( - 'Created package from CDitem harvest: {}'.format(cditem_path)) + f"Created package from CDitem harvest: {cditem_path}" + ) - logger.info( - 'Created package from scancode harvest: {}'.format(package)) + logger.info(f"Created package from scancode harvest: {package}") # Now, add resources to the Package. for f in files_data: - path = f.get('path') - is_file = f.get('type', '') == 'file' + path = f.get("path") + is_file = f.get("type", "") == "file" copyright = get_resource_copyright_statements(f) license_expression = get_resource_license_expressions(f) file_data = dict( package=package, path=path, - size=f.get('size'), - sha1=f.get('sha1'), - md5=f.get('md5'), - sha256=f.get('sha256'), - git_sha1=f.get('git_sha1'), + size=f.get("size"), + sha1=f.get("sha1"), + md5=f.get("md5"), + sha256=f.get("sha256"), + git_sha1=f.get("git_sha1"), is_file=is_file, copyright=copyright, license_expression=license_expression, @@ -187,28 +175,26 @@ def map_scancode_harvest(cditem): try: harvest_data = cditem.data except ValueError: - err_msg = 'CDitemError: empty content field for CDitem: {}'.format( - cditem.path) + err_msg = f"CDitemError: empty content field for CDitem: {cditem.path}" logger.error(err_msg) cditem.map_error = err_msg cditem.save() return 0 - content = harvest_data.get('content', {}) or {} - files_data = content.get('files', []) or [] - summary = content.get('summary', {}) or {} - packages = summary.get('packages', []) or [] + content = harvest_data.get("content", {}) or {} + files_data = content.get("files", []) or [] + summary = content.get("summary", {}) or {} + packages = summary.get("packages", []) or [] for package_scan in packages: # Check if there is a valid download url. Missing download_url values are # considered map_errors, as a Package object cannot have a `Null` # download_url value. - download_url = package_scan.get('download_url') + download_url = package_scan.get("download_url") if not download_url: - purl = package_scan.get('purl') - err_msg = 'CDitemError: empty download_url for package: {}'.format( - purl) + purl = package_scan.get("purl") + err_msg = f"CDitemError: empty download_url for package: {purl}" logger.error(err_msg) cditem.map_error = err_msg diff --git a/clearindex/management/commands/run_clearindex.py b/clearindex/management/commands/run_clearindex.py index 9af70702..1f6b341f 100644 --- a/clearindex/management/commands/run_clearindex.py +++ b/clearindex/management/commands/run_clearindex.py @@ -27,13 +27,12 @@ from clearcode.models import CDitem from clearindex import harvest -from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand +from minecode.management.commands import get_error_message from minecode.model_utils import merge_packages from minecode.utils import stringify_null_purl_fields from packagedb.models import Package - TRACE = False logger = logging.getLogger(__name__) @@ -48,9 +47,7 @@ def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ + """Signal handler to set global variable to True.""" global MUST_STOP MUST_STOP = True @@ -62,42 +59,43 @@ def stop_handler(*args, **kwargs): PACKAGE_TYPES_BY_CD_TYPE = { - 'crate': 'cargo', - 'deb': 'deb', - 'debsrc': 'deb', + "crate": "cargo", + "deb": "deb", + "debsrc": "deb", # Currently used only for maven packages - 'sourcearchive': 'maven', - 'maven': 'maven', - 'composer': 'composer', + "sourcearchive": "maven", + "maven": "maven", + "composer": "composer", # Currently used only for Github repo/packages - 'git': 'github', - 'pod': 'pod', - 'nuget': 'nuget', - 'pypi': 'pypi', - 'gem': 'gem', + "git": "github", + "pod": "pod", + "nuget": "nuget", + "pypi": "pypi", + "gem": "gem", } # TODO: Update with more Package types when scancode-toolkit is updated PACKAGE_TYPES_WITH_GET_URLS = { - 'maven': maven.get_urls, - 'npm': npm.get_urls, - 'pypi': pypi.get_pypi_urls, - 'gem': rubygems.get_urls, - 'nuget': nuget.get_urls, + "maven": maven.get_urls, + "npm": npm.get_urls, + "pypi": pypi.get_pypi_urls, + "gem": rubygems.get_urls, + "nuget": nuget.get_urls, } class Command(VerboseCommand): - help = 'Run a mapping worker.' + help = "Run a mapping worker." def add_arguments(self, parser): parser.add_argument( - '--exit-on-empty', - dest='exit_on_empty', + "--exit-on-empty", + dest="exit_on_empty", default=False, - action='store_true', - help='Do not loop forever. Exit when the queue is empty.') + action="store_true", + help="Do not loop forever. Exit when the queue is empty.", + ) def handle(self, *args, **options): """ @@ -107,32 +105,34 @@ def handle(self, *args, **options): global MUST_STOP logger.setLevel(self.get_verbosity(**options)) - exit_on_empty = options.get('exit_on_empty') + exit_on_empty = options.get("exit_on_empty") sleeping = False created_packages_count = 0 - logger.info('Running ClearIndex') + logger.info("Running ClearIndex") while True: if MUST_STOP: - logger.info('Graceful exit of the map loop.') + logger.info("Graceful exit of the map loop.") break mappable_definitions = CDitem.objects.mappable_definitions()[ - :MAP_BATCH_SIZE] + :MAP_BATCH_SIZE + ] mappable_scancode_harvests = CDitem.objects.mappable_scancode_harvests()[ - :MAP_BATCH_SIZE] + :MAP_BATCH_SIZE + ] try: if not mappable_definitions and not mappable_scancode_harvests: if exit_on_empty: - logger.info('No mappable CDitem, exiting...') + logger.info("No mappable CDitem, exiting...") break # Only log a single message when we go to sleep if not sleeping: sleeping = True - logger.info('No mappable CDitem, sleeping...') + logger.info("No mappable CDitem, sleeping...") time.sleep(SLEEP_WHEN_EMPTY) continue @@ -155,7 +155,7 @@ def handle(self, *args, **options): logger.error(e) break - msg = '{}: {} Packages processed.' + msg = "{}: {} Packages processed." msg = msg.format(timezone.now(), created_packages_count) logger.info(msg) @@ -177,8 +177,7 @@ def map_definition(cditem): cditem.save() return package except Exception as e: - msg = 'Error: Failed to map while processing CDitem: {}\n'.format( - repr(cditem.path)) + msg = f"Error: Failed to map while processing CDitem: {repr(cditem.path)}\n" msg += get_error_message(e) logger.error(msg) cditem.map_error = msg @@ -186,55 +185,56 @@ def map_definition(cditem): def get_coords_des_and_lic_from_def(definition): - return definition.get('coordinates', {}), definition.get('described', {}), definition.get('licensed', {}) + return ( + definition.get("coordinates", {}), + definition.get("described", {}), + definition.get("licensed", {}), + ) # CD_TYPES_WITH_SOURCE = ('debsrc', 'npm', 'sourcearchive',) def get_or_create_package_from_cditem_definition(cditem): - """ - Create a Package from a CDitem definition or return a Package if it already exists - """ + """Create a Package from a CDitem definition or return a Package if it already exists""" definition = cditem.data if not definition: - raise Exception('No data available for this definition') - coordinates, described, licensed = get_coords_des_and_lic_from_def( - definition) + raise Exception("No data available for this definition") + coordinates, described, licensed = get_coords_des_and_lic_from_def(definition) - download_url = described.get('urls', {}).get('download', '') + download_url = described.get("urls", {}).get("download", "") if not download_url: # We use our data to create a Package in order to form the download_url, since we do not have the download_url for the Package # We need to have a unique download URL for every Package download_url = create_download_url_from_coords(coordinates) if not download_url: - raise Exception('No download URL is available for this definition') + raise Exception("No download URL is available for this definition") - if download_url.startswith('http://central.maven.org'): - split_download_url = download_url.rsplit('http://central.maven.org') + if download_url.startswith("http://central.maven.org"): + split_download_url = download_url.rsplit("http://central.maven.org") if len(split_download_url) == 2: - download_url = 'https://repo1.maven.org' + split_download_url[1] + download_url = "https://repo1.maven.org" + split_download_url[1] stringify_null_purl_fields(coordinates) - namespace = coordinates.get('namespace') - namespace = namespace if namespace != '-' else '' - name = coordinates.get('name') - version = coordinates.get('revision') - package_type = coordinates.get('type') - converted_package_type = PACKAGE_TYPES_BY_CD_TYPE.get( - package_type) or package_type + namespace = coordinates.get("namespace") + namespace = namespace if namespace != "-" else "" + name = coordinates.get("name") + version = coordinates.get("revision") + package_type = coordinates.get("type") + converted_package_type = PACKAGE_TYPES_BY_CD_TYPE.get(package_type) or package_type # TODO: Source packages need to be updated for clearlydefined, link source packages to binary packages - hashes = described.get('hashes', {}) - sha1 = hashes.get('sha1') - sha256 = hashes.get('sha256') - homepage_url = described.get('projectWebsite') - release_date = described.get('releaseDate') - declared_license = licensed.get('declared') + hashes = described.get("hashes", {}) + sha1 = hashes.get("sha1") + sha256 = hashes.get("sha256") + homepage_url = described.get("projectWebsite") + release_date = described.get("releaseDate") + declared_license = licensed.get("declared") normalized_license_expression = licensing.get_normalized_expression( - declared_license) + declared_license + ) copyrights = get_parties_from_licensed(licensed) - copyrights = '\n'.join(copyrights) + copyrights = "\n".join(copyrights) definition_mining_level = 0 existing_package = None @@ -258,12 +258,13 @@ def get_or_create_package_from_cditem_definition(cditem): declared_license=declared_license, license_expression=normalized_license_expression, copyright=copyrights, - mining_level=definition_mining_level + mining_level=definition_mining_level, ) # log history if package was created if created: package.append_to_history( - 'Created package from CDitem definition: {}'.format(cditem.path)) + f"Created package from CDitem definition: {cditem.path}" + ) else: # TODO: This is temporary until we fold clearindex into minecode mapping @@ -286,37 +287,35 @@ def get_or_create_package_from_cditem_definition(cditem): merge_packages( existing_package=existing_package, new_package_data=new_package_data, - replace=True + replace=True, ) package = existing_package package.append_to_history( - 'Updated package from CDitem definition: {}'.format(cditem.path)) + f"Updated package from CDitem definition: {cditem.path}" + ) return package def is_scancode_scan(harvest): - return harvest.get('_metadata', {}).get('type', '') == 'scancode' + return harvest.get("_metadata", {}).get("type", "") == "scancode" def create_download_url_from_coords(coord): - """ - Return a download URL for a supported Package from Coordinates `coord` - """ - ptype = coord.get('type') - namespace = coord.get('namespace') - name = coord.get('name') - version = coord.get('revision') + """Return a download URL for a supported Package from Coordinates `coord`""" + ptype = coord.get("type") + namespace = coord.get("namespace") + name = coord.get("name") + version = coord.get("revision") package_type = PACKAGE_TYPES_BY_CD_TYPE.get(ptype) if not package_type: - raise Exception( - 'Unsupported ClearlyDefined package type: {}'.format(ptype)) + raise Exception(f"Unsupported ClearlyDefined package type: {ptype}") get_urls = PACKAGE_TYPES_WITH_GET_URLS.get(package_type) if get_urls: urls = get_urls(namespace=namespace, name=name, version=version) - return urls['repository_download_url'] + return urls["repository_download_url"] def str2coord(s): @@ -330,21 +329,31 @@ def str2coord(s): plain: /gem/rubygems/foo/mocha/1.7.0" """ from itertools import izip_longest - is_urn = s.startswith('urn') - is_url = s.startswith('cd:') - splitter = ':' if is_urn else '/' + + is_urn = s.startswith("urn") + is_url = s.startswith("cd:") + splitter = ":" if is_urn else "/" segments = s.strip(splitter).split(splitter) if is_urn or is_url: segments = segments[1:] # ignore extra segments for now beyond the 5 fisrt (such as the PR of a curation) segments = segments[:5] - fields = ('type', 'provider', 'namespace', 'name', 'revision',) + fields = ( + "type", + "provider", + "namespace", + "name", + "revision", + ) return dict(izip_longest(fields, segments)) def get_parties_from_licensed(licensed): - """ - Return a list of Copyright statements from `licensed`, if available - """ - return licensed.get('facets', {}).get('core', {}).get('attribution', {}).get('parties', []) + """Return a list of Copyright statements from `licensed`, if available""" + return ( + licensed.get("facets", {}) + .get("core", {}) + .get("attribution", {}) + .get("parties", []) + ) diff --git a/clearindex/utils.py b/clearindex/utils.py index dcf19ba4..7c57d8d8 100644 --- a/clearindex/utils.py +++ b/clearindex/utils.py @@ -7,19 +7,15 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from unittest import TestCase -import logging import ntpath import os import posixpath -import traceback +from unittest import TestCase -from django.core.management.base import BaseCommand from django.test import TestCase as DjangoTestCase from minecode.utils_test import JsonBasedTesting - """ The conventions used for the tests are: - for tests that require files these are stored in the testfiles directory @@ -30,7 +26,7 @@ class BaseTestCase(TestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") @classmethod def get_test_loc(cls, path): @@ -44,13 +40,11 @@ def get_test_loc(cls, path): class ClearIndexTestCase(JsonBasedTesting, BaseTestCase, DjangoTestCase): - databases = '__all__' + databases = "__all__" def to_os_native_path(path): - """ - Normalize a path to use the native OS path separator. - """ + """Normalize a path to use the native OS path separator.""" path = path.replace(posixpath.sep, os.path.sep) path = path.replace(ntpath.sep, os.path.sep) path = path.rstrip(os.path.sep) diff --git a/etc/scripts/clearcode-api-backup.py b/etc/scripts/clearcode-api-backup.py index d11c4d34..88171041 100644 --- a/etc/scripts/clearcode-api-backup.py +++ b/etc/scripts/clearcode-api-backup.py @@ -151,10 +151,6 @@ def run_api_backup(api_root_url, extra_payload=None): objects = get_all_objects_from_endpoint(endpoint_url, extra_payload=extra_payload) print('{} {} collected.'.format(len(objects), endpoint_name)) - collect_extra_conditions = [ - extra_payload.get('last_modified_date'), - ] - results[endpoint_name] += objects timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') diff --git a/etc/scripts/utils_thirdparty.py b/etc/scripts/utils_thirdparty.py index addf8e5e..7f222abc 100644 --- a/etc/scripts/utils_thirdparty.py +++ b/etc/scripts/utils_thirdparty.py @@ -845,7 +845,7 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False): if TRACE: print(f"Fetched license from remote: {lic_url}") - except: + except Exception: try: # try licensedb second lic_url = f"{LICENSEDB_API_URL}/{filename}" @@ -858,7 +858,7 @@ def fetch_license_files(self, dest_dir=THIRDPARTY_DIR, use_cached_index=False): if TRACE: print(f"Fetched license from licensedb: {lic_url}") - except: + except Exception: msg = f'No text for license {filename} in expression "{self.license_expression}" from {self}' print(msg) errors.append(msg) @@ -1290,7 +1290,7 @@ def is_pure(self): def is_pure_wheel(filename): try: return Wheel.from_filename(filename).is_pure() - except: + except Exception: return False diff --git a/manage_matchcode.py b/manage_matchcode.py index 872f8398..bfaed621 100755 --- a/manage_matchcode.py +++ b/manage_matchcode.py @@ -11,10 +11,8 @@ import os import sys - -if __name__ == '__main__': +if __name__ == "__main__": from django.core.management import execute_from_command_line - os.environ.setdefault('DJANGO_SETTINGS_MODULE', - 'matchcode_project.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "matchcode_project.settings") execute_from_command_line(sys.argv) diff --git a/manage_purldb.py b/manage_purldb.py index 2dbe57a9..62029b84 100755 --- a/manage_purldb.py +++ b/manage_purldb.py @@ -11,9 +11,8 @@ import os import sys - -if __name__ == '__main__': +if __name__ == "__main__": from django.core.management import execute_from_command_line - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'purldb_project.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "purldb_project.settings") execute_from_command_line(sys.argv) diff --git a/matchcode/api.py b/matchcode/api.py index 98762547..1edebbc4 100644 --- a/matchcode/api.py +++ b/matchcode/api.py @@ -9,8 +9,13 @@ from django.db.models import Q from django.forms import widgets from django.forms.fields import MultipleChoiceField + from django_filters.filters import MultipleChoiceFilter from django_filters.rest_framework import FilterSet +from matchcode_toolkit.fingerprinting import create_halohash_chunks +from matchcode_toolkit.fingerprinting import hexstring_to_binarray +from matchcode_toolkit.fingerprinting import split_fingerprint +from matchcode_toolkit.halohash import byte_hamming_distance from rest_framework.decorators import action from rest_framework.response import Response from rest_framework.serializers import CharField @@ -21,49 +26,35 @@ from rest_framework.serializers import Serializer from rest_framework.viewsets import ReadOnlyModelViewSet -from matchcode_toolkit.fingerprinting import create_halohash_chunks -from matchcode_toolkit.fingerprinting import hexstring_to_binarray -from matchcode_toolkit.fingerprinting import split_fingerprint -from matchcode_toolkit.halohash import byte_hamming_distance -from matchcode.models import ExactFileIndex -from matchcode.models import ExactPackageArchiveIndex from matchcode.models import ApproximateDirectoryContentIndex from matchcode.models import ApproximateDirectoryStructureIndex +from matchcode.models import ExactFileIndex +from matchcode.models import ExactPackageArchiveIndex class BaseFileIndexSerializer(ModelSerializer): - sha1 = CharField(source='fingerprint') + sha1 = CharField(source="fingerprint") package = HyperlinkedRelatedField( - view_name='api:package-detail', - lookup_field='uuid', - read_only=True + view_name="api:package-detail", lookup_field="uuid", read_only=True ) class ExactFileIndexSerializer(BaseFileIndexSerializer): class Meta: model = ExactFileIndex - fields = ( - 'sha1', - 'package' - ) + fields = ("sha1", "package") class ExactPackageArchiveIndexSerializer(BaseFileIndexSerializer): class Meta: model = ExactPackageArchiveIndex - fields = ( - 'sha1', - 'package' - ) + fields = ("sha1", "package") class BaseDirectoryIndexSerializer(ModelSerializer): fingerprint = ReadOnlyField() package = HyperlinkedRelatedField( - view_name='api:package-detail', - lookup_field='uuid', - read_only=True + view_name="api:package-detail", lookup_field="uuid", read_only=True ) @@ -71,8 +62,8 @@ class ApproximateDirectoryContentIndexSerializer(BaseDirectoryIndexSerializer): class Meta: model = ApproximateDirectoryContentIndex fields = ( - 'fingerprint', - 'package', + "fingerprint", + "package", ) @@ -80,8 +71,8 @@ class ApproximateDirectoryStructureIndexSerializer(BaseDirectoryIndexSerializer) class Meta: model = ApproximateDirectoryStructureIndex fields = ( - 'fingerprint', - 'package', + "fingerprint", + "package", ) @@ -89,9 +80,7 @@ class BaseDirectoryIndexMatchSerializer(Serializer): fingerprint = CharField() matched_fingerprint = CharField() package = HyperlinkedRelatedField( - view_name='api:package-detail', - lookup_field='uuid', - read_only=True + view_name="api:package-detail", lookup_field="uuid", read_only=True ) similarity_score = FloatField() @@ -104,22 +93,19 @@ class CharMultipleWidget(widgets.TextInput): def value_from_datadict(self, data, files, name): value = widgets.SelectMultiple().value_from_datadict(data, files, name) - if not value or value == ['']: - return '' + if not value or value == [""]: + return "" return value def format_value(self, value): - """ - Return a value as it should appear when rendered in a template. - """ - return ', '.join(value) + """Return a value as it should appear when rendered in a template.""" + return ", ".join(value) class MultipleCharField(MultipleChoiceField): - """ - Overrides `MultipleChoiceField` to fit in `MultipleCharFilter`. - """ + """Overrides `MultipleChoiceField` to fit in `MultipleCharFilter`.""" + widget = CharMultipleWidget def valid_value(self, value): @@ -127,9 +113,8 @@ def valid_value(self, value): class MultipleCharFilter(MultipleChoiceFilter): - """ - Filters on multiple values for a CharField type using `?field=a&field=b` URL syntax. - """ + """Filters on multiple values for a CharField type using `?field=a&field=b` URL syntax.""" + field_class = MultipleCharField @@ -145,7 +130,7 @@ def filter(self, qs, value): predicate = self.get_filter_predicate(value) old_field_name = next(iter(predicate)) - new_field_name = f'{old_field_name}__in' + new_field_name = f"{old_field_name}__in" predicate[new_field_name] = predicate[old_field_name] predicate.pop(old_field_name) @@ -198,9 +183,9 @@ def filter(self, qs, value): chunk1=chunk1, chunk2=chunk2, chunk3=chunk3, - chunk4=chunk4 + chunk4=chunk4, ), - Q.OR + Q.OR, ) return qs.filter(q) @@ -213,17 +198,13 @@ class BaseFileIndexFilterSet(FilterSet): class ExactFileIndexFilterSet(BaseFileIndexFilterSet): class Meta: model = ExactFileIndex - fields = ( - 'sha1', - ) + fields = ("sha1",) class ExactPackageArchiveFilterSet(BaseFileIndexFilterSet): class Meta: model = ExactPackageArchiveIndex - fields = ( - 'sha1', - ) + fields = ("sha1",) class BaseDirectoryIndexFilterSet(FilterSet): @@ -233,21 +214,17 @@ class BaseDirectoryIndexFilterSet(FilterSet): class ApproximateDirectoryContentFilterSet(BaseDirectoryIndexFilterSet): class Meta: model = ApproximateDirectoryContentIndex - fields = ( - 'fingerprint', - ) + fields = ("fingerprint",) class ApproximateDirectoryStructureFilterSet(BaseDirectoryIndexFilterSet): class Meta: model = ApproximateDirectoryStructureIndex - fields = ( - 'fingerprint', - ) + fields = ("fingerprint",) class BaseFileIndexViewSet(ReadOnlyModelViewSet): - lookup_field = 'sha1' + lookup_field = "sha1" class ExactFileIndexViewSet(BaseFileIndexViewSet): @@ -263,11 +240,11 @@ class ExactPackageArchiveIndexViewSet(BaseFileIndexViewSet): class BaseDirectoryIndexViewSet(ReadOnlyModelViewSet): - lookup_field = 'fingerprint' + lookup_field = "fingerprint" @action(detail=False) def match(self, request): - fingerprints = request.query_params.getlist('fingerprint') + fingerprints = request.query_params.getlist("fingerprint") if not fingerprints: return Response() @@ -285,17 +262,15 @@ def match(self, request): similarity_score = (128 - hd) / 128 results.append( { - 'fingerprint': fingerprint, - 'matched_fingerprint': fp, - 'package': match.package, - 'similarity_score': similarity_score, + "fingerprint": fingerprint, + "matched_fingerprint": fp, + "package": match.package, + "similarity_score": similarity_score, } ) serialized_match_results = BaseDirectoryIndexMatchSerializer( - results, - context={'request': request}, - many=True + results, context={"request": request}, many=True ) return Response(serialized_match_results.data) diff --git a/matchcode/match.py b/matchcode/match.py index 46936bc2..c4a15768 100644 --- a/matchcode/match.py +++ b/matchcode/match.py @@ -10,9 +10,10 @@ from functools import reduce from operator import or_ +from django.db.models import Q + import attr from commoncode.resource import VirtualCodebase -from django.db.models import Q from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode.models import ApproximateDirectoryContentIndex @@ -52,10 +53,9 @@ def do_match(codebase, match_type): The total number of matches found is returned. """ - matcher = get_matchers().get(match_type) if not matcher: - raise Exception('Unknown match type: {}'.format(match_type)) + raise Exception(f"Unknown match type: {match_type}") match_count = matcher(codebase) return match_count @@ -63,12 +63,8 @@ def do_match(codebase, match_type): def run_do_match_from_scan(scan_file_location, match_type): vc = VirtualCodebase( location=scan_file_location, - codebase_attributes=dict( - matches=attr.ib(default=attr.Factory(list)) - ), - resource_attributes=dict( - matched_to=attr.ib(default=attr.Factory(list)) - ) + codebase_attributes=dict(matches=attr.ib(default=attr.Factory(list))), + resource_attributes=dict(matched_to=attr.ib(default=attr.Factory(list))), ) vc = compute_codebase_directory_fingerprints(vc) do_match(vc, match_type) @@ -83,9 +79,11 @@ def package_archive_match(codebase): """ match_count = 0 for resource in codebase.walk(topdown=True): - if (resource.is_dir - or not resource.is_archive - or resource.extra_data.get('matched', False)): + if ( + resource.is_dir + or not resource.is_archive + or resource.extra_data.get("matched", False) + ): continue archive_matches, match_type = get_archive_match(resource) @@ -107,7 +105,7 @@ def approximate_directory_content_match(codebase): """ match_count = 0 for resource in codebase.walk(topdown=True): - if resource.is_file or resource.extra_data.get('matched', False): + if resource.is_file or resource.extra_data.get("matched", False): continue directory_matches, match_type = get_directory_content_match(resource) @@ -115,8 +113,7 @@ def approximate_directory_content_match(codebase): continue match_count += directory_matches.count() - tag_matched_resources(resource, codebase, - directory_matches, match_type) + tag_matched_resources(resource, codebase, directory_matches, match_type) return match_count @@ -128,7 +125,7 @@ def approximate_directory_structure_match(codebase): """ match_count = 0 for resource in codebase.walk(topdown=True): - if resource.is_file or resource.extra_data.get('matched', False): + if resource.is_file or resource.extra_data.get("matched", False): continue directory_matches, match_type = get_directory_structure_match(resource) @@ -136,8 +133,7 @@ def approximate_directory_structure_match(codebase): continue match_count += directory_matches.count() - tag_matched_resources(resource, codebase, - directory_matches, match_type) + tag_matched_resources(resource, codebase, directory_matches, match_type) return match_count @@ -149,7 +145,7 @@ def individual_file_match(codebase): """ match_count = 0 for resource in codebase.walk(topdown=True): - if resource.is_dir or resource.extra_data.get('matched', False): + if resource.is_dir or resource.extra_data.get("matched", False): continue file_matches, match_type = get_file_match(resource) @@ -169,7 +165,7 @@ def approximate_file_match(codebase): """ match_count = 0 for resource in codebase.walk(topdown=True): - if resource.is_dir or resource.extra_data.get('matched', False): + if resource.is_dir or resource.extra_data.get("matched", False): continue file_matches, match_type = get_approximate_file_match(resource) if not file_matches: @@ -181,71 +177,56 @@ def approximate_file_match(codebase): def get_directory_content_match(resource): - """ - Match a directory to a Package using its contents - """ - directory_content_fingerprint = resource.extra_data.get( - 'directory_content', '') + """Match a directory to a Package using its contents""" + directory_content_fingerprint = resource.extra_data.get("directory_content", "") matches = ApproximateDirectoryContentIndex.objects.none() - match_type = '' + match_type = "" if directory_content_fingerprint: directory_matches = ApproximateDirectoryContentIndex.match( - directory_content_fingerprint, - resource + directory_content_fingerprint, resource ) matches |= directory_matches - match_type = 'approximate-content' + match_type = "approximate-content" return matches, match_type # TODO: rename match_directory_structure def get_directory_structure_match(resource): - """ - Match a directory to a Package using its structure - """ - directory_structure_fingerprint = resource.extra_data.get( - 'directory_structure', '') + """Match a directory to a Package using its structure""" + directory_structure_fingerprint = resource.extra_data.get("directory_structure", "") matches = ApproximateDirectoryStructureIndex.objects.none() - match_type = '' + match_type = "" if directory_structure_fingerprint: directory_matches = ApproximateDirectoryStructureIndex.match( - directory_structure_fingerprint, - resource + directory_structure_fingerprint, resource ) matches |= directory_matches - match_type = 'approximate-structure' + match_type = "approximate-structure" return matches, match_type def get_archive_match(resource): - """ - Match an Archive resource to a Package - """ + """Match an Archive resource to a Package""" file_matches = ExactPackageArchiveIndex.match(resource.sha1) - return file_matches, 'exact-archive' + return file_matches, "exact-archive" def get_file_match(resource): - """ - Match an individual file back to the Package it is from - """ + """Match an individual file back to the Package it is from""" file_matches = ExactFileIndex.match(resource.sha1) - return file_matches, 'exact-file' + return file_matches, "exact-file" def get_approximate_file_match(resource): - """ - Approximately match an individual file back to the Package it is from - """ - if hasattr(resource, 'halo1'): + """Approximately match an individual file back to the Package it is from""" + if hasattr(resource, "halo1"): resource_content_fingerprint = resource.halo1 else: - resource_content_fingerprint = resource.extra_data.get('halo1', '') + resource_content_fingerprint = resource.extra_data.get("halo1", "") file_matches = ApproximateResourceContentIndex.match( - resource_content_fingerprint, - resource + resource_content_fingerprint, resource ) - return file_matches, 'approximate-file' + return file_matches, "approximate-file" def tag_matched_resource(resource, codebase, purl): @@ -255,7 +236,7 @@ def tag_matched_resource(resource, codebase, purl): """ if purl not in resource.matched_to: resource.matched_to.append(purl) - resource.extra_data['matched'] = True + resource.extra_data["matched"] = True resource.save(codebase) @@ -268,7 +249,7 @@ def tag_matched_resources(resource, codebase, matches, match_type): for match in matches: # Prep matched package data and append to `codebase` matched_package_info = match.package.to_dict() - matched_package_info['match_type'] = match_type + matched_package_info["match_type"] = match_type codebase.attributes.matches.append(matched_package_info) purl = match.package.package_url @@ -280,18 +261,17 @@ def tag_matched_resources(resource, codebase, matches, match_type): # by or), then querying the matched packages resources to see if any of # those suffixes match a package child resource path for child in resource.walk(codebase): - query = reduce(or_, (Q(path=suffix) - for suffix in path_suffixes(child.path)), Q()) + query = reduce( + or_, (Q(path=suffix) for suffix in path_suffixes(child.path)), Q() + ) matched_child_resources = match.package.resources.filter(query) if len(matched_child_resources) > 0: tag_matched_resource(child, codebase, purl) def path_suffixes(path): - """ - Yield all the suffixes of `path`, starting from the longest (e.g. more segments). - """ - segments = path.strip('/').split('/') + """Yield all the suffixes of `path`, starting from the longest (e.g. more segments).""" + segments = path.strip("/").split("/") suffixes = (segments[i:] for i in range(len(segments))) for suffix in suffixes: - yield '/'.join(suffix) + yield "/".join(suffix) diff --git a/matchcode/models.py b/matchcode/models.py index 83aa59fa..bca5d9db 100644 --- a/matchcode/models.py +++ b/matchcode/models.py @@ -17,6 +17,7 @@ from django.db import models from django.forms.models import model_to_dict from django.utils.translation import gettext_lazy as _ + from matchcode_toolkit.fingerprinting import create_halohash_chunks from matchcode_toolkit.fingerprinting import hexstring_to_binarray from matchcode_toolkit.fingerprinting import split_fingerprint @@ -38,7 +39,7 @@ def logger_debug(*args): - return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) + return logger.debug(" ".join(isinstance(a, str) and a or repr(a) for a in args)) ############################################################################### @@ -48,14 +49,14 @@ class BaseFileIndex(models.Model): sha1 = models.BinaryField( max_length=20, db_index=True, - help_text='Binary form of a SHA1 checksum in lowercase hex for a file', + help_text="Binary form of a SHA1 checksum in lowercase hex for a file", null=False, blank=False, ) package = models.ForeignKey( Package, - help_text='The Package that this file is from', + help_text="The Package that this file is from", null=False, on_delete=models.CASCADE, ) @@ -67,22 +68,14 @@ class Meta: def index(cls, sha1, package): try: sha1_bin = hexstring_to_binarray(sha1) - bfi, created = cls.objects.get_or_create( - package=package, - sha1=sha1_bin - ) + bfi, created = cls.objects.get_or_create(package=package, sha1=sha1_bin) if created: logger.info( - '{} - Inserted {} for Package {}:\t{}'.format( - datetime.utcnow().isoformat(), - bfi.__class__.__name__, - package.download_url, - sha1 - ) + f"{datetime.utcnow().isoformat()} - Inserted {bfi.__class__.__name__} for Package {package.download_url}:\t{sha1}" ) return bfi, created except Exception as e: - msg = f'Error creating FileIndex:\n' + msg = "Error creating FileIndex:\n" msg += get_error_message(e) package.index_error = msg package.save() @@ -90,11 +83,9 @@ def index(cls, sha1, package): @classmethod def match(cls, sha1): - """ - Return a list of matched Packages that contains a file with a SHA1 value of `sha1` - """ + """Return a list of matched Packages that contains a file with a SHA1 value of `sha1`""" if TRACE: - logger_debug(cls.__name__, 'match:', 'sha1:', sha1) + logger_debug(cls.__name__, "match:", "sha1:", sha1) if not sha1: return cls.objects.none() @@ -105,11 +96,11 @@ def match(cls, sha1): for match in matches: package = match.package dct = model_to_dict(package) - logger_debug(cls.__name__, 'match:', 'matched_file:', dct) + logger_debug(cls.__name__, "match:", "matched_file:", dct) return matches def fingerprint(self): - return binascii.hexlify(self.sha1).decode('utf-8') + return binascii.hexlify(self.sha1).decode("utf-8") class ExactPackageArchiveIndex(BaseFileIndex): @@ -134,63 +125,62 @@ def bah128_ranges(indexed_elements_count, range_ratio=0.05): """ return ( int(indexed_elements_count * (1 - range_ratio)), - int(indexed_elements_count * (1 + range_ratio)) + int(indexed_elements_count * (1 + range_ratio)), ) class ApproximateMatchingHashMixin(models.Model): indexed_elements_count = models.IntegerField( - help_text='Number of elements that went into the fingerprint', + help_text="Number of elements that went into the fingerprint", ) chunk1 = models.BinaryField( max_length=4, db_index=True, - help_text='Binary form of the first 8 (0-7) hex digits of the fingerprint', + help_text="Binary form of the first 8 (0-7) hex digits of the fingerprint", null=False, - blank=False + blank=False, ) chunk2 = models.BinaryField( max_length=4, db_index=True, - help_text='Binary form of the second 8 (8-15) hex digits of the fingerprint', + help_text="Binary form of the second 8 (8-15) hex digits of the fingerprint", null=False, - blank=False + blank=False, ) chunk3 = models.BinaryField( max_length=4, db_index=True, - help_text='Binary form of the third 8 (16-23) hex digits of the fingerprint', + help_text="Binary form of the third 8 (16-23) hex digits of the fingerprint", null=False, - blank=False + blank=False, ) chunk4 = models.BinaryField( max_length=4, db_index=True, - help_text='Binary form of the fourth 8 (24-32) hex digits of the fingerprint', + help_text="Binary form of the fourth 8 (24-32) hex digits of the fingerprint", null=False, - blank=False + blank=False, ) package = models.ForeignKey( Package, - help_text='The Package that this resource is a part of', + help_text="The Package that this resource is a part of", null=False, on_delete=models.CASCADE, ) path = models.CharField( max_length=2000, - help_text=_('The full path value of this resource'), + help_text=_("The full path value of this resource"), ) class Meta: abstract = True - unique_together = ['chunk1', 'chunk2', - 'chunk3', 'chunk4', 'package', 'path'] + unique_together = ["chunk1", "chunk2", "chunk3", "chunk4", "package", "path"] def __str__(self): return self.fingerprint() @@ -207,8 +197,7 @@ def index(cls, fingerprint, resource_path, package): """ try: indexed_elements_count, fp = split_fingerprint(fingerprint) - fp_chunk1, fp_chunk2, fp_chunk3, fp_chunk4 = create_halohash_chunks( - fp) + fp_chunk1, fp_chunk2, fp_chunk3, fp_chunk4 = create_halohash_chunks(fp) bdi, created = cls.objects.get_or_create( indexed_elements_count=indexed_elements_count, chunk1=fp_chunk1, @@ -220,16 +209,11 @@ def index(cls, fingerprint, resource_path, package): ) if created: logger.info( - '{} - Inserted {} for Package {}:\t{}'.format( - datetime.utcnow().isoformat(), - bdi.__class__.__name__, - package.download_url, - fingerprint - ) + f"{datetime.utcnow().isoformat()} - Inserted {bdi.__class__.__name__} for Package {package.download_url}:\t{fingerprint}" ) return bdi, created except Exception as e: - msg = f'Error creating ApproximateMatchingHashMixin:\n' + msg = "Error creating ApproximateMatchingHashMixin:\n" msg += get_error_message(e) package.index_error = msg package.save() @@ -237,17 +221,15 @@ def index(cls, fingerprint, resource_path, package): @classmethod def match(cls, fingerprint, resource=None, exact_match=False): - """ - Return a list of matched Packages - """ + """Return a list of matched Packages""" if TRACE: logger_debug( cls.__name__, - 'match:', - 'fingerprint:', + "match:", + "fingerprint:", fingerprint, - 'resource:', - resource + "resource:", + resource, ) if not fingerprint: @@ -270,28 +252,16 @@ def match(cls, fingerprint, resource=None, exact_match=False): # Step 1: find fingerprints with matching chunks range = bah128_ranges(indexed_elements_count) matches = cls.objects.filter( - models.Q( - indexed_elements_count__range=range, - chunk1=chunk1 - ) | - models.Q( - indexed_elements_count__range=range, - chunk2=chunk2 - ) | - models.Q( - indexed_elements_count__range=range, - chunk3=chunk3 - ) | - models.Q( - indexed_elements_count__range=range, - chunk4=chunk4 - ) + models.Q(indexed_elements_count__range=range, chunk1=chunk1) + | models.Q(indexed_elements_count__range=range, chunk2=chunk2) + | models.Q(indexed_elements_count__range=range, chunk3=chunk3) + | models.Q(indexed_elements_count__range=range, chunk4=chunk4) ) if TRACE: for match in matches: dct = model_to_dict(match) - logger_debug(cls.__name__, 'match:', 'matched_package:', dct) + logger_debug(cls.__name__, "match:", "matched_package:", dct) # Step 2: calculate Hamming distance of all matches @@ -309,8 +279,7 @@ def match(cls, fingerprint, resource=None, exact_match=False): # TODO: try other thresholds if this is too restrictive if hd < 8: # Save match to `matches_by_hamming_distance` by adding the matched object to the queryset - matches_by_hamming_distance[hd] |= cls.objects.filter( - pk=match.pk) + matches_by_hamming_distance[hd] |= cls.objects.filter(pk=match.pk) if TRACE: logger_debug(list(matches_by_hamming_distance.items())) @@ -319,9 +288,7 @@ def match(cls, fingerprint, resource=None, exact_match=False): # TODO: consider limiting matches for brevity hamming_distances_and_matches = [] for hamming_distance, matches in sorted(matches_by_hamming_distance.items()): - hamming_distances_and_matches.append( - (hamming_distance, matches) - ) + hamming_distances_and_matches.append((hamming_distance, matches)) if TRACE: for hamming_distance, matches in hamming_distances_and_matches: @@ -329,11 +296,11 @@ def match(cls, fingerprint, resource=None, exact_match=False): dct = model_to_dict(match) logger_debug( cls.__name__, - 'match:', - 'step_3_hamming_distance:', + "match:", + "step_3_hamming_distance:", hamming_distance, - 'step_3_matched_package:', - dct + "step_3_matched_package:", + dct, ) # Step 4: use file heuristics to rank matches from step 3 @@ -355,35 +322,34 @@ def match(cls, fingerprint, resource=None, exact_match=False): if TRACE: logger_debug( cls.__name__, - 'match:', - 'step_4_matched_resource:', - matched_resource + "match:", + "step_4_matched_resource:", + matched_resource, ) # Compute size and name difference if matched_resource.is_file: - size_difference = abs( - resource_size - matched_resource.size) + size_difference = abs(resource_size - matched_resource.size) else: # TODO: index number of files in a directory so we can use # that for size comparison. For now, we are going to # disregard size as a factor. size_difference = 0 name_sequence_matcher = SequenceMatcher( - a=resource.name, b=matched_resource.name) + a=resource.name, b=matched_resource.name + ) name_difference = 1 - name_sequence_matcher.ratio() - rank_attributes = ( - hamming_distance, size_difference, name_difference) + rank_attributes = (hamming_distance, size_difference, name_difference) matches_by_rank_attributes[rank_attributes].append(match) if TRACE: logger_debug( cls.__name__, - 'match:', - 'step_4_size_difference:', + "match:", + "step_4_size_difference:", size_difference, - 'step_4_name_difference:', - name_difference + "step_4_name_difference:", + name_difference, ) # Order these from low to high (low being low difference/very similar)), first by hamming distance, then by size difference, and finally by name difference. @@ -393,15 +359,9 @@ def match(cls, fingerprint, resource=None, exact_match=False): if TRACE: dct = model_to_dict(match) - logger_debug( - cls.__name__, - 'match:', - 'step_4_best_match:', - dct - ) + logger_debug(cls.__name__, "match:", "step_4_best_match:", dct) - matches = cls.objects.filter( - pk__in=[match.pk for match in ranked_matches]) + matches = cls.objects.filter(pk__in=[match.pk for match in ranked_matches]) return matches def get_chunks(self): @@ -412,11 +372,12 @@ def get_chunks(self): return chunk1, chunk2, chunk3, chunk4 def fingerprint(self): - indexed_element_count_as_hex_bytes = b'%08x' % self.indexed_elements_count + indexed_element_count_as_hex_bytes = b"%08x" % self.indexed_elements_count chunk1, chunk2, chunk3, chunk4 = self.get_chunks() - fingerprint = indexed_element_count_as_hex_bytes + \ - chunk1 + chunk2 + chunk3 + chunk4 - return fingerprint.decode('utf-8') + fingerprint = ( + indexed_element_count_as_hex_bytes + chunk1 + chunk2 + chunk3 + chunk4 + ) + return fingerprint.decode("utf-8") class ApproximateDirectoryStructureIndex(ApproximateMatchingHashMixin): diff --git a/matchcode/tests/__init__.py b/matchcode/tests/__init__.py index d4b312b7..cf38b66f 100644 --- a/matchcode/tests/__init__.py +++ b/matchcode/tests/__init__.py @@ -9,5 +9,4 @@ import os - FIXTURES_REGEN = os.environ.get("MATCHCODE_TEST_FIXTURES_REGEN", False) diff --git a/matchcode/tests/test_match.py b/matchcode/tests/test_match.py index fa9df079..a09fab4d 100644 --- a/matchcode/tests/test_match.py +++ b/matchcode/tests/test_match.py @@ -28,449 +28,455 @@ class MatchPackagesTestCase(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") maxDiff = None def setUp(self): # Execute the superclass' setUp method before creating our own # DB objects - super(MatchPackagesTestCase, self).setUp() + super().setUp() self.test_package1, _ = Package.objects.get_or_create( - filename='abbot-0.12.3.jar', - sha1='51d28a27d919ce8690a40f4f335b9d591ceb16e9', - md5='38206e62a54b0489fb6baa4db5a06093', + filename="abbot-0.12.3.jar", + sha1="51d28a27d919ce8690a40f4f335b9d591ceb16e9", + md5="38206e62a54b0489fb6baa4db5a06093", size=689791, - name='abbot', - version='0.12.3', - download_url='http://repo1.maven.org/maven2/abbot/abbot/0.12.3/abbot-0.12.3.jar', - type='maven', + name="abbot", + version="0.12.3", + download_url="http://repo1.maven.org/maven2/abbot/abbot/0.12.3/abbot-0.12.3.jar", + type="maven", ) self.test_package1_metadata = self.test_package1.to_dict() self.test_package2, _ = Package.objects.get_or_create( - filename='dojoz-0.4.1-1.jar', - sha1='ae9d68fd6a29906606c2d9407d1cc0749ef84588', - md5='508361a1c6273a4c2b8e4945618b509f', + filename="dojoz-0.4.1-1.jar", + sha1="ae9d68fd6a29906606c2d9407d1cc0749ef84588", + md5="508361a1c6273a4c2b8e4945618b509f", size=876720, - name='dojoz', - version='0.4.1-1', - download_url='https://repo1.maven.org/maven2/org/zkoss/zkforge/dojoz/0.4.1-1/dojoz-0.4.1-1.jar', - type='maven', + name="dojoz", + version="0.4.1-1", + download_url="https://repo1.maven.org/maven2/org/zkoss/zkforge/dojoz/0.4.1-1/dojoz-0.4.1-1.jar", + type="maven", ) self.test_package2_metadata = self.test_package2.to_dict() self.test_package3, _ = Package.objects.get_or_create( - filename='acegi-security-0.51.jar', - sha1='ede156692b33872f5ee9465b7a06d6b2bc9e5e7f', + filename="acegi-security-0.51.jar", + sha1="ede156692b33872f5ee9465b7a06d6b2bc9e5e7f", size=176954, - name='acegi-security', - version='0.51', - download_url='https://repo1.maven.org/maven2/acegisecurity/acegi-security/0.51/acegi-security-0.51.jar', - type='maven' + name="acegi-security", + version="0.51", + download_url="https://repo1.maven.org/maven2/acegisecurity/acegi-security/0.51/acegi-security-0.51.jar", + type="maven", ) self.test_package3_metadata = self.test_package3.to_dict() self.test_package4, _ = Package.objects.get_or_create( - filename='test.tar.gz', - sha1='deadbeef', + filename="test.tar.gz", + sha1="deadbeef", size=42589, - name='test', - version='0.01', - download_url='https://test.com/test.tar.gz', - type='maven' + name="test", + version="0.01", + download_url="https://test.com/test.tar.gz", + type="maven", ) self.test_package4_metadata = self.test_package4.to_dict() # Populate ExactPackageArchiveIndexFingerprint table index_packages_sha1() - load_resources_from_scan(self.get_test_loc( - 'models/match-test.json'), self.test_package4) + load_resources_from_scan( + self.get_test_loc("models/match-test.json"), self.test_package4 + ) index_package_directories(self.test_package4) index_package_files_sha1( - self.test_package4, self.get_test_loc('models/match-test.json')) + self.test_package4, self.get_test_loc("models/match-test.json") + ) # Add approximate file resource self.test_package5, _ = Package.objects.get_or_create( - filename='inflate.tar.gz', - sha1='deadfeed', - type='generic', - name='inflate', - version='1.0.0', - download_url='inflate.com/inflate.tar.gz', + filename="inflate.tar.gz", + sha1="deadfeed", + type="generic", + name="inflate", + version="1.0.0", + download_url="inflate.com/inflate.tar.gz", ) self.test_resource5, _ = Resource.objects.get_or_create( - path='inflate.c', - size=55466, - package=self.test_package5 + path="inflate.c", size=55466, package=self.test_package5 ) - self.test_resource5_fingerprint = '000018fba23a49e4cd40718d1297be719e6564a4' + self.test_resource5_fingerprint = "000018fba23a49e4cd40718d1297be719e6564a4" ApproximateResourceContentIndex.index( self.test_resource5_fingerprint, self.test_resource5.path, - self.test_package5 + self.test_package5, ) def test_do_match_package_archive_match(self): - input_file = self.get_test_loc('models/match-test.json') + input_file = self.get_test_loc("models/match-test.json") vc = run_do_match_from_scan(input_file, EXACT_PACKAGE_ARCHIVE_MATCH) - expected = self.get_test_loc( - 'models/match-test-exact-package-results.json') + expected = self.get_test_loc("models/match-test-exact-package-results.json") self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_do_match_approximate_directory_structure_match(self): - input_file = self.get_test_loc('models/match-test.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("models/match-test.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'models/match-test-approximate-directory-structure-results.json') + "models/match-test-approximate-directory-structure-results.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_do_match_approximate_directory_content_match(self): - input_file = self.get_test_loc('models/match-test.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("models/match-test.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'models/match-test-approximate-directory-content-results.json') + "models/match-test-approximate-directory-content-results.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_do_match_package_file_match(self): - input_file = self.get_test_loc('models/match-test.json') + input_file = self.get_test_loc("models/match-test.json") vc = run_do_match_from_scan(input_file, EXACT_FILE_MATCH) - expected = self.get_test_loc( - 'models/match-test-exact-file-results.json') + expected = self.get_test_loc("models/match-test-exact-file-results.json") self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_do_match_approximate_package_file_match(self): input_file = self.get_test_loc( - 'match/approximate-file-matching/approximate-match-test.json') + "match/approximate-file-matching/approximate-match-test.json" + ) vc = run_do_match_from_scan(input_file, APPROXIMATE_FILE_MATCH) expected = self.get_test_loc( - 'match/approximate-file-matching/approximate-match-test-results.json') + "match/approximate-file-matching/approximate-match-test-results.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) class MatchNestedPackagesTestCase(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") maxDiff = None def setUp(self): # Execute the superclass' setUp method before creating our own # DB objects - super(MatchNestedPackagesTestCase, self).setUp() + super().setUp() self.test_package1, _ = Package.objects.get_or_create( - filename='plugin-request-2.4.1.tgz', - sha1='7295749caddd3c52be472eef6623a7b441ed17d6', + filename="plugin-request-2.4.1.tgz", + sha1="7295749caddd3c52be472eef6623a7b441ed17d6", size=7269, - name='plugin-request', - version='2.4.1', - download_url='https://registry.npmjs.org/@umijs/plugin-request/-/plugin-request-2.4.1.tgz', - type='npm', + name="plugin-request", + version="2.4.1", + download_url="https://registry.npmjs.org/@umijs/plugin-request/-/plugin-request-2.4.1.tgz", + type="npm", + ) + load_resources_from_scan( + self.get_test_loc("match/nested/plugin-request-2.4.1-ip.json"), + self.test_package1, ) - load_resources_from_scan(self.get_test_loc( - 'match/nested/plugin-request-2.4.1-ip.json'), self.test_package1) index_package_directories(self.test_package1) self.test_package2, _ = Package.objects.get_or_create( - filename='underscore-1.10.9.tgz', - sha1='ba7a9cfc15873e67821611503a34a7c26bf7264f', + filename="underscore-1.10.9.tgz", + sha1="ba7a9cfc15873e67821611503a34a7c26bf7264f", size=26569, - name='underscore', - version='1.10.9', - download_url='https://registry.npmjs.org/@types/underscore/-/underscore-1.10.9.tgz', - type='npm', + name="underscore", + version="1.10.9", + download_url="https://registry.npmjs.org/@types/underscore/-/underscore-1.10.9.tgz", + type="npm", + ) + load_resources_from_scan( + self.get_test_loc("match/nested/underscore-1.10.9-ip.json"), + self.test_package2, ) - load_resources_from_scan(self.get_test_loc( - 'match/nested/underscore-1.10.9-ip.json'), self.test_package2) index_package_directories(self.test_package2) def test_do_match_approximate_directory_structure_match(self): - input_file = self.get_test_loc('match/nested/nested.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/nested/nested.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/nested/nested-directory-structure-match-expected.json') + "match/nested/nested-directory-structure-match-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_do_match_approximate_directory_content_match(self): - input_file = self.get_test_loc('match/nested/nested.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/nested/nested.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/nested/nested-directory-content-match-expected.json') + "match/nested/nested-directory-content-match-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) class MatchUtilityFunctionsTestCase(MatchcodeTestCase): def test_path_suffixes(self): - suffixes = list(path_suffixes('/foo/bar/baz/qux')) - expected = ['foo/bar/baz/qux', 'bar/baz/qux', 'baz/qux', 'qux'] + suffixes = list(path_suffixes("/foo/bar/baz/qux")) + expected = ["foo/bar/baz/qux", "bar/baz/qux", "baz/qux", "qux"] self.assertEqual(expected, suffixes) class DirectoryMatchingTestCase(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") maxDiff = None def setUp(self): - super(DirectoryMatchingTestCase, self).setUp() + super().setUp() self.test_package1, _ = Package.objects.get_or_create( - filename='abbrev-1.0.3.tgz', - sha1='aa049c967f999222aa42e14434f0c562ef468241', - name='abbrev', - version='1.0.3', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.3.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.3-i.json'), self.test_package1) + filename="abbrev-1.0.3.tgz", + sha1="aa049c967f999222aa42e14434f0c562ef468241", + name="abbrev", + version="1.0.3", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.3.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.3-i.json"), + self.test_package1, + ) index_package_directories(self.test_package1) self.test_package2, _ = Package.objects.get_or_create( - filename='abbrev-1.0.4.tgz', - sha1='bd55ae5e413ba1722ee4caba1f6ea10414a59ecd', - name='abbrev', - version='1.0.4', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.4.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.4-i.json'), self.test_package2) + filename="abbrev-1.0.4.tgz", + sha1="bd55ae5e413ba1722ee4caba1f6ea10414a59ecd", + name="abbrev", + version="1.0.4", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.4.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.4-i.json"), + self.test_package2, + ) index_package_directories(self.test_package2) self.test_package3, _ = Package.objects.get_or_create( - filename='abbrev-1.0.5.tgz', - sha1='5d8257bd9ebe435e698b2fa431afde4fe7b10b03', - name='abbrev', - version='1.0.5', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.5.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.5-i.json'), self.test_package3) + filename="abbrev-1.0.5.tgz", + sha1="5d8257bd9ebe435e698b2fa431afde4fe7b10b03", + name="abbrev", + version="1.0.5", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.5.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.5-i.json"), + self.test_package3, + ) index_package_directories(self.test_package3) self.test_package4, _ = Package.objects.get_or_create( - filename='abbrev-1.0.6.tgz', - sha1='b6d632b859b3fa2d6f7e4b195472461b9e32dc30', - name='abbrev', - version='1.0.6', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.6.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.6-i.json'), self.test_package4) + filename="abbrev-1.0.6.tgz", + sha1="b6d632b859b3fa2d6f7e4b195472461b9e32dc30", + name="abbrev", + version="1.0.6", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.6.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.6-i.json"), + self.test_package4, + ) index_package_directories(self.test_package4) self.test_package5, _ = Package.objects.get_or_create( - filename='abbrev-1.0.7.tgz', - sha1='5b6035b2ee9d4fb5cf859f08a9be81b208491843', - name='abbrev', - version='1.0.7', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.7.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.7-i.json'), self.test_package5) + filename="abbrev-1.0.7.tgz", + sha1="5b6035b2ee9d4fb5cf859f08a9be81b208491843", + name="abbrev", + version="1.0.7", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.7.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.7-i.json"), + self.test_package5, + ) index_package_directories(self.test_package5) self.test_package6, _ = Package.objects.get_or_create( - filename='abbrev-1.0.9.tgz', - sha1='91b4792588a7738c25f35dd6f63752a2f8776135', - name='abbrev', - version='1.0.9', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.0.9.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.0.9-i.json'), self.test_package6) + filename="abbrev-1.0.9.tgz", + sha1="91b4792588a7738c25f35dd6f63752a2f8776135", + name="abbrev", + version="1.0.9", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.0.9.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.0.9-i.json"), + self.test_package6, + ) index_package_directories(self.test_package6) self.test_package7, _ = Package.objects.get_or_create( - filename='abbrev-1.1.0.tgz', - sha1='d0554c2256636e2f56e7c2e5ad183f859428d81f', - name='abbrev', - version='1.1.0', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.1.0.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.1.0-i.json'), self.test_package7) + filename="abbrev-1.1.0.tgz", + sha1="d0554c2256636e2f56e7c2e5ad183f859428d81f", + name="abbrev", + version="1.1.0", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.1.0.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.1.0-i.json"), + self.test_package7, + ) index_package_directories(self.test_package7) self.test_package8, _ = Package.objects.get_or_create( - filename='abbrev-1.1.1.tgz', - sha1='f8f2c887ad10bf67f634f005b6987fed3179aac8', - name='abbrev', - version='1.1.1', - type='npm', - download_url='https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz', - ) - load_resources_from_scan(self.get_test_loc( - 'match/directory-matching/abbrev-1.1.1-i.json'), self.test_package8) + filename="abbrev-1.1.1.tgz", + sha1="f8f2c887ad10bf67f634f005b6987fed3179aac8", + name="abbrev", + version="1.1.1", + type="npm", + download_url="https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz", + ) + load_resources_from_scan( + self.get_test_loc("match/directory-matching/abbrev-1.1.1-i.json"), + self.test_package8, + ) index_package_directories(self.test_package8) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_3(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.3-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.3-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.3-i-expected.json') + "match/directory-matching/abbrev-1.0.3-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_4(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.4-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.4-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.4-i-expected.json') + "match/directory-matching/abbrev-1.0.4-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_5(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.5-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.5-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.5-i-expected.json') + "match/directory-matching/abbrev-1.0.5-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_6(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.6-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.6-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.6-i-expected.json') + "match/directory-matching/abbrev-1.0.6-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_7(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.7-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.7-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.7-i-expected.json') + "match/directory-matching/abbrev-1.0.7-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_0_9(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.9-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.9-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.9-i-expected.json') + "match/directory-matching/abbrev-1.0.9-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_1_0(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.0-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.1.0-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.0-i-expected.json') + "match/directory-matching/abbrev-1.1.0-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_abbrev_1_1_1(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.1-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.1.1-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.1-i-expected.json') + "match/directory-matching/abbrev-1.1.1-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryStructureIndex_get_stdin_3_0_2(self): input_file = self.get_test_loc( - 'match/directory-matching/get-stdin-3.0.2-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) + "match/directory-matching/get-stdin-3.0.2-i.json" + ) + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_STRUCTURE_MATCH) expected = self.get_test_loc( - 'match/directory-matching/get-stdin-3.0.2-i-expected.json') + "match/directory-matching/get-stdin-3.0.2-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_3(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.3-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.3-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.3-i-expected.json') + "match/directory-matching/abbrev-1.0.3-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_4(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.4-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.4-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.4-i-expected.json') + "match/directory-matching/abbrev-1.0.4-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_5(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.5-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.5-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.5-i-expected.json') + "match/directory-matching/abbrev-1.0.5-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_6(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.6-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.6-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.6-i-expected.json') + "match/directory-matching/abbrev-1.0.6-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_7(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.7-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.7-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.7-i-expected.json') + "match/directory-matching/abbrev-1.0.7-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_0_9(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.9-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.0.9-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.0.9-i-expected.json') + "match/directory-matching/abbrev-1.0.9-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_1_0(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.0-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.1.0-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.0-i-expected.json') + "match/directory-matching/abbrev-1.1.0-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_abbrev_1_1_1(self): - input_file = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.1-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + input_file = self.get_test_loc("match/directory-matching/abbrev-1.1.1-i.json") + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/abbrev-1.1.1-i-expected.json') + "match/directory-matching/abbrev-1.1.1-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) def test_match_ApproximateDirectoryContentIndex_get_stdin_3_0_2(self): input_file = self.get_test_loc( - 'match/directory-matching/get-stdin-3.0.2-i.json') - vc = run_do_match_from_scan( - input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) + "match/directory-matching/get-stdin-3.0.2-i.json" + ) + vc = run_do_match_from_scan(input_file, APPROXIMATE_DIRECTORY_CONTENT_MATCH) expected = self.get_test_loc( - 'match/directory-matching/get-stdin-3.0.2-i-expected.json') + "match/directory-matching/get-stdin-3.0.2-i-expected.json" + ) self.check_codebase(vc, expected, regen=FIXTURES_REGEN) diff --git a/matchcode/tests/test_models.py b/matchcode/tests/test_models.py index d3a84c6e..d0eef2e9 100644 --- a/matchcode/tests/test_models.py +++ b/matchcode/tests/test_models.py @@ -7,7 +7,6 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -import binascii import os import attr @@ -38,55 +37,55 @@ class BaseModelTest(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") maxDiff = None def setUp(self): - super(BaseModelTest, self).setUp() + super().setUp() self.test_package1, _ = Package.objects.get_or_create( - filename='abbot-0.12.3.jar', - sha1='51d28a27d919ce8690a40f4f335b9d591ceb16e9', - md5='38206e62a54b0489fb6baa4db5a06093', + filename="abbot-0.12.3.jar", + sha1="51d28a27d919ce8690a40f4f335b9d591ceb16e9", + md5="38206e62a54b0489fb6baa4db5a06093", size=689791, - name='abbot', - version='0.12.3', - download_url='http://repo1.maven.org/maven2/abbot/abbot/0.12.3/abbot-0.12.3.jar', - type='maven', + name="abbot", + version="0.12.3", + download_url="http://repo1.maven.org/maven2/abbot/abbot/0.12.3/abbot-0.12.3.jar", + type="maven", ) self.test_package1_metadata = self.test_package1.to_dict() self.test_package2, _ = Package.objects.get_or_create( - filename='dojoz-0.4.1-1.jar', - sha1='ae9d68fd6a29906606c2d9407d1cc0749ef84588', - md5='508361a1c6273a4c2b8e4945618b509f', + filename="dojoz-0.4.1-1.jar", + sha1="ae9d68fd6a29906606c2d9407d1cc0749ef84588", + md5="508361a1c6273a4c2b8e4945618b509f", size=876720, - name='dojoz', - version='0.4.1-1', - download_url='https://repo1.maven.org/maven2/org/zkoss/zkforge/dojoz/0.4.1-1/dojoz-0.4.1-1.jar', - type='maven', + name="dojoz", + version="0.4.1-1", + download_url="https://repo1.maven.org/maven2/org/zkoss/zkforge/dojoz/0.4.1-1/dojoz-0.4.1-1.jar", + type="maven", ) self.test_package2_metadata = self.test_package2.to_dict() self.test_package3, _ = Package.objects.get_or_create( - filename='acegi-security-0.51.jar', - sha1='ede156692b33872f5ee9465b7a06d6b2bc9e5e7f', + filename="acegi-security-0.51.jar", + sha1="ede156692b33872f5ee9465b7a06d6b2bc9e5e7f", size=176954, - name='acegi-security', - version='0.51', - download_url='https://repo1.maven.org/maven2/acegisecurity/acegi-security/0.51/acegi-security-0.51.jar', - type='maven' + name="acegi-security", + version="0.51", + download_url="https://repo1.maven.org/maven2/acegisecurity/acegi-security/0.51/acegi-security-0.51.jar", + type="maven", ) self.test_package3_metadata = self.test_package3.to_dict() self.test_package4, _ = Package.objects.get_or_create( - filename='test.tar.gz', - sha1='deadbeef', + filename="test.tar.gz", + sha1="deadbeef", size=42589, - name='test', - version='0.01', - download_url='https://test.com/test.tar.gz', - type='maven' + name="test", + version="0.01", + download_url="https://test.com/test.tar.gz", + type="maven", ) self.test_package4_metadata = self.test_package4.to_dict() @@ -94,38 +93,38 @@ def setUp(self): index_packages_sha1() # Populate ExactFileIndexFingerprint table - load_resources_from_scan(self.get_test_loc( - 'models/match-test.json'), self.test_package4) + load_resources_from_scan( + self.get_test_loc("models/match-test.json"), self.test_package4 + ) index_package_directories(self.test_package4) index_package_files_sha1( - self.test_package4, self.get_test_loc('models/match-test.json')) + self.test_package4, self.get_test_loc("models/match-test.json") + ) class ExactPackageArchiveIndexModelTestCase(BaseModelTest): def test_ExactPackageArchiveIndex_index(self): # Test index - sha1 = 'b6bbe0b067469d719708ca38de5c237cb526c3d2' - epai, created = ExactPackageArchiveIndex.index( - sha1, self.test_package1) + sha1 = "b6bbe0b067469d719708ca38de5c237cb526c3d2" + epai, created = ExactPackageArchiveIndex.index(sha1, self.test_package1) self.assertTrue(created) self.assertEqual(sha1, epai.fingerprint()) # Test index of existing sha1 - epai, created = ExactPackageArchiveIndex.index( - sha1, self.test_package1) + epai, created = ExactPackageArchiveIndex.index(sha1, self.test_package1) self.assertFalse(created) self.assertEqual(sha1, epai.fingerprint()) # Test index of invalid sha1 - ExactPackageArchiveIndex.index('not a sha1', self.test_package1) + ExactPackageArchiveIndex.index("not a sha1", self.test_package1) self.assertTrue( - "Error('Non-hexadecimal digit found')" - in self.test_package1.index_error + "Error('Non-hexadecimal digit found')" in self.test_package1.index_error ) def test_ExactPackageArchiveIndex_single_sha1_single_match(self): result = ExactPackageArchiveIndex.match( - '51d28a27d919ce8690a40f4f335b9d591ceb16e9') + "51d28a27d919ce8690a40f4f335b9d591ceb16e9" + ) result = [r.package.to_dict() for r in result] expected = [self.test_package1_metadata] self.assertEqual(expected, result) @@ -134,7 +133,7 @@ def test_ExactPackageArchiveIndex_single_sha1_single_match(self): class ExactFileIndexModelTestCase(BaseModelTest): def test_ExactFileIndex_index(self): # Test index - sha1 = 'b6bbe0b067469d719708ca38de5c237cb526c3d2' + sha1 = "b6bbe0b067469d719708ca38de5c237cb526c3d2" efi, created = ExactFileIndex.index(sha1, self.test_package1) self.assertTrue(created) self.assertEqual(sha1, efi.fingerprint()) @@ -145,22 +144,17 @@ def test_ExactFileIndex_index(self): self.assertEqual(sha1, efi.fingerprint()) # Test index of invalid sha1 - ExactFileIndex.index('not a sha1', self.test_package1) + ExactFileIndex.index("not a sha1", self.test_package1) self.assertTrue( - "Error('Non-hexadecimal digit found')" - in self.test_package1.index_error + "Error('Non-hexadecimal digit found')" in self.test_package1.index_error ) def test_ExactFileIndex_match(self): - scan_location = self.get_test_loc('models/match-test.json') + scan_location = self.get_test_loc("models/match-test.json") codebase = VirtualCodebase( location=scan_location, - codebase_attributes=dict( - matches=attr.ib(default=attr.Factory(list)) - ), - resource_attributes=dict( - matched_to=attr.ib(default=attr.Factory(list)) - ) + codebase_attributes=dict(matches=attr.ib(default=attr.Factory(list))), + resource_attributes=dict(matched_to=attr.ib(default=attr.Factory(list))), ) # populate codebase with match results @@ -168,77 +162,76 @@ def test_ExactFileIndex_match(self): matches = ExactFileIndex.match(resource.sha1) for match in matches: p = match.package.to_dict() - p['match_type'] = 'exact' + p["match_type"] = "exact" codebase.attributes.matches.append(p) - resource.matched_to.append(p['purl']) + resource.matched_to.append(p["purl"]) resource.save(codebase) expected = self.get_test_loc( - 'models/exact-file-matching-standalone-test-results.json') + "models/exact-file-matching-standalone-test-results.json" + ) self.check_codebase(codebase, expected, regen=FIXTURES_REGEN) class ApproximateDirectoryMatchingIndexModelTestCase(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): super(MatchcodeTestCase, self).setUp() self.test_package1, _ = Package.objects.get_or_create( - filename='async-0.2.10.tgz', - sha1='b6bbe0b0674b9d719708ca38de8c237cb526c3d1', - md5='fd313a0e8cc2343569719e80cd7a67ac', + filename="async-0.2.10.tgz", + sha1="b6bbe0b0674b9d719708ca38de8c237cb526c3d1", + md5="fd313a0e8cc2343569719e80cd7a67ac", size=15772, - name='async', - version='0.2.10', - download_url='https://registry.npmjs.org/async/-/async-0.2.10.tgz', - type='npm', + name="async", + version="0.2.10", + download_url="https://registry.npmjs.org/async/-/async-0.2.10.tgz", + type="npm", ) self.test_package1_metadata = self.test_package1.to_dict() - load_resources_from_scan(self.get_test_loc( - 'models/directory-matching/async-0.2.10.tgz-i.json'), self.test_package1) + load_resources_from_scan( + self.get_test_loc("models/directory-matching/async-0.2.10.tgz-i.json"), + self.test_package1, + ) index_package_directories(self.test_package1) self.test_package2, _ = Package.objects.get_or_create( - filename='async-0.2.9.tgz', - sha1='df63060fbf3d33286a76aaf6d55a2986d9ff8619', - md5='895ac62ba7c61086cffdd50ab03c0447', + filename="async-0.2.9.tgz", + sha1="df63060fbf3d33286a76aaf6d55a2986d9ff8619", + md5="895ac62ba7c61086cffdd50ab03c0447", size=15672, - name='async', - version='0.2.9', - download_url='https://registry.npmjs.org/async/-/async-0.2.9.tgz', - type='npm', + name="async", + version="0.2.9", + download_url="https://registry.npmjs.org/async/-/async-0.2.9.tgz", + type="npm", ) self.test_package2_metadata = self.test_package2.to_dict() - load_resources_from_scan(self.get_test_loc( - 'models/directory-matching/async-0.2.9-i.json'), self.test_package2) + load_resources_from_scan( + self.get_test_loc("models/directory-matching/async-0.2.9-i.json"), + self.test_package2, + ) index_package_directories(self.test_package2) def test_ApproximateDirectoryStructureIndex_index(self): # Test index - fingerprint = '000018fad23a49e4cd40718d1297be719e6564a4' - resource_path = 'foo/bar' + fingerprint = "000018fad23a49e4cd40718d1297be719e6564a4" + resource_path = "foo/bar" adsi, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package1 + fingerprint, resource_path, self.test_package1 ) self.assertTrue(created) self.assertEqual(fingerprint, adsi.fingerprint()) # Test index of existing fingerprint adsi, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package1 + fingerprint, resource_path, self.test_package1 ) self.assertFalse(created) self.assertEqual(fingerprint, adsi.fingerprint()) # Test index of invalid fingerprint ApproximateResourceContentIndex.index( - 'not a fingerprint', - resource_path, - self.test_package1 + "not a fingerprint", resource_path, self.test_package1 ) self.assertTrue( "ValueError: invalid literal for int() with base 16: 'not a fi'" @@ -247,11 +240,11 @@ def test_ApproximateDirectoryStructureIndex_index(self): def test_ApproximateDirectoryStructureIndex_match_subdir(self): scan_location = self.get_test_loc( - 'models/directory-matching/async-0.2.9-i.json') + "models/directory-matching/async-0.2.9-i.json" + ) vc = VirtualCodebase( location=scan_location, - resource_attributes=dict( - packages=attr.ib(default=attr.Factory(list))) + resource_attributes=dict(packages=attr.ib(default=attr.Factory(list))), ) codebase = compute_codebase_directory_fingerprints(vc) @@ -259,47 +252,41 @@ def test_ApproximateDirectoryStructureIndex_match_subdir(self): for resource in codebase.walk(topdown=True): if resource.is_file: continue - fp = resource.extra_data.get('directory_structure', '') + fp = resource.extra_data.get("directory_structure", "") matches = ApproximateDirectoryStructureIndex.match( - fingerprint=fp, - resource=resource + fingerprint=fp, resource=resource ) for match in matches: p = match.package.to_dict() - p['match_type'] = 'approximate-directory-structure' + p["match_type"] = "approximate-directory-structure" resource.packages.append(p) resource.save(codebase) expected = self.get_test_loc( - 'models/directory-matching/async-0.2.9-i-expected-structure.json') + "models/directory-matching/async-0.2.9-i-expected-structure.json" + ) self.check_codebase(codebase, expected, regen=FIXTURES_REGEN) def test_ApproximateDirectoryContentIndex_index(self): # Test index - fingerprint = '000018fad23a49e4cd40718d1297be719e6564a4' - resource_path = 'foo/bar' + fingerprint = "000018fad23a49e4cd40718d1297be719e6564a4" + resource_path = "foo/bar" adci, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package1 + fingerprint, resource_path, self.test_package1 ) self.assertTrue(created) self.assertEqual(fingerprint, adci.fingerprint()) # Test index of existing fingerprint adci, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package1 + fingerprint, resource_path, self.test_package1 ) self.assertFalse(created) self.assertEqual(fingerprint, adci.fingerprint()) # Test index of invalid fingerprint ApproximateResourceContentIndex.index( - 'not a fingerprint', - resource_path, - self.test_package1 + "not a fingerprint", resource_path, self.test_package1 ) self.assertTrue( "ValueError: invalid literal for int() with base 16: 'not a fi'" @@ -308,11 +295,11 @@ def test_ApproximateDirectoryContentIndex_index(self): def test_ApproximateDirectoryContentIndex_match_subdir(self): scan_location = self.get_test_loc( - 'models/directory-matching/async-0.2.9-i.json') + "models/directory-matching/async-0.2.9-i.json" + ) vc = VirtualCodebase( location=scan_location, - resource_attributes=dict( - packages=attr.ib(default=attr.Factory(list))) + resource_attributes=dict(packages=attr.ib(default=attr.Factory(list))), ) codebase = compute_codebase_directory_fingerprints(vc) @@ -320,101 +307,91 @@ def test_ApproximateDirectoryContentIndex_match_subdir(self): for resource in codebase.walk(topdown=True): if resource.is_file: continue - fp = resource.extra_data.get('directory_content', '') + fp = resource.extra_data.get("directory_content", "") matches = ApproximateDirectoryContentIndex.match( - fingerprint=fp, - resource=resource + fingerprint=fp, resource=resource ) for match in matches: p = match.package.to_dict() - p['match_type'] = 'approximate-directory-content' + p["match_type"] = "approximate-directory-content" resource.packages.append(p) resource.save(codebase) expected = self.get_test_loc( - 'models/directory-matching/async-0.2.9-i-expected-content.json') + "models/directory-matching/async-0.2.9-i-expected-content.json" + ) self.check_codebase(codebase, expected, regen=FIXTURES_REGEN) class ApproximateResourceMatchingIndexModelTestCase(MatchcodeTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): super(MatchcodeTestCase, self).setUp() # Add approximate file resource self.test_package, _ = Package.objects.get_or_create( - filename='inflate.tar.gz', - sha1='deadfeed', - type='generic', - name='inflate', - version='1.0.0', - download_url='inflate.com/inflate.tar.gz', + filename="inflate.tar.gz", + sha1="deadfeed", + type="generic", + name="inflate", + version="1.0.0", + download_url="inflate.com/inflate.tar.gz", ) self.test_resource, _ = Resource.objects.get_or_create( - path='inflate.c', - name='inflate.c', - size=55466, - package=self.test_package + path="inflate.c", name="inflate.c", size=55466, package=self.test_package ) - self.test_resource_fingerprint = '000018fba23a49e4cd40718d1297be719e6564a4' + self.test_resource_fingerprint = "000018fba23a49e4cd40718d1297be719e6564a4" ApproximateResourceContentIndex.index( - self.test_resource_fingerprint, - self.test_resource.path, - self.test_package + self.test_resource_fingerprint, self.test_resource.path, self.test_package ) # Add approximate file resource self.test_package1, _ = Package.objects.get_or_create( - filename='deep-equal-1.0.1.tgz', - sha1='f5d260292b660e084eff4cdbc9f08ad3247448b5', - type='npm', - name='deep-equal', - version='1.0.1', - download_url='https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz', + filename="deep-equal-1.0.1.tgz", + sha1="f5d260292b660e084eff4cdbc9f08ad3247448b5", + type="npm", + name="deep-equal", + version="1.0.1", + download_url="https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz", ) self.test_resource1, _ = Resource.objects.get_or_create( - path='package/index.js', - name='index', - extension='js', - package=self.test_package1 + path="package/index.js", + name="index", + extension="js", + package=self.test_package1, ) test_resource1_loc = self.get_test_loc( - 'match/approximate-file-matching/index.js') + "match/approximate-file-matching/index.js" + ) fingerprints = get_file_fingerprint_hashes(test_resource1_loc) - self.test_resource1_fingerprint = fingerprints['halo1'] + self.test_resource1_fingerprint = fingerprints["halo1"] ApproximateResourceContentIndex.index( self.test_resource1_fingerprint, self.test_resource1.path, - self.test_package1 + self.test_package1, ) def test_ApproximateResourceContentIndex_index(self): # Test index - fingerprint = '000018fba23a39e4cd40718d1297be719e6564a4' - resource_path = 'foo/bar' + fingerprint = "000018fba23a39e4cd40718d1297be719e6564a4" + resource_path = "foo/bar" adci, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package + fingerprint, resource_path, self.test_package ) self.assertTrue(created) self.assertEqual(fingerprint, adci.fingerprint()) # Test index of existing fingerprint adci, created = ApproximateResourceContentIndex.index( - fingerprint, - resource_path, - self.test_package + fingerprint, resource_path, self.test_package ) self.assertFalse(created) self.assertEqual(fingerprint, adci.fingerprint()) # Test index of invalid fingerprint ApproximateResourceContentIndex.index( - 'not a fingerprint', - resource_path, - self.test_package + "not a fingerprint", resource_path, self.test_package ) self.assertTrue( "ValueError: invalid literal for int() with base 16: 'not a fi'" @@ -423,11 +400,11 @@ def test_ApproximateResourceContentIndex_index(self): def test_ApproximateResourceContentIndex_match(self): scan_location = self.get_test_loc( - 'match/approximate-file-matching/approximate-match-test.json') + "match/approximate-file-matching/approximate-match-test.json" + ) codebase = VirtualCodebase( location=scan_location, - resource_attributes=dict( - packages=attr.ib(default=attr.Factory(list))) + resource_attributes=dict(packages=attr.ib(default=attr.Factory(list))), ) # populate codebase with match results @@ -435,40 +412,41 @@ def test_ApproximateResourceContentIndex_match(self): if not (fp := resource.halo1): continue matches = ApproximateResourceContentIndex.match( - fingerprint=fp, - resource=resource + fingerprint=fp, resource=resource ) for match in matches: p = match.package.to_dict() - p['match_type'] = 'approximate-resource-content' + p["match_type"] = "approximate-resource-content" resource.packages.append(p) resource.save(codebase) expected = self.get_test_loc( - 'match/approximate-file-matching/approximate-match-model-test-results.json') + "match/approximate-file-matching/approximate-match-model-test-results.json" + ) self.check_codebase(codebase, expected, regen=FIXTURES_REGEN) def test_ApproximateResourceContentIndex_match_deep_equals(self): test_file_loc = self.get_test_loc( - 'match/approximate-file-matching/index-modified.js') + "match/approximate-file-matching/index-modified.js" + ) fingerprints = get_file_fingerprint_hashes(test_file_loc) - fp = fingerprints['halo1'] + fp = fingerprints["halo1"] matches = ApproximateResourceContentIndex.match(fp) results = [match.package.to_dict() for match in matches] expected_results_loc = self.get_test_loc( - 'match/approximate-file-matching/index-modified.js-expected.json') - self.check_expected_results( - results, expected_results_loc, regen=FIXTURES_REGEN) + "match/approximate-file-matching/index-modified.js-expected.json" + ) + self.check_expected_results(results, expected_results_loc, regen=FIXTURES_REGEN) class MatchcodeModelUtilsTestCase(MatchcodeTestCase): def test_create_halohash_chunks(self): - fingerprint = '49280e141724c001e1080128621a4210' + fingerprint = "49280e141724c001e1080128621a4210" chunk1, chunk2, chunk3, chunk4 = create_halohash_chunks(fingerprint) - expected_chunk1 = hexstring_to_binarray('49280e14') - expected_chunk2 = hexstring_to_binarray('1724c001') - expected_chunk3 = hexstring_to_binarray('e1080128') - expected_chunk4 = hexstring_to_binarray('621a4210') + expected_chunk1 = hexstring_to_binarray("49280e14") + expected_chunk2 = hexstring_to_binarray("1724c001") + expected_chunk3 = hexstring_to_binarray("e1080128") + expected_chunk4 = hexstring_to_binarray("621a4210") self.assertEqual(expected_chunk1, chunk1) self.assertEqual(expected_chunk2, chunk2) self.assertEqual(expected_chunk3, chunk3) diff --git a/matchcode/utils.py b/matchcode/utils.py index 7168847a..cdbdfa68 100644 --- a/matchcode/utils.py +++ b/matchcode/utils.py @@ -7,28 +7,22 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import OrderedDict -from unittest import TestCase - -import codecs import json import ntpath import os import posixpath +from unittest import TestCase from django.test import TestCase as DjangoTestCase from commoncode.resource import VirtualCodebase from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints from matchcode_toolkit.fingerprinting import hexstring_to_binarray -from rest_framework.utils.serializer_helpers import ReturnDict -from rest_framework.utils.serializer_helpers import ReturnList from scancode.cli_test_utils import purl_with_fake_uuid from matchcode.tests import FIXTURES_REGEN from minecode.utils_test import JsonBasedTestingMixin - ############## TEST UTILITIES ############## """ The conventions used for the tests are: @@ -40,7 +34,7 @@ class BaseTestCase(TestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") @classmethod def get_test_loc(cls, path): @@ -53,9 +47,14 @@ def get_test_loc(cls, path): return location -class CodebaseTester(object): - def check_codebase(self, codebase, expected_codebase_json_loc, - regen=FIXTURES_REGEN, remove_file_date=True): +class CodebaseTester: + def check_codebase( + self, + codebase, + expected_codebase_json_loc, + regen=FIXTURES_REGEN, + remove_file_date=True, + ): """ Check the Resources of the `codebase` Codebase objects are the same as the data in the `expected_codebase_json_loc` JSON file location, @@ -70,41 +69,39 @@ def check_codebase(self, codebase, expected_codebase_json_loc, def serializer(r): rd = r.to_dict(with_info=True) if remove_file_date: - rd.pop('file_date', None) + rd.pop("file_date", None) - for package_data in rd.get('packages', []): + for package_data in rd.get("packages", []): # Normalize package_uid - package_uid = package_data.get('package_uid') + package_uid = package_data.get("package_uid") if package_uid: - package_data['package_uid'] = purl_with_fake_uuid( - package_uid) + package_data["package_uid"] = purl_with_fake_uuid(package_uid) return rd results = list(map(serializer, codebase.walk(topdown=True))) if regen: - with open(expected_codebase_json_loc, 'w') as reg: - json.dump(dict(files=results), reg, - indent=2, separators=(',', ': ')) + with open(expected_codebase_json_loc, "w") as reg: + json.dump(dict(files=results), reg, indent=2, separators=(",", ": ")) expected_vc = VirtualCodebase(location=expected_codebase_json_loc) expected = list(map(serializer, expected_vc.walk(topdown=True))) # NOTE we redump the JSON as a string for a more efficient display of the # failures comparison/diff - expected = json.dumps(expected, indent=2, separators=(',', ': ')) - results = json.dumps(results, indent=2, separators=(',', ': ')) + expected = json.dumps(expected, indent=2, separators=(",", ": ")) + results = json.dumps(results, indent=2, separators=(",", ": ")) self.assertEqual(expected, results) -class MatchcodeTestCase(CodebaseTester, JsonBasedTestingMixin, BaseTestCase, DjangoTestCase): - databases = '__all__' +class MatchcodeTestCase( + CodebaseTester, JsonBasedTestingMixin, BaseTestCase, DjangoTestCase +): + databases = "__all__" def to_os_native_path(path): - """ - Normalize a path to use the native OS path separator. - """ + """Normalize a path to use the native OS path separator.""" path = path.replace(posixpath.sep, os.path.sep) path = path.replace(ntpath.sep, os.path.sep) path = path.rstrip(os.path.sep) @@ -113,6 +110,7 @@ def to_os_native_path(path): def load_resources_from_scan(scan_location, package): from packagedb.models import Resource + vc = VirtualCodebase( location=scan_location, ) @@ -123,35 +121,27 @@ def load_resources_from_scan(scan_location, package): size=resource.size, sha1=resource.sha1, md5=resource.md5, - is_file=resource.type == 'file' + is_file=resource.type == "file", ) def index_packages_sha1(): - """ - Reindex all the packages for exact sha1 matching. - """ + """Reindex all the packages for exact sha1 matching.""" from matchcode.models import ExactPackageArchiveIndex from packagedb.models import Package for package in Package.objects.filter(sha1__isnull=False): sha1_in_bin = hexstring_to_binarray(package.sha1) - _ = ExactPackageArchiveIndex.objects.create( - package=package, - sha1=sha1_in_bin - ) + _ = ExactPackageArchiveIndex.objects.create(package=package, sha1=sha1_in_bin) def index_package_files_sha1(package, scan_location): - """ - Index for SHA1 the package files found in the JSON scan at scan_location - """ + """Index for SHA1 the package files found in the JSON scan at scan_location""" from matchcode.models import ExactFileIndex resource_attributes = dict() vc = VirtualCodebase( - location=scan_location, - resource_attributes=resource_attributes + location=scan_location, resource_attributes=resource_attributes ) for resource in vc.walk(topdown=True): @@ -166,12 +156,10 @@ def index_package_files_sha1(package, scan_location): def _create_virtual_codebase_from_package_resources(package): - """ - Return a VirtualCodebase from the resources of `package` - """ + """Return a VirtualCodebase from the resources of `package`""" # Create something that looks like a scancode scan so we can import it into # a VirtualCodebase - package_resources = package.resources.order_by('path') + package_resources = package.resources.order_by("path") if not package_resources: return @@ -179,28 +167,28 @@ def _create_virtual_codebase_from_package_resources(package): for resource in package_resources: files.append( { - 'path': resource.path, - 'size': resource.size, - 'sha1': resource.sha1, - 'md5': resource.md5, - 'type': resource.type, + "path": resource.path, + "size": resource.size, + "sha1": resource.sha1, + "md5": resource.md5, + "type": resource.type, } ) make_new_root = False - sample_file_path = files[0].get('path', '') - root_dir = sample_file_path.split('/')[0] + sample_file_path = files[0].get("path", "") + root_dir = sample_file_path.split("/")[0] for f in files: - file_path = f.get('path', '') + file_path = f.get("path", "") if not file_path.startswith(root_dir): make_new_root = True break if make_new_root: - new_root = '{}-{}'.format(package.name, package.version) + new_root = f"{package.name}-{package.version}" for f in files: - new_path = os.path.join(new_root, f.get('path', '')) - f['path'] = new_path + new_path = os.path.join(new_root, f.get("path", "")) + f["path"] = new_path # Create VirtualCodebase mock_scan = dict(files=files) @@ -226,11 +214,11 @@ def index_resource_fingerprints(codebase, package): indexed_adsi = 0 indexed_arci = 0 for resource in codebase.walk(topdown=False): - directory_content_fingerprint = resource.extra_data.get( - 'directory_content', '') + directory_content_fingerprint = resource.extra_data.get("directory_content", "") directory_structure_fingerprint = resource.extra_data.get( - 'directory_structure', '') - resource_content_fingerprint = resource.extra_data.get('halo1', '') + "directory_structure", "" + ) + resource_content_fingerprint = resource.extra_data.get("halo1", "") if directory_content_fingerprint: _, adci_created = ApproximateDirectoryContentIndex.index( diff --git a/matchcode_pipeline/api.py b/matchcode_pipeline/api.py index 0945bd0e..40c892b5 100644 --- a/matchcode_pipeline/api.py +++ b/matchcode_pipeline/api.py @@ -13,7 +13,6 @@ from rest_framework import serializers from rest_framework import viewsets from rest_framework.decorators import action - from scanpipe.api import ExcludeFromListViewMixin from scanpipe.api.serializers import InputSourceSerializer from scanpipe.api.serializers import SerializerExcludeFieldsMixin @@ -80,8 +79,8 @@ class MatchingSerializer(ExcludeFromListViewMixin, serializers.ModelSerializer): class Meta: model = Project fields = ( - 'url', - 'uuid', + "url", + "uuid", "upload_file", "input_urls", "webhook_url", @@ -108,9 +107,9 @@ class Meta: "codebase_relations_summary", ] extra_kwargs = { - 'url': { - 'view_name': 'matching-detail', - 'lookup_field': 'pk', + "url": { + "view_name": "matching-detail", + "lookup_field": "pk", }, } @@ -143,20 +142,17 @@ def validate_input_urls(self, value): """Add support for providing multiple URLs in a single string.""" return [url for entry in value for url in entry.split()] - def create(self, validated_data, matching_pipeline_name='matching'): - """ - Create a new `project` with `upload_file`, using the `matching` pipeline - """ + def create(self, validated_data, matching_pipeline_name="matching"): + """Create a new `project` with `upload_file`, using the `matching` pipeline""" execute_now = True - validated_data['name'] = uuid4() + validated_data["name"] = uuid4() upload_file = validated_data.pop("upload_file", None) input_urls = validated_data.pop("input_urls", []) webhook_url = validated_data.pop("webhook_url", None) downloads, errors = fetch_urls(input_urls) if errors: - raise serializers.ValidationError( - "Could not fetch: " + "\n".join(errors)) + raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) project = super().create(validated_data) @@ -190,8 +186,8 @@ class D2DSerializer(ExcludeFromListViewMixin, serializers.ModelSerializer): class Meta: model = Project fields = ( - 'url', - 'uuid', + "url", + "uuid", "input_urls", "created_date", "input_sources", @@ -218,9 +214,9 @@ class Meta: "codebase_resources_discrepancies", ] extra_kwargs = { - 'url': { - 'view_name': 'd2d-detail', - 'lookup_field': 'pk', + "url": { + "view_name": "d2d-detail", + "lookup_field": "pk", }, } @@ -255,18 +251,15 @@ def get_codebase_relations_summary(self, project): queryset = project.codebaserelations.all() return count_group_by(queryset, "map_type") - def create(self, validated_data, matching_pipeline_name='d2d'): - """ - Create a new `project` with `input_urls`, using the `d2d` pipeline - """ + def create(self, validated_data, matching_pipeline_name="d2d"): + """Create a new `project` with `input_urls`, using the `d2d` pipeline""" execute_now = True - validated_data['name'] = uuid4() + validated_data["name"] = uuid4() input_urls = validated_data.pop("input_urls", []) errors = check_urls_availability(input_urls) if errors: - raise serializers.ValidationError( - "Could not fetch: " + "\n".join(errors)) + raise serializers.ValidationError("Could not fetch: " + "\n".join(errors)) project = super().create(validated_data) @@ -287,8 +280,11 @@ def create(self, validated_data, matching_pipeline_name='d2d'): for url in urls: project.add_input_source(download_url=url) - project.add_pipeline(matching_pipeline_name, selected_groups=[ - "Java", "Javascript", "Elf", "Go"], execute_now=execute_now) + project.add_pipeline( + matching_pipeline_name, + selected_groups=["Java", "Javascript", "Elf", "Go"], + execute_now=execute_now, + ) return project @@ -329,6 +325,7 @@ class MatchingViewSet( - List of mapping containing details about the runs created for this match request. """ + queryset = Project.objects.all() serializer_class = MatchingSerializer filterset_class = ProjectFilterSet @@ -384,6 +381,7 @@ class D2DViewSet( - List of mapping containing details about the runs created for this match request. """ + queryset = Project.objects.all() serializer_class = D2DSerializer filterset_class = ProjectFilterSet diff --git a/matchcode_pipeline/pipelines/matching.py b/matchcode_pipeline/pipelines/matching.py index 2803c657..a30d87ce 100644 --- a/matchcode_pipeline/pipelines/matching.py +++ b/matchcode_pipeline/pipelines/matching.py @@ -22,9 +22,10 @@ from scanpipe.pipelines.load_inventory import LoadInventory from scanpipe.pipelines.scan_codebase import ScanCodebase -from matchcode_pipeline.pipes import matching from scanpipe.pipes import matchcode +from matchcode_pipeline.pipes import matching + class Matching(ScanCodebase, LoadInventory): """ diff --git a/matchcode_pipeline/pipes/matching.py b/matchcode_pipeline/pipes/matching.py index a70dcbad..82a2196b 100644 --- a/matchcode_pipeline/pipes/matching.py +++ b/matchcode_pipeline/pipes/matching.py @@ -113,7 +113,7 @@ def match_purldb_package( """ match_count = 0 sha1_list = list(resources_by_sha1.keys()) - results = Package.objects.using('packagedb').filter(sha1__in=sha1_list) + results = Package.objects.using("packagedb").filter(sha1__in=sha1_list) # Process matched Package data for package in results: package_data = package.to_dict() @@ -147,7 +147,7 @@ def match_purldb_resource( package_data_by_purldb_urls = package_data_by_purldb_urls or {} match_count = 0 sha1_list = list(resources_by_sha1.keys()) - results = Resource.objects.using('packagedb').filter(sha1__in=sha1_list) + results = Resource.objects.using("packagedb").filter(sha1__in=sha1_list) # Process match results for resource in results: # Get package data @@ -170,13 +170,15 @@ def match_purldb_resource_approximately(project, resource): """Match by approximation a single resource in the PurlDB.""" fingerprint = resource.extra_data.get("halo1", "") results = ApproximateResourceContentIndex.match( - fingerprint=fingerprint, - resource=resource + fingerprint=fingerprint, resource=resource ) for result in results: package_data = result.package.to_dict() return create_package_from_purldb_data( - project, [resource], package_data, flag.APPROXIMATE_MATCHED_TO_PURLDB_RESOURCE + project, + [resource], + package_data, + flag.APPROXIMATE_MATCHED_TO_PURLDB_RESOURCE, ) @@ -184,9 +186,7 @@ def match_purldb_directory(project, resource, exact_match=False): """Match a single directory resource in the PurlDB.""" fingerprint = resource.extra_data.get("directory_content", "") results = ApproximateDirectoryContentIndex.match( - fingerprint=fingerprint, - resource=resource, - exact_match=exact_match + fingerprint=fingerprint, resource=resource, exact_match=exact_match ) for result in results: package_data = result.package.to_dict() @@ -236,14 +236,9 @@ def match_purldb_resources( if logger: if resource_count > 0: - logger( - f"Matching {resource_count:,d} resources in PurlDB, " - "using SHA1" - ) + logger(f"Matching {resource_count:,d} resources in PurlDB, " "using SHA1") else: - logger( - f"Skipping resource matching as there are {resource_count:,d}" - ) + logger(f"Skipping resource matching as there are {resource_count:,d}") _match_purldb_resources( project=project, @@ -324,11 +319,9 @@ def match_purldb_resources_approximately(project, logger=None): resource, ) - matched_count = ( - project.codebaseresources - .filter(status=flag.APPROXIMATE_MATCHED_TO_PURLDB_RESOURCE) - .count() - ) + matched_count = project.codebaseresources.filter( + status=flag.APPROXIMATE_MATCHED_TO_PURLDB_RESOURCE + ).count() logger( f"{matched_count:,d} resource{pluralize(matched_count, 's')} " f"approximately matched in PurlDB" @@ -362,11 +355,7 @@ def match_purldb_directories(project, exact_directory_match=False, logger=None): for directory in progress.iter(directory_iterator): directory.refresh_from_db() if directory.status != flag.MATCHED_TO_PURLDB_DIRECTORY: - match_purldb_directory( - project, - directory, - exact_directory_match - ) + match_purldb_directory(project, directory, exact_directory_match) matched_count = ( project.codebaseresources.directories() @@ -381,9 +370,8 @@ def match_purldb_directories(project, exact_directory_match=False, logger=None): def match_purldb_resources_post_process(project, logger=None): """Choose the best package for PurlDB matched resources.""" - extract_directories = ( - project.codebaseresources.directories() - .filter(path__regex=r"^.*-extract$") + extract_directories = project.codebaseresources.directories().filter( + path__regex=r"^.*-extract$" ) resources = project.codebaseresources.files().filter( @@ -403,16 +391,12 @@ def match_purldb_resources_post_process(project, logger=None): map_count = 0 for directory in progress.iter(resource_iterator): - map_count += _match_purldb_resources_post_process( - directory.path, resources - ) + map_count += _match_purldb_resources_post_process(directory.path, resources) logger(f"{map_count:,d} resource processed") -def _match_purldb_resources_post_process( - directory_path, codebase_resources -): +def _match_purldb_resources_post_process(directory_path, codebase_resources): # Exclude the content of nested archive. interesting_codebase_resources = ( codebase_resources.filter(path__startswith=directory_path) diff --git a/matchcode_pipeline/tests/pipes/test_matching.py b/matchcode_pipeline/tests/pipes/test_matching.py index eb70c119..2360a1e0 100644 --- a/matchcode_pipeline/tests/pipes/test_matching.py +++ b/matchcode_pipeline/tests/pipes/test_matching.py @@ -3,6 +3,7 @@ from pathlib import Path from django.test import TestCase + from scanpipe import pipes from scanpipe.models import Project from scanpipe.pipes import flag @@ -30,7 +31,7 @@ def setUp(self): namespace=package_data1["namespace"], name=package_data1["name"], version=package_data1["version"], - sha1="abcdef" + sha1="abcdef", ) self.directory_content_fingerprint1 = ApproximateDirectoryContentIndex.index( fingerprint="00000003238f6ed2c218090d4da80b3b42160e69", @@ -42,9 +43,7 @@ def setUp(self): package=self.package1, ) self.resource1 = Resource.objects.create( - path="inflate.c", - size=55466, - package=self.package1 + path="inflate.c", size=55466, package=self.package1 ) self.resource_content_fingerprint1 = ApproximateResourceContentIndex.index( fingerprint="000018fba23a49e4cd40718d1297be719e6564a4", @@ -67,7 +66,9 @@ def test_matchcode_pipeline_pipes_matching_get_project_resources_qs(self): make_resource_file(self.project1, "directory100/bar.txt") resources = [package_resource, directory_resource] - resources_qs = matching.get_project_resources_qs(self.project1, resources=resources) + resources_qs = matching.get_project_resources_qs( + self.project1, resources=resources + ) expected_paths = [ "package.jar", "package.jar-extract/", @@ -113,7 +114,9 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_directories(self): to_1 = make_resource_directory( self.project1, "package.jar-extract", - extra_data={"directory_content": "00000003238f6ed2c218090d4da80b3b42160e69"}, + extra_data={ + "directory_content": "00000003238f6ed2c218090d4da80b3b42160e69" + }, ) to_2 = make_resource_file(self.project1, "package.jar-extract/a.class") to_3 = make_resource_file(self.project1, "package.jar-extract/b.class") @@ -124,9 +127,7 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_directories(self): logger=buffer.write, ) - expected = ( - "Matching 1 directory against PurlDB" "1 directory matched in PurlDB" - ) + expected = "Matching 1 directory against PurlDB" "1 directory matched in PurlDB" self.assertEqual(expected, buffer.getvalue()) package = self.project1.discoveredpackages.get() @@ -137,13 +138,13 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_directories(self): self.assertEqual("matched-to-purldb-directory", resource.status) self.assertEqual(package, resource.discovered_packages.get()) - - def test_matchcode_pipeline_pipes_matching_match_purldb_resources_post_process(self): + def test_matchcode_pipeline_pipes_matching_match_purldb_resources_post_process( + self, + ): to_map = self.data_location / "d2d-javascript" / "to" / "main.js.map" to_mini = self.data_location / "d2d-javascript" / "to" / "main.js" to_dir = ( - self.project1.codebase_path - / "project.tar.zst/modules/apps/adaptive-media/" + self.project1.codebase_path / "project.tar.zst/modules/apps/adaptive-media/" "adaptive-media-web-extract/src/main/resources/META-INF/resources/" "adaptive_media/js" ) @@ -202,7 +203,9 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_resources_post_process(s self.assertEqual(2, package1_resource_count) self.assertEqual(0, package2_resource_count) - def test_matchcode_pipeline_pipes_matching_match_purldb_resource_approximately(self): + def test_matchcode_pipeline_pipes_matching_match_purldb_resource_approximately( + self, + ): resource = make_resource_file( self.project1, "inflate.c", @@ -217,7 +220,8 @@ def test_matchcode_pipeline_pipes_matching_match_purldb_resource_approximately(s ) expected = ( - "Approximate matching 1 resource against PurlDB" "1 resource approximately matched in PurlDB" + "Approximate matching 1 resource against PurlDB" + "1 resource approximately matched in PurlDB" ) self.assertEqual(expected, buffer.getvalue()) diff --git a/matchcode_pipeline/tests/test_api.py b/matchcode_pipeline/tests/test_api.py index 8783cdd6..d2bdbe9e 100644 --- a/matchcode_pipeline/tests/test_api.py +++ b/matchcode_pipeline/tests/test_api.py @@ -13,8 +13,9 @@ from django.contrib.auth.models import User from django.test import TransactionTestCase from django.urls import reverse -from rest_framework.test import APIClient +from rest_framework import status +from rest_framework.test import APIClient from scanpipe.models import CodebaseRelation from scanpipe.models import CodebaseResource from scanpipe.models import DiscoveredDependency @@ -22,21 +23,19 @@ from scanpipe.models import Run from scanpipe.tests import dependency_data1 from scanpipe.tests import package_data1 -from rest_framework import status class MatchCodePipelineAPITest(TransactionTestCase): - databases = {'default', 'packagedb'} - data_location = Path(__file__).parent / 'data' + databases = {"default", "packagedb"} + data_location = Path(__file__).parent / "data" def setUp(self): - self.project1 = Project.objects.create(name='Analysis') + self.project1 = Project.objects.create(name="Analysis") self.resource1 = CodebaseResource.objects.create( project=self.project1, - path='daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO', + path="daglib-0.3.2.tar.gz-extract/daglib-0.3.2/PKG-INFO", ) - self.discovered_package1 = self.resource1.create_and_add_package( - package_data1) + self.discovered_package1 = self.resource1.create_and_add_package(package_data1) self.discovered_dependency1 = DiscoveredDependency.create_from_data( self.project1, dependency_data1 ) @@ -44,16 +43,14 @@ def setUp(self): project=self.project1, from_resource=self.resource1, to_resource=self.resource1, - map_type='java_to_class', + map_type="java_to_class", ) - self.matching_list_url = reverse('matching-list') - self.project1_detail_url = reverse( - 'matching-detail', args=[self.project1.uuid]) + self.matching_list_url = reverse("matching-list") + self.project1_detail_url = reverse("matching-detail", args=[self.project1.uuid]) - self.user = User.objects.create_user( - 'username', 'e@mail.com', 'secret') - self.auth = f'Token {self.user.auth_token.key}' + self.user = User.objects.create_user("username", "e@mail.com", "secret") + self.auth = f"Token {self.user.auth_token.key}" self.csrf_client = APIClient(enforce_csrf_checks=True) self.csrf_client.credentials(HTTP_AUTHORIZATION=self.auth) @@ -62,52 +59,50 @@ def test_matchcode_pipeline_api_matching_list(self): response = self.csrf_client.get(self.matching_list_url) self.assertContains(response, self.project1_detail_url) - self.assertEqual(1, response.data['count']) - self.assertNotContains(response, 'input_root') - self.assertNotContains(response, 'extra_data') - self.assertNotContains(response, 'message_count') - self.assertNotContains(response, 'resource_count') - self.assertNotContains(response, 'package_count') - self.assertNotContains(response, 'dependency_count') + self.assertEqual(1, response.data["count"]) + self.assertNotContains(response, "input_root") + self.assertNotContains(response, "extra_data") + self.assertNotContains(response, "message_count") + self.assertNotContains(response, "resource_count") + self.assertNotContains(response, "package_count") + self.assertNotContains(response, "dependency_count") def test_matchcode_pipeline_api_matching_detail(self): response = self.csrf_client.get(self.project1_detail_url) - self.assertIn(self.project1_detail_url, response.data['url']) - self.assertEqual(str(self.project1.uuid), response.data['uuid']) - self.assertEqual([], response.data['input_sources']) - self.assertEqual([], response.data['runs']) - self.assertEqual(1, response.data['resource_count']) - self.assertEqual(1, response.data['package_count']) - self.assertEqual(1, response.data['dependency_count']) - self.assertEqual(1, response.data['relation_count']) - - expected = {'': 1} - self.assertEqual(expected, response.data['codebase_resources_summary']) + self.assertIn(self.project1_detail_url, response.data["url"]) + self.assertEqual(str(self.project1.uuid), response.data["uuid"]) + self.assertEqual([], response.data["input_sources"]) + self.assertEqual([], response.data["runs"]) + self.assertEqual(1, response.data["resource_count"]) + self.assertEqual(1, response.data["package_count"]) + self.assertEqual(1, response.data["dependency_count"]) + self.assertEqual(1, response.data["relation_count"]) + + expected = {"": 1} + self.assertEqual(expected, response.data["codebase_resources_summary"]) expected = { - 'total': 1, - 'with_missing_resources': 0, - 'with_modified_resources': 0, + "total": 1, + "with_missing_resources": 0, + "with_modified_resources": 0, } - self.assertEqual( - expected, response.data['discovered_packages_summary']) + self.assertEqual(expected, response.data["discovered_packages_summary"]) expected = { - 'total': 1, - 'is_runtime': 1, - 'is_optional': 0, - 'is_resolved': 0, + "total": 1, + "is_runtime": 1, + "is_optional": 0, + "is_resolved": 0, } - self.assertEqual( - expected, response.data['discovered_dependencies_summary']) + self.assertEqual(expected, response.data["discovered_dependencies_summary"]) - expected = {'java_to_class': 1} - self.assertEqual(expected, response.data['codebase_relations_summary']) + expected = {"java_to_class": 1} + self.assertEqual(expected, response.data["codebase_relations_summary"]) - input1 = self.project1.add_input_source( - filename='file1', is_uploaded=True) + input1 = self.project1.add_input_source(filename="file1", is_uploaded=True) input2 = self.project1.add_input_source( - filename='file2', download_url='https://download.url') + filename="file2", download_url="https://download.url" + ) self.project1.save() response = self.csrf_client.get(self.project1_detail_url) expected = [ @@ -128,89 +123,82 @@ def test_matchcode_pipeline_api_matching_detail(self): "uuid": str(input2.uuid), }, ] - self.assertEqual(expected, response.data['input_sources']) + self.assertEqual(expected, response.data["input_sources"]) - @mock.patch('scanpipe.models.Run.execute_task_async') + @mock.patch("scanpipe.models.Run.execute_task_async") def test_matching_pipeline_api_matching_create(self, mock_execute_pipeline_task): # load upload_file contents - test_out_loc = self.data_location / 'test-out.json' + test_out_loc = self.data_location / "test-out.json" with open(test_out_loc) as f: data = { - 'upload_file': f, + "upload_file": f, } # Send match request response = self.csrf_client.post(self.matching_list_url, data) self.assertEqual(status.HTTP_201_CREATED, response.status_code) - self.assertEqual(1, len(response.data['runs'])) - self.assertEqual('matching', response.data['runs'][0]['pipeline_name']) + self.assertEqual(1, len(response.data["runs"])) + self.assertEqual("matching", response.data["runs"][0]["pipeline_name"]) mock_execute_pipeline_task.assert_called_once() - created_matching_project_detail_url = response.data['url'] - matching_project_uuid = response.data['uuid'] - results_url = reverse('matching-results', args=[matching_project_uuid]) - # Check that the file was uploaded + created_matching_project_detail_url = response.data["url"] response = self.csrf_client.get(created_matching_project_detail_url) - self.assertEqual( - 'test-out.json', response.data['input_sources'][0]['filename']) + self.assertEqual("test-out.json", response.data["input_sources"][0]["filename"]) - @mock.patch('scanpipe.models.Run.execute_task_async') - def test_matching_pipeline_api_matching_create_multiple_input_urls(self, mock_execute_pipeline_task): + @mock.patch("scanpipe.models.Run.execute_task_async") + def test_matching_pipeline_api_matching_create_multiple_input_urls( + self, mock_execute_pipeline_task + ): # load input_urls data = { - 'input_urls': 'https://registry.npmjs.org/asdf/-/asdf-1.2.2.tgz\r\nhttps://registry.npmjs.org/asdf/-/asdf-1.2.1.tgz', + "input_urls": "https://registry.npmjs.org/asdf/-/asdf-1.2.2.tgz\r\nhttps://registry.npmjs.org/asdf/-/asdf-1.2.1.tgz", } # Send match request response = self.csrf_client.post(self.matching_list_url, data) self.assertEqual(status.HTTP_201_CREATED, response.status_code) - self.assertEqual(1, len(response.data['runs'])) - self.assertEqual('matching', response.data['runs'][0]['pipeline_name']) + self.assertEqual(1, len(response.data["runs"])) + self.assertEqual("matching", response.data["runs"][0]["pipeline_name"]) mock_execute_pipeline_task.assert_called_once() - created_matching_project_detail_url = response.data['url'] - matching_project_uuid = response.data['uuid'] - results_url = reverse('matching-results', args=[matching_project_uuid]) - # Check that the file was uploaded + created_matching_project_detail_url = response.data["url"] response = self.csrf_client.get(created_matching_project_detail_url) - input_sources = response.data['input_sources'] + input_sources = response.data["input_sources"] self.assertEqual(2, len(input_sources)) - self.assertEqual('asdf-1.2.2.tgz', input_sources[0]['filename']) - self.assertEqual('asdf-1.2.1.tgz', input_sources[1]['filename']) + self.assertEqual("asdf-1.2.2.tgz", input_sources[0]["filename"]) + self.assertEqual("asdf-1.2.1.tgz", input_sources[1]["filename"]) def test_matchcode_pipeline_api_run_detail(self): - run1 = self.project1.add_pipeline('matching') - url = reverse('run-detail', args=[run1.uuid]) - project1_detail_url = reverse('run-detail', args=[self.project1.uuid]) + run1 = self.project1.add_pipeline("matching") + url = reverse("run-detail", args=[run1.uuid]) + project1_detail_url = reverse("run-detail", args=[self.project1.uuid]) response = self.csrf_client.get(url) - self.assertEqual(str(run1.uuid), response.data['uuid']) - self.assertIn(project1_detail_url, response.data['project']) - self.assertEqual('matching', response.data['pipeline_name']) - self.assertEqual('', response.data['description']) - self.assertEqual('', response.data['scancodeio_version']) - self.assertIsNone(response.data['task_id']) - self.assertIsNone(response.data['task_start_date']) - self.assertIsNone(response.data['task_end_date']) - self.assertEqual('', response.data['task_output']) - self.assertIsNone(response.data['execution_time']) - self.assertEqual(Run.Status.NOT_STARTED, response.data['status']) + self.assertEqual(str(run1.uuid), response.data["uuid"]) + self.assertIn(project1_detail_url, response.data["project"]) + self.assertEqual("matching", response.data["pipeline_name"]) + self.assertEqual("", response.data["description"]) + self.assertEqual("", response.data["scancodeio_version"]) + self.assertIsNone(response.data["task_id"]) + self.assertIsNone(response.data["task_start_date"]) + self.assertIsNone(response.data["task_end_date"]) + self.assertEqual("", response.data["task_output"]) + self.assertIsNone(response.data["execution_time"]) + self.assertEqual(Run.Status.NOT_STARTED, response.data["status"]) class D2DPipelineAPITest(TransactionTestCase): - databases = {'default', 'packagedb'} - data_location = Path(__file__).parent / 'data' + databases = {"default", "packagedb"} + data_location = Path(__file__).parent / "data" def setUp(self): - self.project1 = Project.objects.create(name='Analysis') - self.d2d_list_url = reverse('d2d-list') - self.project1_detail_url = reverse( - 'd2d-detail', args=[self.project1.uuid]) - - self.user = User.objects.create_user( - 'username', 'a@mail.com', 'secret') - self.auth = f'Token {self.user.auth_token.key}' + self.project1 = Project.objects.create(name="Analysis") + self.d2d_list_url = reverse("d2d-list") + self.project1_detail_url = reverse("d2d-detail", args=[self.project1.uuid]) + + self.user = User.objects.create_user("username", "a@mail.com", "secret") + self.auth = f"Token {self.user.auth_token.key}" self.csrf_client = APIClient(enforce_csrf_checks=True) self.csrf_client.credentials(HTTP_AUTHORIZATION=self.auth) @@ -218,44 +206,48 @@ def test_d2d_pipeline_api_d2d_list(self): response = self.csrf_client.get(self.d2d_list_url) self.assertContains(response, self.project1_detail_url) - self.assertEqual(1, response.data['count']) - self.assertNotContains(response, 'input_root') - self.assertNotContains(response, 'extra_data') - self.assertNotContains(response, 'message_count') - self.assertNotContains(response, 'resource_count') - self.assertNotContains(response, 'package_count') - self.assertNotContains(response, 'dependency_count') - - @mock.patch('scanpipe.models.Run.execute_task_async') + self.assertEqual(1, response.data["count"]) + self.assertNotContains(response, "input_root") + self.assertNotContains(response, "extra_data") + self.assertNotContains(response, "message_count") + self.assertNotContains(response, "resource_count") + self.assertNotContains(response, "package_count") + self.assertNotContains(response, "dependency_count") + + @mock.patch("scanpipe.models.Run.execute_task_async") def test_d2d_pipeline_api_d2d_create(self, mock_execute_pipeline_task): # load upload_file contents data = { - 'input_urls': ['https://github.com/nexB/scancode.io/raw/main/scanpipe/tests/data/d2d-elfs/from-data.zip#from', - 'https://github.com/nexB/scancode.io/raw/main/scanpipe/tests/data/d2d-elfs/to-data.zip#to'], + "input_urls": [ + "https://github.com/nexB/scancode.io/raw/main/scanpipe/tests/data/d2d-elfs/from-data.zip#from", + "https://github.com/nexB/scancode.io/raw/main/scanpipe/tests/data/d2d-elfs/to-data.zip#to", + ], } # Send match request response = self.csrf_client.post(self.d2d_list_url, data) self.assertEqual(status.HTTP_201_CREATED, response.status_code) - self.assertEqual(1, len(response.data['runs'])) + self.assertEqual(1, len(response.data["runs"])) mock_execute_pipeline_task.assert_called_once() - response = self.csrf_client.get(response.data['url']) - self.assertIn('codebase_resources_discrepancies', response.data) + response = self.csrf_client.get(response.data["url"]) + self.assertIn("codebase_resources_discrepancies", response.data) def test_d2d_pipeline_api_run_detail(self): - run1 = self.project1.add_pipeline('d2d') - url = reverse('run-detail', args=[run1.uuid]) - project1_detail_url = reverse('run-detail', args=[self.project1.uuid]) + run1 = self.project1.add_pipeline("d2d") + url = reverse("run-detail", args=[run1.uuid]) + project1_detail_url = reverse("run-detail", args=[self.project1.uuid]) response = self.csrf_client.get(url) - self.assertEqual(str(run1.uuid), response.data['uuid']) - self.assertIn(project1_detail_url, response.data['project']) - self.assertEqual('d2d', response.data['pipeline_name']) + self.assertEqual(str(run1.uuid), response.data["uuid"]) + self.assertIn(project1_detail_url, response.data["project"]) + self.assertEqual("d2d", response.data["pipeline_name"]) self.assertEqual( - 'Establish relationships between two code trees: deployment and development.', response.data['description']) - self.assertEqual('', response.data['scancodeio_version']) - self.assertIsNone(response.data['task_id']) - self.assertIsNone(response.data['task_start_date']) - self.assertIsNone(response.data['task_end_date']) - self.assertEqual('', response.data['task_output']) - self.assertIsNone(response.data['execution_time']) - self.assertEqual(Run.Status.NOT_STARTED, response.data['status']) + "Establish relationships between two code trees: deployment and development.", + response.data["description"], + ) + self.assertEqual("", response.data["scancodeio_version"]) + self.assertIsNone(response.data["task_id"]) + self.assertIsNone(response.data["task_start_date"]) + self.assertIsNone(response.data["task_end_date"]) + self.assertEqual("", response.data["task_output"]) + self.assertIsNone(response.data["execution_time"]) + self.assertEqual(Run.Status.NOT_STARTED, response.data["status"]) diff --git a/matchcode_project/dbrouter.py b/matchcode_project/dbrouter.py index 582f77d7..ac739a3b 100644 --- a/matchcode_project/dbrouter.py +++ b/matchcode_project/dbrouter.py @@ -8,23 +8,23 @@ # -class PackageDBRouter(object): +class PackageDBRouter: app_labels = [ - 'clearcode', - 'clearindex', - 'minecode', - 'matchcode', - 'packagedb', + "clearcode", + "clearindex", + "minecode", + "matchcode", + "packagedb", ] def db_for_read(self, model, **hints): if model._meta.app_label in self.app_labels: - return 'packagedb' + return "packagedb" return None def db_for_write(self, model, **hints): if model._meta.app_label in self.app_labels: - return 'packagedb' + return "packagedb" return None def allow_relation(self, obj1, obj2, **hints): @@ -37,23 +37,23 @@ def allow_relation(self, obj1, obj2, **hints): def allow_migrate(self, db, app_label, model_name=None, **hints): if app_label in self.app_labels: - return db == 'packagedb' + return db == "packagedb" return None -class ScancodeIORouter(object): +class ScancodeIORouter: app_labels = [ - 'scanpipe', + "scanpipe", ] def db_for_read(self, model, **hints): if model._meta.app_label in self.app_labels: - return 'default' + return "default" return None def db_for_write(self, model, **hints): if model._meta.app_label in self.app_labels: - return 'default' + return "default" return None def allow_relation(self, obj1, obj2, **hints): @@ -66,5 +66,5 @@ def allow_relation(self, obj1, obj2, **hints): def allow_migrate(self, db, app_label, model_name=None, **hints): if app_label in self.app_labels: - return db == 'default' + return db == "default" return None diff --git a/matchcode_project/settings.py b/matchcode_project/settings.py index 3e4b2b19..b9bf11cb 100644 --- a/matchcode_project/settings.py +++ b/matchcode_project/settings.py @@ -10,7 +10,6 @@ from pathlib import Path import environ - from scancodeio.settings import * PROJECT_DIR = environ.Path(__file__) - 1 @@ -44,40 +43,40 @@ ) INSTALLED_APPS += [ - 'clearcode', - 'clearindex', - 'matchcode', - 'minecode', - 'packagedb', + "clearcode", + "clearindex", + "matchcode", + "minecode", + "packagedb", ] # Database DATABASES = { - 'default': { - 'ENGINE': env.str('SCANCODEIO_DB_ENGINE', 'django.db.backends.postgresql'), - 'HOST': env.str('SCANCODEIO_DB_HOST', 'localhost'), - 'NAME': env.str('SCANCODEIO_DB_NAME', 'matchcodeio'), - 'USER': env.str('SCANCODEIO_DB_USER', 'matchcodeio'), - 'PASSWORD': env.str('SCANCODEIO_DB_PASSWORD', 'matchcodeio'), - 'PORT': env.str('SCANCODEIO_DB_PORT', '5432'), - 'ATOMIC_REQUESTS': True, + "default": { + "ENGINE": env.str("SCANCODEIO_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("SCANCODEIO_DB_HOST", "localhost"), + "NAME": env.str("SCANCODEIO_DB_NAME", "matchcodeio"), + "USER": env.str("SCANCODEIO_DB_USER", "matchcodeio"), + "PASSWORD": env.str("SCANCODEIO_DB_PASSWORD", "matchcodeio"), + "PORT": env.str("SCANCODEIO_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, + }, + "packagedb": { + "ENGINE": env.str("PACKAGEDB_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("PACKAGEDB_DB_HOST", "localhost"), + "NAME": env.str("PACKAGEDB_DB_NAME", "packagedb"), + "USER": env.str("PACKAGEDB_DB_USER", "packagedb"), + "PASSWORD": env.str("PACKAGEDB_DB_PASSWORD", "packagedb"), + "PORT": env.str("PACKAGEDB_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, }, - 'packagedb': { - 'ENGINE': env.str('PACKAGEDB_DB_ENGINE', 'django.db.backends.postgresql'), - 'HOST': env.str('PACKAGEDB_DB_HOST', 'localhost'), - 'NAME': env.str('PACKAGEDB_DB_NAME', 'packagedb'), - 'USER': env.str('PACKAGEDB_DB_USER', 'packagedb'), - 'PASSWORD': env.str('PACKAGEDB_DB_PASSWORD', 'packagedb'), - 'PORT': env.str('PACKAGEDB_DB_PORT', '5432'), - 'ATOMIC_REQUESTS': True, - } } DATABASE_ROUTERS = [ - 'matchcode_project.dbrouter.PackageDBRouter', - 'matchcode_project.dbrouter.ScancodeIORouter', + "matchcode_project.dbrouter.PackageDBRouter", + "matchcode_project.dbrouter.ScancodeIORouter", ] -ROOT_URLCONF = 'matchcode_project.urls' +ROOT_URLCONF = "matchcode_project.urls" diff --git a/matchcode_project/urls.py b/matchcode_project/urls.py index 94812be2..72ccd50a 100644 --- a/matchcode_project/urls.py +++ b/matchcode_project/urls.py @@ -10,20 +10,20 @@ from django.urls import include from django.urls import path from django.views.generic import RedirectView + from rest_framework import routers from matchcode_pipeline.api import D2DViewSet from matchcode_pipeline.api import MatchingViewSet from matchcode_pipeline.api import RunViewSet - api_router = routers.DefaultRouter() -api_router.register('matching', MatchingViewSet, basename='matching') -api_router.register('d2d', D2DViewSet, basename='d2d') -api_router.register('runs', RunViewSet) +api_router.register("matching", MatchingViewSet, basename="matching") +api_router.register("d2d", D2DViewSet, basename="d2d") +api_router.register("runs", RunViewSet) urlpatterns = [ - path('api/', include(api_router.urls)), - path('', include('scanpipe.urls')), - path('', RedirectView.as_view(url='api/')), + path("api/", include(api_router.urls)), + path("", include("scanpipe.urls")), + path("", RedirectView.as_view(url="api/")), ] diff --git a/minecode/__init__.py b/minecode/__init__.py index f31542f8..83ca57de 100644 --- a/minecode/__init__.py +++ b/minecode/__init__.py @@ -12,14 +12,13 @@ from minecode import route - -default_app_config = 'minecode.apps.MinecodeConfig' +default_app_config = "minecode.apps.MinecodeConfig" sys_platform = str(sys.platform).lower() -ON_WINDOWS = 'win32' in sys_platform -ON_MAC = 'darwin' in sys_platform -ON_LINUX = 'linux' in sys_platform +ON_WINDOWS = "win32" in sys_platform +ON_MAC = "darwin" in sys_platform +ON_LINUX = "linux" in sys_platform # global instances of our routers visit_router = route.Router() diff --git a/minecode/api.py b/minecode/api.py index 13146acb..c47a38cb 100644 --- a/minecode/api.py +++ b/minecode/api.py @@ -18,16 +18,21 @@ from django.views.decorators.csrf import csrf_exempt from packageurl import PackageURL -from rest_framework import serializers, status, viewsets -from rest_framework.decorators import action, api_view +from rest_framework import serializers +from rest_framework import status +from rest_framework import viewsets +from rest_framework.decorators import action +from rest_framework.decorators import api_view from rest_framework.permissions import IsAdminUser from rest_framework.response import Response # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import visitors # NOQA +# But importing the collectors module triggers routes registration +from minecode import collectors # NOQA from minecode import priority_router -from minecode.models import PriorityResourceURI, ResourceURI, ScannableURI +from minecode.models import PriorityResourceURI +from minecode.models import ResourceURI +from minecode.models import ScannableURI from minecode.permissions import IsScanQueueWorkerAPIUser from minecode.utils import get_temp_file from minecode.utils import get_webhook_url @@ -45,10 +50,9 @@ class ResourceURIViewSet(viewsets.ModelViewSet): class PriorityResourceURISerializer(serializers.ModelSerializer): - class Meta: model = PriorityResourceURI - fields = '__all__' + fields = "__all__" class PriorityResourceURIViewSet(viewsets.ModelViewSet): @@ -60,25 +64,19 @@ class PriorityResourceURIViewSet(viewsets.ModelViewSet): # TODO: hide debug endpoints under `admin` @action(detail=False, methods=["post"]) def index_package(self, request, *args, **kwargs): - """ - Request the indexing and scanning of Package, given a valid Package URL `purl`. - """ - purl = request.data.get('purl') + """Request the indexing and scanning of Package, given a valid Package URL `purl`.""" + purl = request.data.get("purl") # validate purl try: package_url = PackageURL.from_string(purl) except ValueError as e: - message = { - 'status': f'purl validation error: {e}' - } + message = {"status": f"purl validation error: {e}"} return Response(message, status=status.HTTP_400_BAD_REQUEST) # see if its routeable if not priority_router.is_routable(purl): - message = { - 'status': f'Package type `{package_url.type}` is unsupported' - } + message = {"status": f"Package type `{package_url.type}` is unsupported"} return Response(message, status=status.HTTP_400_BAD_REQUEST) # add to queue @@ -86,11 +84,11 @@ def index_package(self, request, *args, **kwargs): if priority_resource_uri: message = { - 'status': f'Package index request for {purl} has been successful.' + "status": f"Package index request for {purl} has been successful." } else: message = { - 'status': f'Package {purl} has already been requested for indexing.' + "status": f"Package {purl} has already been requested for indexing." } # TODO: revisiting a package should be handled on another level, dependent on data we store return Response(message) @@ -99,44 +97,42 @@ def index_package(self, request, *args, **kwargs): class ScannableURISerializer(serializers.ModelSerializer): class Meta: model = ScannableURI - fields = '__all__' + fields = "__all__" class ScannableURIViewSet(viewsets.ModelViewSet): queryset = ScannableURI.objects.all() serializer_class = ScannableURISerializer permission_classes = [IsScanQueueWorkerAPIUser | IsAdminUser] - lookup_field = 'uuid' + lookup_field = "uuid" - @action(detail=False, methods=['get']) + @action(detail=False, methods=["get"]) def get_next_download_url(self, request, *args, **kwargs): - """ - Return download url for next Package on scan queue - """ + """Return download url for next Package on scan queue""" with transaction.atomic(): scannable_uri = ScannableURI.objects.get_next_scannable() if scannable_uri: user = self.request.user - webhook_url = get_webhook_url('index_package_scan', user.id) + webhook_url = get_webhook_url("index_package_scan", user.id) response = { - 'scannable_uri_uuid': scannable_uri.uuid, - 'download_url': scannable_uri.uri, - 'pipelines': scannable_uri.pipelines, - 'webhook_url': webhook_url, + "scannable_uri_uuid": scannable_uri.uuid, + "download_url": scannable_uri.uri, + "pipelines": scannable_uri.pipelines, + "webhook_url": webhook_url, } scannable_uri.scan_status = ScannableURI.SCAN_SUBMITTED scannable_uri.scan_date = timezone.now() scannable_uri.save() else: response = { - 'scannable_uri_uuid': '', - 'download_url': '', - 'pipelines': [], - 'webhook_url': '', + "scannable_uri_uuid": "", + "download_url": "", + "pipelines": [], + "webhook_url": "", } return Response(response) - @action(detail=True, methods=['post']) + @action(detail=True, methods=["post"]) def update_status(self, request, *args, **kwargs): """ Update the status of a ScannableURI with `scan_status` @@ -144,17 +140,16 @@ def update_status(self, request, *args, **kwargs): If `scan_status` is 'failed', then a `scan_log` string is expected and should contain the error messages for that scan. """ - scan_status = request.data.get('scan_status') + scan_status = request.data.get("scan_status") if not scan_status: - response = { - 'error': 'missing scan_status' - } + response = {"error": "missing scan_status"} return Response(response, status=status.HTTP_400_BAD_REQUEST) scannable_uri = self.get_object() scannable_uri_uuid = scannable_uri.uuid scannable_uri_status = ScannableURI.SCAN_STATUSES_BY_CODE.get( - scannable_uri.scan_status) + scannable_uri.scan_status + ) if scannable_uri.scan_status in [ ScannableURI.SCAN_INDEXED, @@ -163,36 +158,34 @@ def update_status(self, request, *args, **kwargs): ScannableURI.SCAN_INDEX_FAILED, ]: response = { - 'error': f'cannot update status for scannable_uri {scannable_uri_uuid}: ' - f'scannable_uri has finished with status "{scannable_uri_status}"' + "error": f"cannot update status for scannable_uri {scannable_uri_uuid}: " + f'scannable_uri has finished with status "{scannable_uri_status}"' } return Response(response, status=status.HTTP_400_BAD_REQUEST) if scan_status == scannable_uri_status: response = { - 'error': f'cannot update status for scannable_uri {scannable_uri_uuid}: ' - f'scannable_uri status is already "{scannable_uri_status}"' + "error": f"cannot update status for scannable_uri {scannable_uri_uuid}: " + f'scannable_uri status is already "{scannable_uri_status}"' } return Response(response, status=status.HTTP_400_BAD_REQUEST) - if scan_status == 'failed': - scan_log = request.data.get('scan_log') + if scan_status == "failed": + scan_log = request.data.get("scan_log") scannable_uri.scan_error = scan_log scannable_uri.scan_status = ScannableURI.SCAN_FAILED scannable_uri.wip_date = None scannable_uri.save() response = { - 'status': f'updated scannable_uri {scannable_uri_uuid} scan_status to {scan_status}' + "status": f"updated scannable_uri {scannable_uri_uuid} scan_status to {scan_status}" } return Response(response) - response = { - 'error': f'invalid scan_status: {scan_status}' - } + response = {"error": f"invalid scan_status: {scan_status}"} return Response(response, status=status.HTTP_400_BAD_REQUEST) -@api_view(['POST']) +@api_view(["POST"]) @csrf_exempt def index_package_scan(request, key): """ @@ -208,38 +201,32 @@ def index_package_scan(request, key): user_id = signing.loads(key) User = get_user_model() - user = get_object_or_404(User, id=user_id) + get_object_or_404(User, id=user_id) - results = json_data.get('results') - summary = json_data.get('summary') - project_data = json_data.get('project') - extra_data = project_data.get('extra_data') - scannable_uri_uuid = extra_data.get('scannable_uri_uuid') + results = json_data.get("results") + summary = json_data.get("summary") + project_data = json_data.get("project") + extra_data = project_data.get("extra_data") + scannable_uri_uuid = extra_data.get("scannable_uri_uuid") # Save results to temporary files - scan_results_location = get_temp_file( - file_name='scan_results', - extension='.json' - ) - scan_summary_location = get_temp_file( - file_name='scan_summary', - extension='.json' - ) + scan_results_location = get_temp_file(file_name="scan_results", extension=".json") + scan_summary_location = get_temp_file(file_name="scan_summary", extension=".json") - with open(scan_results_location, 'w') as f: + with open(scan_results_location, "w") as f: json.dump(results, f) - with open(scan_summary_location, 'w') as f: + with open(scan_summary_location, "w") as f: json.dump(summary, f) scannable_uri = get_object_or_404(ScannableURI, uuid=scannable_uri_uuid) scannable_uri.process_scan_results( scan_results_location=scan_results_location, scan_summary_location=scan_summary_location, - project_extra_data=extra_data + project_extra_data=extra_data, ) msg = { - 'status': f'scan results for scannable_uri {scannable_uri.uuid} ' - 'have been queued for indexing' + "status": f"scan results for scannable_uri {scannable_uri.uuid} " + "have been queued for indexing" } return Response(msg) diff --git a/minecode/apps.py b/minecode/apps.py index 148452e7..aa8b5641 100644 --- a/minecode/apps.py +++ b/minecode/apps.py @@ -13,5 +13,5 @@ class MinecodeConfig(AppConfig): - name = 'minecode' - verbose_name = _('Minecode') + name = "minecode" + verbose_name = _("Minecode") diff --git a/minecode/collectors/__init__.py b/minecode/collectors/__init__.py index e1521118..a916ea5b 100644 --- a/minecode/collectors/__init__.py +++ b/minecode/collectors/__init__.py @@ -6,3 +6,13 @@ # See https://github.com/aboutcode-org/purldb for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # + +import pkgutil + +""" +Minimal way to recursively import all submodules dynamically. If this module is +imported, all submodules will be imported: this triggers the actual registration +of miners. This should stay as the last import in this init module. +""" +for _, name, _ in pkgutil.walk_packages(__path__, prefix=__name__ + "."): + __import__(name) diff --git a/minecode/visitors/conan.py b/minecode/collectors/conan.py similarity index 93% rename from minecode/visitors/conan.py rename to minecode/collectors/conan.py index 35c1af94..9b9728d0 100644 --- a/minecode/visitors/conan.py +++ b/minecode/collectors/conan.py @@ -29,9 +29,7 @@ def get_yaml_response(url): - """ - Fetch YAML content from the url and return it as a dictionary. - """ + """Fetch YAML content from the url and return it as a dictionary.""" try: response = requests.get(url) response.raise_for_status() @@ -62,8 +60,7 @@ def get_conan_recipe(name, version): folder = recipe_location.get("folder") if not folder: - logger.error( - f"No folder found for version {version} of package {name}") + logger.error(f"No folder found for version {version} of package {name}") return None, None conanfile_py_url = f"{base_index_url}/{name}/{folder}/conanfile.py" @@ -85,9 +82,7 @@ def get_conan_recipe(name, version): def get_download_info(conandata, version): - """ - Return download_url and SHA256 hash from `conandata.yml`. - """ + """Return download_url and SHA256 hash from `conandata.yml`.""" sources = conandata.get("sources", {}) pkg_data = sources.get(version, {}) @@ -153,9 +148,9 @@ def process_request(purl_str, **kwargs): from minecode.model_utils import DEFAULT_PIPELINES package_url = PackageURL.from_string(purl_str) - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) if not package_url.version: return diff --git a/minecode/collectors/debian.py b/minecode/collectors/debian.py new file mode 100644 index 00000000..c7bf3d42 --- /dev/null +++ b/minecode/collectors/debian.py @@ -0,0 +1,532 @@ +import logging + +import attr +import requests +from debian_inspector.version import Version as DebVersion +from packagedcode import models as scan_models +from packagedcode.debian import DebianDscFileHandler +from packagedcode.debian_copyright import StandaloneDebianCopyrightFileHandler +from packageurl import PackageURL + +from minecode import priority_router +from minecode.utils import fetch_and_write_file_from_url +from minecode.utils import get_package_sha1 +from packagedb.models import PackageContentType +from packagedb.models import PackageRelation +from packagedb.models import make_relationship + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +DEBIAN_BASE_URL = "https://deb.debian.org/debian/pool/main/" +DEBIAN_METADATA_URL = "https://metadata.ftp-master.debian.org/changelogs/main/" + +UBUNTU_BASE_URL = "http://archive.ubuntu.com/ubuntu/pool/main/" +UBUNTU_METADATA_URL = "http://changelogs.ubuntu.com/changelogs/pool/main/" + + +@priority_router.route("pkg:deb/.*") +def process_request(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a maven Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from debian and + using it to create a new PackageDB entry. The binary package is then added to the + scan queue afterwards. We also get the Package information for the + accompanying source package and add it to the PackageDB and scan queue, if + available. + + Return an error string for errors that occur, or empty string if there is no error. + """ + from minecode.model_utils import DEFAULT_PIPELINES + + source_purl = kwargs.get("source_purl", None) + addon_pipelines = kwargs.get("addon_pipelines", []) + pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + priority = kwargs.get("priority", 0) + + try: + package_url = PackageURL.from_string(purl_str) + source_package_url = None + if source_purl: + source_package_url = PackageURL.from_string(source_purl) + + except ValueError as e: + error = f"error occured when parsing purl: {purl_str} source_purl: {source_purl} : {e}" + return error + + has_version = bool(package_url.version) + if has_version: + error = map_debian_metadata_binary_and_source( + package_url=package_url, + source_package_url=source_package_url, + pipelines=pipelines, + priority=priority, + ) + + return error + + +def map_debian_package(debian_package, package_content, pipelines, priority=0): + """ + Add a debian `package_url` to the PackageDB. + + Return an error string if errors have occured in the process. + """ + from minecode.model_utils import add_package_to_scan_queue + from minecode.model_utils import merge_or_create_package + + db_package = None + error = "" + + purl = debian_package.package_url + if package_content == PackageContentType.BINARY: + download_url = debian_package.binary_archive_url + elif package_content == PackageContentType.SOURCE_ARCHIVE: + download_url = debian_package.source_archive_url + + response = requests.get(download_url) + if not response.ok: + msg = f"Package metadata does not exist on debian: {download_url}" + error += msg + "\n" + logger.error(msg) + return db_package, error + + purl_package = scan_models.PackageData( + type=purl.type, + namespace=purl.namespace, + name=purl.name, + version=purl.version, + qualifiers=purl.qualifiers, + ) + + package, error_metadata = get_debian_package_metadata(debian_package) + if not package: + error += error_metadata + return db_package, error + + package_copyright, error_copyright = get_debian_package_copyright(debian_package) + package.update_purl_fields(package_data=purl_package, replace=True) + if package_copyright: + update_license_copyright_fields( + package_from=package_copyright, + package_to=package, + replace=True, + ) + else: + error += error_metadata + + # This will be used to download and scan the package + package.download_url = download_url + + # Set package_content value + package.extra_data["package_content"] = package_content + + # If sha1 exists for an archive, we know we can create the package + # Use purl info as base and create packages for binary and source package + sha1 = get_package_sha1(package=package, field="download_url") + if sha1: + package.sha1 = sha1 + db_package, _, _, _ = merge_or_create_package(package, visit_level=50) + else: + msg = f"Failed to retrieve package archive: {purl.to_string()} from url: {download_url}" + error += msg + "\n" + logger.error(msg) + + # Submit package for scanning + if db_package: + add_package_to_scan_queue(db_package, pipelines, priority) + + return db_package, error + + +def get_debian_package_metadata(debian_package): + """ + Given a DebianPackage object with package url and source package url + information, get the .dsc package metadata url, fetch the .dsc file, + parse and return the PackageData object containing the package metadata + for that Debian package. + + If there are errors, return None and a string containing the error + information. + """ + error = "" + + metadata_url = debian_package.package_metadata_url + temp_metadata_file = fetch_and_write_file_from_url(url=metadata_url) + if not temp_metadata_file: + msg = f"Package metadata does not exist on debian: {metadata_url}" + error += msg + "\n" + logger.error(msg) + return None, error + + packages = DebianDscFileHandler.parse(location=temp_metadata_file) + package = list(packages).pop() + + package.qualifiers = debian_package.package_url.qualifiers + + return package, error + + +def get_debian_package_copyright(debian_package): + """ + Given a DebianPackage object with package url and source package url + information, get the debian copyright file url, fetch and run license + detection, and return the PackageData object containing the package + metadata for that Debian package. + + If there are errors, return None and a string containing the error + information. + """ + error = "" + + metadata_url = debian_package.package_copyright_url + temp_metadata_file = fetch_and_write_file_from_url(url=metadata_url) + if not temp_metadata_file: + msg = f"Package metadata does not exist on debian: {metadata_url}" + error += msg + "\n" + logger.error(msg) + return None, error + + packages = StandaloneDebianCopyrightFileHandler.parse(location=temp_metadata_file) + package = list(packages).pop() + + package.qualifiers = debian_package.package_url.qualifiers + + return package, error + + +def update_license_copyright_fields(package_from, package_to, replace=True): + fields_to_update = [ + "copyright", + "holder", + "declared_license_expression", + "declared_license_expression_spdx", + "license_detections", + "other_license_expression", + "other_license_expression_spdx", + "other_license_detections", + "extracted_license_statement", + ] + + for field in fields_to_update: + value = getattr(package_from, field) + if value and replace: + setattr(package_to, field, value) + + +def map_debian_metadata_binary_and_source( + package_url, source_package_url, pipelines, priority=0 +): + """ + Get metadata for the binary and source release of the Debian package + `package_url` and save it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + error = "" + + if "repository_url" in package_url.qualifiers: + base_url = package_url.qualifiers["repository_url"] + elif package_url.namespace == "ubuntu": + base_url = UBUNTU_BASE_URL + else: + base_url = DEBIAN_BASE_URL + + if "api_data_url" in package_url.qualifiers: + metadata_base_url = package_url.qualifiers["api_data_url"] + elif package_url.namespace == "ubuntu": + metadata_base_url = UBUNTU_METADATA_URL + else: + metadata_base_url = DEBIAN_METADATA_URL + + package_urls = dict( + package_url=package_url, + source_package_url=source_package_url, + archive_base_url=base_url, + metadata_base_url=metadata_base_url, + ) + debian_package, emsg = DebianPackage.from_purls(package_urls) + if emsg: + return emsg + + binary_package, emsg = map_debian_package( + debian_package, + PackageContentType.BINARY, + pipelines, + priority, + ) + if emsg: + error += emsg + + package_url.qualifiers["classifier"] = "sources" + source_package, emsg = map_debian_package( + debian_package, + PackageContentType.SOURCE_ARCHIVE, + pipelines, + priority, + ) + if emsg: + error += emsg + + if binary_package and source_package: + make_relationship( + from_package=binary_package, + to_package=source_package, + relationship=PackageRelation.Relationship.SOURCE_PACKAGE, + ) + + return error + + +@attr.s +class DebianPackage: + """ + Contains the package url and source package url for a debian package + necessary to get source, binary, metadata and copyright urls for it. + """ + + archive_base_url = attr.ib(type=str) + metadata_base_url = attr.ib(type=str) + package_url = attr.ib(type=str) + source_package_url = attr.ib(type=str) + metadata_directory_url = attr.ib(type=str, default=None) + archive_directory_url = attr.ib(type=str, default=None) + + @classmethod + def from_purls(cls, package_urls): + """Set the directory URLs for metadata and package archives.""" + debian_package = cls(**package_urls) + error = debian_package.set_debian_directories() + return debian_package, error + + @property + def package_archive_version(self): + """ + Get the useful part of the debian package version used in + source, binary, metadata and copyright URLs optionally. + """ + debvers = DebVersion.from_string(self.package_url.version) + if debvers.revision != "0": + purl_version = f"{debvers.upstream}-{debvers.revision}" + else: + purl_version = debvers.upstream + return purl_version + + @property + def binary_archive_url(self): + """Get the .deb debian binary archive url for this debian package.""" + purl_version = self.package_archive_version + arch = self.package_url.qualifiers.get("arch") + if arch: + archive_name = f"{self.package_url.name}_{purl_version}_{arch}.deb" + else: + archive_name = f"{self.package_url.name}_{purl_version}.deb" + binary_package_url = self.archive_directory_url + f"{archive_name}" + return binary_package_url + + @property + def source_archive_url(self): + """Get the debian source tarball archive url for this debian package.""" + debian_source_archive_formats = [ + ".tar.xz", + ".tar.gz", + ".orig.tar.xz", + ".orig.tar.gz", + ".orig.tar.bz2", + ] + + source_version = self.package_archive_version + if not self.source_package_url: + source_package_name = self.package_url.name + else: + source_package_name = self.source_package_url.name + if self.source_package_url.version: + source_version = self.source_package_url.version + + for archive_format in debian_source_archive_formats: + if ".orig" in archive_format: + base_version_source = source_version.split("-")[0] + archive_name = ( + f"{source_package_name}_{base_version_source}" + archive_format + ) + else: + archive_name = ( + f"{source_package_name}_{source_version}" + archive_format + ) + source_package_url = self.archive_directory_url + archive_name + response = requests.get(source_package_url) + if response.ok: + break + + return source_package_url + + @property + def package_metadata_url(self): + """Get the .dsc metadata file url for this debian package.""" + metadata_version = self.package_archive_version + if not self.source_package_url: + metadata_package_name = self.package_url.name + else: + metadata_package_name = self.source_package_url.name + if self.source_package_url.version: + metadata_version = self.source_package_url.version + + base_version_metadata = metadata_version.split("+")[0] + metadata_dsc_package_url = ( + self.archive_directory_url + + f"{metadata_package_name}_{base_version_metadata}.dsc" + ) + response = requests.get(metadata_dsc_package_url) + if not response.ok: + metadata_dsc_package_url = ( + self.archive_directory_url + + f"{metadata_package_name}_{metadata_version}.dsc" + ) + + return metadata_dsc_package_url + + @property + def package_copyright_url(self): + """ + Get the debian copyright file url containing license and copyright + declarations for this debian package. + """ + # Copyright files for ubuntu are named just `copyright` and placed under a name-version folder + # instead of having the name-version in the copyright file itself + copyright_file_string = "_copyright" + if self.package_url.namespace == "ubuntu": + copyright_file_string = "/copyright" + + metadata_version = self.package_archive_version + if not self.source_package_url: + metadata_package_name = self.package_url.name + else: + metadata_package_name = self.source_package_url.name + if self.source_package_url.version: + metadata_version = self.source_package_url.version + + copyright_package_url = ( + self.metadata_directory_url + + f"{metadata_package_name}_{metadata_version}{copyright_file_string}" + ) + response = requests.get(copyright_package_url) + if not response.ok: + base_version_metadata = metadata_version.split("+")[0] + copyright_package_url = ( + self.metadata_directory_url + + f"{metadata_package_name}_{base_version_metadata}{copyright_file_string}" + ) + + return copyright_package_url + + def set_debian_directories(self): + """ + Compute and set base urls for metadata and archives, to get + source/binary + """ + error = "" + + archive_base_url = self.archive_base_url + metadata_base_url = self.metadata_base_url + + index_folder = None + if self.package_url.name.startswith("lib"): + name_wout_lib = self.package_url.name.replace("lib", "") + index_folder = "lib" + name_wout_lib[0] + else: + index_folder = self.package_url.name[0] + + msg = "No directory exists for package at: " + + package_directory = f"{archive_base_url}{index_folder}/{self.package_url.name}/" + metadata_directory = ( + f"{metadata_base_url}{index_folder}/{self.package_url.name}/" + ) + + response = requests.get(package_directory) + if not response.ok: + if not self.source_package_url: + error = msg + str(package_directory) + return error + + if self.source_package_url.name.startswith("lib"): + name_wout_lib = self.source_package_url.name.replace("lib", "") + index_folder = "lib" + name_wout_lib[0] + else: + index_folder = self.source_package_url.name[0] + + package_directory = ( + f"{archive_base_url}{index_folder}/{self.source_package_url.name}/" + ) + metadata_directory = ( + f"{metadata_base_url}{index_folder}/{self.source_package_url.name}/" + ) + + response = requests.get(package_directory) + if not response.ok: + error = msg + str(package_directory) + return error + + self.archive_directory_url = package_directory + self.metadata_directory_url = metadata_directory + + +# FIXME: We are not returning download URLs. Returned information is incorrect + + +def get_dependencies(data): + """Return a list of DependentPackage extracted from a Debian `data` mapping.""" + scopes = { + "Build-Depends": dict(is_runtime=False, is_optional=True), + "Depends": dict(is_runtime=True, is_optional=False), + "Pre-Depends": dict(is_runtime=True, is_optional=False), + # 'Provides': dict(is_runtime=True, is_optional=False), + # 'Recommends': dict(is_runtime=True, is_optional=True), + # 'Suggests': dict(is_runtime=True, is_optional=True), + } + dep_pkgs = [] + for scope, flags in scopes.items(): + depends = data.get(scope) + if not depends: + continue + + dependencies = None # debutils.comma_separated(depends) + if not dependencies: + continue + # break each dep in package names and version constraints + # FIXME:!!! + for name in dependencies: + purl = PackageURL(type="deb", namespace="debian", name=name) + dep = scan_models.DependentPackage( + purl=purl.to_string(), score=scope, **flags + ) + dep_pkgs.append(dep) + + return dep_pkgs + + +def get_vcs_repo(description): + """Return a tuple of (vcs_tool, vcs_repo) or (None, None) if no vcs_repo is found.""" + repos = [] + for vcs_tool, vcs_repo in description.items(): + vcs_tool = vcs_tool.lower() + if not vcs_tool.startswith("vcs-") or vcs_tool.startswith("vcs-browser"): + continue + _, _, vcs_tool = vcs_tool.partition("-") + repos.append((vcs_tool, vcs_repo)) + + if len(repos) > 1: + raise TypeError(f"Debian description with more than one Vcs repos: {repos}") + + if repos: + vcs_tool, vcs_repo = repos[0] + else: + vcs_tool = None + vcs_repo = None + + return vcs_tool, vcs_repo diff --git a/minecode/visitors/generic.py b/minecode/collectors/generic.py similarity index 91% rename from minecode/visitors/generic.py rename to minecode/collectors/generic.py index 85aa4abd..ab20197a 100644 --- a/minecode/visitors/generic.py +++ b/minecode/collectors/generic.py @@ -36,7 +36,7 @@ def map_generic_package(package_url, pipelines, priority=0): from minecode.model_utils import add_package_to_scan_queue from minecode.model_utils import merge_or_create_package - download_url = package_url.qualifiers.get('download_url') + download_url = package_url.qualifiers.get("download_url") package = PackageData( type=package_url.type, namespace=package_url.namespace, @@ -69,19 +69,19 @@ def process_request(purl_str, **kwargs): """ from minecode.model_utils import DEFAULT_PIPELINES - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) try: package_url = PackageURL.from_string(purl_str) except ValueError as e: - error = f'error occured when parsing {purl_str}: {e}' + error = f"error occured when parsing {purl_str}: {e}" return error - download_url = package_url.qualifiers.get('download_url') + download_url = package_url.qualifiers.get("download_url") if not download_url: - error = f'package_url {purl_str} does not contain a download_url qualifier' + error = f"package_url {purl_str} does not contain a download_url qualifier" return error error_msg = map_generic_package(package_url, pipelines, priority) @@ -183,9 +183,9 @@ def process_request_fetchcode_generic(purl_str, **kwargs): """ from minecode.model_utils import DEFAULT_PIPELINES - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) try: package_url = PackageURL.from_string(purl_str) @@ -193,8 +193,7 @@ def process_request_fetchcode_generic(purl_str, **kwargs): error = f"error occurred when parsing {purl_str}: {e}" return error - error_msg = map_fetchcode_supported_package( - package_url, pipelines, priority) + error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) if error_msg: return error_msg diff --git a/minecode/collectors/github.py b/minecode/collectors/github.py new file mode 100644 index 00000000..fbf6337e --- /dev/null +++ b/minecode/collectors/github.py @@ -0,0 +1,42 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from packageurl import PackageURL + +from minecode import priority_router +from minecode.collectors.generic import map_fetchcode_supported_package + + +# Indexing GitHub PURLs requires a GitHub API token. +# Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. +@priority_router.route("pkg:github/.*") +def process_request_dir_listed(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a GitHub Package URL (PURL). + + This involves obtaining Package information for the PURL using + https://github.com/aboutcode-org/fetchcode and using it to create a new + PackageDB entry. The package is then added to the scan queue afterwards. + """ + from minecode.model_utils import DEFAULT_PIPELINES + + addon_pipelines = kwargs.get("addon_pipelines", []) + pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + priority = kwargs.get("priority", 0) + + try: + package_url = PackageURL.from_string(purl_str) + except ValueError as e: + error = f"error occurred when parsing {purl_str}: {e}" + return error + + error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) + + if error_msg: + return error_msg diff --git a/minecode/visitors/gnu.py b/minecode/collectors/gnu.py similarity index 82% rename from minecode/visitors/gnu.py rename to minecode/collectors/gnu.py index 8dbe5860..861d6231 100644 --- a/minecode/visitors/gnu.py +++ b/minecode/collectors/gnu.py @@ -13,7 +13,7 @@ from packageurl import PackageURL from minecode import priority_router -from minecode.visitors.generic import map_fetchcode_supported_package +from minecode.collectors.generic import map_fetchcode_supported_package logger = logging.getLogger(__name__) handler = logging.StreamHandler() @@ -33,16 +33,15 @@ def process_request(purl_str, **kwargs): """ from minecode.model_utils import DEFAULT_PIPELINES - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) + priority = kwargs.get("priority", 0) package_url = PackageURL.from_string(purl_str) if not package_url.version: return - error_msg = map_fetchcode_supported_package( - package_url, pipelines, priority) + error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) if error_msg: return error_msg diff --git a/minecode/collectors/maven.py b/minecode/collectors/maven.py index aab631fb..7aa4bd35 100644 --- a/minecode/collectors/maven.py +++ b/minecode/collectors/maven.py @@ -1,11 +1,52 @@ -from dateutil.parser import parse as dateutil_parse -from minecode.visitors.maven import get_artifacts, is_worthy_artifact, build_url_and_filename +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import hashlib +import logging +import re +from urllib.parse import urlparse + +import requests +from packagedcode.maven import _parse +from packagedcode.maven import get_maven_pom from packagedcode.maven import get_urls -from minecode.utils import fetch_http, get_temp_file from packagedcode.models import PackageData +from packageurl import PackageURL + +from minecode import priority_router +from minecode.miners.maven import build_url_and_filename +from minecode.miners.maven import get_artifacts +from minecode.miners.maven import is_worthy_artifact +from minecode.utils import fetch_http +from minecode.utils import get_temp_file +from minecode.utils import validate_sha1 +from packagedb.models import PackageContentType +from packagedb.models import PackageRelation +from packagedb.models import make_relationship + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +TRACE = False +TRACE_DEEP = False +if TRACE: + import sys -MAVEN_INDEX_URL = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' + logging.basicConfig(stream=sys.stdout) + logger.setLevel(logging.DEBUG) + + +MAVEN_BASE_URL = "https://repo1.maven.org/maven2" +MAVEN_INDEX_URL = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz" +) class MavenNexusCollector: @@ -22,15 +63,13 @@ def fetch_index(self, uri=MAVEN_INDEX_URL, timeout=10): `timeout` is a default timeout. """ content = fetch_http(uri, timeout=timeout) - temp_file = get_temp_file('NonPersistentHttpVisitor') - with open(temp_file, 'wb') as tmp: + temp_file = get_temp_file("NonPersistentHttpVisitor") + with open(temp_file, "wb") as tmp: tmp.write(content) return temp_file def get_packages(self, content=None): - """ - Yield Package objects from maven index - """ + """Yield Package objects from maven index""" if content: index_location = content else: @@ -49,12 +88,12 @@ def get_packages(self, content=None): continue qualifiers = {} - if extension and extension != 'jar': - qualifiers['type'] = extension + if extension and extension != "jar": + qualifiers["type"] = extension classifier = artifact.classifier if classifier: - qualifiers['classifier'] = classifier + qualifiers["classifier"] = classifier # FIXME: also use the Artifact.src_exist flags too? @@ -63,7 +102,8 @@ def get_packages(self, content=None): # instead togther with the filename... especially we could use # different REPOs. jar_download_url, _ = build_url_and_filename( - group_id, artifact_id, version, extension, classifier) + group_id, artifact_id, version, extension, classifier + ) # FIXME: should this be set in the yielded URI too last_mod = artifact.last_modified @@ -75,12 +115,12 @@ def get_packages(self, content=None): qualifiers=qualifiers or None, ) - repository_homepage_url = urls['repository_homepage_url'] - repository_download_url = urls['repository_download_url'] - api_data_url = urls['api_data_url'] + repository_homepage_url = urls["repository_homepage_url"] + repository_download_url = urls["repository_download_url"] + api_data_url = urls["api_data_url"] yield PackageData( - type='maven', + type="maven", namespace=group_id, name=artifact_id, version=version, @@ -93,3 +133,669 @@ def get_packages(self, content=None): repository_download_url=repository_download_url, api_data_url=api_data_url, ) + + +def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_URL): + """ + Return the contents of the POM file of the package described by the purl + field arguments in a string. + """ + # Create URLs using purl fields + if qualifiers and not isinstance(qualifiers, dict): + return + urls = get_urls( + namespace=namespace, + name=name, + version=version, + qualifiers=qualifiers, + base_url=base_url, + ) + if not urls: + return + # Get and parse POM info + pom_url = urls["api_data_url"] + # TODO: manage different types of errors (404, etc.) + response = requests.get(pom_url) + if not response: + return + return response.text + + +def fetch_parent(pom_text, base_url=MAVEN_BASE_URL): + """Return the parent pom text of `pom_text`, or None if `pom_text` has no parent.""" + if not pom_text: + return + pom = get_maven_pom(text=pom_text) + if ( + pom.parent + and pom.parent.group_id + and pom.parent.artifact_id + and pom.parent.version.version + ): + parent_namespace = pom.parent.group_id + parent_name = pom.parent.artifact_id + parent_version = str(pom.parent.version.version) + parent_pom_text = get_pom_text( + namespace=parent_namespace, + name=parent_name, + version=parent_version, + qualifiers={}, + base_url=base_url, + ) + return parent_pom_text + + +def get_ancestry(pom_text, base_url=MAVEN_BASE_URL): + """ + Return a list of pom text of the ancestors of `pom`. The list is ordered + from oldest ancestor to newest. The list is empty is there is no parent pom. + """ + ancestors = [] + has_parent = True + while has_parent: + parent_pom_text = fetch_parent(pom_text=pom_text, base_url=base_url) + if not parent_pom_text: + has_parent = False + else: + ancestors.append(parent_pom_text) + pom_text = parent_pom_text + return reversed(ancestors) + + +def get_merged_ancestor_package_from_maven_package(package, base_url=MAVEN_BASE_URL): + """ + Merge package details of a package with its ancestor pom + and return the merged package. + """ + if not package: + return + pom_text = get_pom_text( + name=package.name, + namespace=package.namespace, + version=package.version, + qualifiers=package.qualifiers, + base_url=base_url, + ) + merged_package = merge_ancestors( + ancestor_pom_texts=get_ancestry(pom_text), + package=package, + ) + return merged_package + + +def merge_parent(package, parent_package): + """Merge `parent_package` data into `package` and return `package.""" + mergeable_fields = ( + "declared_license_expression", + "homepage_url", + "parties", + ) + for field in mergeable_fields: + # If `field` is empty on the package we're looking at, populate + # those fields with values from the parent package. + if not getattr(package, field): + value = getattr(parent_package, field) + setattr(package, field, value) + + msg = f"Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}" + history = package.extra_data.get("history") + if history: + package.extra_data["history"].append(msg) + else: + package.extra_data["history"] = [msg] + + return package + + +def merge_ancestors(ancestor_pom_texts, package): + """ + Merge metadata from `ancestor_pom_text` into `package`. + + The order of POM content in `ancestor_pom_texts` is expected to be in the + order of oldest ancestor to newest. + """ + for ancestor_pom_text in ancestor_pom_texts: + ancestor_package = _parse( + datasource_id="maven_pom", + package_type="maven", + primary_language="Java", + text=ancestor_pom_text, + ) + package = merge_parent(package, ancestor_package) + return package + + +def map_maven_package( + package_url, package_content, pipelines, priority=0, reindex_metadata=False +): + """ + Add a maven `package_url` to the PackageDB. + + Return an error string if errors have occured in the process. + + if ``reindex_metadata`` is True, only reindex metadata and DO NOT rescan the full package. + """ + from minecode.model_utils import add_package_to_scan_queue + from minecode.model_utils import merge_or_create_package + + db_package = None + error = "" + + if "repository_url" in package_url.qualifiers: + base_url = package_url.qualifiers["repository_url"] + else: + base_url = MAVEN_BASE_URL + + pom_text = get_pom_text( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + qualifiers=package_url.qualifiers, + base_url=base_url, + ) + if not pom_text: + msg = f"Package does not exist on maven: {package_url}" + error += msg + "\n" + logger.error(msg) + return db_package, error + + package = _parse( + "maven_pom", + "maven", + "Java", + text=pom_text, + base_url=base_url, + ) + ancestor_pom_texts = get_ancestry(pom_text=pom_text, base_url=base_url) + package = merge_ancestors(ancestor_pom_texts=ancestor_pom_texts, package=package) + + urls = get_urls( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + qualifiers=package_url.qualifiers, + base_url=base_url, + ) + # In the case of looking up a maven package with qualifiers of + # `classifiers=sources`, the purl of the package created from the pom does + # not have the qualifiers, so we need to set them. Additionally, the download + # url is not properly generated since it would be missing the sources bit + # from the filename. + package.qualifiers = package_url.qualifiers + package.download_url = urls["repository_download_url"] + package.repository_download_url = urls["repository_download_url"] + + # Set package_content value + package.extra_data["package_content"] = package_content + + # If sha1 exists for a jar, we know we can create the package + # Use pom info as base and create packages for binary and source package + + # Check to see if binary is available + sha1 = get_package_sha1(package) + if sha1: + package.sha1 = sha1 + override = reindex_metadata + db_package, _, _, _ = merge_or_create_package( + package, visit_level=50, override=override + ) + else: + msg = f"Failed to retrieve JAR: {package_url}" + error += msg + "\n" + logger.error(msg) + + if not reindex_metadata: + # Submit package for scanning + if db_package: + add_package_to_scan_queue( + package=db_package, pipelines=pipelines, priority=priority + ) + + return db_package, error + + +def map_maven_binary_and_source( + package_url, pipelines, priority=0, reindex_metadata=False +): + """ + Get metadata for the binary and source release of the Maven package + `package_url` and save it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + error = "" + package, emsg = map_maven_package( + package_url=package_url, + package_content=PackageContentType.BINARY, + pipelines=pipelines, + priority=priority, + reindex_metadata=reindex_metadata, + ) + if emsg: + error += emsg + + source_package_url = package_url + source_package_url.qualifiers["classifier"] = "sources" + source_package, emsg = map_maven_package( + package_url=source_package_url, + package_content=PackageContentType.SOURCE_ARCHIVE, + pipelines=pipelines, + priority=priority, + reindex_metadata=reindex_metadata, + ) + if emsg: + error += emsg + + if not reindex_metadata and package and source_package: + make_relationship( + from_package=source_package, + to_package=package, + relationship=PackageRelation.Relationship.SOURCE_PACKAGE, + ) + + return error + + +def map_maven_packages(package_url, pipelines): + """ + Given a valid `package_url` with no version, get metadata for the binary and + source release for each version of the Maven package `package_url` and save + it to the PackageDB. + + Return an error string for errors that occur, or empty string if there is no error. + """ + error = "" + namespace = package_url.namespace + name = package_url.name + # Find all versions of this package + query_params = f"g:{namespace}+AND+a:{name}" + url = f"https://search.maven.org/solrsearch/select?q={query_params}&core=gav" + response = requests.get(url) + if response: + package_listings = response.json().get("response", {}).get("docs", []) + for listing in package_listings: + purl = PackageURL( + type="maven", + namespace=listing.get("g"), + name=listing.get("a"), + version=listing.get("v"), + ) + emsg = map_maven_binary_and_source(purl, pipelines) + if emsg: + error += emsg + return error + + +def get_package_sha1(package): + """ + Return the sha1 value for `package` by checking if the sha1 file exists for + `package` on maven and returning the contents if it does. + If the sha1 is invalid, we download the package's JAR and calculate the sha1 + from that. + """ + download_url = package.repository_download_url + sha1_download_url = f"{download_url}.sha1" + response = requests.get(sha1_download_url) + if response.ok: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + if not sha1: + # Download JAR and calculate sha1 if we cannot get it from the repo + response = requests.get(download_url) + if response: + sha1_hash = hashlib.new("sha1", response.content) + sha1 = sha1_hash.hexdigest() + return sha1 + + +@priority_router.route("pkg:maven/.*") +def process_request(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a maven Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from maven and + using it to create a new PackageDB entry. The package is then added to the + scan queue afterwards. We also get the Package information for the + accompanying source package and add it to the PackageDB and scan queue, if + available. + + Return an error string for errors that occur, or empty string if there is no error. + """ + from minecode.model_utils import DEFAULT_PIPELINES + + addon_pipelines = kwargs.get("addon_pipelines", []) + pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + priority = kwargs.get("priority", 0) + + try: + package_url = PackageURL.from_string(purl_str) + except ValueError as e: + error = f"error occured when parsing {purl_str}: {e}" + return error + + has_version = bool(package_url.version) + if has_version: + reindex_metadata = kwargs.get("reindex_metadata", False) + error = map_maven_binary_and_source( + package_url, + pipelines, + reindex_metadata=reindex_metadata, + priority=priority, + ) + else: + error = map_maven_packages(package_url, pipelines) + + return error + + +collect_links = re.compile(r'href="([^"]+)"').findall +collect_links_and_artifact_timestamps = re.compile( + r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' +).findall + + +def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): + """Return True if `file_name` is in `links`""" + return any(link.endswith(file_name) for link in links) + + +def check_if_page_has_pom_files(links, **kwargs): + """Return True of any entry in `links` ends with .pom.""" + return any(link.endswith(".pom") for link in links) + + +def check_if_page_has_directories(links, **kwargs): + """Return True if any entry, excluding "../", ends with /.""" + return any(link.endswith("/") for link in links if link != "../") + + +def check_if_package_version_page(links, **kwargs): + """Return True if `links` contains pom files and has no directories""" + return check_if_page_has_pom_files( + links=links + ) and not check_if_page_has_directories(links=links) + + +def check_if_package_page(links, **kwargs): + return check_if_file_name_is_linked_on_page( + file_name="maven-metadata.xml", links=links + ) and not check_if_page_has_pom_files(links=links) + + +def check_if_maven_root(links, **kwargs): + """ + Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven + repo contains "archetype-catalog.xml". + """ + return check_if_file_name_is_linked_on_page( + file_name="archetype-catalog.xml", links=links + ) + + +def check_on_page(url, checker): + """ + Return True if there is a link on `url` that is the same as `file_name`, + False otherwise. + """ + response = requests.get(url) + if response: + links = collect_links(response.text) + return checker(links=links) + return False + + +def is_maven_root(url): + """Return True if `url` is the root of a Maven repo, False otherwise.""" + return check_on_page(url, check_if_maven_root) + + +def is_package_page(url): + """Return True if `url` is a package page on a Maven repo, False otherwise.""" + return check_on_page(url, check_if_package_page) + + +def is_package_version_page(url): + """Return True if `url` is a package version page on a Maven repo, False otherwise.""" + return check_on_page(url, check_if_package_version_page) + + +def url_parts(url): + parsed_url = urlparse(url) + scheme = parsed_url.scheme + netloc = parsed_url.netloc + path_segments = [p for p in parsed_url.path.split("/") if p] + return scheme, netloc, path_segments + + +def create_url(scheme, netloc, path_segments): + url_template = f"{scheme}://{netloc}" + path = "/".join(path_segments) + return f"{url_template}/{path}" + + +def get_maven_root(url): + """ + Given `url`, that is a URL to namespace, package, or artifact in a Maven + repo, return the URL to the root of that repo. If a Maven root cannot be + determined, return None. + + >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + 'https://repo1.maven.org/maven2' + """ + scheme, netloc, path_segments = url_parts(url) + for i in range(len(path_segments)): + segments = path_segments[: i + 1] + url_segment = create_url(scheme, netloc, segments) + if is_maven_root(url_segment): + return url_segment + return None + + +def determine_namespace_name_version_from_url(url, root_url=None): + """ + Return a 3-tuple containing strings of a Package namespace, name, and + version, determined from `url`, where `url` points to namespace, package, + specific package version, or artifact on a Maven repo. + + Return None if a Maven root cannot be determined from `url`. + + >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') + ('net.shibboleth', 'parent', '7.11.0') + """ + if not root_url: + root_url = get_maven_root(url) + if not root_url: + raise Exception(f"Error: not a Maven repository: {url}") + + _, remaining_path_segments = url.split(root_url) + remaining_path_segments = remaining_path_segments.split("/") + remaining_path_segments = [p for p in remaining_path_segments if p] + + namespace_segments = [] + package_name = "" + package_version = "" + for i in range(len(remaining_path_segments)): + segment = remaining_path_segments[i] + segments = remaining_path_segments[: i + 1] + path = "/".join(segments) + url_segment = f"{root_url}/{path}" + if is_package_page(url_segment): + package_name = segment + elif is_package_version_page(url_segment): + package_version = segment + else: + namespace_segments.append(segment) + namespace = ".".join(namespace_segments) + return namespace, package_name, package_version + + +def add_to_import_queue(url, root_url): + """Create ImportableURI for the Maven repo package page at `url`.""" + from minecode.models import ImportableURI + + data = None + response = requests.get(url) + if response: + data = response.text + namespace, name, _ = determine_namespace_name_version_from_url(url, root_url) + purl = PackageURL( + type="maven", + namespace=namespace, + name=name, + ) + importable_uri = ImportableURI.objects.insert(url, data, purl) + if importable_uri: + logger.info(f"Inserted {url} into ImportableURI queue") + + +def filter_only_directories(timestamps_by_links): + """Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`),""" + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + if link != "../" and link.endswith("/"): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + +valid_artifact_extensions = [ + "ejb3", + "ear", + "aar", + "apk", + "gem", + "jar", + "nar", + # 'pom', + "so", + "swc", + "tar", + "tar.gz", + "war", + "xar", + "zip", +] + + +def filter_for_artifacts(timestamps_by_links): + """ + Given a mapping of `timestamps_by_links`, where the links are the filenames + of Maven artifacts, return a mapping of filenames whose extension is in + `valid_artifact_extensions` and their timestamps. + """ + timestamps_by_links_filtered = {} + for link, timestamp in timestamps_by_links.items(): + for ext in valid_artifact_extensions: + if link.endswith(ext): + timestamps_by_links_filtered[link] = timestamp + return timestamps_by_links_filtered + + +def collect_links_from_text(text, filter): + """ + Return a mapping of link locations and their timestamps, given HTML `text` + content, that is filtered using `filter`. + """ + links_and_timestamps = collect_links_and_artifact_timestamps(text) + timestamps_by_links = {} + for link, timestamp in links_and_timestamps: + if timestamp == "-": + timestamp = "" + timestamps_by_links[link] = timestamp + + timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) + return timestamps_by_links + + +def create_absolute_urls_for_links(text, url, filter): + """ + Given the `text` contents from `url`, return a mapping of absolute URLs to + links from `url` and their timestamps, that is then filtered by `filter`. + """ + timestamps_by_absolute_links = {} + url = url.rstrip("/") + timestamps_by_links = collect_links_from_text(text, filter) + for link, timestamp in timestamps_by_links.items(): + if not link.startswith(url): + link = f"{url}/{link}" + timestamps_by_absolute_links[link] = timestamp + return timestamps_by_absolute_links + + +def get_directory_links(url): + """Return a list of absolute directory URLs of the hyperlinks from `url`""" + timestamps_by_directory_links = {} + response = requests.get(url) + if response: + timestamps_by_directory_links = create_absolute_urls_for_links( + response.text, url=url, filter=filter_only_directories + ) + return timestamps_by_directory_links + + +def get_artifact_links(url): + """Return a list of absolute directory URLs of the hyperlinks from `url`""" + timestamps_by_artifact_links = [] + response = requests.get(url) + if response: + timestamps_by_artifact_links = create_absolute_urls_for_links( + response.text, url=url, filter=filter_for_artifacts + ) + return timestamps_by_artifact_links + + +def crawl_to_package(url, root_url): + """Given a maven repo `url`,""" + if is_package_page(url): + add_to_import_queue(url, root_url) + return + + for link in get_directory_links(url): + crawl_to_package(link, root_url) + + +def crawl_maven_repo_from_root(root_url): + """ + Given the `url` to a maven root, traverse the repo depth-first and add + packages to the import queue. + """ + crawl_to_package(root_url, root_url) + + +def get_artifact_sha1(artifact_url): + """Return the SHA1 value of the Maven artifact located at `artifact_url`.""" + sha1 = None + artifact_sha1_url = f"{artifact_url}.sha1" + response = requests.get(artifact_sha1_url) + if response: + sha1_contents = response.text.strip().split() + sha1 = sha1_contents[0] + sha1 = validate_sha1(sha1) + return sha1 + + +def get_classifier_from_artifact_url( + artifact_url, package_version_page_url, package_name, package_version +): + """ + Return the classifier from a Maven artifact URL `artifact_url`, otherwise + return None if a classifier cannot be determined from `artifact_url` + """ + classifier = None + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 + package_version_page_url = package_version_page_url.rstrip("/") + # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 + leading_url_portion = f"{package_version_page_url}/{package_name}-{package_version}" + # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' + # ['', '-onejar.jar'] + _, remaining_url_portion = artifact_url.split(leading_url_portion) + # ['-onejar', 'jar'] + remaining_url_portions = remaining_url_portion.split(".") + if remaining_url_portions and remaining_url_portions[0]: + # '-onejar' + classifier = remaining_url_portions[0] + if classifier.startswith("-"): + # 'onejar' + classifier = classifier[1:] + return classifier diff --git a/minecode/collectors/npm.py b/minecode/collectors/npm.py new file mode 100644 index 00000000..50593744 --- /dev/null +++ b/minecode/collectors/npm.py @@ -0,0 +1,107 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import logging + +import requests +from packagedcode.npm import NpmPackageJsonHandler +from packagedcode.npm import npm_api_url +from packageurl import PackageURL + +from minecode import priority_router +from packagedb.models import PackageContentType + +""" +Collect NPM packages from npm registries. +""" + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +def get_package_json(namespace, name, version): + """ + Return the contents of the package.json file of the package described by the purl + field arguments in a string. + """ + # Create URLs using purl fields + url = npm_api_url( + namespace=namespace, + name=name, + version=version, + ) + + try: + response = requests.get(url) + response.raise_for_status() + return response.json() + except requests.exceptions.HTTPError as err: + logger.error(f"HTTP error occurred: {err}") + + +def map_npm_package(package_url, pipelines, priority=0): + """ + Add a npm `package_url` to the PackageDB. + + Return an error string if any errors are encountered during the process + """ + from minecode.model_utils import add_package_to_scan_queue + from minecode.model_utils import merge_or_create_package + + package_json = get_package_json( + namespace=package_url.namespace, + name=package_url.name, + version=package_url.version, + ) + + if not package_json: + error = f"Package does not exist on npmjs: {package_url}" + logger.error(error) + return error + + package = NpmPackageJsonHandler._parse(json_data=package_json) + package.extra_data["package_content"] = PackageContentType.SOURCE_ARCHIVE + + db_package, _, _, error = merge_or_create_package(package, visit_level=0) + + # Submit package for scanning + if db_package: + add_package_to_scan_queue( + package=db_package, pipelines=pipelines, priority=priority + ) + + return error + + +@priority_router.route("pkg:npm/.*") +def process_request(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a npm Package URL (PURL) as a + URI. + + This involves obtaining Package information for the PURL from npm and + using it to create a new PackageDB entry. The package is then added to the + scan queue afterwards. + """ + from minecode.model_utils import DEFAULT_PIPELINES + + addon_pipelines = kwargs.get("addon_pipelines", []) + pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + priority = kwargs.get("priority", 0) + + package_url = PackageURL.from_string(purl_str) + if not package_url.version: + return + + error_msg = map_npm_package(package_url, pipelines, priority) + + if error_msg: + return error_msg diff --git a/minecode/collectors/openssl.py b/minecode/collectors/openssl.py new file mode 100644 index 00000000..a74cb421 --- /dev/null +++ b/minecode/collectors/openssl.py @@ -0,0 +1,43 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from packageurl import PackageURL + +from minecode import priority_router +from minecode.collectors.generic import map_fetchcode_supported_package + + +# Indexing OpenSSL PURLs requires a GitHub API token. +# Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. +@priority_router.route("pkg:openssl/openssl@.*") +def process_request_dir_listed(purl_str, **kwargs): + """ + Process `priority_resource_uri` containing a OpenSSL Package URL (PURL) + supported by fetchcode. + + This involves obtaining Package information for the PURL using + https://github.com/nexB/fetchcode and using it to create a new + PackageDB entry. The package is then added to the scan queue afterwards. + """ + from minecode.model_utils import DEFAULT_PIPELINES + + addon_pipelines = kwargs.get("addon_pipelines", []) + pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) + priority = kwargs.get("priority", 0) + + try: + package_url = PackageURL.from_string(purl_str) + except ValueError as e: + error = f"error occurred when parsing {purl_str}: {e}" + return error + + error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) + + if error_msg: + return error_msg diff --git a/minecode/command.py b/minecode/command.py index b46f29ae..7e22aee1 100644 --- a/minecode/command.py +++ b/minecode/command.py @@ -15,13 +15,12 @@ from minecode import ON_WINDOWS - logger = logging.getLogger(__name__) # FIXME: use commoncode instead -class Command(object): +class Command: """Simple wrapper around a subprocess.""" def __init__(self, command, env=None, cwd=None): @@ -31,15 +30,17 @@ def __init__(self, command, env=None, cwd=None): self.start() def start(self): - self.proc = subprocess.Popen(self.command, - shell=True, - cwd=self.cwd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=self.env, - universal_newlines=True, - close_fds=not ON_WINDOWS) + self.proc = subprocess.Popen( + self.command, + shell=True, + cwd=self.cwd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=self.env, + universal_newlines=True, + close_fds=not ON_WINDOWS, + ) self.returncode = self.proc.returncode def execute(self): @@ -49,15 +50,16 @@ def stop(self): if not self.proc: return - close_pipe(getattr(self.proc, 'stdin', None)) - close_pipe(getattr(self.proc, 'stderr', None)) - close_pipe(getattr(self.proc, 'stdout', None)) + close_pipe(getattr(self.proc, "stdin", None)) + close_pipe(getattr(self.proc, "stderr", None)) + close_pipe(getattr(self.proc, "stdout", None)) # Ensure process death in all cases, otherwise proc.wait seems to hang # in some cases def kill(sig, fun): if self.proc and self.proc.poll() is None: self.proc.kill() + signal.signal(signal.SIGALRM, kill) # @UndefinedVariable signal.alarm(5) # @UndefinedVariable @@ -71,5 +73,5 @@ def close_pipe(pipe): return try: pipe.close() - except IOError: + except OSError: pass diff --git a/minecode/debutils.py b/minecode/debutils.py index 5c33f531..04e28bdc 100644 --- a/minecode/debutils.py +++ b/minecode/debutils.py @@ -15,35 +15,29 @@ def parse_email(text): """ if not text: return None, None - name, _, email = text.partition('<') - email = email.strip('>') + name, _, email = text.partition("<") + email = email.strip(">") name = name.strip() email = email.strip() return name or None, email or None def comma_separated(text): - """ - Return a list of strings from a comma-separated text. - """ + """Return a list of strings from a comma-separated text.""" if not text: return [] - return [t.strip() for t in text.split(',') if t and t.strip()] + return [t.strip() for t in text.split(",") if t and t.strip()] def fold(value): - """ - Return a folded `value` string. - """ + """Return a folded `value` string.""" if not value: return value - return ''.join(value.split()) + return "".join(value.split()) def line_separated(value): - """ - Return a list of values from a `value` string using line delimiters. - """ + """Return a list of values from a `value` string using line delimiters.""" if not value: return [] return [v.strip() for v in value.splitlines(False) if v] diff --git a/minecode/filter.py b/minecode/filter.py index 38d89c49..018c6bc9 100644 --- a/minecode/filter.py +++ b/minecode/filter.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -22,58 +21,59 @@ def sf_net(input_file, output): adding new columns and trying to sf_net the data """ download_url_template = ( - 'http://master.dl.sourceforge.net/project' - '/%(project_id)s%(filename)s' + "http://master.dl.sourceforge.net/project" "/%(project_id)s%(filename)s" ) - existing_headers = ('external_id,name,version,license,owners,' - 'homepage_url,keywords,description'.split(',') - ) + existing_headers = ( + "external_id,name,version,license,owners," + "homepage_url,keywords,description".split(",") + ) - new_headers = ('computed_version,release_date_ts,file_download_url,' - 'reviewed,curated_name,excluded_reason,curated_owner,' - 'owner_type'.split(',') - ) + new_headers = ( + "computed_version,release_date_ts,file_download_url," + "reviewed,curated_name,excluded_reason,curated_owner," + "owner_type".split(",") + ) - with open(output, 'w') as fo: + with open(output, "w") as fo: writer = csv.writer(fo, quoting=csv.QUOTE_ALL) - with open(input_file, 'r') as fi: + with open(input_file) as fi: reader = csv.reader(fi) - for i, l in enumerate(reader): + for i, row in enumerate(reader): if i == 0: # add headers on first row - l.extend(new_headers) - if not l: + row.extend(new_headers) + if not row: continue - project_id = l[0] - name = l[1] - version_column = l[2] - sep = ': released on ' + project_id = row[0] + name = row[1] + version_column = row[2] + sep = ": released on " if sep not in version_column: # write as is if we do not have a file release date # separator - writer.writerow(l) + writer.writerow(row) continue filename, release_date_ts = version_column.split(sep, 1) found_version = version.version_hint(filename) - l.append(found_version or '') - l.append(release_date_ts or '') - l.append(download_url_template % locals()) - l.append('') # reviewed - l.append('') # curated name - excluded_reason = '' - if '.' in project_id: - excluded_reason = 'mirror or special project' + row.append(found_version or "") + row.append(release_date_ts or "") + row.append(download_url_template % locals()) + row.append("") # reviewed + row.append("") # curated name + excluded_reason = "" + if "." in project_id: + excluded_reason = "mirror or special project" elif not found_version: - excluded_reason = 'no version' + excluded_reason = "no version" elif not good_name(name): - excluded_reason = 'special chars in name' + excluded_reason = "special chars in name" elif not good_filename(project_id, filename, name): - excluded_reason = 'multi component possible' - l.append(excluded_reason) - l.append('') # curated_owner - l.append('') # owner_type - writer.writerow(l) + excluded_reason = "multi component possible" + row.append(excluded_reason) + row.append("") # curated_owner + row.append("") # owner_type + writer.writerow(row) def good_name(s): @@ -85,9 +85,11 @@ def good_name(s): -- there is a punctuation sign string.punctuation -- there is non-ascii letters string.letters + string.digit """ - return (s - and all(c not in string.punctuation for c in s) - and all(c in string.ascii_lowercase for c in s.lower())) + return ( + s + and all(c not in string.punctuation for c in s) + and all(c in string.ascii_lowercase for c in s.lower()) + ) def good_filename(pid, fn, name): diff --git a/minecode/indexing.py b/minecode/indexing.py index ba6a1e36..c94d14e4 100644 --- a/minecode/indexing.py +++ b/minecode/indexing.py @@ -31,7 +31,9 @@ def index_package_files(package, scan_data, reindex=False): deleted and recreated from `scan_data`. """ if reindex: - logger.info(f'Deleting fingerprints and Resources related to {package.package_url}') + logger.info( + f"Deleting fingerprints and Resources related to {package.package_url}" + ) package.approximatedirectorycontentindex_set.all().delete() package.approximatedirectorystructureindex_set.all().delete() package.approximateresourcecontentindex_set.all().delete() @@ -40,21 +42,24 @@ def index_package_files(package, scan_data, reindex=False): scan_index_errors = [] try: - logger.info(f'Indexing Resources and fingerprints related to {package.package_url} from scan data') - for resource in scan_data.get('files', []): + logger.info( + f"Indexing Resources and fingerprints related to {package.package_url} from scan data" + ) + for resource in scan_data.get("files", []): r, _, _ = update_or_create_resource(package, resource) path = r.path sha1 = r.sha1 if sha1: - _, _ = ExactFileIndex.index( - sha1=sha1, - package=package - ) + _, _ = ExactFileIndex.index(sha1=sha1, package=package) - resource_extra_data = resource.get('extra_data', {}) - directory_content_fingerprint = resource_extra_data.get('directory_content', '') - directory_structure_fingerprint = resource_extra_data.get('directory_structure', '') - halo1 = resource_extra_data.get('halo1', '') + resource_extra_data = resource.get("extra_data", {}) + directory_content_fingerprint = resource_extra_data.get( + "directory_content", "" + ) + directory_structure_fingerprint = resource_extra_data.get( + "directory_structure", "" + ) + halo1 = resource_extra_data.get("halo1", "") if directory_content_fingerprint: _, _ = ApproximateDirectoryContentIndex.index( @@ -85,50 +90,58 @@ def index_package_files(package, scan_data, reindex=False): return scan_index_errors -def index_package(scannable_uri, package, scan_data, summary_data, project_extra_data, reindex=False): +def index_package( + scannable_uri, package, scan_data, summary_data, project_extra_data, reindex=False +): scan_index_errors = [] try: indexing_errors = index_package_files(package, scan_data, reindex=reindex) scan_index_errors.extend(indexing_errors) - declared_license_expression = summary_data.get('declared_license_expression') - other_license_expressions = summary_data.get('other_license_expressions', []) - other_license_expressions = [l['value'] for l in other_license_expressions if l['value']] + declared_license_expression = summary_data.get("declared_license_expression") + other_license_expressions = summary_data.get("other_license_expressions", []) + other_license_expressions = [ + license_expression["value"] + for license_expression in other_license_expressions + if license_expression["value"] + ] other_license_expression = combine_expressions(other_license_expressions) - copyright = '' - declared_holder = summary_data.get('declared_holder') + copyright = "" + declared_holder = summary_data.get("declared_holder") if declared_holder: - copyright = f'Copyright (c) {declared_holder}' + copyright = f"Copyright (c) {declared_holder}" checksums_and_size_by_field = { k: v for k, v in project_extra_data.items() - if k in [ - 'md5','sha1', 'size', 'sha256', 'sha512', 'filename' - ] + if k in ["md5", "sha1", "size", "sha256", "sha512", "filename"] } values_by_updateable_fields = { - 'summary': summary_data, - 'declared_license_expression': declared_license_expression, - 'other_license_expression': other_license_expression, - 'copyright': copyright, - **checksums_and_size_by_field + "summary": summary_data, + "declared_license_expression": declared_license_expression, + "other_license_expression": other_license_expression, + "copyright": copyright, + **checksums_and_size_by_field, } # do not override fields with empty values - values_by_updateable_fields = {k: v for k, v in values_by_updateable_fields.items() if v} - - _, updated_fields = package.update_fields(save=True, **values_by_updateable_fields) - updated_fields = ', '.join(updated_fields) - message = f'Updated fields for Package {package.purl}: {updated_fields}' + values_by_updateable_fields = { + k: v for k, v in values_by_updateable_fields.items() if v + } + + _, updated_fields = package.update_fields( + save=True, **values_by_updateable_fields + ) + updated_fields = ", ".join(updated_fields) + message = f"Updated fields for Package {package.purl}: {updated_fields}" logger.info(message) scannable_uri.scan_status = ScannableURI.SCAN_INDEXED scannable_uri.save() except Exception: traceback_message = traceback.format_exc() - error_message = traceback_message + '\n' + error_message = traceback_message + "\n" # TODO: We should rerun the specific indexers that have failed if scan_index_errors: - error_message += '\n'.join(scan_index_errors) + error_message += "\n".join(scan_index_errors) logger.error(error_message) scannable_uri.index_error = error_message scannable_uri.scan_status = ScannableURI.SCAN_INDEX_FAILED diff --git a/minecode/ls.py b/minecode/ls.py index 3d636b8b..14a9ad71 100644 --- a/minecode/ls.py +++ b/minecode/ls.py @@ -8,21 +8,21 @@ # -from datetime import datetime -from functools import total_ordering import logging import posixpath import stat +from datetime import datetime +from functools import total_ordering -from ftputil.stat import UnixParser from ftputil.error import ParserError - +from ftputil.stat import UnixParser TRACE = False logger = logging.getLogger(__name__) if TRACE: import sys + logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) logger.setLevel(logging.DEBUG) @@ -32,20 +32,19 @@ """ # TODO: use constants for entry types -DIR = 'd' -FILE = 'f' -LINK = 'l' -SPECIAL = 's' +DIR = "d" +FILE = "f" +LINK = "l" +SPECIAL = "s" # FIXME: do we really need link and special file support? @total_ordering -class Entry(object): - """ - Represent a file, directory or link entry in a directory listing. - """ - __slots__ = 'path', 'type', 'size', 'date', 'target' +class Entry: + """Represent a file, directory or link entry in a directory listing.""" + + __slots__ = "path", "type", "size", "date", "target" def __init__(self, path=None, type=None, size=None, date=None, target=None): # NOQA self.path = path @@ -54,35 +53,37 @@ def __init__(self, path=None, type=None, size=None, date=None, target=None): # self.date = date self.target = target if TRACE: - logger.debug('Entry(): ' + repr(self)) + logger.debug("Entry(): " + repr(self)) def __repr__(self): - base = 'Entry(path=%(path)r, type=%(type)r, size=%(size)r, date=%(date)r' - link_target = ')' + base = "Entry(path=%(path)r, type=%(type)r, size=%(size)r, date=%(date)r" + link_target = ")" if self.type == LINK: - link_target = ', target=%(target)r)' + link_target = ", target=%(target)r)" return (base + link_target) % self.to_dict() def __eq__(self, other): return isinstance(other, Entry) and self.to_dict() == other.to_dict() def __lt__(self, other): - return isinstance(other, Entry) and tuple(self.to_dict().items()) < tuple(other.to_dict().items()) + return isinstance(other, Entry) and tuple(self.to_dict().items()) < tuple( + other.to_dict().items() + ) def __hash__(self): return hash(tuple(self.to_dict().items())) def to_dict(self): return { - 'path': self.path, - 'type': self.type, - 'size': self.size, - 'date': self.date, - 'target': self.target, + "path": self.path, + "type": self.type, + "size": self.size, + "date": self.date, + "target": self.target, } @classmethod - def from_stat(self, stat_result, base_dir='', use_utc_time=True): + def from_stat(self, stat_result, base_dir="", use_utc_time=True): """ Return a new Entry built from a stat-like tuple and a base directory. @@ -119,7 +120,7 @@ def from_stat(self, stat_result, base_dir='', use_utc_time=True): res_type = SPECIAL # rejoin path with base-dir if any - if base_dir and base_dir != '.': + if base_dir and base_dir != ".": base_dir = clean_path(base_dir) path = posixpath.join(base_dir, path) @@ -127,11 +128,9 @@ def from_stat(self, stat_result, base_dir='', use_utc_time=True): def clean_path(path): - """ - Return a path cleaned from leading and trailing slashes and leading ./. - """ - path = path.strip().strip('/') - if path.startswith('./'): + """Return a path cleaned from leading and trailing slashes and leading ./.""" + path = path.strip().strip("/") + if path.startswith("./"): path = path[2:] return path.strip() @@ -141,8 +140,8 @@ def remove_inode(line): Return the line with leading inode number and size in block (which are numbers separated by spaces) are removed. """ - _, _, line = line.strip().partition(' ') - _, _, line = line.strip().partition(' ') + _, _, line = line.strip().partition(" ") + _, _, line = line.strip().partition(" ") return line.strip() @@ -161,7 +160,7 @@ def parse_directory_listing(dir_listing, from_find=False): parser = UnixParser() # default in case this would not be a recursive listing: we always need a base dir - base_dir = '' + base_dir = "" for ln, line in enumerate(lines, 1): line = line.strip() if parser.ignores_line(line): @@ -174,26 +173,25 @@ def parse_directory_listing(dir_listing, from_find=False): try: file_stat = parser.parse_line(line) if TRACE: - logger.debug( - 'parse_directory_listing:file_stat: ' + repr(file_stat)) + logger.debug("parse_directory_listing:file_stat: " + repr(file_stat)) dt = datetime.utcfromtimestamp(file_stat.st_mtime) dt = datetime.isoformat(dt) - logger.debug( - 'parse_directory_listing:file_stat:date: ' + repr(dt)) + logger.debug("parse_directory_listing:file_stat:date: " + repr(dt)) - except ParserError as pe: + except ParserError: # this is likely a directory line from an ls -LR listing. Strip # trailing colon and keep track of the base directory - if not line.endswith(':'): + if not line.endswith(":"): raise Exception( - 'Unknown directory listing line format: #%(ln)d: %(line)r' % locals()) - base_dir = line.strip(':') + "Unknown directory listing line format: #%(ln)d: %(line)r" + % locals() + ) + base_dir = line.strip(":") continue - if file_stat._st_name in ('.', '..'): + if file_stat._st_name in (".", ".."): continue - entry = Entry.from_stat( - file_stat, base_dir=base_dir, use_utc_time=False) + entry = Entry.from_stat(file_stat, base_dir=base_dir, use_utc_time=False) if entry: yield entry diff --git a/minecode/management/commands/__init__.py b/minecode/management/commands/__init__.py index 27a8178c..a629f92f 100644 --- a/minecode/management/commands/__init__.py +++ b/minecode/management/commands/__init__.py @@ -9,8 +9,8 @@ import logging -from os import getenv import traceback +from os import getenv from django.conf import settings from django.core.management.base import BaseCommand @@ -25,7 +25,7 @@ class VerboseCommand(BaseCommand): """ def get_verbosity(self, **options): - verbosity = int(options.get('verbosity', 1)) + verbosity = int(options.get("verbosity", 1)) levels = {1: logging.INFO, 2: logging.ERROR, 3: logging.DEBUG} return levels.get(verbosity, logging.CRITICAL) @@ -42,17 +42,13 @@ def stop_handler(cls, *args, **kwargs): def get_error_message(e): - """ - Return an error message with a traceback given an exception. - """ + """Return an error message with a traceback given an exception.""" tb = traceback.format_exc() - msg = e.__class__.__name__ + ' ' + repr(e) - msg += '\n' + tb + msg = e.__class__.__name__ + " " + repr(e) + msg += "\n" + tb return msg def get_settings(var_name): - """ - Return the settings value from the environment or Django settings. - """ - return getenv(var_name) or getattr(settings, var_name, None) or '' + """Return the settings value from the environment or Django settings.""" + return getenv(var_name) or getattr(settings, var_name, None) or "" diff --git a/minecode/management/commands/check_licenses.py b/minecode/management/commands/check_licenses.py index 5716c626..fbab6dcc 100644 --- a/minecode/management/commands/check_licenses.py +++ b/minecode/management/commands/check_licenses.py @@ -9,18 +9,17 @@ import codecs -from functools import reduce import json import logging import operator import os import sys +from functools import reduce from django.db.models import Q -from packagedb.models import Package - from minecode.management.commands import VerboseCommand +from packagedb.models import Package """ Utility command to find license oddities. @@ -35,19 +34,20 @@ class Command(VerboseCommand): - help = ('Find packages with an ambiguous declared license.') + help = "Find packages with an ambiguous declared license." def add_arguments(self, parser): parser.add_argument( - '-o', '--output', type=str, - help='Define the output file name') + "-o", "--output", type=str, help="Define the output file name" + ) parser.add_argument( - '--types', - dest='types', - default='maven', - action='store', - help='Package types to check, comma-separated [maven]') + "--types", + dest="types", + default="maven", + action="store", + help="Package types to check, comma-separated [maven]", + ) def handle(self, *args, **options): """ @@ -57,51 +57,57 @@ def handle(self, *args, **options): """ logger.setLevel(self.get_verbosity(**options)) - output_filename = options.get('output') + output_filename = options.get("output") - types = options.get('types') - types = [t.strip() for t in types.split(',') if t.strip()] + types = options.get("types") + types = [t.strip() for t in types.split(",") if t.strip()] packages_with_ambiguous_licenses = find_ambiguous_packages(types=types) file_location = os.path.abspath(output_filename) found_counter = dump( - packages=packages_with_ambiguous_licenses, json_location=file_location) + packages=packages_with_ambiguous_licenses, json_location=file_location + ) visited_counter = Package.objects.filter(type__in=types).count() - self.stdout.write('Visited {} packages'.format(visited_counter)) - self.stdout.write('Found {} possible packages'.format(found_counter)) + self.stdout.write(f"Visited {visited_counter} packages") + self.stdout.write(f"Found {found_counter} possible packages") if found_counter > 0: - self.stdout.write( - 'Found packages dumped to: {}'.format(file_location)) + self.stdout.write(f"Found packages dumped to: {file_location}") -def find_ambiguous_packages(types=('maven',), keywords=('unknown', 'proprietary', 'commercial',)): +def find_ambiguous_packages( + types=("maven",), + keywords=( + "unknown", + "proprietary", + "commercial", + ), +): """ Search the package DB and yield the package that declared_license and license_expression contain "unknown", "proprietary" and "commercial" words. """ # filter to detect declared_license field filter_expression = [ - Q(declared_license_expression__icontains=word) for word in keywords] + Q(declared_license_expression__icontains=word) for word in keywords + ] # filter to detect license_expression field, add or relationship between these two fields filter_expression.extend( - [Q(other_license_expression__icontains=word) for word in keywords]) + [Q(other_license_expression__icontains=word) for word in keywords] + ) license_filter = reduce(operator.or_, filter_expression) - for package in Package.objects.filter(type__in=types).filter(license_filter): - yield package + yield from Package.objects.filter(type__in=types).filter(license_filter) def dump(packages, json_location): - """ - Dump the packages as json format at the passing json_location and return the count of the packages. - """ + """Dump the packages as json format at the passing json_location and return the count of the packages.""" if not packages: return 0 packages = [p.to_dict() for p in packages] if packages: - with codecs.open(json_location, mode='wb', encoding='utf-8') as expect: - json.dump(packages, expect, indent=2, separators=(',', ': ')) + with codecs.open(json_location, mode="wb", encoding="utf-8") as expect: + json.dump(packages, expect, indent=2, separators=(",", ": ")) return len(packages) diff --git a/minecode/management/commands/check_uri.py b/minecode/management/commands/check_uri.py index f3a67355..263ab3ef 100644 --- a/minecode/management/commands/check_uri.py +++ b/minecode/management/commands/check_uri.py @@ -14,11 +14,11 @@ from django.core.management.base import BaseCommand +from minecode import map_router + # NOTE: mappers and visitors are Unused Import here: But importing the mappers # module triggers routes registration -from minecode import mappers # NOQA -from minecode import visitors # NOQA -from minecode import map_router +from minecode import miners # NOQA from minecode import visit_router from minecode.models import ResourceURI from minecode.route import NoRouteAvailable @@ -31,53 +31,54 @@ class Command(BaseCommand): - help = 'Print diagnostic information on a given URI prefix.' + help = "Print diagnostic information on a given URI prefix." def add_arguments(self, parser): parser.add_argument( - '--uri-prefix', - dest='uri_prefix', - action='store', - help='URI prefix to check.') + "--uri-prefix", + dest="uri_prefix", + action="store", + help="URI prefix to check.", + ) parser.add_argument( - '--limit', - dest='limit', + "--limit", + dest="limit", default=10, - action='store', - help='Maximum number of records to return.') + action="store", + help="Maximum number of records to return.", + ) parser.add_argument( - '--show-data', - dest='show_data', + "--show-data", + dest="show_data", default=False, - action='store_true', - help='URI prefix to check.') + action="store_true", + help="URI prefix to check.", + ) def handle(self, *args, **options): - """ - Check uris and print diagnostic information as JSON. - """ - uri_prefix = options.get('uri_prefix') - limit = options.get('limit', 10) - show_data = options.get('show_data') + """Check uris and print diagnostic information as JSON.""" + uri_prefix = options.get("uri_prefix") + limit = options.get("limit", 10) + show_data = options.get("show_data") # get the last 10 uris - uris = ResourceURI.objects.filter( - uri__startswith=uri_prefix).order_by("-id")[:limit] + uris = ResourceURI.objects.filter(uri__startswith=uri_prefix).order_by("-id")[ + :limit + ] # TODO: add if the uri be resolved by visit and/or map router for uri in uris: - try: # FIXME: resolve() returns an acutal Visitor object, using module names for now visit_route_resolve = repr(visit_router.resolve(uri.uri)) except NoRouteAvailable: - visit_route_resolve = 'No Route Availible' + visit_route_resolve = "No Route Availible" try: # FIXME: resolve() returns an acutal Mapper object, using module names for now map_route_resolve = repr(map_router.resolve(uri.uri)) except NoRouteAvailable: - map_route_resolve = 'No Route Availible' + map_route_resolve = "No Route Availible" if uri.last_visit_date: last_visit_date = uri.last_visit_date.isoformat() @@ -94,24 +95,26 @@ def handle(self, *args, **options): else: wip_date = None - uri_info = dict([ - ('id', uri.id), - ('uri', uri.uri), - ('source_uri', uri.source_uri), - ('priority', uri.priority), - ('mining_level', uri.mining_level), - ('visit_route', visit_route_resolve), - ('map_route', map_route_resolve), - ('is_visitable', uri.is_visitable), - ('is_mappable', uri.is_mappable), - ('last_visit_date', last_visit_date), - ('last_map_date', last_map_date), - ('wip_date', wip_date), - ('visit_error', uri.visit_error), - ('map_error', uri.map_error), - ]) + uri_info = dict( + [ + ("id", uri.id), + ("uri", uri.uri), + ("source_uri", uri.source_uri), + ("priority", uri.priority), + ("mining_level", uri.mining_level), + ("visit_route", visit_route_resolve), + ("map_route", map_route_resolve), + ("is_visitable", uri.is_visitable), + ("is_mappable", uri.is_mappable), + ("last_visit_date", last_visit_date), + ("last_map_date", last_map_date), + ("wip_date", wip_date), + ("visit_error", uri.visit_error), + ("map_error", uri.map_error), + ] + ) if show_data: - uri_info.update({'data': uri.data}) + uri_info.update({"data": uri.data}) print(json.dumps(uri_info, indent=2)) diff --git a/minecode/management/commands/create-scan-queue-worker-user.py b/minecode/management/commands/create-scan-queue-worker-user.py index 7708f963..7111c539 100644 --- a/minecode/management/commands/create-scan-queue-worker-user.py +++ b/minecode/management/commands/create-scan-queue-worker-user.py @@ -8,24 +8,24 @@ # from django.contrib.auth.models import Group + from minecode.management.user_creation import CreateUserCommand class Command(CreateUserCommand): - help = 'Create a user and generate an API key for a scan queue worker' + help = "Create a user and generate an API key for a scan queue worker" def handle(self, *args, **options): - username = options['username'] - interactive = options['interactive'] - verbosity = options['verbosity'] + username = options["username"] + interactive = options["interactive"] + verbosity = options["verbosity"] user = self.create_user( - username=username, - interactive=interactive, - verbosity=verbosity + username=username, interactive=interactive, verbosity=verbosity ) # Add user to `scan_queue_workers` group scan_queue_workers_group, _ = Group.objects.get_or_create( - name='scan_queue_workers') + name="scan_queue_workers" + ) scan_queue_workers_group.user_set.add(user) - msg = f'User {username} added to `scan_queue_workers` group' + msg = f"User {username} added to `scan_queue_workers` group" self.stdout.write(msg, self.style.SUCCESS) diff --git a/minecode/management/commands/create-user.py b/minecode/management/commands/create-user.py index 47087f27..9b618ba1 100644 --- a/minecode/management/commands/create-user.py +++ b/minecode/management/commands/create-user.py @@ -11,14 +11,12 @@ class Command(CreateUserCommand): - help = 'Create a user and generate an API key for a scan queue worker' + help = "Create a user and generate an API key for a scan queue worker" def handle(self, *args, **options): - username = options['username'] - interactive = options['interactive'] - verbosity = options['verbosity'] + username = options["username"] + interactive = options["interactive"] + verbosity = options["verbosity"] self.create_user( - username=username, - interactive=interactive, - verbosity=verbosity + username=username, interactive=interactive, verbosity=verbosity ) diff --git a/minecode/management/commands/dump_purls.py b/minecode/management/commands/dump_purls.py index a81d8aba..62494819 100644 --- a/minecode/management/commands/dump_purls.py +++ b/minecode/management/commands/dump_purls.py @@ -14,20 +14,20 @@ def dump_purls(package_type, output): - """ - Dump packagedb purls for ``package_type`` as JSON lines in the ``output`` files - """ - compact_separators = (u',', u':',) + """Dump packagedb purls for ``package_type`` as JSON lines in the ``output`` files""" + compact_separators = ( + ",", + ":", + ) out = None for i, package in enumerate(Package.objects.filter(type=package_type).all()): if not output: out = open(f"{output}-{i}.json", "w") - purl = dict(purl=package.package_url, - download_url=package.download_url) + purl = dict(purl=package.package_url, download_url=package.download_url) if not i % 500: print(f"#{i} purl: {package.package_url}") out.write(json.dumps(purl, separators=compact_separators)) - out.write('\n') + out.write("\n") if not i % 1000000: out.close() out = None diff --git a/minecode/management/commands/get_maven_release_dates.py b/minecode/management/commands/get_maven_release_dates.py index 5dc1e928..e3ab2f35 100644 --- a/minecode/management/commands/get_maven_release_dates.py +++ b/minecode/management/commands/get_maven_release_dates.py @@ -7,19 +7,18 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from dateutil.parser import parse as dateutil_parse -from os.path import dirname import logging import sys +from os.path import dirname import requests +from dateutil.parser import parse as dateutil_parse +from minecode.collectors.maven import collect_links_from_text +from minecode.collectors.maven import filter_for_artifacts from minecode.management.commands import VerboseCommand -from minecode.visitors.maven import collect_links_from_text -from minecode.visitors.maven import filter_for_artifacts from packagedb.models import Package - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -30,51 +29,56 @@ class Command(VerboseCommand): - help = 'Get and set release_date for Maven Packages' + help = "Get and set release_date for Maven Packages" def handle(self, *args, **options): queryset = Package.objects.filter( - type='maven', + type="maven", release_date=None, - download_url__startswith='https://repo1.maven.org/maven2' + download_url__startswith="https://repo1.maven.org/maven2", ) object_count = queryset.count() chunk_size = 2000 iterator = queryset.iterator(chunk_size=chunk_size) unsaved_objects = [] - logger.info(f'Updating release_date for {object_count} packages') + logger.info(f"Updating release_date for {object_count} packages") for index, package in enumerate(iterator, start=1): download_url = package.download_url package_url = package.package_url logger.info( - f'Updating release_date for package {package_url} ({download_url})') + f"Updating release_date for package {package_url} ({download_url})" + ) package_version_page_url = dirname(download_url) - filename = download_url.rsplit('/')[-1] + filename = download_url.rsplit("/")[-1] response = requests.get(package_version_page_url) if response: timestamps_by_links = collect_links_from_text( - response.text, filter=filter_for_artifacts) + response.text, filter=filter_for_artifacts + ) timestamp = timestamps_by_links.get(filename) if not timestamp: logger.info( - f'\tCould not get release_date for package {package_url} ({download_url})') + f"\tCould not get release_date for package {package_url} ({download_url})" + ) continue timestamp = dateutil_parse(timestamp) package.release_date = timestamp unsaved_objects.append(package) logger.info( - f'\t{package_url} ({download_url}) release_date has been updated to {timestamp}') + f"\t{package_url} ({download_url}) release_date has been updated to {timestamp}" + ) else: logger.info( - f'\t{package_url} not updated: error encountered when visiting {package_version_page_url}') + f"\t{package_url} not updated: error encountered when visiting {package_version_page_url}" + ) if not (index % chunk_size) and unsaved_objects: - logger.info(f'{index:,} / {object_count:,} Packages processed') + logger.info(f"{index:,} / {object_count:,} Packages processed") - logger.info('Updating Package objects...') + logger.info("Updating Package objects...") updated_packages_count = Package.objects.bulk_update( objs=unsaved_objects, - fields=['release_date'], + fields=["release_date"], batch_size=1000, ) - logger.info(f'Updated {updated_packages_count} Package objects') + logger.info(f"Updated {updated_packages_count} Package objects") diff --git a/minecode/management/commands/get_status.py b/minecode/management/commands/get_status.py index 144d2850..c92358be 100644 --- a/minecode/management/commands/get_status.py +++ b/minecode/management/commands/get_status.py @@ -23,29 +23,43 @@ class Command(BaseCommand): - help = 'Print status information for the minecode system.' + help = "Print status information for the minecode system." def handle(self, *args, **options): - counts = dict([ - ('total_packages', Package.objects.count()), - ('total_uri', ResourceURI.objects.count()), - ('unique_uri', ResourceURI.objects.distinct().count()), - - ('visitables', ResourceURI.objects.get_visitables().count()), - ('visited', ResourceURI.objects.visited().count()), - ('successfully_visited', ResourceURI.objects.successfully_visited().count()), - ('unsuccessfully_visited', - ResourceURI.objects.unsuccessfully_visited().count()), - ('never_visited', ResourceURI.objects.never_visited().count()), - ('visit_in_progress', ResourceURI.objects.filter( - wip_date__isnull=False, last_visit_date__isnull=True).count()), - - ('mappables', ResourceURI.objects.get_mappables().count()), - ('mapped', ResourceURI.objects.mapped().count()), - ('successfully_mapped', ResourceURI.objects.successfully_mapped().count()), - ('unsuccessfully_mapped', - ResourceURI.objects.unsuccessfully_mapped().count()), - ('never_mapped', ResourceURI.objects.never_mapped().count()), - ]) + counts = dict( + [ + ("total_packages", Package.objects.count()), + ("total_uri", ResourceURI.objects.count()), + ("unique_uri", ResourceURI.objects.distinct().count()), + ("visitables", ResourceURI.objects.get_visitables().count()), + ("visited", ResourceURI.objects.visited().count()), + ( + "successfully_visited", + ResourceURI.objects.successfully_visited().count(), + ), + ( + "unsuccessfully_visited", + ResourceURI.objects.unsuccessfully_visited().count(), + ), + ("never_visited", ResourceURI.objects.never_visited().count()), + ( + "visit_in_progress", + ResourceURI.objects.filter( + wip_date__isnull=False, last_visit_date__isnull=True + ).count(), + ), + ("mappables", ResourceURI.objects.get_mappables().count()), + ("mapped", ResourceURI.objects.mapped().count()), + ( + "successfully_mapped", + ResourceURI.objects.successfully_mapped().count(), + ), + ( + "unsuccessfully_mapped", + ResourceURI.objects.unsuccessfully_mapped().count(), + ), + ("never_mapped", ResourceURI.objects.never_mapped().count()), + ] + ) print(json.dumps(counts, indent=2)) diff --git a/minecode/management/commands/import_queue.py b/minecode/management/commands/import_queue.py index 52c0c355..0be7a9f2 100644 --- a/minecode/management/commands/import_queue.py +++ b/minecode/management/commands/import_queue.py @@ -7,30 +7,29 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from dateutil.parser import parse as dateutil_parse import logging import signal import sys import time -import requests - from django.db import transaction from django.utils import timezone + +import requests +from dateutil.parser import parse as dateutil_parse +from packagedcode.models import PackageData from packageurl import PackageURL -from minecode.management.commands import get_error_message +from minecode.collectors.maven import collect_links_from_text +from minecode.collectors.maven import determine_namespace_name_version_from_url +from minecode.collectors.maven import filter_only_directories +from minecode.collectors.maven import get_artifact_links +from minecode.collectors.maven import get_artifact_sha1 +from minecode.collectors.maven import get_classifier_from_artifact_url from minecode.management.commands import VerboseCommand -from minecode.models import ImportableURI -from minecode.visitors.maven import get_artifact_links -from minecode.visitors.maven import get_classifier_from_artifact_url -from minecode.visitors.maven import collect_links_from_text -from minecode.visitors.maven import filter_only_directories -from minecode.visitors.maven import get_artifact_sha1 +from minecode.management.commands import get_error_message from minecode.model_utils import merge_or_create_package -from packagedcode.models import PackageData -from minecode.visitors.maven import determine_namespace_name_version_from_url - +from minecode.models import ImportableURI logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -47,9 +46,7 @@ def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ + """Signal handler to set global variable to True.""" global MUST_STOP MUST_STOP = True @@ -58,7 +55,7 @@ def stop_handler(*args, **kwargs): class Command(VerboseCommand): - help = 'Run a Package request queue.' + help = "Run a Package request queue." def handle(self, *args, **options): """ @@ -66,7 +63,6 @@ def handle(self, *args, **options): processing. Loops forever and sleeps a short while if there are no PriorityResourceURI left to process. """ - global MUST_STOP sleeping = False @@ -74,7 +70,7 @@ def handle(self, *args, **options): while True: if MUST_STOP: - logger.info('Graceful exit of the request queue.') + logger.info("Graceful exit of the request queue.") break with transaction.atomic(): @@ -84,7 +80,7 @@ def handle(self, *args, **options): # Only log a single message when we go to sleep if not sleeping: sleeping = True - logger.info('No more processable request, sleeping...') + logger.info("No more processable request, sleeping...") time.sleep(SLEEP_WHEN_EMPTY) continue @@ -92,12 +88,13 @@ def handle(self, *args, **options): sleeping = False # process request - logger.info('Processing {}'.format(importable_uri)) + logger.info(f"Processing {importable_uri}") try: errors = process_request(importable_uri) except Exception as e: - errors = 'Error: Failed to process ImportableURI: {}\n'.format( - repr(importable_uri)) + errors = ( + f"Error: Failed to process ImportableURI: {repr(importable_uri)}\n" + ) errors += get_error_message(e) finally: if errors: @@ -113,7 +110,7 @@ def handle(self, *args, **options): def process_request(importable_uri): uri = importable_uri.uri - uri = uri.rstrip('/') + uri = uri.rstrip("/") data = importable_uri.data if not data: # collect data again if we don't have it @@ -130,22 +127,24 @@ def process_request(importable_uri): namespace, name, _ = determine_namespace_name_version_from_url(uri) timestamps_by_directory_links = collect_links_from_text( - data, filter_only_directories) + data, filter_only_directories + ) # Go into each version directory for directory_link in timestamps_by_directory_links.keys(): - version = directory_link.rstrip('/') - version_page_url = f'{uri}/{version}' + version = directory_link.rstrip("/") + version_page_url = f"{uri}/{version}" timestamps_by_artifact_links = get_artifact_links(version_page_url) for artifact_link, timestamp in timestamps_by_artifact_links.items(): sha1 = get_artifact_sha1(artifact_link) classifier = get_classifier_from_artifact_url( - artifact_link, version_page_url, name, version) + artifact_link, version_page_url, name, version + ) qualifiers = None if classifier: - qualifiers = f'classifier={classifier}' + qualifiers = f"classifier={classifier}" release_date = dateutil_parse(timestamp) package_data = PackageData( - type='maven', + type="maven", namespace=namespace, name=name, version=version, @@ -155,14 +154,13 @@ def process_request(importable_uri): release_date=release_date, ) package, created, merged, map_error = merge_or_create_package( - scanned_package=package_data, - visit_level=50 + scanned_package=package_data, visit_level=50 ) if created: - logger.info(f'Created package {package}') + logger.info(f"Created package {package}") if merged: - logger.info(f'Updated package {package}') + logger.info(f"Updated package {package}") if map_error: - logger.error(f'Error encountered: {map_error}') + logger.error(f"Error encountered: {map_error}") importable_uri.processing_error = map_error importable_uri.save() diff --git a/minecode/management/commands/increase_scannableuri_priority.py b/minecode/management/commands/increase_scannableuri_priority.py index 5724ec56..e6c237d7 100644 --- a/minecode/management/commands/increase_scannableuri_priority.py +++ b/minecode/management/commands/increase_scannableuri_priority.py @@ -2,16 +2,13 @@ # Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from __future__ import absolute_import -from __future__ import unicode_literals import logging import sys -from minecode.models import ScannableURI -from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand - +from minecode.management.commands import get_error_message +from minecode.models import ScannableURI logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -21,16 +18,21 @@ class Command(VerboseCommand): logger = logger - help = 'Increase the priority of the Package to be scanned' + help = "Increase the priority of the Package to be scanned" def add_arguments(self, parser): - parser.add_argument('--pattern', '-p', action='store', dest='pattern', - help='Only increase the priority of URIs matching this regex pattern.') + parser.add_argument( + "--pattern", + "-p", + action="store", + dest="pattern", + help="Only increase the priority of URIs matching this regex pattern.", + ) def handle(self, *args, **options): logger.setLevel(self.get_verbosity(**options)) - pattern = options.get('pattern') + pattern = options.get("pattern") for scannable_uri in ScannableURI.objects.filter(uri__iregex=pattern): uri = scannable_uri.uri @@ -38,8 +40,8 @@ def handle(self, *args, **options): # Priority is arbitrarily set to 100 to immediately increase its processing priority scannable_uri.priority = 100 scannable_uri.save() - logger.info('Increased priority of: '.format(uri)) + logger.info(f"Increased priority of: ") except Exception as e: - msg = 'Error setting priority for: '.format(uri) + msg = f"Error setting priority for: " msg += get_error_message(e) logger.error(msg) diff --git a/minecode/management/commands/load_priority_queue.py b/minecode/management/commands/load_priority_queue.py index 8e4fbfde..8af14d3b 100644 --- a/minecode/management/commands/load_priority_queue.py +++ b/minecode/management/commands/load_priority_queue.py @@ -11,12 +11,11 @@ import sys import requests - from commoncode.resource import VirtualCodebase + from minecode.management.commands import VerboseCommand from minecode.models import PriorityResourceURI - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -27,33 +26,35 @@ class Command(VerboseCommand): - help = 'Run a Package request queue.' + help = "Run a Package request queue." def add_arguments(self, parser): parser.add_argument("--input", type=str) def handle(self, *args, **options): - input = options.get('input') + input = options.get("input") if input: vc = VirtualCodebase(location=input) for resource in vc.walk(): if not resource.sha1: continue - maven_api_search_url = f'https://search.maven.org/solrsearch/select?q=1:{resource.sha1}' + maven_api_search_url = ( + f"https://search.maven.org/solrsearch/select?q=1:{resource.sha1}" + ) response = requests.get(maven_api_search_url) if not response.ok: - logger.error( - f"API query failed for: {maven_api_search_url}") + logger.error(f"API query failed for: {maven_api_search_url}") continue contents = response.json() - resp = contents.get('response', {}) - if resp.get('numFound', 0) > 0: - for matched_package in resp.get('docs', []): - namespace = matched_package.get('g', '') - name = matched_package.get('a', '') - version = matched_package.get('v', '') + resp = contents.get("response", {}) + if resp.get("numFound", 0) > 0: + for matched_package in resp.get("docs", []): + namespace = matched_package.get("g", "") + name = matched_package.get("a", "") + version = matched_package.get("v", "") if namespace and name and version: - purl = f'pkg:maven/{namespace}/{name}@{version}' + purl = f"pkg:maven/{namespace}/{name}@{version}" PriorityResourceURI.objects.create( - uri=purl, package_url=purl, sha1=resource.sha1) - logger.info(f'Added {purl} to priority queue') + uri=purl, package_url=purl, sha1=resource.sha1 + ) + logger.info(f"Added {purl} to priority queue") diff --git a/minecode/management/commands/make_scannableuris.py b/minecode/management/commands/make_scannableuris.py index 992b9c76..4384c287 100644 --- a/minecode/management/commands/make_scannableuris.py +++ b/minecode/management/commands/make_scannableuris.py @@ -2,18 +2,15 @@ # Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from __future__ import absolute_import -from __future__ import unicode_literals import logging import sys from django.core.management.base import BaseCommand -from packagedb.models import Package from minecode.management.commands import get_error_message from minecode.models import ScannableURI - +from packagedb.models import Package logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -23,19 +20,18 @@ class Command(BaseCommand): logger = logger - help = 'Create ScannableURIs from Packages' + help = "Create ScannableURIs from Packages" def handle(self, *args, **options): for package in Package.objects.all(): package_uri = package.download_url try: _, created = ScannableURI.objects.get_or_create( - uri=package_uri, - package=package + uri=package_uri, package=package ) if created: - self.stdout.write('ScannableURI created for: {}'.format(package_uri)) + self.stdout.write(f"ScannableURI created for: {package_uri}") except Exception as e: - msg = 'Error creating ScannableURI for: {}'.format(package_uri) + msg = f"Error creating ScannableURI for: {package_uri}" msg += get_error_message(e) logger.error(msg) diff --git a/minecode/management/commands/manage_scans.py b/minecode/management/commands/manage_scans.py index a361f227..77ed3246 100644 --- a/minecode/management/commands/manage_scans.py +++ b/minecode/management/commands/manage_scans.py @@ -2,19 +2,16 @@ # Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -import time import logging import signal import sys +import time from django.db import transaction from django.utils import timezone - -from minecode.models import ScannableURI - from minecode.management.commands import VerboseCommand - +from minecode.models import ScannableURI logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -26,39 +23,40 @@ class ScanningCommand(VerboseCommand): - """ - Base command class for processing ScannableURIs. - """ + """Base command class for processing ScannableURIs.""" + # subclasses must override logger = None def add_arguments(self, parser): parser.add_argument( - '--exit-on-empty', - dest='exit_on_empty', + "--exit-on-empty", + dest="exit_on_empty", default=False, - action='store_true', - help='Do not loop forever. Exit when the queue is empty.') + action="store_true", + help="Do not loop forever. Exit when the queue is empty.", + ) parser.add_argument( - '--max-uris', - dest='max_uris', + "--max-uris", + dest="max_uris", default=0, - action='store', - help='Limit the number of Scannable URIs processed to a maximum number. ' - '0 means no limit. Used only for testing.') + action="store", + help="Limit the number of Scannable URIs processed to a maximum number. " + "0 means no limit. Used only for testing.", + ) def handle(self, *args, **options): - exit_on_empty = options.get('exit_on_empty') - max_uris = options.get('max_uris', 0) + exit_on_empty = options.get("exit_on_empty") + max_uris = options.get("max_uris", 0) uris_counter = self.process_scans( exit_on_empty=exit_on_empty, max_uris=max_uris, # Pass options to allow subclasses to add their own options - options=options + options=options, ) - self.stdout.write('Processed {} ScannableURI.'.format(uris_counter)) + self.stdout.write(f"Processed {uris_counter} ScannableURI.") @classmethod def process_scans(cls, exit_on_empty=False, max_uris=0, **kwargs): @@ -77,29 +75,35 @@ def process_scans(cls, exit_on_empty=False, max_uris=0, **kwargs): time.sleep(10) if cls.MUST_STOP: - cls.logger.info('Graceful exit of the scan processing loop.') + cls.logger.info("Graceful exit of the scan processing loop.") break if max_uris and uris_counter >= max_uris: - cls.logger.info('max_uris requested reached: exiting scan processing loop.') + cls.logger.info( + "max_uris requested reached: exiting scan processing loop." + ) break scannable_uri = cls.get_next_uri() if not scannable_uri: if exit_on_empty: - cls.logger.info('exit-on-empty requested: No more scannable URIs, exiting...') + cls.logger.info( + "exit-on-empty requested: No more scannable URIs, exiting..." + ) break # Only log a single message when we go to sleep if not sleeping: sleeping = True - cls.logger.info('No more scannable URIs, sleeping for at least {} seconds...'.format(SLEEP_WHEN_EMPTY)) + cls.logger.info( + f"No more scannable URIs, sleeping for at least {SLEEP_WHEN_EMPTY} seconds..." + ) time.sleep(SLEEP_WHEN_EMPTY) continue - cls.logger.info('Processing scannable URI: {}'.format(scannable_uri)) + cls.logger.info(f"Processing scannable URI: {scannable_uri}") cls.process_scan(scannable_uri, **kwargs) uris_counter += 1 @@ -129,11 +133,12 @@ def process_scan(scannable_uri, **kwargs): class Command(ScanningCommand): - logger = logger - help = ('Check scancode.io requested scans for status then fetch and process ' - 'completed scans for indexing and updates.') + help = ( + "Check scancode.io requested scans for status then fetch and process " + "completed scans for indexing and updates." + ) def handle(self, *args, **options): logger.setLevel(self.get_verbosity(**options)) @@ -146,15 +151,24 @@ def get_next_uri(self): return scannable_uri @classmethod - def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_save_loc='', **kwargs): + def process_scan( + cls, + scannable_uri, + get_scan_info_save_loc="", + get_scan_data_save_loc="", + **kwargs, + ): """ Manage a ScannableURI based on its status. - For submitted but not completed scans, check the timestamp of when the scan was submitted, if it has been past some time, then we set the scan as timed out - For timed out scans, we set that as failed and then create a new one? """ - logger.info('Checking scan for URI: {}'.format(scannable_uri)) + logger.info(f"Checking scan for URI: {scannable_uri}") - if scannable_uri.scan_status in (ScannableURI.SCAN_SUBMITTED, ScannableURI.SCAN_IN_PROGRESS): + if scannable_uri.scan_status in ( + ScannableURI.SCAN_SUBMITTED, + ScannableURI.SCAN_IN_PROGRESS, + ): scan_duration = timezone.now() - scannable_uri.scan_date scan_duration_hours = scan_duration.seconds / (60 * 60) @@ -162,7 +176,7 @@ def process_scan(cls, scannable_uri, get_scan_info_save_loc='', get_scan_data_sa scannable_uri.scan_status = ScannableURI.SCAN_TIMEOUT scannable_uri.wip_date = None scannable_uri.save() - logger.info('Scan for URI has timed out: {}'.format(scannable_uri)) + logger.info(f"Scan for URI has timed out: {scannable_uri}") # support graceful death when used as a service diff --git a/minecode/management/commands/maven_crawler.py b/minecode/management/commands/maven_crawler.py index 647a94ae..df6da9cf 100644 --- a/minecode/management/commands/maven_crawler.py +++ b/minecode/management/commands/maven_crawler.py @@ -10,10 +10,9 @@ import logging import sys -from minecode.visitors.maven import crawl_maven_repo_from_root +from minecode.collectors.maven import crawl_maven_repo_from_root from minecode.management.commands import VerboseCommand - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -24,8 +23,8 @@ class Command(VerboseCommand): - help = 'Run a Package request queue.' + help = "Run a Package request queue." def handle(self, *args, **options): - maven_root_url = 'https://repo.maven.apache.org/maven2' + maven_root_url = "https://repo.maven.apache.org/maven2" crawl_maven_repo_from_root(root_url=maven_root_url) diff --git a/minecode/management/commands/priority_queue.py b/minecode/management/commands/priority_queue.py index 94fc4edd..510febbb 100644 --- a/minecode/management/commands/priority_queue.py +++ b/minecode/management/commands/priority_queue.py @@ -16,16 +16,14 @@ from django.utils import timezone # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import visitors # NOQA +# But importing the collectors module triggers routes registration +from minecode import collectors # NOQA from minecode import priority_router -from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand +from minecode.management.commands import get_error_message from minecode.models import PriorityResourceURI -from minecode.models import ScannableURI from minecode.route import NoRouteAvailable - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -41,9 +39,7 @@ def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ + """Signal handler to set global variable to True.""" global MUST_STOP MUST_STOP = True @@ -52,7 +48,7 @@ def stop_handler(*args, **kwargs): class Command(VerboseCommand): - help = 'Run a Package request queue.' + help = "Run a Package request queue." def handle(self, *args, **options): """ @@ -60,7 +56,6 @@ def handle(self, *args, **options): processing. Loops forever and sleeps a short while if there are no PriorityResourceURI left to process. """ - global MUST_STOP sleeping = False @@ -68,7 +63,7 @@ def handle(self, *args, **options): while True: if MUST_STOP: - logger.info('Graceful exit of the request queue.') + logger.info("Graceful exit of the request queue.") break with transaction.atomic(): @@ -78,7 +73,7 @@ def handle(self, *args, **options): # Only log a single message when we go to sleep if not sleeping: sleeping = True - logger.info('No more processable request, sleeping...') + logger.info("No more processable request, sleeping...") time.sleep(SLEEP_WHEN_EMPTY) continue @@ -86,12 +81,11 @@ def handle(self, *args, **options): sleeping = False # process request - logger.info('Processing {}'.format(priority_resource_uri)) + logger.info(f"Processing {priority_resource_uri}") try: errors = process_request(priority_resource_uri) except Exception as e: - errors = 'Error: Failed to process PriorityResourceURI: {}\n'.format( - repr(priority_resource_uri)) + errors = f"Error: Failed to process PriorityResourceURI: {repr(priority_resource_uri)}\n" errors += get_error_message(e) finally: if errors: @@ -113,24 +107,20 @@ def process_request(priority_resource_uri, _priority_router=priority_router): try: if TRACE: - logger.debug('visit_uri: uri: {}'.format(purl_to_visit)) + logger.debug(f"visit_uri: uri: {purl_to_visit}") kwargs = dict() if source_purl: - kwargs['source_purl'] = source_purl + kwargs["source_purl"] = source_purl if addon_pipelines: - kwargs['addon_pipelines'] = addon_pipelines + kwargs["addon_pipelines"] = addon_pipelines if priority: - kwargs['priority'] = priority + kwargs["priority"] = priority errors = _priority_router.process(purl_to_visit, **kwargs) - if TRACE: - new_uris_to_visit = list(new_uris_to_visit or []) - logger.debug( - 'visit_uri: new_uris_to_visit: {}'.format(new_uris_to_visit)) return errors except NoRouteAvailable: - error = f'No route available for {purl_to_visit}' + error = f"No route available for {purl_to_visit}" logger.error(error) # TODO: For now, when a route is not yet supported, we keep a value for # the wip_date value so the instance is not back in the queue. It will diff --git a/minecode/management/commands/remap.py b/minecode/management/commands/remap.py index 303655b1..68cbc403 100644 --- a/minecode/management/commands/remap.py +++ b/minecode/management/commands/remap.py @@ -22,23 +22,30 @@ class Command(BaseCommand): - help = 'Mark ResourceURIs for remapping to packages.' + help = "Mark ResourceURIs for remapping to packages." def handle(self, *args, **options): - q1 = Q(uri__startswith='https://repo1') - q2 = Q(uri__startswith='maven-index://') - q3 = Q(uri__startswith='https://replicate') - q4 = Q(uri__startswith='https://registry') + q1 = Q(uri__startswith="https://repo1") + q2 = Q(uri__startswith="maven-index://") + q3 = Q(uri__startswith="https://replicate") + q4 = Q(uri__startswith="https://registry") for uri in ResourceURI.objects.successfully_mapped().filter(q1 | q2 | q3 | q4): uri.last_map_date = None uri.wip_date = None uri.save() - ResourceURI.objects.successfully_mapped().filter( - uri__contains='maven').update(last_map_date=None) - ResourceURI.objects.successfully_mapped().filter( - uri__contains='npm').update(last_map_date=None) - - ResourceURI.objects.successfully_mapped().exclude(uri__startswith='http://repo1').exclude(uri__startswith='maven-index://').exclude( - uri__startswith='https://replicate').exclude(uri__startswith='https://registry.npmjs.org').update(is_mappable=False) + ResourceURI.objects.successfully_mapped().filter(uri__contains="maven").update( + last_map_date=None + ) + ResourceURI.objects.successfully_mapped().filter(uri__contains="npm").update( + last_map_date=None + ) + + ResourceURI.objects.successfully_mapped().exclude( + uri__startswith="http://repo1" + ).exclude(uri__startswith="maven-index://").exclude( + uri__startswith="https://replicate" + ).exclude(uri__startswith="https://registry.npmjs.org").update( + is_mappable=False + ) diff --git a/minecode/management/commands/run_map.py b/minecode/management/commands/run_map.py index 0ae56574..819b25e8 100644 --- a/minecode/management/commands/run_map.py +++ b/minecode/management/commands/run_map.py @@ -16,19 +16,17 @@ from django.db import transaction from django.utils import timezone -# UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import mappers # NOQA -from minecode import visitors # NOQA - from minecode import map_router -from minecode.models import ResourceURI -from minecode.management.commands import get_error_message + +# UnusedImport here! +# But importing the miners module triggers routes registration +from minecode import miners # NOQA from minecode.management.commands import VerboseCommand +from minecode.management.commands import get_error_message from minecode.model_utils import merge_or_create_package +from minecode.models import ResourceURI from minecode.models import ScannableURI - TRACE = True logger = logging.getLogger(__name__) @@ -43,9 +41,7 @@ def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ + """Signal handler to set global variable to True.""" global MUST_STOP MUST_STOP = True @@ -57,15 +53,16 @@ def stop_handler(*args, **kwargs): class Command(VerboseCommand): - help = 'Run a mapping worker.' + help = "Run a mapping worker." def add_arguments(self, parser): parser.add_argument( - '--exit-on-empty', - dest='exit_on_empty', + "--exit-on-empty", + dest="exit_on_empty", default=False, - action='store_true', - help='Do not loop forever. Exit when the queue is empty.') + action="store_true", + help="Do not loop forever. Exit when the queue is empty.", + ) def handle(self, *args, **options): """ @@ -75,26 +72,26 @@ def handle(self, *args, **options): global MUST_STOP logger.setLevel(self.get_verbosity(**options)) - exit_on_empty = options.get('exit_on_empty') + exit_on_empty = options.get("exit_on_empty") sleeping = False while True: if MUST_STOP: - logger.info('Graceful exit of the map loop.') + logger.info("Graceful exit of the map loop.") break mappables = ResourceURI.objects.get_mappables()[:MAP_BATCH_SIZE] if not mappables: if exit_on_empty: - logger.info('No mappable resource, exiting...') + logger.info("No mappable resource, exiting...") break # Only log a single message when we go to sleep if not sleeping: sleeping = True - logger.info('No mappable resource, sleeping...') + logger.info("No mappable resource, sleeping...") time.sleep(SLEEP_WHEN_EMPTY) continue @@ -102,7 +99,7 @@ def handle(self, *args, **options): sleeping = False for resource_uri in mappables: - logger.info('Mapping {}'.format(resource_uri)) + logger.info(f"Mapping {resource_uri}") map_uri(resource_uri) @@ -114,16 +111,18 @@ def map_uri(resource_uri, _map_router=map_router): # FIXME: returning a string or sequence is UGLY try: mapped_scanned_packages = _map_router.process( - resource_uri.uri, resource_uri=resource_uri) + resource_uri.uri, resource_uri=resource_uri + ) - logger.debug('map_uri: Package URI: {}'.format(resource_uri.uri)) + logger.debug(f"map_uri: Package URI: {resource_uri.uri}") # consume generators mapped_scanned_packages = mapped_scanned_packages and list( - mapped_scanned_packages) + mapped_scanned_packages + ) if not mapped_scanned_packages: - msg = 'No visited scanned packages returned.' + msg = "No visited scanned packages returned." logger.error(msg) resource_uri.last_map_date = timezone.now() resource_uri.map_error = msg @@ -131,8 +130,9 @@ def map_uri(resource_uri, _map_router=map_router): return except Exception as e: - msg = 'Error: Failed to map while processing ResourceURI: {}\n'.format( - repr(resource_uri)) + msg = ( + f"Error: Failed to map while processing ResourceURI: {repr(resource_uri)}\n" + ) msg += get_error_message(e) logger.error(msg) # we had an error, so mapped_scanned_packages is an error string @@ -144,7 +144,7 @@ def map_uri(resource_uri, _map_router=map_router): # if we reached this place, we have mapped_scanned_packages that contains # packages in ScanCode models format that these are ready to save to the DB - map_error = '' + map_error = "" try: with transaction.atomic(): @@ -155,7 +155,8 @@ def map_uri(resource_uri, _map_router=map_router): for scanned_package in mapped_scanned_packages: visit_level = resource_uri.mining_level package, package_created, _, m_err = merge_or_create_package( - scanned_package, visit_level) + scanned_package, visit_level + ) map_error += m_err if package_created: # Add this Package to the scan queue @@ -165,14 +166,13 @@ def map_uri(resource_uri, _map_router=map_router): package=package, ) if scannable_uri_created: - logger.debug( - ' + Inserted ScannableURI\t: {}'.format(package_uri)) + logger.debug(f" + Inserted ScannableURI\t: {package_uri}") except Exception as e: - msg = 'Error: Failed to map while processing ResourceURI: {}\n'.format( - repr(resource_uri)) - msg += 'While processing scanned_package: {}\n'.format( - repr(scanned_package)) + msg = ( + f"Error: Failed to map while processing ResourceURI: {repr(resource_uri)}\n" + ) + msg += f"While processing scanned_package: {repr(scanned_package)}\n" msg += get_error_message(e) logger.error(msg) # this is enough to save the error to the ResourceURI which is done at last diff --git a/minecode/management/commands/run_visit.py b/minecode/management/commands/run_visit.py index 0bae2cca..1f923d2f 100644 --- a/minecode/management/commands/run_visit.py +++ b/minecode/management/commands/run_visit.py @@ -8,11 +8,11 @@ # -from collections import Counter import logging import signal import sys import time +from collections import Counter # FIXME: why use Django cache for this? any benefits and side effects? from django.core.cache import cache as visit_delay_by_hostname @@ -23,18 +23,14 @@ import reppy.cache # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import mappers # NOQA -from minecode import visitors # NOQA +# But importing the miners module triggers routes registration +from minecode import miners # NOQA from minecode import visit_router - -from minecode.management.commands import get_error_message from minecode.management.commands import VerboseCommand - +from minecode.management.commands import get_error_message from minecode.models import ResourceURI from minecode.route import NoRouteAvailable - logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.INFO) @@ -54,15 +50,13 @@ # FIXME: we should rotate UA strings or setup our own UA # this one is for FF Windows 7 agent 32 on win7 64 as of July 2016 -USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0' +USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0" MUST_STOP = False def stop_handler(*args, **kwargs): - """ - Signal handler to set global variable to True. - """ + """Signal handler to set global variable to True.""" global MUST_STOP MUST_STOP = True @@ -71,7 +65,7 @@ def stop_handler(*args, **kwargs): class Command(VerboseCommand): - help = 'Run a visiting worker loop.' + help = "Run a visiting worker loop." # Note: we use the GLOBAL visit_router by default here. # Test subclasses can override this class-level attribute for testing. @@ -79,41 +73,46 @@ class Command(VerboseCommand): def add_arguments(self, parser): parser.add_argument( - '--exit-on-empty', - dest='exit_on_empty', + "--exit-on-empty", + dest="exit_on_empty", default=False, - action='store_true', - help='Do not loop forever. Exit when the queue is empty.') + action="store_true", + help="Do not loop forever. Exit when the queue is empty.", + ) parser.add_argument( - '--max-uris', - dest='max_uris', + "--max-uris", + dest="max_uris", default=0, - action='store', - help='Limit the number of URIs yielded from a visit to a maximum ' - 'number. 0 means no limit. Used only for testing.') + action="store", + help="Limit the number of URIs yielded from a visit to a maximum " + "number. 0 means no limit. Used only for testing.", + ) parser.add_argument( - '--max-loops', - dest='max_loops', + "--max-loops", + dest="max_loops", default=0, - action='store', - help='Limit the number of visit loops to a maximum number. ' - '0 means no limit. Used only for testing.') + action="store", + help="Limit the number of visit loops to a maximum number. " + "0 means no limit. Used only for testing.", + ) parser.add_argument( - '--ignore-robots', - dest='ignore_robots', + "--ignore-robots", + dest="ignore_robots", default=False, - action='store_true', - help='Ignore robots.txt politeness.') + action="store_true", + help="Ignore robots.txt politeness.", + ) parser.add_argument( - '--ignore-throttle', - dest='ignore_throttle', + "--ignore-throttle", + dest="ignore_throttle", default=False, - action='store_true', - help='Ignore throttling politeness.') + action="store_true", + help="Ignore throttling politeness.", + ) def handle(self, *args, **options): """ @@ -122,12 +121,12 @@ def handle(self, *args, **options): no ResourceURI left to visit. """ logger.setLevel(self.get_verbosity(**options)) - exit_on_empty = options.get('exit_on_empty') - max_uris = options.get('max_uris', 0) + exit_on_empty = options.get("exit_on_empty") + max_uris = options.get("max_uris", 0) max_uris = int(max_uris) - max_loops = options.get('max_loops', 0) - ignore_robots = options.get('ignore_robots') - ignore_throttle = options.get('ignore_throttle') + max_loops = options.get("max_loops", 0) + ignore_robots = options.get("ignore_robots") + ignore_throttle = options.get("ignore_throttle") visited_counter, inserted_counter = visit_uris( ignore_robots=ignore_robots, @@ -137,13 +136,18 @@ def handle(self, *args, **options): max_uris=max_uris, ) - self.stdout.write('Visited {} URIs'.format(visited_counter)) - self.stdout.write('Inserted {} new URIs'.format(inserted_counter)) + self.stdout.write(f"Visited {visited_counter} URIs") + self.stdout.write(f"Inserted {inserted_counter} new URIs") -def visit_uris(ignore_robots=False, ignore_throttle=False, - exit_on_empty=False, max_loops=0, max_uris=0, - user_agent=USER_AGENT): +def visit_uris( + ignore_robots=False, + ignore_throttle=False, + exit_on_empty=False, + max_loops=0, + max_uris=0, + user_agent=USER_AGENT, +): """ Run an infinite visit loop. Return a tuple of (visited, inserted) counts. @@ -164,7 +168,7 @@ def visit_uris(ignore_robots=False, ignore_throttle=False, while True: if MUST_STOP: - logger.info('Graceful exit of the visit loop.') + logger.info("Graceful exit of the visit loop.") break with transaction.atomic(): @@ -173,13 +177,14 @@ def visit_uris(ignore_robots=False, ignore_throttle=False, if not resource_uri: if exit_on_empty: logger.info( - 'exit-on-empty requested: No more visitable resource, exiting...') + "exit-on-empty requested: No more visitable resource, exiting..." + ) break # Only log a single message when we go to sleep if not sleeping: sleeping = True - logger.info('No more visitable resource, sleeping...') + logger.info("No more visitable resource, sleeping...") time.sleep(SLEEP_WHEN_EMPTY) continue @@ -187,7 +192,7 @@ def visit_uris(ignore_robots=False, ignore_throttle=False, sleeping = False if not ignore_robots and robots.disallowed(resource_uri.uri, user_agent): - msg = 'Denied by robots.txt' + msg = "Denied by robots.txt" logger.error(msg) resource_uri.last_visit_date = timezone.now() resource_uri.wip_date = None @@ -198,8 +203,9 @@ def visit_uris(ignore_robots=False, ignore_throttle=False, if not ignore_throttle: sleep_time = get_sleep_time(resource_uri) if sleep_time: - logger.debug('Respecting revisit delay: wait for {} for {}'.format( - sleep_time, resource_uri.uri)) + logger.debug( + f"Respecting revisit delay: wait for {sleep_time} for {resource_uri.uri}" + ) time.sleep(sleep_time) # Set new value in cache 'visit_delay_by_hostname' right before making the request # TODO: The cache logic should move closer to the requests calls @@ -207,27 +213,31 @@ def visit_uris(ignore_robots=False, ignore_throttle=False, visit_delay_by_hostname.set(uri_hostname, timezone.now()) # visit proper - logger.info('Visiting {}'.format(resource_uri)) + logger.info(f"Visiting {resource_uri}") visited_counter += 1 inserted_counter += visit_uri( - resource_uri=resource_uri, max_uris=max_uris, - uri_counter_by_visitor=uri_counter_by_visitor) + resource_uri=resource_uri, + max_uris=max_uris, + uri_counter_by_visitor=uri_counter_by_visitor, + ) if max_loops and int(visited_counter) > int(max_loops): - logger.info( - 'Stopping visits after max_loops: {} visit loops.'.format(max_loops)) + logger.info(f"Stopping visits after max_loops: {max_loops} visit loops.") break return visited_counter, inserted_counter -def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_router=visit_router): +def visit_uri( + resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_router=visit_router +): """ Call a visitor for a single ResourceURI. Process up to `max_uris` records. `_visit_router` is the Router to use for routing. Used for tests only. """ - from requests.exceptions import ConnectionError, Timeout + from requests.exceptions import ConnectionError + from requests.exceptions import Timeout if not resource_uri: return @@ -252,18 +262,18 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout return 0 if TRACE: - logger.debug('visit_uri: uri: {}'.format(uri_to_visit)) + logger.debug(f"visit_uri: uri: {uri_to_visit}") # TODO: Consider pass a full visitors.URI plain object rather than a plain string new_uris_to_visit, visited_data, visit_error = _visit_router.process( - uri_to_visit) + uri_to_visit + ) if TRACE: new_uris_to_visit = list(new_uris_to_visit or []) - logger.debug( - 'visit_uri: new_uris_to_visit: {}'.format(new_uris_to_visit)) + logger.debug(f"visit_uri: new_uris_to_visit: {new_uris_to_visit}") except NoRouteAvailable: - logger.error('No route available.') + logger.error("No route available.") # TODO: For now, when a route is not yet supported, we keep a value for # the wip_date value so the instance is not back in the queue. It will # not be selected by a worker again until the wip_date is manually @@ -273,8 +283,8 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout return 0 except (ConnectionError, Timeout, Exception) as e: # FIXME: is catching all expections here correct? - msg = 'Visit error for URI: {}'.format(uri_to_visit) - msg += '\n'.format(uri_to_visit) + msg = f"Visit error for URI: {uri_to_visit}" + msg += "\n".format() msg += get_error_message(e) visit_errors.append(msg) logger.error(msg) @@ -282,9 +292,9 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout ######################################## # Also log visit errors!!!1 if visit_error: - msg = 'Visit error for URI: {}'.format(uri_to_visit) - msg += '\n'.format(uri_to_visit) - msg += get_error_message(e) + msg = f"Visit error for URI: {uri_to_visit}" + msg += "\n".format() + msg += get_error_message(e) # NOQA visit_errors.append(msg) logger.error(msg) @@ -304,63 +314,62 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout uri_str = smart_str(vuri.uri) visited_uri = vuri.to_dict() - last_modified_date = visited_uri.pop('date') + last_modified_date = visited_uri.pop("date") if last_modified_date: - visited_uri['last_modified_date'] = last_modified_date + visited_uri["last_modified_date"] = last_modified_date if vuri_count % 1000 == 0: - logger.debug( - ' * Processed: {} visited URIs'.format(vuri_count)) + logger.debug(f" * Processed: {vuri_count} visited URIs") try: # insert new if pre-visited - pre_visited = visited_uri.pop('visited') + pre_visited = visited_uri.pop("visited") if pre_visited: # set last visit date for this pre-visited URI - visited_uri['last_visit_date'] = timezone.now() + visited_uri["last_visit_date"] = timezone.now() new_uri = ResourceURI(**visited_uri) new_uri.save() - logger.debug( - ' + Inserted pre-visited:\t{}'.format(uri_str)) + logger.debug(f" + Inserted pre-visited:\t{uri_str}") inserted_count += 1 if max_uris: uri_counter_by_visitor[visitor_key] += 1 else: # if not pre-visited only insert if not existing - if not ResourceURI.objects.filter(uri=vuri.uri, last_visit_date=None).exists(): - visited_uri['last_visit_date'] = None + if not ResourceURI.objects.filter( + uri=vuri.uri, last_visit_date=None + ).exists(): + visited_uri["last_visit_date"] = None new_uri = ResourceURI(**visited_uri) new_uri.save() - logger.debug(' + Inserted new:\t{}'.format(uri_str)) + logger.debug(f" + Inserted new:\t{uri_str}") inserted_count += 1 if max_uris: uri_counter_by_visitor[visitor_key] += 1 else: - logger.debug(' + NOT Inserted:\t{}'.format(uri_str)) + logger.debug(f" + NOT Inserted:\t{uri_str}") except Exception as e: # FIXME: is catching all expections here correct? - msg = 'ERROR while processing URI from a visit through: {}'.format( - uri_str) - msg += '\n' + msg = f"ERROR while processing URI from a visit through: {uri_str}" + msg += "\n" msg += repr(visited_uri) - msg += '\n' + msg += "\n" msg += get_error_message(e) visit_errors.append(msg) logger.error(msg) if len(visit_errors) > 10: logger.error( - ' ! Breaking after processing over 10 vuris errors for: {}'.format(uri_str)) + f" ! Breaking after processing over 10 vuris errors for: {uri_str}" + ) break if max_uris and int(uri_counter_by_visitor[visitor_key]) > int(max_uris): - logger.info( - ' ! Breaking after processing max-uris: {} URIs.'.format(max_uris)) + logger.info(f" ! Breaking after processing max-uris: {max_uris} URIs.") break except Exception as e: - msg = 'Visit error for URI: {}'.format(uri_to_visit) - msg += '\n'.format(uri_to_visit) + msg = f"Visit error for URI: {uri_to_visit}" + msg += "\n".format() msg += get_error_message(e) visit_errors.append(msg) logger.error(msg) @@ -370,14 +379,14 @@ def visit_uri(resource_uri, max_uris=0, uri_counter_by_visitor=None, _visit_rout resource_uri.last_visit_date = timezone.now() resource_uri.wip_date = None if visited_data: - logger.debug(' + Data collected.') + logger.debug(" + Data collected.") resource_uri.data = visited_data if visit_errors: - logger.debug(' ! Errors.') - resource_uri.visit_error = '\n'.join(visit_errors)[:5000] + logger.debug(" ! Errors.") + resource_uri.visit_error = "\n".join(visit_errors)[:5000] resource_uri.save() - logger.debug(' Inserted\t: {} new URI(s).'.format(inserted_count)) + logger.debug(f" Inserted\t: {inserted_count} new URI(s).") return inserted_count diff --git a/minecode/management/commands/seed.py b/minecode/management/commands/seed.py index 9ee8fd9f..5b001b61 100644 --- a/minecode/management/commands/seed.py +++ b/minecode/management/commands/seed.py @@ -15,14 +15,11 @@ from django.db import transaction # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import mappers # NOQA -from minecode import visitors # NOQA - +# But importing the miners module triggers routes registration +from minecode import miners # NOQA from minecode import seed -from minecode.models import ResourceURI from minecode.management.commands import VerboseCommand - +from minecode.models import ResourceURI logger = logging.getLogger(__name__) logging.basicConfig(stream=sys.stdout) @@ -30,12 +27,19 @@ class Command(VerboseCommand): - help = ('Insert ResourceURIs records from Seed ' - 'objects with a URI matching a pattern.') + help = ( + "Insert ResourceURIs records from Seed " + "objects with a URI matching a pattern." + ) def add_arguments(self, parser): - parser.add_argument('--pattern', '-p', action='store', dest='pattern', - help='Only add seed URIs matching this regex pattern.') + parser.add_argument( + "--pattern", + "-p", + action="store", + dest="pattern", + help="Only add seed URIs matching this regex pattern.", + ) def handle(self, *args, **options): """ @@ -44,14 +48,14 @@ def handle(self, *args, **options): """ logger.setLevel(self.get_verbosity(**options)) - pattern = options.get('pattern') + pattern = options.get("pattern") seeders = seed.get_active_seeders() counter = 0 for uri in insert_seed_uris(pattern, seeders=seeders): - logger.info('Inserting new seed URI: {}'.format(uri)) + logger.info(f"Inserting new seed URI: {uri}") counter += 1 - self.stdout.write('Inserted {} seed URIs'.format(counter)) + self.stdout.write(f"Inserted {counter} seed URIs") SEED_PRIORITY = 100 @@ -66,15 +70,17 @@ def insert_seed_uris(pattern=None, priority=SEED_PRIORITY, seeders=()): for seeder in seeders: for uri in seeder.get_seeds(): if pattern and not re.match(pattern, uri): - logger.info('Skipping seeding for: {}. Pattern {}' - 'not matched.'.format(uri, pattern)) + logger.info( + f"Skipping seeding for: {uri}. Pattern {pattern}" "not matched." + ) continue if ResourceURI.objects.filter(uri=uri).exists(): needs_revisit = ResourceURI.objects.needs_revisit( - uri=uri, hours=seeder.revisit_after) + uri=uri, hours=seeder.revisit_after + ) if not needs_revisit: - logger.info('Revisit not needed for: {}'.format(uri)) + logger.info(f"Revisit not needed for: {uri}") continue # FIXME: Currently, we update the existing a new ResourceURI @@ -83,8 +89,7 @@ def insert_seed_uris(pattern=None, priority=SEED_PRIORITY, seeders=()): # to store this datablob on the filesystem and have a single # ResourceURI per `uri` that points to one or more data blobs. seed_uri = ResourceURI.objects.update_or_create( - uri=uri, - priority=priority, - last_visit_date=None) + uri=uri, priority=priority, last_visit_date=None + ) assert seed_uri yield uri diff --git a/minecode/management/commands/update_maven_package_data.py b/minecode/management/commands/update_maven_package_data.py index c62a0be4..2ac6c0c7 100644 --- a/minecode/management/commands/update_maven_package_data.py +++ b/minecode/management/commands/update_maven_package_data.py @@ -6,15 +6,16 @@ # See https://github.com/aboutcode-org/purldb for support or download. # See https://aboutcode.org for more information about nexB OSS projects. # -from dateutil.parser import parse as dateutil_parse -from os.path import basename import logging import sys import traceback +from os.path import basename from django.db import transaction from django.db.utils import DataError from django.utils import timezone + +from dateutil.parser import parse as dateutil_parse from packageurl import normalize_qualifiers from minecode.collectors.maven import MavenNexusCollector @@ -34,10 +35,7 @@ def update_packages(packages, fields_to_update): try: with transaction.atomic(): - Package.objects.bulk_update( - objs=packages, - fields=fields_to_update - ) + Package.objects.bulk_update(objs=packages, fields=fields_to_update) updated_packages_count = len(packages) except DataError: updated_packages_count = 0 @@ -50,7 +48,7 @@ def update_packages(packages, fields_to_update): except DataError: service = basename(__file__) traceback_message = traceback.format_exc() - message = f'Error updating Package {package.package_uid}:\n\n{traceback_message}' + message = f"Error updating Package {package.package_uid}:\n\n{traceback_message}" ProcessingError.objects.create( service=service, date=timezone.now(), @@ -75,7 +73,9 @@ def create_packages(packages): except DataError: service = basename(__file__) traceback_message = traceback.format_exc() - message = f'Error creating Package {package.purl}:\n\n{traceback_message}' + message = ( + f"Error creating Package {package.purl}:\n\n{traceback_message}" + ) ProcessingError.objects.create( service=service, date=timezone.now(), @@ -105,13 +105,13 @@ def process_packages( updated = False if unsaved_existing_packages: fields_to_update = [ - 'download_url', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'release_date', - 'last_modified_date', - 'history', + "download_url", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "release_date", + "last_modified_date", + "history", ] upc = update_packages(unsaved_existing_packages, fields_to_update) updated_packages_count += upc @@ -121,44 +121,47 @@ def process_packages( if unsaved_existing_packages_lowercased: fields_to_update = [ - 'namespace', - 'name', - 'version', - 'qualifiers', - 'download_url', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'release_date', - 'last_modified_date', - 'history', + "namespace", + "name", + "version", + "qualifiers", + "download_url", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "release_date", + "last_modified_date", + "history", ] - upc = update_packages( - unsaved_existing_packages_lowercased, fields_to_update) + upc = update_packages(unsaved_existing_packages_lowercased, fields_to_update) updated_packages_count += upc unsaved_existing_packages_lowercased = [] if upc > 0: updated = True if updated: - logger.info(f'Updated {updated_packages_count:,} Maven Packages') + logger.info(f"Updated {updated_packages_count:,} Maven Packages") if unsaved_new_packages: cpc = create_packages(unsaved_new_packages) created_packages_count += cpc unsaved_new_packages = [] if cpc > 0: - logger.info(f'Created {created_packages_count:,} Maven Packages') + logger.info(f"Created {created_packages_count:,} Maven Packages") if packages_to_delete: dpc = delete_packages(packages_to_delete) packages_to_delete = [] deleted_packages_count += dpc if dpc > 0: - logger.info( - f'Deleted {deleted_packages_count:,} Duplicate Maven Packages') + logger.info(f"Deleted {deleted_packages_count:,} Duplicate Maven Packages") - return unsaved_existing_packages, unsaved_existing_packages_lowercased, unsaved_new_packages, packages_to_delete + return ( + unsaved_existing_packages, + unsaved_existing_packages_lowercased, + unsaved_new_packages, + packages_to_delete, + ) def update_package_fields(package, maven_package, field_names): @@ -166,13 +169,13 @@ def update_package_fields(package, maven_package, field_names): for field in field_names: p_val = getattr(package, field) value = getattr(maven_package, field) - if field == 'qualifiers': + if field == "qualifiers": value = normalize_qualifiers(value, encode=True) - if field == 'release_date': + if field == "release_date": value = dateutil_parse(value) if p_val != value: setattr(package, field, value) - if field == 'release_date': + if field == "release_date": p_val = str(p_val) value = str(value) entry = dict( @@ -184,59 +187,59 @@ def update_package_fields(package, maven_package, field_names): if updated_fields: data = { - 'updated_fields': updated_fields, + "updated_fields": updated_fields, } package.append_to_history( - 'Package field values have been updated.', + "Package field values have been updated.", data=data, ) - logger.debug(f'Updated existing Package {package.package_uid}') + logger.debug(f"Updated existing Package {package.package_uid}") return package -def update_maven_packages(maven_package, fields_to_update, lowercased_purl_fields=False): +def update_maven_packages( + maven_package, fields_to_update, lowercased_purl_fields=False +): namespace = maven_package.namespace name = maven_package.name version = maven_package.version - normalized_qualifiers = normalize_qualifiers( - maven_package.qualifiers, encode=True) + normalized_qualifiers = normalize_qualifiers(maven_package.qualifiers, encode=True) if lowercased_purl_fields: namespace = namespace.lower() name = name.lower() version = version.lower() - normalize_qualifiers = normalize_qualifiers.lower() + normalized_qualifiers = normalized_qualifiers.lower() existing_packages = Package.objects.filter( - type='maven', + type="maven", namespace=namespace, name=name, version=version, - qualifiers=normalized_qualifiers or '' + qualifiers=normalized_qualifiers or "", ) - if existing_package.exists(): + if existing_packages.exists(): duplicate_packages = [] for existing_package in existing_packages: if existing_package.download_url != maven_package.download_url: logger.debug( - f'Deleted duplicate Package with incorrect download URL {existing_package.package_uid}') + f"Deleted duplicate Package with incorrect download URL {existing_package.package_uid}" + ) duplicate_packages.append(existing_package) duplicate_packages_pks = [p.pk for p in duplicate_packages] existing_package = Package.objects.exclude( pk__in=duplicate_packages_pks ).get_or_none( - type='maven', + type="maven", namespace=namespace, name=name, version=version, - qualifiers=normalized_qualifiers or '' + qualifiers=normalized_qualifiers or "", ) if existing_package: existing_package = update_package_fields( - existing_package, - maven_package, - fields_to_update + existing_package, maven_package, fields_to_update ) return existing_package, duplicate_packages else: @@ -244,17 +247,17 @@ def update_maven_packages(maven_package, fields_to_update, lowercased_purl_field class Command(VerboseCommand): - help = 'Update maven Package values' + help = "Update maven Package values" def add_arguments(self, parser): parser.add_argument( - '--create_package', + "--create_package", type=bool, - help='Create new Maven Packages if it does not exist in our database' + help="Create new Maven Packages if it does not exist in our database", ) def handle(self, *args, **options): - create_package = options.get('create_package', False) + create_package = options.get("create_package", False) updated_packages_count = 0 created_packages_count = 0 deleted_packages_count = 0 @@ -263,11 +266,11 @@ def handle(self, *args, **options): unsaved_existing_packages_lowercased = [] packages_to_delete = [] - logger.info('Updating or Adding new Packages from Maven Index') + logger.info("Updating or Adding new Packages from Maven Index") collector = MavenNexusCollector() for i, maven_package in enumerate(collector.get_packages()): if not i % 1000: - logger.info(f'Processed {i:,} Maven Artifacts') + logger.info(f"Processed {i:,} Maven Artifacts") if not i % 2000: ( unsaved_existing_packages, @@ -288,15 +291,14 @@ def handle(self, *args, **options): ) fields_to_update = [ - 'download_url', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'release_date', + "download_url", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "release_date", ] existing_package, duplicate_packages = update_maven_packages( - maven_package, - fields_to_update + maven_package, fields_to_update ) if existing_package: unsaved_existing_packages.append(existing_package) @@ -304,43 +306,40 @@ def handle(self, *args, **options): continue fields_to_update = [ - 'namespace', - 'name', - 'version', - 'qualifiers', - 'download_url', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'release_date', + "namespace", + "name", + "version", + "qualifiers", + "download_url", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "release_date", ] existing_package_lowercased, duplicate_packages = update_maven_packages( - maven_package, - fields_to_update, - lowercased_purl_fields=True + maven_package, fields_to_update, lowercased_purl_fields=True ) if existing_package_lowercased: - unsaved_existing_packages_lowercased.append( - existing_package_lowercased) + unsaved_existing_packages_lowercased.append(existing_package_lowercased) packages_to_delete.extend(duplicate_packages) continue if Package.objects.filter(download_url=maven_package.download_url).exists(): logger.debug( - f'Skipping creation of {maven_package.purl} - already exists') + f"Skipping creation of {maven_package.purl} - already exists" + ) continue if create_package: normalized_qualifiers = normalize_qualifiers( - maven_package.qualifiers, - encode=True + maven_package.qualifiers, encode=True ) new_package = Package( type=maven_package.type, namespace=maven_package.namespace, name=maven_package.name, version=maven_package.version, - qualifiers=normalized_qualifiers or '', + qualifiers=normalized_qualifiers or "", download_url=maven_package.download_url, size=maven_package.size, sha1=maven_package.sha1, @@ -351,7 +350,7 @@ def handle(self, *args, **options): ) new_package.created_date = timezone.now() unsaved_new_packages.append(new_package) - logger.debug(f'Created Package {maven_package.purl}') + logger.debug(f"Created Package {maven_package.purl}") ( unsaved_existing_packages, diff --git a/minecode/management/user_creation.py b/minecode/management/user_creation.py index ee8cba8a..6248514d 100644 --- a/minecode/management/user_creation.py +++ b/minecode/management/user_creation.py @@ -19,7 +19,7 @@ class CreateUserCommand(BaseCommand): - help = 'Create a user and generate an API key for authentication.' + help = "Create a user and generate an API key for authentication." requires_migrations_checks = True def __init__(self, *args, **kwargs): @@ -30,23 +30,20 @@ def __init__(self, *args, **kwargs): ) def add_arguments(self, parser): + parser.add_argument("username", help="Specifies the username for the user.") parser.add_argument( - 'username', help='Specifies the username for the user.') - parser.add_argument( - '--no-input', - action='store_false', - dest='interactive', - help='Do not prompt the user for input of any kind.', + "--no-input", + action="store_false", + dest="interactive", + help="Do not prompt the user for input of any kind.", ) def handle(self, *args, **options): - username = options['username'] - interactive = options['interactive'] - verbosity = options['verbosity'] + username = options["username"] + interactive = options["interactive"] + verbosity = options["verbosity"] self.create_user( - username=username, - interactive=interactive, - verbosity=verbosity + username=username, interactive=interactive, verbosity=verbosity ) def create_user(self, username, interactive, verbosity): @@ -58,12 +55,11 @@ def create_user(self, username, interactive, verbosity): if interactive: password = self.get_password_from_stdin(username) - user = self.UserModel._default_manager.create_user( - username, password=password) + user = self.UserModel._default_manager.create_user(username, password=password) token, _ = Token._default_manager.get_or_create(user=user) if verbosity >= 1: - msg = f'User {username} created with API key: {token.key}' + msg = f"User {username} created with API key: {token.key}" self.stdout.write(msg, self.style.SUCCESS) return user @@ -78,21 +74,21 @@ def get_password_from_stdin(self, username): password = None while password is None: password1 = getpass.getpass() - password2 = getpass.getpass('Password (again): ') + password2 = getpass.getpass("Password (again): ") if password1 != password2: self.stderr.write("Error: Your passwords didn't match.") continue - if password1.strip() == '': + if password1.strip() == "": self.stderr.write("Error: Blank passwords aren't allowed.") continue try: validate_password(password2, self.UserModel(**fake_user_data)) except exceptions.ValidationError as err: - self.stderr.write('\n'.join(err.messages)) + self.stderr.write("\n".join(err.messages)) response = input( - 'Bypass password validation and create user anyway? [y/N]: ' + "Bypass password validation and create user anyway? [y/N]: " ) - if response.lower() != 'y': + if response.lower() != "y": continue password = password1 @@ -106,9 +102,9 @@ def _validate_username(self, username): except self.UserModel.DoesNotExist: pass else: - return 'Error: That username is already taken.' + return "Error: That username is already taken." try: self.username_field.clean(username, None) except exceptions.ValidationError as e: - return '; '.join(e.messages) + return "; ".join(e.messages) diff --git a/minecode/mappers/__init__.py b/minecode/mappers/__init__.py deleted file mode 100644 index f8ccc9fe..00000000 --- a/minecode/mappers/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import pkgutil - - -class Mapper(object): - """ - Abstract base class for mappers. Subclasses must implement the - get_packages() method and use a routing decorator for the URIs they can - handle. - """ - - def __call__(self, uri, resource_uri): - # Note: we let exceptions bubble up and they will be caught and - # processed by the worker loop - return self.get_packages(uri, resource_uri) - - def get_packages(self, uri, resource_uri): - """ - This method must yield ScannedPackage objects (or return a list) built - from a resource_uri ResourceURI object. - """ - raise NotImplementedError - - -""" -Minimal way to recursively import all submodules dynamically. If this module is -imported, all submodules will be imported: this triggers the actual registration -of mappers. This should stay as the last import in this init module. -""" -for _, name, _ in pkgutil.walk_packages(__path__, prefix=__name__ + '.'): - __import__(name) diff --git a/minecode/mappers/apache.py b/minecode/mappers/apache.py deleted file mode 100644 index 25b68aec..00000000 --- a/minecode/mappers/apache.py +++ /dev/null @@ -1,268 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from collections import OrderedDict -import json -import logging - -from packageurl import PackageURL - -from commoncode import fileutils -import packagedcode.models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import parse_date -from minecode.visitors.apache import CHECKSUM_EXTS - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - -# TODO: Declared license should be an Apache license - -# common licenses found in JSON -APACHE_LICENSE_URL = { - 'http://usefulinc.com/doap/licenses/asl20', - 'https://usefulinc.com/doap/licenses/asl20', - 'http://spdx.org/licenses/Apache-2.0', - 'https://spdx.org/licenses/Apache-2.0', - 'http://www.apache.org/licenses/LICENSE-2.0', - 'https://www.apache.org/licenses/LICENSE-2.0', - 'http://www.apache.org/licenses/LICENSE-2.0.txt', - 'https://www.apache.org/licenses/LICENSE-2.0.txt', - 'http://www.apache.org/licenses/', - 'http://forrest.apache.org/license.html', - 'https://svn.apache.org/repos/asf/tomee/tomee/trunk/LICENSE', -} - - -# FIXME: this is NOT specific to a download URL but to a project: disabled for now -# @map_router.route('https://projects.apache.org/json/foundation/projects.json') -class ApacheProjectJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages built from resource_uri record for a single - package version. - """ - metadata = json.loads(resource_uri.data, object_pairs_hook=OrderedDict) - return build_packages_from_projects(metadata, uri=uri) - - -def build_packages_from_projects(metadata, uri=None): - """ - Yield Package built from Apache a `metadata` mapping - which is a dictionary keyed by project name and values are project_metadata. - Yield as many Package as there are download URLs. - """ - for project_name, project_meta in metadata.items(): - short_desc = project_meta.get('shortdesc') - long_desc = project_meta.get('description') - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - common_data = dict( - datasource_id="apache_json", - type='apache', - name=project_name, - description=description, - homepage_url=project_meta.get('homepage'), - bug_tracking_url=project_meta.get('bug-database'), - primary_language=project_meta.get('programming-language'), - ) - - # FIXME: setting the download-page as the download_url is not right - if project_meta.get('download-page'): - download_url = project_meta.get('download-page') - common_data['download_url'] = download_url - for repo in project_meta.get('repository', []): - common_data['code_view_url'] = repo - # Package code_view_url only support one URL, so break when - # finding a code_view_url - break - - maintainers = project_meta.get('maintainer', []) - for maintainer in maintainers: - mailbox = maintainer.get('mbox', '').replace('mailto:', '') - name = maintainer.get('name') - party = scan_models.Party( - type=scan_models.party_person, name=name, role='maintainer', email=mailbox) - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - common_data['parties'].append(party.to_dict()) - - # license is just a URL in the json file, for example: - # http://usefulinc.com/doap/licenses/asl20 - license_url = project_meta.get('license') - common_data['extracted_license_statement'] = license_url - - if license_url in APACHE_LICENSE_URL: - common_data['declared_license_expression'] = 'apache-2.0' - common_data['declared_license_expression_spdx'] = 'Apache-2.0' - common_data['license_detections'] = [] - - keywords = [] - category = project_meta.get('category', '') - for kw in category.split(','): - kw = kw.strip() - if kw: - keywords.append(kw) - common_data['keywords'] = keywords - - common_data['primary_language'] = project_meta.get( - 'programming-language') - - # FIXME: these cannot be related to actual packages with a download URL - releases = project_meta.get('release') - if releases: - for release in releases: - rdata = dict(common_data) - rdata['version'] = release.get('revision') - if release.get('created') and len(release.get('created')) == 10: - rdata['release_date'] = parse_date(release.get('created')) - else: - logger.warn('Unexpected date format for release date: {}'.format( - release.get('created'))) - package = scan_models.Package.from_package_data( - package_data=rdata, - datafile_path=uri, - ) - yield package - else: - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - yield package - - -# FIXME: this is NOT specific to a download URL but to a project: disabled for now -# FIXME: this is casting too wide a net! -# @map_router.route('http?://[\w\-\.]+.incubator.apache.org/"') -class ApachePodlingsMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages built from resource_uri record for a single - package version. - """ - metadata = json.loads(resource_uri.data, object_pairs_hook=OrderedDict) - return build_packages_from_podlings(metadata, resource_uri.package_url) - - -def build_packages_from_podlings(metadata, purl): - """ - Yield Package built from Apache podlings metadata - which is a dictionary keyed by project name and values are project_metadata. - Yield as many Package as there are download URLs. - """ - name = metadata.get('name') - if name: - common_data = dict( - type='apache-podling', - name=name, - description=metadata.get('description'), - homepage_url=metadata.get('homepage'), - ) - package = scan_models.Package(**common_data) - package.set_purl(purl) - yield package - - -@map_router.route('http?s://(archive\.)?apache\.org/dist/.*') -class ApacheDownloadMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages build from a bare download URI or download checksum URI. - """ - if uri.endswith(CHECKSUM_EXTS): - # 1. create a regular package from the URL stripped from its checksum extension - archive_uri, _, checksum_type = uri.rpartition('.') - - pack = build_package_from_download( - archive_uri, resource_uri.package_url) - # 2. collect the checksum inside the file - # and attach it to the package - checksum_value = resource_uri.data.strip() - if checksum_value: - checksum_field_name = 'download_{checksum_type}'.format( - **locals()) - setattr(pack, checksum_field_name, checksum_value) - yield pack - else: - # a plain download URI - yield build_package_from_download(uri, resource_uri.package_url) - - -def build_package_from_download(uri, purl=None): - """ - Return a Package built from an Apache dist download archive URL. - - The uri could be: - http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip - https://apache.org/dist/chemistry/opencmis/1.1.0/chemistry-opencmis-dist-1.1.0-server-webapps.zip - """ - name, version = get_name_version(uri) - if purl: - purl = PackageURL.from_string(purl) - if not name: - name = purl.name - # FIXME: use purl data?? - package = scan_models.Package( - type='apache', - namespace=purl.namespace, - name=name, - version=version, - download_url=uri, - ) - package.set_purl(purl) - return package - - -# FIXME: there should be only one such method and this one is rather weak -def get_name_version(uri): - """ - Return name and version extracted from a path. - """ - # base_url will end being 'https://archive.apache.org/dist' or 'https://apache.org/dist' - # path is the uri without base url, for example: - # /groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip - _, _, path = uri.partition('apache.org/dist/') - base_name = fileutils.file_base_name(path) - version = None - package_name = '' - name_segments = base_name.split('-') - for segment in name_segments: - try: - # To test if each split segment with . is integer. - # For example in '1.2.3' all chars are integer or period. - # If so, this segment is a version segment. - if version: - # The segment after integer segment should belong to version too. - # For example: turbine-4.0-M1, after detecting 4.0, - # M1 should be including in version too, so the final version is 4.0-M1 - version = '-'.join([version, segment]) - continue - - is_all_int = all(n.isdigit() for n in segment.split('.')) - if is_all_int: - version = segment - except ValueError: - # Connect the package_name with - because we split it with - eariler, util - # when we meet version, package_name should be good. - if not package_name: - package_name = segment - else: - package_name = ('-').join([package_name, segment]) - continue - return package_name, version diff --git a/minecode/mappers/bitbucket.py b/minecode/mappers/bitbucket.py deleted file mode 100644 index 5764cb58..00000000 --- a/minecode/mappers/bitbucket.py +++ /dev/null @@ -1,139 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from collections import OrderedDict -import json -import logging - -from packagedcode import models as scan_models -from packageurl import PackageURL - -from minecode import map_router -from minecode.mappers import Mapper - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -@map_router.route( - 'https://api.bitbucket\.org/2\.0/repositories/.*/downloads/', -) -class BitbucketDownloadMapper(Mapper): - """ - Build package from download urls if present. - """ - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single package version. - """ - downloads_data = json.loads( - resource_uri.data, object_pairs_hook=OrderedDict) - for download_data in downloads_data.get('values', []): - for package in build_bitbucket_download_packages( - download_data, resource_uri.package_url): - yield package - - -def build_bitbucket_download_packages(download_data, purl): - """ - Yield scanned Packages for each download - https://api.bitbucket.org/2.0/repositories/pypa/setuptools/downloads/ - """ - purl = PackageURL.from_string(purl) - namespace = purl.namespace - name = purl.name - - # FIXME: add these ? - filename = download_data.get('name') - download_counts = download_data.get('downloads', 0) - - download_url = download_data.get('links', {}).get('self', {}).get('href') - size = download_data.get('size') - - package = scan_models.Package( - type='bitbucket', - name=name, - namespace=namespace, - download_url=download_url, - size=size, - ) - package.set_purl(purl) - yield package - - -# @map_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*') -class BitbucketIndexMapper(Mapper): - """ - Build a Package for a repo. - """ - - def get_packages(self, uri, resource_uri): - repo = json.loads(resource_uri.data, object_pairs_hook=OrderedDict) - if not repo: - return - yield build_bitbucket_repo_package(repo, resource_uri.package_url) - - -# FIXME: disabled as this is for a package template -# @map_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*') -class BitbucketRepoMapper(Mapper): - """ - Build a Package for a repo. - """ - - def get_packages(self, uri, resource_uri): - repo = json.loads(resource_uri.data, object_pairs_hook=OrderedDict) - if not repo: - return - yield build_bitbucket_repo_package(repo, resource_uri.package_url) - - -def build_bitbucket_repo_package(repo_data, purl): - """ - Peturn a Package "template" from repository data. - Notes: this is not version-specific and has no download URL. - """ - purl = PackageURL.from_string(purl) - scm_protocol = repo_data.get('scm') - if not scm_protocol: - scm_protocol = 'git' - bb_url = '{protocol}+https://bitbucket.org/{namespace}/{name}'.format( - protocol=scm_protocol, **purl.to_dict()) - - owner = repo_data.get('owner') - owner_party = scan_models.Party( - type=scan_models.party_person, - name=owner.get('username'), - role='owner', - url=owner.get('links', {}).get('html', {}).get('href', {}) - ) - - if repo_data.get('has_issues'): - bug_tracking_url = bb_url + '/issues' - else: - bug_tracking_url = None - - package = scan_models.Package( - type=purl.type, - namespace=purl.namespace, - name=purl.name, - homepage_url=repo_data.get('website') or bb_url, - code_view_url=bb_url + '/src', - bug_tracking_url=bug_tracking_url, - description=repo_data.get('description'), - vcs_url=bb_url, - primary_language=repo_data.get('language'), - parties=[owner_party], - ) - package.set_purl(purl) - return package diff --git a/minecode/mappers/bower.py b/minecode/mappers/bower.py deleted file mode 100644 index c27f6fc3..00000000 --- a/minecode/mappers/bower.py +++ /dev/null @@ -1,133 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode import models as scan_models -from packagedcode.models import DependentPackage - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://raw.githubusercontent.com/.*/master/bower.json', - 'https://lolg.it/.*/master/bower.json', - 'https://coding.net/.*/master/bower.json') -class BowerJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = resource_uri.data - build_packages_from_jsonfile( - metadata, resource_uri.uri, resource_uri.package_url) - - -def build_packages_from_jsonfile(metadata, uri=None, purl=None): - """ - Yield Package built from Bower json content - """ - content = json.loads(metadata) - - licenses_content = content.get('licenses') - extracted_license_statement = set([]) - if licenses_content: - if isinstance(licenses_content, list): - for lic in licenses_content: - extracted_license_statement.add(lic) - else: - extracted_license_statement.add(licenses_content) - - keywords_content = content.get('keywords', []) - name = content.get('name') - - devdependencies = content.get('devDependencies') - dev_dependencies = [] - if devdependencies: - for key, value in devdependencies.items(): - dev_dependencies.append( - DependentPackage( - purl=key, extracted_requirement=value, scope='devdependency').to_dict() - ) - - dependencies = content.get('dependencies') - dependencies_build = [] - if dependencies: - for key, value in dependencies.items(): - dependencies_build.append( - DependentPackage( - purl=key, extracted_requirement=value, scope='runtime').to_dict() - ) - - if name: - vcs_tool, vcs_repo = get_vcs_repo(content) - if vcs_tool and vcs_repo: - # Form the vsc_url by - # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 - vcs_repo = vcs_tool + '+' + vcs_repo - common_data = dict( - type='bower', - name=name, - description=content.get('description'), - version=content.get('version'), - vcs_url=vcs_repo, - keywords=keywords_content, - homepage_url=content.get('homepage'), - datasource_id='bower_json', - license_detections=[], - ) - - if extracted_license_statement: - common_data['extracted_license_statement'] = list( - extracted_license_statement) - - author_content = content.get('author') - if author_content: - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - common_data['parties'].append(scan_models.Party( - name=author_content, role='author',).to_dict()) - else: - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - author_content = content.get('authors', []) - for author in author_content: - author_split = author.split(':') - if len(author_split) > 1: - common_data['parties'].append(scan_models.Party( - name=author_split[1].strip(), role='author',).to_dict()) - - dependencies = [] - if dependencies_build: - dependencies.extend(dependencies_build) - if dev_dependencies: - dependencies.extend(dev_dependencies) - if len(dependencies) > 0: - common_data['dependencies'] = dependencies - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package - - -def get_vcs_repo(content): - """ - Return the repo type and url. - """ - repo = content.get('repository', {}) - if repo: - return repo.get('type'), repo.get('url') - return None, None diff --git a/minecode/mappers/cpan.py b/minecode/mappers/cpan.py deleted file mode 100644 index 4ea7fce4..00000000 --- a/minecode/mappers/cpan.py +++ /dev/null @@ -1,314 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from collections import OrderedDict -import json - -import packagedcode.models as scan_models -import saneyaml -from packageurl import PackageURL - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import parse_date - - -@map_router.route('https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000') -class MetaCpanReleaseSearchMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield packages by parsing the json returned from release search request. - """ - metadata = resource_uri.data - build_packages_from_release_json( - metadata, resource_uri.uri, resource_uri.package_url) - - -def build_packages_from_release_json(metadata, uri=None): - """ - Yield packages built from the json from release search request. - metadata: json content with metadata - uri: the uri of the ResourceURI object - """ - content = json.loads(metadata) - hits = content.get('hits', {}) - inner_hits = hits.get('hits', []) - for hit in inner_hits: - release = hit.get('_source', {}) - if not release: - continue - name = release.get('name') - if not name: - continue - - extracted_license_statement = [ - l for l in release.get('license', []) if l and l.strip()] - - common_data = dict( - datasource_id="cpan_release_json", - type='cpan', - name=name, - description=release.get('abstract'), - version=release.get('version'), - download_url=release.get('download_url'), - extracted_license_statement=extracted_license_statement, - license_detections=[], - # the date format passing is like: - # "2014-04-20T21:30:13" - release_date=parse_date(release.get('date')), - ) - - # Get the homepage_url, declared_license and vcs_repository/vcs_tool under resources section. - # The resources section format is like this: - # "resources" : { - # "homepage" : "http://plackperl.org", - # "license" : [ - # "http://dev.perl.org/licenses/" - # ], - # "bugtracker" : { - # "web" : "https://github.com/plack/Plack/issues" - # }, - # "repository" : { - # "url" : "git://github.com/plack/Plack.git" - # } - # }, - resources = release.get('resources') or {} - - common_data['homepage_url'] = resources.get('homepage') - # Usually the license in root node contains the license name - # like perl_5. The license here under resources section is the - # url of license for example: http://dev.perl.org/licenses/ So - # it's useful to collect both information... - license_url = [l for l in resources.get( - 'license', []) if l and l.strip()] - if license_url: - common_data['extracted_license_statement'].extend(license_url) - - vcs_tool, vcs_repo = get_vcs_repo1(resources) - if vcs_tool and vcs_repo: - # Form the vsc_url by - # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 - vcs_repo = vcs_tool + '+' + vcs_repo - common_data['vcs_url'] = vcs_repo - - bugtracker_section = resources.get('bugtracker', {}) - common_data['bug_tracking_url'] = bugtracker_section.get('web') - - if release.get('author'): - party = scan_models.Party( - type=scan_models.party_person, - name=release.get('author'), role='author') - common_data['parties'] = common_data.get('parties', []) - common_data['parties'].append(party.to_dict()) - - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package_url = PackageURL(type='cpan', name=release.get( - 'name'), version=release.get('version')) - package.set_purl(package_url.to_string()) - yield package - - -def get_vcs_repo1(content): - """ - Return the repo type and url. - """ - repo_type = None - repo_url = None - repo = content.get('repository', {}) - if repo: - url = repo.get('url') - if url: - repo_url = url - if '.git' in url: - repo_type = 'git' - return repo_type, repo_url - - -@map_router.route('http://www.cpan.org/.*.meta') -class CpanMetaFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = resource_uri.data - build_packages_from_metafile( - metadata, resource_uri.uri, resource_uri.package_url) - - -def build_packages_from_metafile(metadata, uri=None, purl=None): - """ - Yield Package built from Cpan a `metadata` content - metadata: json content with metadata - uri: the uri of the ResourceURI object - purl: String value of the package url of the ResourceURI object - """ - # FIXME: it does not make sense to use a single functin tod eal with the two - # formats IMHO - if is_json(metadata): - content = json.loads(metadata, object_pairs_hook=OrderedDict) - else: - content = saneyaml.load(metadata) - - licenses_content = content.get('license') - extracted_license_statement = [] - if licenses_content: - if isinstance(licenses_content, (list,)): - for lic in licenses_content: - extracted_license_statement.append(lic) - else: - extracted_license_statement.append(licenses_content) - - keywords_content = content.get('keywords', []) - - download_url = uri.replace('.meta', '.tar.gz') if uri else None - - name = content.get('name') - if name: - vcs_tool, vcs_repo = get_vcs_repo(content) - if vcs_tool and vcs_repo: - # Form the vsc_url by - # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 - vcs_repo = vcs_tool + '+' + vcs_repo - common_data = dict( - datasource_id="cpan_meta_json", - type='cpan', - name=name, - description=content.get('abstract', name), - version=content.get('version'), - download_url=download_url, - extracted_license_statement=extracted_license_statement, - vcs_url=vcs_repo, - keywords=keywords_content, - ) - - parties = common_data['parties'] = [] - - for author_content in content.get('author', []): - # The author format is like: Abigail - if '<' in author_content: - author_name, _, author_email = author_content.partition('<') - author_email = author_email.strip('>') - else: - author_name = author_content - author_email = '' - - party = scan_models.Party( - role='author', - type=scan_models.party_person, - name=author_name.rstrip(), - email=author_email - ) - - parties.append(party.to_dict()) - - package = scan_models.PackageData.from_data(package_data=common_data) - package.set_purl(purl) - yield package - - -def get_vcs_repo(content): - """ - Return the repo type and url. - """ - repo = content.get('resources', {}).get('repository') - if repo: - if isinstance(repo, dict): - repo = repo.get('url', '') - if repo.startswith('git:'): - return 'git', repo - return None, None - - -def is_json(json_content): - try: - json.loads(json_content) - except ValueError: - return False - return True - - -@map_router.route('http://www.cpan.org/.*.readme') -class CpanReadmeFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = resource_uri.data - build_packages_from_metafile( - metadata, resource_uri.uri, resource_uri.package_url) - - -def build_packages_from_readmefile(metadata, uri=None, purl=None): - """ - Yield Package built from Cpan a `readme` content - metadata: json metadata content of readme file - uri: the uri of the ResourceURI object - purl: String value of the package url of the ResourceURI object - """ - content = json.loads(metadata) - name = content.get('NAME') - if name: - download_url = uri.replace('.meta', '.tar.gz') if uri else None - vcs_tool, vcs_repo = get_vcs_repo_fromstring(content) - if vcs_tool and vcs_repo: - # Form the vsc_url by - # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 - vcs_repo = vcs_tool + '+' + vcs_repo - copyr = content.get('COPYRIGHT and LICENSE') - common_data = dict( - datasource_id="cpan_readme", - type='cpan', - name=name, - description=content.get('ABSTRACT', name), - download_url=download_url, - vcs_url=vcs_repo, - copyright=copyr, - version=content.get('VERSION') - ) - - authors = content.get('AUTHOR', []) - for author_content in authors: - author_split = author_content.split('<') - if len(author_split) > 1: - party = scan_models.Party(type=scan_models.party_person, name=author_split[0].rstrip( - ), role='author', email=author_split[1].replace('>', '')) - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - common_data['parties'].append(party) - - keywords_content = [] - if content.get('KEYWORDS'): - keywords_content = [content.get('KEYWORDS')] - common_data['keywords'] = keywords_content - - package = scan_models.PackageData.from_data(package_data=common_data) - package.set_purl(purl) - yield package - - -def get_vcs_repo_fromstring(content): - """ - Return the repo type and url. - """ - repo = content.get('DEVELOPMENT') - if repo and repo.index('<') < repo.index('>') and 'git:' in repo: - return 'git', repo[repo.index('<') + 1: repo.index('>')] - else: - return None, None diff --git a/minecode/mappers/cran.py b/minecode/mappers/cran.py deleted file mode 100644 index d63b98e9..00000000 --- a/minecode/mappers/cran.py +++ /dev/null @@ -1,163 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from bs4 import BeautifulSoup -import packagedcode.models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import parse_date - - -CRAN_URL = 'https://cloud.r-project.org/' -CRAN_WEB_URL = CRAN_URL + 'web/' - - -@map_router.route('https://cloud.r-project.org/web/packages/[\w\-\.]/index.html') -class CranMetaFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = resource_uri.data - build_packages_from_html( - metadata, resource_uri.uri, resource_uri.package_url) - - -def get_download_url(url): - return url.replace('../../../', CRAN_URL) - - -def get_dependencies(depends): - """ - Return a dictionary of dependencies keyed by dep_group. - """ - dep_pkgs = [] - if not depends: - return dep_pkgs - dependencies = comma_separated(depends) - if not dependencies: - return dep_pkgs - for name in dependencies: - dep_pkgs.append(scan_models.DependentPackage(purl=name).to_dict()) - return dep_pkgs - - -def comma_separated(text): - """ - Return a list of strings from a comma-separated text. - """ - if not text: - return [] - return [t.strip() for t in text.split(',') if t and t.strip()] - - -def build_packages_from_html(metadata, uri=None, purl=None): - """ - Yield Package built from Cpan a `metadata` content - metadata: json metadata content - uri: the uri of the ResourceURI object - purl: String value of the package url of the ResourceURI object - """ - # Parse the name from the url, for example: https://cloud.r-project.org/web/packages/ANN2/index.html - common_data = dict( - datasource_id="cran_metadata", - type='cran', - name=uri.rpartition('/')[0].rpartition('/')[-1] - ) - extracted_license_statement = [] - download_urls = [] - - soup = BeautifulSoup(metadata, 'lxml') - first_pblock = soup.find('p') - if first_pblock: - common_data['description'] = first_pblock.string - else: - h2_block = soup.find('h2') - if h2_block: - common_data['description'] = h2_block.string - - tables = soup.find_all('table') - for table in tables: - rows = table.find_all('tr') - for row in rows: - col_values = [] - cols = row.find_all('td') - for ele in cols: - if ele.find_all('a'): - col_values.append([a['href'].strip() - for a in ele.find_all('a')]) - col_values.append(ele.text.strip()) - if len(cols) >= 2: - key = col_values[0] - value = col_values[1] - if key == 'Version:': - common_data['version'] = value - elif key == 'URL:': - if type(value) == list and len(value) > 0: - homepages = [] - for home_page in value: - homepages.append(home_page) - common_data['homepage_url'] = '\n'.join(homepages) - else: - common_data['homepage_url'] = value - elif key == 'License:': - for license_url in value: - extracted_license_statement.append(license_url) - elif key == 'Author:': - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - party = scan_models.Party( - type=scan_models.party_person, name=value, role='author') - common_data['parties'].append(party.to_dict()) - elif key == 'Maintainer:': - maintainer_split = value.split('<') - if len(maintainer_split) > 1: - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - party = scan_models.Party(type=scan_models.party_person, name=maintainer_split[0].rstrip( - ), role='maintainer', email=maintainer_split[1].replace('>', '').replace(' at ', '@')) - common_data['parties'].append(party.to_dict()) - elif 'source' in key or 'binaries' in key: - if type(value) == list: - for url in value: - download_urls.append(get_download_url(url)) - elif key == 'Published:': - common_data['release_date'] = parse_date(value) - elif key == 'Imports:': - # use the text instead of a href since the text is more accurate - if len(col_values) == 3: - value = col_values[2] - common_data['dependencies'] = get_dependencies(value) - if extracted_license_statement: - common_data['extracted_license_statement'] = extracted_license_statement - common_data['license_detections'] = [] - - if download_urls: # for else statement will have else running always if there is no break statement - for download_url in download_urls: - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.download_url = download_url - package.set_purl(purl) - yield package - else: - # Yield a package without download_url - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/debian.py b/minecode/mappers/debian.py deleted file mode 100644 index a2f6e01a..00000000 --- a/minecode/mappers/debian.py +++ /dev/null @@ -1,441 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -from collections import defaultdict -import json -import logging - -import attr -from debian_inspector import debcon -from packagedcode import models as scan_models -from packageurl import PackageURL - -from minecode import ls -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import form_vcs_url -from minecode import debutils - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -# FIXME: We are not returning download URLs. Returned information is incorrect - - -def get_dependencies(data): - """ - Return a list of DependentPackage extracted from a Debian `data` mapping. - """ - scopes = { - 'Build-Depends': dict(is_runtime=False, is_optional=True), - 'Depends': dict(is_runtime=True, is_optional=False), - 'Pre-Depends': dict(is_runtime=True, is_optional=False), - # 'Provides': dict(is_runtime=True, is_optional=False), - # 'Recommends': dict(is_runtime=True, is_optional=True), - # 'Suggests': dict(is_runtime=True, is_optional=True), - } - dep_pkgs = [] - for scope, flags in scopes.items(): - depends = data.get(scope) - if not depends: - continue - - dependencies = None # debutils.comma_separated(depends) - if not dependencies: - continue - # break each dep in package names and version constraints - # FIXME:!!! - for name in dependencies: - purl = PackageURL(type='deb', namespace='debian', name=name) - dep = scan_models.DependentPackage( - purl=purl.to_string(), score=scope, **flags) - dep_pkgs.append(dep) - - return dep_pkgs - - -def get_vcs_repo(description): - """ - Return a tuple of (vcs_tool, vcs_repo) or (None, None) if no vcs_repo is found. - """ - repos = [] - for vcs_tool, vcs_repo in description.items(): - vcs_tool = vcs_tool.lower() - if not vcs_tool.startswith('vcs-') or vcs_tool.startswith('vcs-browser'): - continue - _, _, vcs_tool = vcs_tool.partition('-') - repos.append((vcs_tool, vcs_repo)) - - if len(repos) > 1: - raise TypeError( - 'Debian description with more than one Vcs repos: %(repos)r' % locals()) - - if repos: - vcs_tool, vcs_repo = repos[0] - else: - vcs_tool = None - vcs_repo = None - - return vcs_tool, vcs_repo - - -@map_router.route('http://ftp.debian.org/debian/pool/.*\.dsc') -class DebianDescriptionMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield packages parsed from a dsc Debian control file mapping. - """ - return parse_description( - metadata=json.loads(resource_uri.data), - purl=resource_uri.package_url, - base_download_url=None) - - -def get_files(text): - """ - Yield tuples of (checksum, size, filename) collected from a files field - `text`. - """ - if text: - for line in text.splitlines(False): - # we have htree space-separated items, so we perform two partitions - line = ' '.join(line.split()) - checksum, _, rest = line.partition(' ') - size, _, filename = rest.partition(' ') - yield checksum, size, filename - - -def parse_description(metadata, purl=None, base_download_url=None): - """ - Yield Scanned Package parse from description `metadata` mapping - for a single package version. - Yield as many Package as there are download URLs. - Optionally use the `purl` Package URL string if provided. - """ - # FIXME: this may not be correct: Source and Binary are package names - common_data = dict( - name=metadata['Source'], - version=metadata['Version'], - homepage_url=metadata.get('Homepage'), - code_view_url=metadata.get('Vcs-Browser'), - parties=[] - ) - - if metadata.get('Label'): - common_data['keywords'] = [metadata.get('Label')] - - vcs_tool, vcs_repo = get_vcs_repo(metadata) - if vcs_tool and vcs_repo: - vcs_repo = form_vcs_url(vcs_tool, vcs_repo) - common_data['vcs_url'] = vcs_repo - - dependencies = get_dependencies(metadata) - if dependencies: - common_data['dependencies'] = dependencies - - # TODO: add "original maintainer" seen in Ubuntu - maintainer = metadata.get('Maintainer') - if maintainer: - name, email = debutils.parse_email(maintainer) - if name: - party = scan_models.Party( - name=name, role='maintainer', email=email) - common_data['parties'].append(party) - - @attr.s() - class File(object): - name = attr.ib(default=None) - size = attr.ib(default=None) - md5 = attr.ib(default=None) - sha1 = attr.ib(default=None) - sha256 = attr.ib(default=None) - - def collect_files(existing_files, field_value, checksum_name): - for checksum, size, name in get_files(field_value): - fl = existing_files[name] - if not fl.name: - fl.name = name - fl.size = size - setattr(fl, checksum_name, checksum) - - # TODO: what do we do with files? - # FIXME: we should store them in the package record - files = defaultdict(File) - collect_files(existing_files=files, field_value=metadata.get( - 'Files'), checksum_name='md5') - collect_files(existing_files=files, field_value=metadata.get( - 'Checksums-Sha1'), checksum_name='sha1') - collect_files(existing_files=files, field_value=metadata.get( - 'Checksums-Sha256'), checksum_name='sha256') - - # FIXME: craft a download_url - download_url = None - if base_download_url: - download_url = None - common_data['download_url'] = download_url - - package = scan_models.DebianPackage(**common_data) - package.set_purl(purl) - yield package - - -@map_router.route('http://ftp.debian.org/debian/dists/.*Sources.gz') -class DebianSourceFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield ScannedPackages built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = resource_uri.data - return parse_packages(metadata, resource_uri.package_url) - - -def build_source_file_packages(metadata, purl=None): - """ - Yield packages from the passing source file metadata. - metadata: json metadata content - purl: String value of the package url of the ResourceURI object - """ - for source in debcon.get_paragraphs_data(metadata): - package_name = source.get('Package') - - parties = [] - maintainer_names = debutils.comma_separated( - source.get('Maintainer', '')) - if maintainer_names: - for maintainer in maintainer_names: - name, email = debutils.parse_email(maintainer) - if name: - party = scan_models.Party( - name=name, role='maintainer', email=email) - parties.append(party) - contributor_names = debutils.comma_separated( - source.get('Uploaders', '')) - if contributor_names: - for contributor in contributor_names: - name, email = debutils.parse_email(contributor) - if name: - party = scan_models.Party( - name=name, role='contributor', email=email) - parties.append(party) - - dependencies = get_dependencies(source, ['Build-Depends']) - - keywords = set() - keywords.update(debutils.comma_separated(source.get('Binary', ''))) - if source.get('Section'): - keywords.add(source.get('Section')) - - files = source.get('Files') - for f in files: - name = f.get('name') - package = dict( - name=package_name, - version=source.get('Version'), - dependencies=dependencies, - parties=parties, - code_view_url=source.get('Vcs-Browser'), - homepage_url=source.get('Homepage'), - keywords=list(keywords), - ) - - download_url = 'http://ftp.debian.org/debian/{path}/{name}'.format( - path=source.get('Directory'), - name=name) - - package['download_url'] = download_url - - vcs_tool, vcs_repo = get_vcs_repo(source) - if vcs_tool and vcs_repo: - vcs_repo = form_vcs_url(vcs_tool, vcs_repo) - package['vcs_url'] = vcs_repo - - package['md5'] = f.get('md5sum') - # TODO: Why would we have more than a single SHA1 or SHA256 - sha1s = source.get('Checksums-Sha1', []) - for sha1 in sha1s: - sha1value = sha1.get('sha1') - name = sha1.get('name') - if name and sha1value: - package['sha1'] = sha1value - sha256s = source.get('Checksums-Sha256', []) - for sha256 in sha256s: - sha256value = sha256.get('sha256') - name = sha256.get('name') - if name and sha256value: - package['sha256'] = sha256value - package = scan_models.DebianPackage(**package) - package.set_purl(purl) - yield package - - -@map_router.route('http://ftp.debian.org/debian/dists/.*Packages.gz') -class DebianPackageFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages from a Debian Packages inex. - """ - metadata = resource_uri.data - return parse_packages(metadata, resource_uri.package_url) - - -def get_programming_language(tags): - """ - Return the programming language extracted from list of `tags` strings. - """ - for tag in tags: - key, _, value = tag.partition('::') - if key == 'implemented-in': - return value - - -def parse_packages(metadata, purl=None): - """ - Yield packages from Debian package text data. - metadata: Debian data (e.g. a Packages files) - purl: String value of the package url of the ResourceURI object - """ - for pack in debcon.get_paragraphs_data(metadata): - data = dict( - name=pack['Package'], - version=pack['Version'], - homepage_url=pack.get('Homepage'), - code_view_url=pack.get('Vcs-Browser'), - description=pack.get('Description'), - bug_tracking_url=pack.get('Bugs'), - parties=[], - md5=pack.get('MD5sum'), - sha1=pack.get('SHA1'), - sha256=pack.get('SHA256'), - ) - - filename = pack.get('Filename'), - if filename: - data['download_url'] = 'http://ftp.debian.org/debian/{}'.format( - filename) - - maintainers = pack.get('Maintainer') - if maintainers: - name, email = debutils.parse_email(maintainers) - if name: - party = scan_models.Party( - name=name, role='maintainer', email=email) - data['parties'].append(party) - - dependencies = get_dependencies(pack) - if dependencies: - data['dependencies'] = dependencies - - keywords = debutils.comma_separated(pack.get('Tag', '')) - - section = pack.get('Section') - if section: - keywords.append(section) - data['keywords'] = keywords - - data['primary_language'] = get_programming_language(keywords) - - package = scan_models.DebianPackage(**data) - if purl: - package.set_purl(purl) - yield package - - -################################################################################# -# FIXME: this cannot work since we do not fetch these yet AND what are the zip jar and gz in this??? -################################################################################# - - -@map_router.route('http://ftp.debian.org/debian/dists/.*\.zip', - 'http://ftp.debian.org/debian/dists/.*\.jar', - 'http://ftp.debian.org/debian/dists/.*\.gz') -class DebianArchiveFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - return build_packages_from_dist_archive(resource_uri.data, resource_uri.uri) - - -def build_packages_from_dist_archive(metadata, uri): - """ - Yield Package built from Debian project URI and the ls content associated - which is a result by running ls LR command at the Debiain root folder. - Yield as many Package as there are download URLs. - """ - debian_dist_length = len('http://ftp.debian.org/debian/dists') - # The parent folder URI related to uri file itself. - folder_uri = uri[debian_dist_length: uri.rindex('/')] - debian_dist_length = len('http://ftp.debian.org/debian/dists') - # project name by trucking the uri - name = uri[debian_dist_length:uri.index('/', debian_dist_length)] - folder_length = debian_dist_length + len(name) + 1 - # version by analysing the uri - version = uri[folder_length:uri.index('/', folder_length)] - common_data = dict( - datasource_id="debian_archive_file", - name=name, - version=version, - ) - - # FIXME: this is NOT RIGHT - def get_resourceuri_by_uri(uri): - """ - Return the Resource URI by searching with passing uri string value. - """ - from minecode.models import ResourceURI - uris = ResourceURI.objects.filter(uri=uri) - if uris: - return uris[0] - - url_template = 'http://ftp.debian.org/debian/dists{name}' - download_urls = [] - for entry in ls.parse_directory_listing(metadata): - if entry.type != ls.FILE: - continue - path = entry.path - - if path.startswith(folder_uri): - path = path.lstrip('/') - url = url_template.format(name=path) - # FIXME: this is NOT RIGHT - if path.endswith('.md5') and url.replace('.md5', '') == uri: - if get_resourceuri_by_uri(url) and get_resourceuri_by_uri(url).md5: - common_data['md5'] = get_resourceuri_by_uri(url).md5 - # FIXME: this is NOT RIGHT - if path.endswith('.sha') and url.replace('.sha', '') == uri: - if get_resourceuri_by_uri(url) and get_resourceuri_by_uri(url).sha1: - common_data['sha1'] = get_resourceuri_by_uri(url).sha1 - - if path.endswith(('.jar', 'zip', 'gz')) and url != uri: - download_urls.append(url) - - if download_urls: - for download_url in download_urls: - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package['download_url'] = download_url - yield package - else: - # yield package without a download_url value - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - # FIXME: this is NOT RIGHT: purl is not defined - package.set_purl(package.purl) - yield package diff --git a/minecode/mappers/dockerhub.py b/minecode/mappers/dockerhub.py deleted file mode 100644 index 92b8697f..00000000 --- a/minecode/mappers/dockerhub.py +++ /dev/null @@ -1,56 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://registry.hub.docker.com/v2/repositories/library/[\w\-\.]+/') -class DockerHubLiraryJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = resource_uri.data - build_packages_from_jsonfile( - metadata, resource_uri.uri, resource_uri.package_url) - - -def build_packages_from_jsonfile(metadata, uri=None, purl=None): - """ - Yield Package built from Docker Hub json content. - metadata: json metadata content - uri: String value of uri of the ResourceURI object. - purl: String value of the package url of the ResourceURI object - """ - content = json.loads(metadata) - dockhub_library_htmlpage_template = 'https://hub.docker.com/_/{project}' - name = content.get('name') - if name: - short_desc = content.get('description') - long_desc = content.get('full_description') - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - common_data = dict( - type='docker', - name=name, - description=description, - homepage_url=dockhub_library_htmlpage_template.format( - project=name), - ) - package = scan_models.Package(**common_data) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/eclipse.py b/minecode/mappers/eclipse.py deleted file mode 100644 index 9edd9615..00000000 --- a/minecode/mappers/eclipse.py +++ /dev/null @@ -1,178 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from bs4 import BeautifulSoup - -from packagedcode import models as scan_models -from minecode import map_router -from minecode.mappers import Mapper - -# FIXME: we should create packages from releases!!!! not from projects - - -@map_router.route('http://projects.eclipse.org/json/project/.*') -class EclipseJsonPackageMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - # FIXME: JSON deserialization should be handled eventually by the framework - metadata = json.loads(resource_uri.data) - return build_packages_with_json(metadata, resource_uri.package_url, uri) - - -def build_packages_with_json(metadata, purl=None, uri=None): - """ - Yield Package built from Eclipse a `metadata` mapping - The package can contain multiple projects, and each project can contain - meta data including title, description, homepage, bug tracking url etc. - metadata: json metadata content - purl: String value of the package url of the ResourceURI object - """ - - projects = metadata['projects'] - for project, project_metadata in projects.items(): - common_data = dict( - datasource_id="eclipse_metadata", - type='eclipse', - name=project, - ) - - descriptions = project_metadata.get('description') - if descriptions and len(descriptions) > 0: - common_data['description'] = descriptions[0].get('value') - else: - common_data['description'] = project_metadata['title'] - - homepage_urls = project_metadata.get('website_url') - if homepage_urls and len(homepage_urls) > 0: - common_data['homepage_url'] = homepage_urls[0].get('url') - - bug_tracking_urls = project_metadata.get('bugzilla') - if bug_tracking_urls and len(bug_tracking_urls) > 0: - common_data['bug_tracking_url'] = bug_tracking_urls[0].get( - 'query_url') - - if project_metadata.get('licenses'): - common_data['extracted_license_statement'] = [ - l.get('name') for l in project_metadata.get('licenses', [])] - common_data['license_detections'] = [] - - # FIXME: this is a download page and NOT a download URL!!!!! - for download_url in project_metadata.get('download_url', []): - durl = download_url.get('url') - if durl: - common_data['download_url'] = durl - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package - - -@map_router.route('https://projects.eclipse.org/projects/.*') -class EclipseHTMLProjectMapper(Mapper): - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - # FIXME: JSON deserialization should be handled eventually by the framework - return build_packages(resource_uri.data, resource_uri.package_url, uri) - - -def build_packages(html_text, purl=None, uri=None): - """ - Yield Package objects built from `html_text`and the `purl` package URL - string. - """ - page = BeautifulSoup(html_text, 'lxml') - common_data = dict( - datasource_id="eclipse_html", - type='eclipse', - ) - - extracted_license_statement = [] - for meta in page.find_all(name='meta'): - if 'name' in meta.attrs and 'dcterms.title' in meta.attrs.get('name'): - common_data['name'] = meta.attrs.get('content') - if 'name' in meta.attrs and 'dcterms.description' in meta.attrs.get('name'): - common_data['description'] = meta.attrs.get('content') - - for div in page.find_all(name='div'): - if 'class' not in div.attrs: - continue - if 'field-name-field-project-licenses' in div.attrs.get('class'): - # Visit div element whose class atttribute is field-name-field-project-licenses - for a in div.find_all(name='a'): - if 'href' not in a.attrs: - continue - license_name = str(a.contents[0]) - extracted_license_statement.append(license_name) - if extracted_license_statement: - common_data['extracted_license_statement'] = extracted_license_statement - common_data['license_detections'] = [] - - for a in page.find_all(name='a'): - if a.contents: - if str(a.contents[0]).strip() == 'Website': - common_data['homepage_url'] = a['href'] - - for a in page.find_all(name='a'): - if not a.contents: - continue - if str(a.contents[0]).strip() == 'Downloads': - download_data = dict(download_url=a['href'],) - download_data.update(common_data) - package = scan_models.Package.from_package_data( - package_data=download_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package - - for div in page.find_all(name='div'): - if 'class' not in div.attrs: - continue - if 'field-name-field-latest-releases' not in div.attrs.get('class'): - continue - # Visit div element whose class attribute is ield-name-field-latest-releases - tbody = div.find(name='tbody') - if not tbody: - continue - - for tr in tbody.find_all(name='tr'): - for td in tr.find_all(name='td'): - a = td.find(name='a') - if not a: - continue - - if 'href' not in a.attrs or 'class' in a.attrs: - continue - - version = a.contents[0] - href = a['href'] - download_data = dict( - version=version, - download_url=href, - ) - download_data.update(common_data) - package = scan_models.Package.from_package_data( - package_data=download_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/freebsd.py b/minecode/mappers/freebsd.py deleted file mode 100644 index fb0a760b..00000000 --- a/minecode/mappers/freebsd.py +++ /dev/null @@ -1,54 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -from io import StringIO -import os -import saneyaml - -from packagedcode.freebsd import CompactManifestHandler - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import get_temp_dir - - -@map_router.route('https://pkg.freebsd.org/.*packagesite.txz') -class FreeBSDIndexMapper(Mapper): - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - return build_packages(resource_uri.data, resource_uri.package_url) - - -def build_packages(metadata, purl=None): - """ - Yield the package by parsing the passing json content. - metadata: json metadata content - purl: String value of the package url of the ResourceURI object - """ - buf = StringIO(metadata) - # The passing metadata is not a well-formatted yaml or json, but each line is a yaml, so read by line and parse with FreeBSDPackage parser. - for each_line in buf: - if each_line and each_line.strip() in ('', '{', '}'): - continue - content = saneyaml.load(each_line) - if content and content.get('name'): - temp_dir = get_temp_dir('freebsd_index') - location = os.path.join(temp_dir, '+COMPACT_MANIFEST') - with open(location, 'w') as manifest: - manifest.write(each_line) - with open(location, encoding='utf-8') as loc: - yaml_data = saneyaml.load(loc) - package = CompactManifestHandler._parse(yaml_data=yaml_data) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/freedesktop.py b/minecode/mappers/freedesktop.py deleted file mode 100644 index 297b48c1..00000000 --- a/minecode/mappers/freedesktop.py +++ /dev/null @@ -1,67 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from bs4 import BeautifulSoup -from packageurl import PackageURL - -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import form_vcs_url - - -@map_router.route('https://www.freedesktop.org/wiki/Software/.*') -class FreedesktopHTMLProjectMapper(Mapper): - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - return build_packages(resource_uri.data, uri, resource_uri.package_url) - - -def build_packages(html_text, uri, purl): - """ - Yield Package objects built from `html_text` from the `uri` and the `purl` - package URL string. - """ - - purl = PackageURL.from_string(purl) - package_data = dict( - type='freedesktop', - name=purl.name, - version=purl.version, - homepage_url=uri - ) - - page = BeautifulSoup(html_text, 'lxml') - if page.h1: - package_data['description'] = page.h1.string.strip() - - for a in page.find_all(name='a'): - link = a['href'] - if 'freedesktop.org' not in link: - continue - - if '/releases/' in link or '/dist/' in link: - package_data['download_url'] = link - - if 'https://bugs.freedesktop.org/buglist.cgi' in link: - package_data['bug_tracking_url'] = link - - if 'http://cgit.freedesktop.org/' in link and 'tree/' in link: - package_data['code_view_url'] = link - - for li in page.find_all(name='li'): - if li.text and li.text.startswith('git://'): - package_data['vcs_url'] = form_vcs_url('git', li.text) - - yield scan_models.Package(**package_data) diff --git a/minecode/mappers/github.py b/minecode/mappers/github.py deleted file mode 100644 index 6ef72f00..00000000 --- a/minecode/mappers/github.py +++ /dev/null @@ -1,142 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from collections import OrderedDict -import json -import logging - -import attr - -import packagedcode.models as scan_models -from packageurl import PackageURL - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import form_vcs_url -from minecode.utils import parse_date - - -logger = logging.getLogger(__name__) - - -@map_router.route('https://api\.github\.com/repos/([^/]+)/([^/]+)') -class GithubMetaFileMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - visited_data = resource_uri.data - if not visited_data: - return - return build_github_packages(visited_data, resource_uri.uri, resource_uri.package_url) - - -def build_github_packages(visited_data, uri, purl=None): - """ - Yield Package built from Github API visited_data as a JSON string. - metadata: HTML metadata content - uri: String value of the uri from ResourceURI object - purl: String value of the package url of the ResourceURI object - """ - visited_data = json.loads(visited_data, object_pairs_hook=OrderedDict) - - full_name = visited_data['full_name'] - namespace, name = split_org_repo(full_name) - # FIXME: when could this ever happen?? - assert name == visited_data['name'], 'build_github_packages: Inconsistent name and org for URI: ' + uri - - description = visited_data['description'] - - vcs_url = visited_data.get('git_url'), - if vcs_url: - vcs_url = form_vcs_url('git', vcs_url) - package = scan_models.Package( - type='github', - namespace=namespace, - name=name, - description=description, - primary_language=visited_data.get('language'), - homepage_url=visited_data.get('html_url'), - vcs_url=vcs_url, - # this size does not make sense - size=visited_data.get('size'), - ) - - if visited_data.get('owner'): - package.parties = [ - scan_models.Party( - # FIXME: we can add the org or user URL and we can know if this - # is an org or a perrsone too. - type=scan_models.party_person, - name=visited_data.get('owner'), - role='owner') - ] - - package.set_purl(purl) - - downloads = visited_data.get('downloads') or [] - for download in downloads: - html_url = download.get('html_url') - if html_url: - # make a copy - package = attr.evolve(package) - package.download_url = html_url - package.size = download.get('size') - package.release_date = parse_date(download.get('created_at')) - yield package - - tags = visited_data.get('tags') or [] - for tag in tags: - package = attr.evolve(package) - package.version = tag.get('name') - package_url = PackageURL(type='github', name=package.name, - namespace=namespace, version=tag.get('name')).to_string() - package.sha1 = tag.get('sha1') - if tag.get('tarball_url'): - package.download_url = tag.get('tarball_url') - package.set_purl(package_url) - yield package - if tag.get('zipball_url'): - package.download_url = tag.get('zipball_url') - package.set_purl(package_url) - yield package - - branches_download_urls = visited_data.get('branches_download_urls') or [] - for branches_download_url in branches_download_urls: - package = attr.evolve(package) - package.download_url = branches_download_url - yield package - - -def split_org_repo(url_like): - """ - Given a URL-like string to a GitHub repo or a repo name as in org/name, - split and return the org and name. - - For example: - >>> split_org_repo('foo/bar') - ('foo', 'bar') - >>> split_org_repo('https://api.github.com/repos/foo/bar/') - ('foo', 'bar') - >>> split_org_repo('github.com/foo/bar/') - ('foo', 'bar') - >>> split_org_repo('git://github.com/foo/bar.git') - ('foo', 'bar') - """ - segments = [s.strip() for s in url_like.split('/') if s.strip()] - if not len(segments) >= 2: - raise ValueError('Not a GitHub-like URL: {}'.format(url_like)) - org = segments[-2] - name = segments[-1] - if name.endswith('.git'): - name, _, _ = name .rpartition('.git') - return org, name diff --git a/minecode/mappers/gitlab.py b/minecode/mappers/gitlab.py deleted file mode 100644 index 8a6d7c6c..00000000 --- a/minecode/mappers/gitlab.py +++ /dev/null @@ -1,57 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -import packagedcode.models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import form_vcs_url -from minecode.utils import parse_date - - -@map_router.route('https://gitlab.com/.*') -class GitLabMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = resource_uri.data - build_packages_from_json(metadata, resource_uri.package_url) - - -def build_packages_from_json(metadata, purl=None): - """ - Yield Package built from gitlab json content - metadata: Json metadata content - purl: String value of the package url of the ResourceURI object - """ - content = json.loads(metadata) - - name = content.get('name') - if name: - common_data = dict( - type='gitlab', - name=name, - homepage_url=content.get('web_url'), - description=content.get('description'), - ) - repo_url = content.get('http_url_to_repo') - if repo_url: - repo_url = form_vcs_url('git', repo_url) - common_data['vcs_url'] = repo_url - common_data['code_view_url'] = repo_url - common_data['release_date'] = parse_date(content.get('created_at')) - package = scan_models.Package(**common_data) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/golang.py b/minecode/mappers/golang.py deleted file mode 100644 index cd750da1..00000000 --- a/minecode/mappers/golang.py +++ /dev/null @@ -1,49 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode import models as scan_models -from packageurl import PackageURL - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import form_vcs_url - - -@map_router.route('pkg:golang/.*') -class GolangApiDocMapper(Mapper): - - def get_packages(self, uri, resource_uri): - package = json.loads(resource_uri.data) - yield build_golang_package(package, resource_uri.package_url) - - -def build_golang_package(package_data, purl): - """ - Return a single Golang package - """ - package_url = PackageURL.from_string(purl) - vcs_url = package_url.qualifiers.get('vcs_repository') - homepage_url = '/'.join(['https:/', - package_url.namespace, package_url.name]) - vcs_tool = 'git' if 'github.com' in package_url.namespace else None - if vcs_tool: - vcs_url = form_vcs_url(vcs_tool, vcs_url) - # TODO: collect stats and counter from package_data too - package = scan_models.Package( - name=package_url.name, - namespace=package_url.namespace, - type=package_url.type, - primary_language='Go', - description=package_data.get('synopsis'), - homepage_url=homepage_url, - vcs_url=vcs_url, - ) - return package diff --git a/minecode/mappers/googlecode.py b/minecode/mappers/googlecode.py deleted file mode 100644 index b06bc538..00000000 --- a/minecode/mappers/googlecode.py +++ /dev/null @@ -1,127 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from django.core.validators import URLValidator -from django.core.exceptions import ValidationError - -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/project.json') -class GoogleNewAPIV2ProjectJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages built from resource_uri record for a single - package version. - """ - # FIXME: JSON deserialization should be handled eventually by the - # framework - metadata = json.loads(resource_uri.data) - return build_packages_from_projectsjson_v2(metadata, resource_uri.package_url, uri) - - -def build_packages_from_projectsjson_v2(metadata, purl=None, uri=None): - """ - Yield Package built from Googlecode API json `metadata` mapping - which is a dictionary keyed by project name and values are metadatadata. - Yield as many Package as there are download URLs. - metadata: json metadata content from API call - purl: String value of the package url of the ResourceURI object - """ - short_desc = metadata.get('summary') - long_desc = metadata.get('description') - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - common_data = dict( - datasource_id='googlecode_api_json', - type='googlecode', - name=metadata.get('name'), - description=description - ) - - license_name = metadata.get('license') - if license_name: - common_data['extracted_license_statement'] = license_name - common_data['license_detections'] = [] - - keywords = [] - labels = metadata.get('labels') - for label in labels: - if label: - keywords.append(label.strip()) - common_data['keywords'] = keywords - - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package - - -@map_router.route('https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2.*project.json\?alt=media') -class GoogleNewAPIV1ProjectJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Packages built from resource_uri record for a single - package version. - """ - # FIXME: JSON deserialization should be handled eventually by the - # framework - metadata = json.loads(resource_uri.data) - return build_packages_from_projectsjson_v1(metadata, resource_uri.package_url, uri) - - -def build_packages_from_projectsjson_v1(metadata, purl=None, uri=None): - """Yield Package from the project.json passed by the google code v1 API - metadata: json metadata content from API call - purl: String value of the package url of the ResourceURI object - """ - if metadata.get('name'): - common_data = dict( - datasource_id="googlecode_json", - type='googlecode', - name=metadata.get('name'), - description=metadata.get('description') - ) - - license_name = metadata.get('license') - if license_name: - common_data['extracted_license_statement'] = license_name - common_data['license_detections'] = [] - - keywords = [] - labels = metadata.get('labels') - for label in labels: - if label: - keywords.append(label.strip()) - common_data['keywords'] = keywords - - common_data['vcs_url'] = metadata.get('ancestorRepo') - common_data['namespace'] = metadata.get('domain') - - # createTime doesn't make sense since the timestamp value is incorrect - # and parsing it will give a wrong year out of range. - - # created_time = metadata.get('creationTime') - # if created_time: - # common_data['release_date'] = date.fromtimestamp(created_time) - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/gstreamer.py b/minecode/mappers/gstreamer.py deleted file mode 100644 index 8d953302..00000000 --- a/minecode/mappers/gstreamer.py +++ /dev/null @@ -1,51 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -from commoncode import fileutils -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*[\w\-\.]+[.tar\.bz2\\.gz|\.tar\.xz]') -class GstreamerURLMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - """ - return build_package_from_url(resource_uri.uri, resource_uri.package_url) - - -def build_package_from_url(uri, purl=None): - """ - Return Package built from uri and package_url. - uri: String value of uri of the ResourceURI object. - purl: String value of the package url of the ResourceURI object - """ - file_name = fileutils.file_name(uri) - file_name_without_prefix = file_name - prefixes = ('.tar.bz2', '.tar.gz', '.tar.xz') - for prefix in prefixes: - file_name_without_prefix = file_name_without_prefix.replace(prefix, '') - if '-' in file_name_without_prefix: - project_name, _, version = file_name.rpartition('-') - common_data = dict( - type='gstreamer', - name=project_name, - version=version, - download_url=uri, - homepage_url='https://gstreamer.freedesktop.org' - ) - package = scan_models.Package(**common_data) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/haxe.py b/minecode/mappers/haxe.py deleted file mode 100644 index a8b6e594..00000000 --- a/minecode/mappers/haxe.py +++ /dev/null @@ -1,35 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode.haxe import HaxelibJsonHandler - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://lib.haxe.org/p/[\w\-\.]+/[\w\-\.]+/raw-files/[\w\-\.]+/package.json') -class HaxePackageJsonMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from package json file. - """ - # FIXME: JSON deserialization should be handled eventually by the framework - metadata = json.loads(resource_uri.data) - return build_packages_with_json(metadata, resource_uri.package_url) - - -def build_packages_with_json(metadata, purl=None): - # yield package by getting package from the build_package parser in scancode - package = HaxelibJsonHandler._parse(json_data=metadata) - if package: - package.set_purl(purl) - yield package diff --git a/minecode/mappers/maven.py b/minecode/mappers/maven.py deleted file mode 100644 index a3ec9855..00000000 --- a/minecode/mappers/maven.py +++ /dev/null @@ -1,135 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import logging -import packageurl -from packageurl import PackageURL - -from commoncode.text import as_unicode -from packagedcode.models import PackageData -from packagedcode.maven import _parse - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import parse_date -from minecode.visitors.maven import Artifact - - -TRACE = False - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - - -if TRACE: - import sys - logging.basicConfig(stream=sys.stdout) - logger.setLevel(logging.DEBUG) - - -@map_router.route('maven-index://.*') -class MavenIndexArtifactMapper(Mapper): - """ - Process the minimal artifacts collected for a Maven Jar or POM in an - index visit. - """ - - def get_packages(self, uri, resource_uri): - yield get_mini_package(resource_uri.data, uri, resource_uri.package_url) - - -def get_mini_package(data, uri, purl): - """ - Return a MavenPomPackage built from the minimal artifact data available in a - nexus index, given a `data` JSON string, a `uri` string and a `purl` - PacxkageURL string. Return None if the package cannot be built. - """ - if not data: - return - - artdata = json.loads(data) - - # FIXME: this should a slot in Artifact - download_url = artdata.pop('download_url') - # FIXME: what if this is an ArtifactExtended?? - artifact = Artifact(**artdata) - - if purl: - if isinstance(purl, str): - purl = PackageURL.from_string(purl) - assert isinstance(purl, PackageURL) - - qualifiers = None - if purl and purl.qualifiers: - qualifiers = packageurl.normalize_qualifiers( - purl.qualifiers, encode=False) - if qualifiers: - assert isinstance(qualifiers, dict) - logger.debug('get_mini_package: qualifiers: {}'.format(qualifiers)) - - package = PackageData( - type='maven', - namespace=artifact.group_id, - name=artifact.artifact_id, - version=artifact.version, - qualifiers=qualifiers, - description=artifact.description, - download_url=download_url, - release_date=parse_date(artifact.last_modified), - size=artifact.size, - sha1=artifact.sha1 or None, - ) - logger.debug('get_mini_package: package.qualifiers: {}'.format( - package.qualifiers)) - logger.debug( - 'get_mini_package for uri: {}, package: {}'.format(uri, package)) - return package - - -# FIXME this should be valid for any POM -@map_router.route('https?://repo1.maven.org/maven2/.*\.pom') -class MavenPomMapper(Mapper): - """ - Map a proper full POM visited as XML. - """ - - def get_packages(self, uri, resource_uri): - - logger.debug('MavenPomMapper.get_packages: uri: {}, resource_uri: {}, purl:' - .format(uri, resource_uri.uri, resource_uri.package_url)) - package = get_package(resource_uri.data, resource_uri.package_url) - if package: - logger.debug('MavenPomMapper.get_packages: uri: {}, package: {}' - .format(uri, package)) - yield package - - -def get_package(text, package_url=None, - baseurl='https://repo1.maven.org/maven2'): - """ - Return a ScannedPackage built from a POM XML string `text`. - """ - text = as_unicode(text) - package = _parse( - datasource_id='maven_pom', - package_type='maven', - primary_language='Java', - text=text - ) - if package: - # FIXME: this should be part of the parse call - if package_url: - purl = PackageURL.from_string(package_url) - package.set_purl(purl) - # Build proper download_url given a POM: this must be the URL for - # the Jar which is the key to the PackageDB record - # FIXME the download is hardcoded to Maven Central? - # package.download_url = package.repository_download_url(baseurl=baseurl) - return package diff --git a/minecode/mappers/npm.py b/minecode/mappers/npm.py deleted file mode 100644 index 0e13d5d5..00000000 --- a/minecode/mappers/npm.py +++ /dev/null @@ -1,61 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import json -import logging - - -from packagedcode.npm import NpmPackageJsonHandler - -from minecode import map_router -from minecode.mappers import Mapper - - -TRACE = False - -logger = logging.getLogger(__name__) - -if TRACE: - import sys - logging.basicConfig(stream=sys.stdout) - logger.setLevel(logging.DEBUG) - - -# FIXME: This route may not work when we have scoped Packages or URLs to a specific version -# or yarn URLs -@map_router.route('https://registry.npmjs.org/[^\/]+') -class NpmPackageMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield NpmPackage built from a resource_uri record that contains many - npm versions for a given npm name. - """ - if not resource_uri.data: - return - visited_data = json.loads(resource_uri.data) - return build_packages(visited_data) - - -# FIXME: Consider using PURL here -def build_packages(data): - """ - Yield NpmPackage built from data corresponding to a single package name - and many npm versions. - """ - versions = data.get('versions', {}) - - logger.debug('build_packages: versions: ' + repr(type(versions))) - for version, data in versions.items(): - logger.debug('build_packages: version: ' + repr(version)) - logger.debug('build_packages: data: ' + repr(data)) - package = NpmPackageJsonHandler._parse(json_data=data) - if package: - yield package diff --git a/minecode/mappers/nuget.py b/minecode/mappers/nuget.py deleted file mode 100644 index 7b4b2c0d..00000000 --- a/minecode/mappers/nuget.py +++ /dev/null @@ -1,182 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from bs4 import BeautifulSoup - -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https://api.nuget.org/v3/catalog.+\.json') -class NugetPackageMapper(Mapper): - """ - Return NugetPackage object by parsing the ResourceURI stored in db referenced by the - nuget API URIs. - """ - - def get_packages(self, uri, resource_uri): - if not resource_uri.data: - return - pkg_data = json.loads(resource_uri.data) - return build_packages_with_json(pkg_data, resource_uri.package_url) - - -def build_packages_with_json(metadata, purl=None): - """ - Yield package from the json metadata passed - metadata: json metadata content from API call - purl: String value of the package url of the ResourceURI object - """ - licenseUrl = metadata.get('licenseUrl') - copyr = metadata.get('copyright') - - authors = [] - names = metadata.get('authors') - if names: - for name in names.split(','): - authors.append(scan_models.Party(name=name.strip(), role='author')) - - keywords = metadata.get('tags', []) - - # TODO: the content has the SHA512, our model may extend to SHA512 - - if name: - short_desc = metadata.get('summary') - long_desc = metadata.get('description') - if long_desc == short_desc: - long_desc = None - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - package_mapping = dict( - type='nuget', - name=metadata['id'], - version=metadata['version'], - homepage_url=metadata.get('projectUrl'), - description=description, - extracted_license_statement=licenseUrl, - license_detections=[], - copyright=copyr, - parties=authors, - keywords=keywords, - ) - package = scan_models.PackageData.from_data( - package_data=package_mapping) - package.set_purl(purl) - yield package - - -@map_router.route('https://api.nuget.org/packages/.*\.nupkg') -class NugetNUPKGDownloadMapper(Mapper): - """ - Return NugetPackage object by parsing the download URL. - For example: https://api.nuget.org/packages/entityframework.4.3.1.nupkg - """ - - def get_packages(self, uri, resource_uri): - if not resource_uri.data: - return - pkg_data = json.loads(resource_uri.data) - return build_packages_with_nupkg_download_url(pkg_data, resource_uri.package_url, resource_uri.uri) - - -def build_packages_with_nupkg_download_url(metadata, purl, uri): - if purl: - package = scan_models.PackageData( - type='nuget', - name=purl.name, - download_url=uri - ) - package.set_purl(purl) - yield package - - -@map_router.route('https://www.nuget.org/packages/[\w\-\.]+', - 'https://www.nuget.org/packages/[\w\-\.]+/[\w\-\.]+') -class NugetHTMLPackageMapper(Mapper): - """ - Return NugetPackage object by parsing the package HTML content. - For example: https://www.nuget.org/packages/log4net - """ - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri data. - """ - metadata = resource_uri.data - build_packages_from_html( - metadata, resource_uri.uri, resource_uri.package_url) - - -def build_packages_from_html(metadata, uri, purl=None): - """ - Yield Package built from Nuget a `metadata` content - metadata: json metadata content - uri: the uri of the ResourceURI object - purl: String value of the package url of the ResourceURI object - """ - download_url_format = 'https://www.nuget.org/api/v2/package/{name}/{version}' - soup = BeautifulSoup(metadata, 'lxml') - h1 = soup.find('h1') - if h1 and h1.contents: - license_value = None - name = str(h1.contents[0]).strip() - for a in soup.find_all('a'): - if a.get('data-track') and a.get('data-track') == 'outbound-license-url': - license_value = a.string - if license_value: - license_value = str(license_value).strip() - - copyright_value = None - h2s = soup.find_all('h2') - for h2 in h2s: - # Copyright will be after the copyright h2 node - # The exmaple is like this: - #

Copyright

- #

Copyright 2004-2017 The Apache Software Foundation

- if h2.string and h2.string == 'Copyright': - next_element = h2.find_next_sibling('p') - if next_element: - copyright_value = next_element.string - - description = None - for m in soup.find_all('meta'): - if m.get('property') and m.get('property') == 'og:description' and m.get('content'): - description = m.get('content') - - for tbody in soup.find_all('tbody'): - if tbody.get('class') and tbody.get('class')[0] == 'no-border': - for a in tbody.find_all('a'): - version = a.string - if not version or not version.strip(): - continue - version = version.strip() - download_url = download_url_format.format( - name=name, version=version) - package_mapping = dict( - datasource_id="nuget_metadata_json", - name=name, - type='nuget', - version=version, - homepage_url=uri, - description=description, - download_url=download_url, - extracted_license_statement=license_value, - license_detections=[], - copyright=copyright_value - ) - package = scan_models.Package.from_package_data( - package_data=package_mapping, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/openssl.py b/minecode/mappers/openssl.py deleted file mode 100644 index c082cd52..00000000 --- a/minecode/mappers/openssl.py +++ /dev/null @@ -1,69 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from datetime import datetime -import logging - -from commoncode import fileutils -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import parse_date - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -@map_router.route('https://ftp.openssl.org/.*') -class OpenSSLMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield ScannedPackage built from resource_uri record for a single package - version. Yield as many Package from the uri - """ - return build_packages(resource_uri, resource_uri.package_url) - - -def build_packages(resource_uri, purl=None): - """ - Yield Package from resource_uri metadata - resource_uri: ResourceURI object - purl: String value of the package url of the ResourceURI object - """ - uri = resource_uri.uri - file_name = fileutils.file_name(uri) - version = file_name.replace('.tar.gz', '').replace('openssl-', '').replace('.tar.gz', '').replace( - '.asc', '').replace('.md5', '').replace('.sha1', '').replace('.sha256', '') - common_data = dict( - datasource_id="openssl_metadeta", - type='generic', - name=file_name, - description='The OpenSSL Project is a collaborative effort to develop a robust, commercial-grade, fully featured, and Open Source toolkit implementing the Transport Layer Security (TLS) protocols (including SSLv3) as well as a full-strength general purpose cryptographic library.', - version=version, - size=resource_uri.size, - release_date=parse_date(resource_uri.last_modified_date), - extracted_license_statement='OpenSSL License', - license_detections=[], - homepage_url='https://www.openssl.org/', - download_url=uri, - copyright='Copyright (c) 1998-2018 The OpenSSL Project\nCopyright (c) 1995-1998 Eric A. Young, Tim J. Hudson\nAll rights reserved.', - vcs_url='git+https://github.com/openssl/openssl.git', - code_view_url='https://github.com/openssl/openssl', - bug_tracking_url='https://github.com/openssl/openssl/issues', - ) - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/openwrt.py b/minecode/mappers/openwrt.py deleted file mode 100644 index 7c1b9ec1..00000000 --- a/minecode/mappers/openwrt.py +++ /dev/null @@ -1,90 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import logging - -from packagedcode import models as scan_models - -from minecode import debutils -from minecode import map_router -from minecode.mappers import Mapper -from minecode.mappers.debian import get_dependencies - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -""" -OpenWRT IPK package data files are using the Deb822 format. -""" - - -@map_router.route('https://downloads.openwrt.org/.*\.ipk') -class OpenwrtIpkMetadataMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield ScannedPackage built from resource_uri record for a single package - version. Yield as many Package as there are download URLs. - """ - metadata = json.loads(resource_uri.data) - return build_packages(metadata, resource_uri.package_url, uri) - - -def build_packages(metadata, purl=None, uri=None): - """ - Yield ScannedPackage built from the passing metadata. - metadata: metadata mapping - purl: String value of the package url of the ResourceURI object - """ - common_data = dict( - type='openwrt', - datasource_id='openwrt_metadata', - name=metadata.get('Package'), - version=metadata.get('Version'), - description=metadata.get('Description'), - size=metadata.get('Installed-Size'), - ) - - dependencies = get_dependencies(metadata, ['Depends']) - if dependencies: - common_data['dependencies'] = dependencies - - maintainers = metadata.get('Maintainer') - if maintainers: - name, email = debutils.parse_email(maintainers) - if name: - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - party = scan_models.Party( - name=name, role='maintainer', email=email) - common_data['parties'].append(party) - - lic = metadata.get('License') - if lic: - common_data['declared_license'] = lic - - common_data['keywords'] = [] - section = metadata.get('Section') - if section: - common_data['keywords'].append(section) - architecture = metadata.get('Architecture') - if architecture: - common_data['keywords'].append(architecture) - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/packagist.py b/minecode/mappers/packagist.py deleted file mode 100644 index dc05dd30..00000000 --- a/minecode/mappers/packagist.py +++ /dev/null @@ -1,98 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode import models as scan_models -from packagedcode.models import DependentPackage - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import form_vcs_url - - -@map_router.route('https://packagist.org/p/.*json') -class PackagistPackageMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are multiple versions. - """ - metadata = json.loads(resource_uri.data) - return build_packages_with_json(metadata, resource_uri.package_url, uri) - - -def build_packages_with_json(metadata, purl=None, uri=None): - """ - Yield Package built from Packist package json content. - metadata: json metadata content - purl: String value of the package url of the ResourceURI object - """ - - package = metadata.get('package') - if package: - primary_language = package.get('language') - for version_content in package.get('versions').values(): - common = dict( - datasource_id='php_composer_json', - type='composer', - name=version_content.get('name'), - description=version_content.get('description'), - primary_language=primary_language, - ) - common['version'] = version_content.get('version') - common['keywords'] = version_content.get('keywords') - common['homepage_url'] = version_content.get('homepage') - - source = version_content.get('source') - if source: - if source.get('type') == 'git' and source.get('url'): - common['vcs_url'] = form_vcs_url('git', source.get('url')) - else: - pass # Packagist only has the github repo - - dist = version_content.get('dist') - if dist: - common['download_url'] = dist.get('url') - common['sha1'] = dist.get('shasum') - - for author in version_content.get('authors', []): - parties = common.get('parties') - if not parties: - common['parties'] = [] - common['parties'].append( - scan_models.Party(name=author.get('name'), role='author', url=author.get( - 'homepage'), email=author.get('email')).to_dict() - ) - - extracted_license_statement = set([]) - for lic in version_content.get('license'): - extracted_license_statement.add(lic) - if extracted_license_statement: - common['extracted_license_statement'] = list( - extracted_license_statement) - common['license_detections'] = [] - - dependencies = [] - for name, version in version_content.get('require', {}).items(): - dependencies.append( - DependentPackage( - purl=name, extracted_requirement=version, scope='runtime').to_dict() - ) - if dependencies: - common['dependencies'] = dependencies - # FIXME: We should create a composer package - package = scan_models.Package.from_package_data( - package_data=common, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappers/pypi.py b/minecode/mappers/pypi.py deleted file mode 100644 index 69110414..00000000 --- a/minecode/mappers/pypi.py +++ /dev/null @@ -1,144 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import json - -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper -from minecode.utils import parse_date - - -@map_router.route('https://pypi.python.org/pypi/[^/]+/[^/]+/json') -class PypiPackageMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield ScannedPackages built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - # FIXME: JSON deserialization should be handled eventually by the framework - metadata = json.loads(resource_uri.data) - return build_packages(metadata, resource_uri.package_url) - - -def build_packages(metadata, purl=None): - """ - Yield ScannedPackage built from Pypi a `metadata` mapping - for a single package version. - Yield as many Package as there are download URLs. - - The metadata for a Pypi package has three main blocks: info, releases and - urls. Releases is redundant with urls and contains all download urls for - every releases. It is repeased for each version-specific json: we ignore it - and use only info and urls. - - purl: String value of the package url of the ResourceURI object - """ - info = metadata['info'] - # mapping of information that are common to all the downloads of a version - short_desc = info.get('summary') - long_desc = info.get('description') - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - common_data = dict( - name=info['name'], - version=info['version'], - description=description, - homepage_url=info.get('home_page'), - bug_tracking_url=info.get('bugtrack_url'), - ) - - author = info.get('author') - email = info.get('author_email') - if author or email: - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - common_data['parties'].append(scan_models.Party( - type=scan_models.party_person, name=author, role='author', email=email)) - - maintainer = info.get('maintainer') - email = info.get('maintainer_email') - if maintainer or email: - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - common_data['parties'].append(scan_models.Party( - type=scan_models.party_person, name=maintainer, role='maintainer', email=email)) - - extracted_license_statement = [] - lic = info.get('license') - if lic and lic != 'UNKNOWN': - extracted_license_statement.append(lic) - - classifiers = info.get('classifiers') - if classifiers and not extracted_license_statement: - licenses = [ - lic for lic in classifiers if lic.lower().startswith('license')] - for lic in licenses: - extracted_license_statement.append(lic) - - common_data['extracted_license_statement'] = extracted_license_statement - - kw = info.get('keywords') - if kw: - common_data['keywords'] = [k.strip() - for k in kw.split(',') if k.strip()] - - # FIXME: we should either support "extra" data in a ScannedPackage or just ignore this kind of FIXME comments for now - - # FIXME: not supported in ScanCode Package: info.platform may provide some platform infor (possibly UNKNOWN) - # FIXME: not supported in ScanCode Package: info.docs_url - # FIXME: not supported in ScanCode Package: info.release_url "http://pypi.python.org/pypi/Django/1.10b1" - # FIXME: not supported in ScanCode Package: info.classifiers: this contains a lot of other info (platform, license, etc) - # FIXME: if the homepage is on Github we can infer the VCS - # FIXME: info.requires_dist contains a list of requirements/deps that should be mapped to dependencies? - # FIXME: info.requires_python may be useful and should be mapped to some platform? - # FIXME: Package Index Owner: seems to be only available on the web page - - # A download_url may be provided for off Pypi download: we yield a package if relevant - # FIXME: do not prioritize the download_url outside Pypi over actual exact Pypi donwload URL - download_url = info.get('download_url') - if download_url and download_url != 'UNKNOWN': - download_data = dict( - datasource_id='pypi_sdist_pkginfo', - type='pypi', - download_url=download_url, - ) - download_data.update(common_data) - package = scan_models.PackageData.from_data(download_data) - # TODO: Consider creating a DatafileHandler for PyPI API metadata - package.datasource_id = 'pypi_api_metadata' - package.set_purl(purl) - yield package - - # yield a package for each download URL - for download in metadata['urls']: - url = download.get('url') - if not url: - continue - - download_data = dict( - download_url=url, - size=download.get('size'), - release_date=parse_date(download.get('upload_time')), - datasource_id='pypi_sdist_pkginfo', - type='pypi', - ) - # TODO: Check for other checksums - download_data['md5'] = download.get('md5_digest') - download_data.update(common_data) - package = scan_models.PackageData.from_data(download_data) - package.datasource_id = 'pypi_api_metadata' - package.set_purl(purl) - yield package diff --git a/minecode/mappers/repomd.py b/minecode/mappers/repomd.py deleted file mode 100644 index 10b8bf66..00000000 --- a/minecode/mappers/repomd.py +++ /dev/null @@ -1,31 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode.models import PackageData - -from minecode import map_router - - -@map_router.route('.+/repomd.xml') -def map_repomd_data(uris, resource_uri): - """ - Returns a list of RpmPackage objects collected from visitors. - """ - if not resource_uri.data: - return - packages = [] - for pkg_data in json.loads(resource_uri.data): - # 'name' is required for every package - # FIXME: how could we obtain a package without a name??? - # FIXME: This cannot work unless we use **pkg_data - if pkg_data.get('name'): - packages.append(PackageData(pkg_data)) - return packages diff --git a/minecode/mappers/rubygems.py b/minecode/mappers/rubygems.py deleted file mode 100644 index 19f0e0c1..00000000 --- a/minecode/mappers/rubygems.py +++ /dev/null @@ -1,291 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import json -import logging - -from packagedcode import models as scan_models -from packagedcode.models import DependentPackage -from packagedcode.models import PackageData - -from minecode import map_router -from minecode import saneyaml -from minecode.mappers import Mapper -from minecode.utils import parse_date - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -@map_router.route('https*://rubygems\.org/api/v1/versions/[\w\-\.]+.json') -class RubyGemsApiVersionsJsonMapper(Mapper): - """ - Mapper to build Rubygems Packages from JSON API data. - """ - - def get_packages(self, uri, resource_uri): - metadata = json.loads(resource_uri.data) - _, sep, namejson = uri.partition('versions/') - if not sep: - return - name, sep, _ = namejson.rpartition('.json') - if not sep: - return - return build_rubygem_packages_from_api_data(metadata, name) - - -def build_rubygem_packages_from_api_data(metadata, name, purl=None): - """ - Yield Package built from resource_uri record for a single - package version. - metadata: json metadata content - name: package name - purl: String value of the package url of the ResourceURI object - """ - for version_details in metadata: - short_desc = version_details.get('summary') - long_desc = version_details.get('description') - if long_desc == short_desc: - long_desc = None - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - package = dict( - type='gem', - name=name, - description=description, - version=version_details.get('number'), - ) - # FIXME: we are missing deps and more things such as download URL and more - - if version_details.get('sha'): - package['sha256'] = version_details.get('sha') - - package['release_date'] = parse_date( - version_details.get('created_at') or '') or None - - author = version_details.get('authors') - if author: - parties = package.get('parties') - if not parties: - package['parties'] = [] - party = scan_models.Party(name=author, role='author') - package['parties'].append(party) - - extracted_license_statement = [] - licenses = version_details.get('licenses') - if licenses: - for lic in licenses: - extracted_license_statement.append(lic) - if extracted_license_statement: - package['extracted_license_statement'] = extracted_license_statement - package = PackageData.from_data(package) - package.set_purl(purl) - yield package - - -@map_router.route('https?://rubygems.org/downloads/[\w\-\.]+.gem') -class RubyGemsPackageArchiveMetadataMapper(Mapper): - """ - Mapper to build on e Package from the metadata file found inside a gem. - """ - - def get_packages(self, uri, resource_uri): - metadata = resource_uri.data - return build_rubygem_packages_from_metadata(metadata, download_url=uri) - - -def build_rubygem_packages_from_metadata(metadata, download_url=None, purl=None): - """ - Yield Package built from a Gem `metadata` YAML content - metadata: json metadata content - download_url: url to download the package - purl: String value of the package url of the ResourceURI object - """ - content = saneyaml.load(metadata) - if not content: - return - - name = content.get('name') - short_desc = content.get('summary') - long_desc = content.get('description') - if long_desc == short_desc: - long_desc = None - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - package = dict( - type='gem', - name=name, - description=description, - homepage_url=content.get('homepage'), - ) - if download_url: - package['download_url'] = download_url - - extracted_license_statement = [] - licenses = content.get('licenses') - if licenses: - for lic in licenses: - extracted_license_statement.append(lic) - if extracted_license_statement: - package['extracted_license_statement'] = extracted_license_statement - - authors = content.get('authors') - for author in authors: - parties = package.get('parties') - if not parties: - package['parties'] = [] - party = scan_models.Party(name=author, role='author') - package['parties'].append(party) - - # Release date in the form of `2010-02-01 00:00:00 -05:00` - release_date = content.get('date', '').split() - package['release_date'] = parse_date(release_date[0]) - - package['dependencies'] = get_dependencies_from_meta(content) or [] - - # This is a two level nenest item - version1 = content.get('version') or {} - version = version1.get('version') or None - package['version'] = version - package = PackageData.from_data(package) - package.set_purl(purl) - yield package - - -def get_dependencies_from_meta(content): - """ - Return a mapping of dependencies keyed by group based on the gem YAML - metadata data structure. - """ - dependencies = content.get('dependencies') or [] - if not dependencies: - return [] - - group = [] - for dependency in dependencies: - name = dependency.get('name') or None - if not name: - continue - - requirement = dependency.get('requirement') or {} - # FIXME when upating to the ScanCode package model - scope = dependency.get('type') - scope = scope and scope.lstrip(':') - - # note that as weird artifact of our saneyaml YAML parsing, we are - # getting both identical requirements and version_requirements mapping. - # We ignore version_requirements - # requirement is {'requirements': [ - # [u'>=', {'version': '0'}] - # ] - # } - requirements = requirement.get('requirements') or [] - version_constraint = [] - - # each requirement is [u'>=', {'version': '0'}] - for constraint, req_version in requirements: - req_version = req_version.get('version') or None - # >= 0 allows for any version: we ignore these type of contrainsts - # as this is the same as no constraints. We also ignore lack of - # constraints and versions - if ((constraint == '>=' and req_version == '0') - or not (constraint and req_version)): - continue - version_constraint.append(' '.join([constraint, req_version])) - version_constraint = ', '.join(version_constraint) or None - - group.append(DependentPackage( - purl=name, extracted_requirement=version_constraint, scope=scope)) - - return group - - -def get_dependencies_from_api(content): - """ - Return a mapping of dependencies keyed by group based on the RubyGems API - data structure. - """ - dependencies = content.get('dependencies') or [] - if not dependencies: - return {} - - group = [] - for dependency in dependencies: - name = dependency.get('name') or None - if not name: - continue - - requirement = dependency.get('requirement') or {} - scope = dependency.get('type') - scope = scope and scope.lstrip(':') - - # note that as weird artifact of our saneyaml YAML parsing, we are - # getting both identical requirements and version_requirements mapping. - # We ignore version_requirements - # requirement is {'requirements': [ - # [u'>=', {'version': '0'}] - # ] - # } - requirements = requirement.get('requirements') or [] - version_constraint = [] - # each requirement is [u'>=', {'version': '0'}] - for constraint, req_version in requirements: - req_version = req_version.get('version') or None - # >= 0 allows for any version: we ignore these type of contrainsts - # as this is the same as no constraints. We also ignore lack of - # constraints and versions - if ((constraint == '>=' and req_version == '0') - or not (constraint and req_version)): - continue - version_constraint.append(' '.join([constraint, req_version])) - version_constraint = ', '.join(version_constraint) or None - - group.append(DependentPackage( - purl=name, extracted_requirement=version_constraint, scope=scope)) - - return group - - -# Structure: {gem_spec: license.key} -LICENSES_MAPPING = { - 'None': None, - 'Apache 2.0': 'apache-2.0', - 'Apache License 2.0': 'apache-2.0', - 'Apache-2.0': 'apache-2.0', - 'Apache': 'apache-2.0', - 'GPL': 'gpl-2.0', - 'GPL-2': 'gpl-2.0', - 'GNU GPL v2': 'gpl-2.0', - 'GPLv2+': 'gpl-2.0-plus', - 'GPLv2': 'gpl-2.0', - 'GPLv3': 'gpl-3.0', - 'MIT': 'mit', - 'Ruby': 'ruby', - "same as ruby's": 'ruby', - 'Ruby 1.8': 'ruby', - 'Artistic 2.0': 'artistic-2.0', - 'Perl Artistic v2': 'artistic-2.0', - '2-clause BSDL': 'bsd-simplified', - 'BSD': 'bsd-new', - 'BSD-3': 'bsd-new', - 'ISC': 'isc', - 'SIL Open Font License': 'ofl-1.0', - 'New Relic': 'new-relic', - 'GPL2': 'gpl-2.0', - 'BSD-2-Clause': 'bsd-simplified', - 'BSD 2-Clause': 'bsd-simplified', - 'LGPL-3': 'lgpl-3.0', - 'LGPL-2.1+': 'lgpl-2.1-plus', - 'LGPLv2.1+': 'lgpl-2.1-plus', - 'LGPL': 'lgpl', - 'Unlicense': 'unlicense', -} diff --git a/minecode/mappers/sourceforge.py b/minecode/mappers/sourceforge.py deleted file mode 100644 index 887e6748..00000000 --- a/minecode/mappers/sourceforge.py +++ /dev/null @@ -1,101 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json - -from packagedcode import models as scan_models - -from minecode import map_router -from minecode.mappers import Mapper - - -@map_router.route('https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json', - 'https?://sourceforge.net/rest/p/[a-z0-9.-]+') -class SourceforgeProjectJsonAPIMapper(Mapper): - - def get_packages(self, uri, resource_uri): - """ - Yield Package built from resource_uri record for a single - package version. - Yield as many Package as there are download URLs. - """ - metadata = json.loads(resource_uri.data) - return build_packages_from_metafile(metadata, resource_uri.package_url, uri) - - -def build_packages_from_metafile(metadata, purl=None, uri=None): - """ - Yield Package built from package a `metadata` content - metadata: json metadata content - purl: String value of the package url of the ResourceURI object - """ - short_desc = metadata.get('summary') - long_desc = metadata.get('short_description') - descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] - description = '\n'.join(descriptions) - name = metadata.get('shortname') - # short name is more reasonable here for name, since it's an abbreviation - # for the project and unique - if not name: - name = metadata.get('name') - if name: - common_data = dict( - datasource_id='sourceforge_metadata', - type='sourceforge', - name=metadata.get('shortname', metadata.get('name')), - description=description, - homepage_url=metadata.get( - 'external_homepage', metadata.get('url')), - license_detections=[], - ) - - devs = metadata.get('developers') or [] - for dev in devs: - parties = common_data.get('parties') - if not parties: - common_data['parties'] = [] - if dev.get('name'): - common_data['parties'].append( - scan_models.Party(name=dev.get( - 'name'), role='contributor', url=dev.get('url')).to_dict() - ) - - categories = metadata.get('categories', {}) - languages = categories.get('language', []) - langs = [] - for lang in languages: - lshort = lang.get('shortname') - if lshort: - langs.append(lshort) - langs = ', '.join(langs) - common_data['primary_language'] = langs or None - - extracted_license_statement = [] - licenses = categories.get('license') or [] - for l in licenses: - license_name = l.get('fullname') - # full name is first priority than shortname since shortname is like gpl, it doesn't show detailed gpl version etc. - if license_name: - extracted_license_statement.append(l.get('shortname')) - if license_name: - extracted_license_statement.append(license_name) - if extracted_license_statement: - common_data['extracted_license_statement'] = extracted_license_statement - - keywords = [] - topics = categories.get('topic', []) - for topic in topics: - keywords.append(topic.get('shortname')) - common_data['keywords'] = keywords or None - package = scan_models.Package.from_package_data( - package_data=common_data, - datafile_path=uri, - ) - package.set_purl(purl) - yield package diff --git a/minecode/mappings/gcode_keywords.py b/minecode/mappings/gcode_keywords.py index 57fe1648..23e84dff 100644 --- a/minecode/mappings/gcode_keywords.py +++ b/minecode/mappings/gcode_keywords.py @@ -13,438 +13,439 @@ See visitors.googlecode.get_project_labels_feed_as_list() for the origin of those values. """ + GCODE_KEYWORDS = { - 'AJAX': 'AJAX', - 'AOP': 'AOP', - 'API': 'API', - 'AWS': 'AWS', - 'Academic': 'Academic', - 'Accessibility': 'Accessibility', - 'Accounting': 'Accounting', - 'ActiveRecord': 'ActiveRecord', - 'Agent': 'Agent', - 'Agile': 'Agile', - 'Air': 'Air', - 'Album': 'Album', - 'Algorithm': 'Algorithms', - 'Analysis': 'Analysis', - 'Analytics': 'Analytics', - 'Android': 'Android', - 'Animation': 'Animation', - 'Annotation': 'Annotation', - 'Apache': 'Apache', - 'AppEngine': 'AppEngine', - 'Applet': 'Applet', - 'Application': 'Application', - 'Apps': 'Apps', - 'Arcade': 'Arcade', - 'Archive': 'Archive', - 'Arm': 'Arm', - 'Asterisk': 'Asterisk', - 'Astronomy': 'Astronomy', - 'Atom': 'Atom', - 'Automation': 'Automation', - 'BSD': 'BSD', - 'Backup': 'Backup', - 'Batch': 'Batch', - 'Bioinformatics': 'Bioinformatics', - 'Biology': 'Biology', - 'Bittorrent': 'BitTorrent', - 'Blender': 'Blender', - 'Blogger': 'Blogger', - 'Blogging': 'Blogging', - 'Bluetooth': 'Bluetooth', - 'Board': 'Board', - 'Boardgame': 'Board Games', - 'Book': 'Book', - 'Books': 'Books', - 'Boost': 'Boost', - 'Browser': 'Browsers', - 'Build': 'Build Tool', - 'Business': 'Business', - 'CGI': 'CGI', - 'CML': 'CML', - 'CMS': 'CMS Systems', - 'CRM': 'CRM', - 'CRUD': 'CRUD', - 'CS': 'CS', - 'CSE': 'CSE', - 'CSV': 'CSV', - 'CakePHP': 'CakePHP', - 'Calculator': 'Calculator', - 'Canvas': 'Canvas', - 'Chess': 'Chess', - 'Chinese': 'Chinese', - 'Chrome': 'Chrome', - 'ChromeOS': 'ChromeOS', - 'Client': 'Client', - 'Cluster': 'Cluster', - 'Cocoa': 'Cocoa', - 'CodeGeneration': 'Code Generation', - 'Codeigniter': 'Codeigniter', - 'Color': 'Color', - 'Communication': 'Communications', - 'Community': 'Community', - 'Component': 'Component', - 'Compression': 'Compression', - 'Computer': 'Computer', - 'Concurrency': 'Concurrency', - 'Console': 'Console', - 'Contacts': 'Contacts', - 'Content': 'Content', - 'ContentManagement': 'ContentManagement', - 'Control': 'Control', - 'Controller': 'Controller', - 'Convert': 'Convert', - 'Crawler': 'Crawler', - 'CrossPlatform': 'CrossPlatform', - 'Cryptography': 'Cryptography', - 'Cuda': 'Cuda', - 'Custom': 'Custom', - 'DAO': 'DAO', - 'DHTML': 'DHTML', - 'DNS': 'DNS', - 'DSL': 'DSL', - 'Dashboard': 'Dashboard', - 'Database': 'Database', - 'Debug': 'Debugger', - 'Delphi': 'Delphi', - 'Demo': 'Demo', - 'Design': 'Design', - 'Desktop': 'Desktop', - 'DevTool': 'DevTool', - 'Developer': 'Developer', - 'Dictionary': 'Dictionary', - 'Distributed': 'Distributed', - 'Django': 'Django', - 'Doc': 'Documentation', - 'Documentation': 'Documentation', - 'Dojo': 'Dojo', - 'DotNet': 'DotNet', - 'Downloader': 'Downloader', - 'Driver': 'Driver', - 'Drupal': 'Drupal', - 'Dynamic': 'Dynamic', - 'E-commerce': 'E-commerce', - 'EJB': 'EJB', - 'ERP': 'ERP', - 'Ebook': 'Ebook', - 'Eclipse': 'Eclipse', - 'Embedded': 'Embedded', - 'Emulator': 'Emulators', - 'Engineering': 'Engineering', - 'English': 'English', - 'Enterprise': 'Enterprise', - 'Events': 'Events', - 'Evolution': 'Evolution', - 'Extension': 'Extension', - 'FLV': 'FLV', - 'FUSE': 'FUSE', - 'Facebook': 'Facebook', - 'Filesystem': 'Filesystems', - 'Finance': 'Finance', - 'Firefox': 'Firefox', - 'Firewall': 'Firewalls', - 'Flash': 'Flash', - 'Football': 'Football', - 'Forms': 'Forms', - 'Framework': 'Framework', - 'FreeBSD': 'FreeBSD', - 'Functional': 'Functional', - 'GIS': 'GIS', - 'GPU': 'GPU', - 'GTD': 'GTD', - 'GTK': 'GTK', - 'GWT': 'GWT', - 'Gadget': 'Gadget', - 'Gallery': 'Gallery', - 'Game': 'Game', - 'Gdata': 'Gdata', - 'Generator': 'Generator', - 'Gentoo': 'Gentoo', - 'Geo': 'Geo', - 'Gnome': 'Gnome', - 'Grails': 'Grails', - 'Grid': 'Grid', - 'Guice': 'Guice', - 'HTML': 'HTML/XHTML', - 'HTTP': 'HTTP', - 'Hadoop': 'Hadoop', - 'Hardware': 'Hardware', - 'Health': 'Health', - 'Hello': 'Hello', - 'Hibernate': 'Hibernate', - 'Home': 'Home', - 'Hosting': 'Hosting', - 'I18n': 'I18N (Internationalization)', - 'IDE': 'IDE', - 'IM': 'IM', - 'IOC': 'IOC', - 'IP': 'IP', - 'IRC': 'IRC', - 'Images': 'Images', - 'Installer': 'Installer', - 'Integration': 'Integration', - 'Interactive': 'Interactive', - 'Interface': 'Interface', - 'Internet': 'Internet', - 'Itunes': 'Itunes', - 'JBoss': 'JBoss', - 'JEE': 'JEE', - 'JME': 'JME', - 'JPA': 'JPA', - 'JSF': 'JSF', - 'JSON': 'JSON', - 'JSP': 'JSP', - 'Jabber': 'Jabber', - 'Japanese': 'Japanese', - 'JavaFX': 'JavaFX', - 'Jobeet': 'Jobeet', - 'Joomla': 'Joomla', - 'KDE': 'KDE', - 'KML': 'KML', - 'Kernel': 'Kernel', - 'Keyboard': 'Keyboard', - 'LDAP': 'LDAP', - 'LaTex': 'TeX/LaTeX', - 'Lab': 'Lab', - 'Layout': 'Layout', - 'Learn': 'Learn', - 'Library': 'Library', - 'Life': 'Life', - 'Light': 'Light', - 'Linq': 'Linq', - 'Linux': 'Linux', - 'List': 'List', - 'Live': 'Live', - 'Localization': 'Localization', - 'Location': 'Location', - 'Log': 'Logging', - 'Logger': 'Logger', - 'MFC': 'MFC', - 'MIDI': 'MIDI', - 'MMO': 'MMO', - 'MMORPG': 'MMORPG', - 'Mac': 'Mac', - 'Machinelearning': 'Machine Learning', - 'Mail': 'Mail', - 'Manage': 'Manage', - 'Mapping': 'Mapping', - 'Mashup': 'Mashup', - 'Mathematics': 'Mathematics', - 'Matlab': 'Matlab', - 'Maven': 'Maven', - 'Mediawiki': 'Mediawiki', - 'Medical': 'Medical', - 'Memory': 'Memory', - 'Menu': 'Menu', - 'Message': 'Message', - 'Messaging': 'Messaging', - 'Messenger': 'Messenger', - 'Microcontroller': 'Microcontroller', - 'Middleware': 'Middleware', - 'Mod': 'Mod', - 'Modeling': 'Modeling', - 'Module': 'Module', - 'Modules': 'Modules', - 'Monitoring': 'Monitoring', - 'Mono': 'Mono', - 'Multiplayer': 'Multiplayer', - 'Multitouch': 'Multitouch', - 'MySQL': 'MySQL', - 'NHibernate': 'NHibernate', - 'Navigation': 'Navigation', - 'Netbeans': 'Netbeans', - 'Networking': 'Networking', - 'News': 'News', - 'Nintendo': 'Nintendo', - 'Notes': 'Notes', - 'OAuth': 'OAuth', - 'OOP': 'OOP', - 'OWL': 'OWL', - 'Object': 'Object', - 'Ocaml': 'Ocaml', - 'Office': 'Office', - 'Ogre': 'Ogre', - 'Online': 'Online', - 'Ontology': 'Ontology', - 'OpenGL': 'OpenGL', - 'OpenID': 'OpenID', - 'OpenSocial': 'OpenSocial', - 'PDF': 'PDF', - 'PSP': 'PSP', - 'Package': 'Package', - 'Parsing': 'Parsing', - 'Password': 'Password', - 'Pattern': 'Pattern', - 'Performance': 'Performance', - 'Persistence': 'Persistence', - 'PhpBB': 'PhpBB', - 'Picasa': 'Picasa', - 'Platform': 'Platform', - 'Player': 'Player', - 'Podcast': 'Podcast', - 'Poker': 'Poker', - 'Portable': 'Portable', - 'Portal': 'Portal', - 'PostgreSQL': 'PostgreSQL', - 'Process': 'Process', - 'Projects': 'Projects', - 'Projeto': 'Projeto', - 'Protocol': 'Protocol', - 'Prototype': 'Prototype', - 'Proxy': 'Proxy', - 'Prueba': 'Prueba', - 'Query': 'Query', - 'RCP': 'RCP', - 'RDF': 'RDF', - 'REST': 'REST', - 'RIA': 'RIA', - 'RMI': 'RMI', - 'RPC': 'RPC', - 'RSS': 'RSS', - 'RTS': 'RTS', - 'Rails': 'Rails', - 'Random': 'Random', - 'Realtime': 'Realtime', - 'Report': 'Report', - 'Research': 'Research', - 'Robotics': 'Robotics', - 'RogueLike': 'RogueLike', - 'SDF': 'SDF', - 'SDK': 'SDK', - 'SDL': 'SDL', - 'SEO': 'SEO', - 'SIP': 'SIP', - 'SMS': 'SMS', - 'SMTP': 'SMTP', - 'SQL': 'SQL', - 'SQLServer': 'SQLServer', - 'SSH': 'SSH', - 'SWF': 'SWF', - 'SWT': 'SWT', - 'Sandbox': 'Sandbox', - 'Schedule': 'Schedule', - 'Scheduler': 'Scheduler', - 'Scheduling': 'Scheduling', - 'Scrum': 'Scrum', - 'Seam': 'Seam', - 'SearchEngine': 'SearchEngine', - 'Semantic': 'Semantic', - 'SemanticWeb': 'SemanticWeb', - 'Server': 'Server', - 'Service': 'Service', - 'Services': 'Services', - 'Sharing': 'Sharing', - 'Shooter': 'Shooter', - 'Simple': 'Simple', - 'Simulator': 'Simulator', - 'Sistema': 'Sistema', - 'SlideShow': 'SlideShow', - 'Small': 'Small', - 'SocialNetworking': 'SocialNetworking', - 'Socket': 'Socket', - 'Sockets': 'Sockets', - 'Spider': 'Spider', - 'Spring': 'Spring', - 'Sqlite': 'Sqlite', - 'Statistics': 'Statistics', - 'Storage': 'Storage', - 'Stream': 'Stream', - 'Struts': 'Struts', - 'Student': 'Student', - 'Study': 'Study', - 'Subversion': 'Subversion', - 'Sudoku': 'Sudoku', - 'Svn': 'Svn', - 'Swing': 'Swing', - 'Symfony': 'Symfony', - 'Sync': 'Sync', - 'TCL': 'TCL', - 'TCP': 'TCP', - 'Table': 'Table', - 'Taggi': 'Taggi', - 'Tasks': 'Tasks', - 'Template': 'Templates', - 'Terminal': 'Terminal', - 'Theme': 'Theme', - 'Thesis': 'Thesis', - 'Time': 'Time', - 'Timer': 'Timer', - 'Tool': 'Tool', - 'Toolkit': 'Toolkit', - 'Tracking': 'Tracking', - 'Traffic': 'Traffic', - 'Training': 'Training', - 'Translate': 'Translate', - 'Translation': 'Translation', - 'Travel': 'Travel', - 'Tree': 'Tree', - 'Tutorial': 'Tutorial', - 'Twitter': 'Twitter', - 'UDP': 'UDP', - 'UI': 'UI', - 'UML': 'UML', - 'URL': 'URL', - 'Ubuntu': 'Ubuntu', - 'Unicode': 'Unicode', - 'UnitTesting': 'Unit Test', - 'Unittest': 'Unit Test', - 'University': 'University', - 'Unix': 'Unix', - 'Utility': 'Utility', - 'Vector': 'Vector', - 'Videogame': 'Videogame', - 'Viewer': 'Viewer', - 'Virtual': 'Virtual', - 'Visual': 'Visual', - 'VisualStudio': 'VisualStudio', - 'WPF': 'WPF', - 'Wave': 'Wave', - 'Web': 'Web', - 'Webcam': 'Webcam', - 'Webkit': 'Webkit', - 'Webservice': 'Web Service', - 'Webservices': 'Web Service', - 'Website': 'Website', - 'WiFi': 'WiFi', - 'Wicket': 'Wicket', - 'Widget': 'Widget', - 'Widgets': 'Widgets', - 'Wiki': 'Wiki', - 'Wikipedia': 'Wikipedia', - 'Windows': 'Windows', - 'WoW': 'WoW', - 'Word': 'Word', - 'Work': 'Work', - 'World': 'World', - 'XHTML': 'XHTML', - 'XMPP': 'XMPP', - 'XNA': 'XNA', - 'XSL': 'XSL', - 'XUL': 'XUL', - 'XWindow': 'XWindow', - 'YUI': 'YUI', - 'YouTube': 'YouTube', - 'Zend': 'Zend', - 'ZendFramework': 'ZendFramework', - 'addon': 'Addon', - 'extjs': 'extjs', - 'ffmpeg': 'ffmpeg', - 'iPhone': 'iPhone', - 'j2ee': 'j2ee', - 'j2me': 'j2me', - 'j2se': 'j2se', - 'jQuery': 'jQuery', - 'memcached': 'memcached', - 'mp3': 'MP3', - 'p2p': 'p2p', - 'plugin': 'plugin', - 'pygame': 'pygame', - 'pyqt': 'pyqt', - 'regex': 'Regex', - 's3': 's3', - 's60': 's60', - 'twisted': 'Twisted', - 'wxwidgets': 'wxwidgets', + "AJAX": "AJAX", + "AOP": "AOP", + "API": "API", + "AWS": "AWS", + "Academic": "Academic", + "Accessibility": "Accessibility", + "Accounting": "Accounting", + "ActiveRecord": "ActiveRecord", + "Agent": "Agent", + "Agile": "Agile", + "Air": "Air", + "Album": "Album", + "Algorithm": "Algorithms", + "Analysis": "Analysis", + "Analytics": "Analytics", + "Android": "Android", + "Animation": "Animation", + "Annotation": "Annotation", + "Apache": "Apache", + "AppEngine": "AppEngine", + "Applet": "Applet", + "Application": "Application", + "Apps": "Apps", + "Arcade": "Arcade", + "Archive": "Archive", + "Arm": "Arm", + "Asterisk": "Asterisk", + "Astronomy": "Astronomy", + "Atom": "Atom", + "Automation": "Automation", + "BSD": "BSD", + "Backup": "Backup", + "Batch": "Batch", + "Bioinformatics": "Bioinformatics", + "Biology": "Biology", + "Bittorrent": "BitTorrent", + "Blender": "Blender", + "Blogger": "Blogger", + "Blogging": "Blogging", + "Bluetooth": "Bluetooth", + "Board": "Board", + "Boardgame": "Board Games", + "Book": "Book", + "Books": "Books", + "Boost": "Boost", + "Browser": "Browsers", + "Build": "Build Tool", + "Business": "Business", + "CGI": "CGI", + "CML": "CML", + "CMS": "CMS Systems", + "CRM": "CRM", + "CRUD": "CRUD", + "CS": "CS", + "CSE": "CSE", + "CSV": "CSV", + "CakePHP": "CakePHP", + "Calculator": "Calculator", + "Canvas": "Canvas", + "Chess": "Chess", + "Chinese": "Chinese", + "Chrome": "Chrome", + "ChromeOS": "ChromeOS", + "Client": "Client", + "Cluster": "Cluster", + "Cocoa": "Cocoa", + "CodeGeneration": "Code Generation", + "Codeigniter": "Codeigniter", + "Color": "Color", + "Communication": "Communications", + "Community": "Community", + "Component": "Component", + "Compression": "Compression", + "Computer": "Computer", + "Concurrency": "Concurrency", + "Console": "Console", + "Contacts": "Contacts", + "Content": "Content", + "ContentManagement": "ContentManagement", + "Control": "Control", + "Controller": "Controller", + "Convert": "Convert", + "Crawler": "Crawler", + "CrossPlatform": "CrossPlatform", + "Cryptography": "Cryptography", + "Cuda": "Cuda", + "Custom": "Custom", + "DAO": "DAO", + "DHTML": "DHTML", + "DNS": "DNS", + "DSL": "DSL", + "Dashboard": "Dashboard", + "Database": "Database", + "Debug": "Debugger", + "Delphi": "Delphi", + "Demo": "Demo", + "Design": "Design", + "Desktop": "Desktop", + "DevTool": "DevTool", + "Developer": "Developer", + "Dictionary": "Dictionary", + "Distributed": "Distributed", + "Django": "Django", + "Doc": "Documentation", + "Documentation": "Documentation", + "Dojo": "Dojo", + "DotNet": "DotNet", + "Downloader": "Downloader", + "Driver": "Driver", + "Drupal": "Drupal", + "Dynamic": "Dynamic", + "E-commerce": "E-commerce", + "EJB": "EJB", + "ERP": "ERP", + "Ebook": "Ebook", + "Eclipse": "Eclipse", + "Embedded": "Embedded", + "Emulator": "Emulators", + "Engineering": "Engineering", + "English": "English", + "Enterprise": "Enterprise", + "Events": "Events", + "Evolution": "Evolution", + "Extension": "Extension", + "FLV": "FLV", + "FUSE": "FUSE", + "Facebook": "Facebook", + "Filesystem": "Filesystems", + "Finance": "Finance", + "Firefox": "Firefox", + "Firewall": "Firewalls", + "Flash": "Flash", + "Football": "Football", + "Forms": "Forms", + "Framework": "Framework", + "FreeBSD": "FreeBSD", + "Functional": "Functional", + "GIS": "GIS", + "GPU": "GPU", + "GTD": "GTD", + "GTK": "GTK", + "GWT": "GWT", + "Gadget": "Gadget", + "Gallery": "Gallery", + "Game": "Game", + "Gdata": "Gdata", + "Generator": "Generator", + "Gentoo": "Gentoo", + "Geo": "Geo", + "Gnome": "Gnome", + "Grails": "Grails", + "Grid": "Grid", + "Guice": "Guice", + "HTML": "HTML/XHTML", + "HTTP": "HTTP", + "Hadoop": "Hadoop", + "Hardware": "Hardware", + "Health": "Health", + "Hello": "Hello", + "Hibernate": "Hibernate", + "Home": "Home", + "Hosting": "Hosting", + "I18n": "I18N (Internationalization)", + "IDE": "IDE", + "IM": "IM", + "IOC": "IOC", + "IP": "IP", + "IRC": "IRC", + "Images": "Images", + "Installer": "Installer", + "Integration": "Integration", + "Interactive": "Interactive", + "Interface": "Interface", + "Internet": "Internet", + "Itunes": "Itunes", + "JBoss": "JBoss", + "JEE": "JEE", + "JME": "JME", + "JPA": "JPA", + "JSF": "JSF", + "JSON": "JSON", + "JSP": "JSP", + "Jabber": "Jabber", + "Japanese": "Japanese", + "JavaFX": "JavaFX", + "Jobeet": "Jobeet", + "Joomla": "Joomla", + "KDE": "KDE", + "KML": "KML", + "Kernel": "Kernel", + "Keyboard": "Keyboard", + "LDAP": "LDAP", + "LaTex": "TeX/LaTeX", + "Lab": "Lab", + "Layout": "Layout", + "Learn": "Learn", + "Library": "Library", + "Life": "Life", + "Light": "Light", + "Linq": "Linq", + "Linux": "Linux", + "List": "List", + "Live": "Live", + "Localization": "Localization", + "Location": "Location", + "Log": "Logging", + "Logger": "Logger", + "MFC": "MFC", + "MIDI": "MIDI", + "MMO": "MMO", + "MMORPG": "MMORPG", + "Mac": "Mac", + "Machinelearning": "Machine Learning", + "Mail": "Mail", + "Manage": "Manage", + "Mapping": "Mapping", + "Mashup": "Mashup", + "Mathematics": "Mathematics", + "Matlab": "Matlab", + "Maven": "Maven", + "Mediawiki": "Mediawiki", + "Medical": "Medical", + "Memory": "Memory", + "Menu": "Menu", + "Message": "Message", + "Messaging": "Messaging", + "Messenger": "Messenger", + "Microcontroller": "Microcontroller", + "Middleware": "Middleware", + "Mod": "Mod", + "Modeling": "Modeling", + "Module": "Module", + "Modules": "Modules", + "Monitoring": "Monitoring", + "Mono": "Mono", + "Multiplayer": "Multiplayer", + "Multitouch": "Multitouch", + "MySQL": "MySQL", + "NHibernate": "NHibernate", + "Navigation": "Navigation", + "Netbeans": "Netbeans", + "Networking": "Networking", + "News": "News", + "Nintendo": "Nintendo", + "Notes": "Notes", + "OAuth": "OAuth", + "OOP": "OOP", + "OWL": "OWL", + "Object": "Object", + "Ocaml": "Ocaml", + "Office": "Office", + "Ogre": "Ogre", + "Online": "Online", + "Ontology": "Ontology", + "OpenGL": "OpenGL", + "OpenID": "OpenID", + "OpenSocial": "OpenSocial", + "PDF": "PDF", + "PSP": "PSP", + "Package": "Package", + "Parsing": "Parsing", + "Password": "Password", + "Pattern": "Pattern", + "Performance": "Performance", + "Persistence": "Persistence", + "PhpBB": "PhpBB", + "Picasa": "Picasa", + "Platform": "Platform", + "Player": "Player", + "Podcast": "Podcast", + "Poker": "Poker", + "Portable": "Portable", + "Portal": "Portal", + "PostgreSQL": "PostgreSQL", + "Process": "Process", + "Projects": "Projects", + "Projeto": "Projeto", + "Protocol": "Protocol", + "Prototype": "Prototype", + "Proxy": "Proxy", + "Prueba": "Prueba", + "Query": "Query", + "RCP": "RCP", + "RDF": "RDF", + "REST": "REST", + "RIA": "RIA", + "RMI": "RMI", + "RPC": "RPC", + "RSS": "RSS", + "RTS": "RTS", + "Rails": "Rails", + "Random": "Random", + "Realtime": "Realtime", + "Report": "Report", + "Research": "Research", + "Robotics": "Robotics", + "RogueLike": "RogueLike", + "SDF": "SDF", + "SDK": "SDK", + "SDL": "SDL", + "SEO": "SEO", + "SIP": "SIP", + "SMS": "SMS", + "SMTP": "SMTP", + "SQL": "SQL", + "SQLServer": "SQLServer", + "SSH": "SSH", + "SWF": "SWF", + "SWT": "SWT", + "Sandbox": "Sandbox", + "Schedule": "Schedule", + "Scheduler": "Scheduler", + "Scheduling": "Scheduling", + "Scrum": "Scrum", + "Seam": "Seam", + "SearchEngine": "SearchEngine", + "Semantic": "Semantic", + "SemanticWeb": "SemanticWeb", + "Server": "Server", + "Service": "Service", + "Services": "Services", + "Sharing": "Sharing", + "Shooter": "Shooter", + "Simple": "Simple", + "Simulator": "Simulator", + "Sistema": "Sistema", + "SlideShow": "SlideShow", + "Small": "Small", + "SocialNetworking": "SocialNetworking", + "Socket": "Socket", + "Sockets": "Sockets", + "Spider": "Spider", + "Spring": "Spring", + "Sqlite": "Sqlite", + "Statistics": "Statistics", + "Storage": "Storage", + "Stream": "Stream", + "Struts": "Struts", + "Student": "Student", + "Study": "Study", + "Subversion": "Subversion", + "Sudoku": "Sudoku", + "Svn": "Svn", + "Swing": "Swing", + "Symfony": "Symfony", + "Sync": "Sync", + "TCL": "TCL", + "TCP": "TCP", + "Table": "Table", + "Taggi": "Taggi", + "Tasks": "Tasks", + "Template": "Templates", + "Terminal": "Terminal", + "Theme": "Theme", + "Thesis": "Thesis", + "Time": "Time", + "Timer": "Timer", + "Tool": "Tool", + "Toolkit": "Toolkit", + "Tracking": "Tracking", + "Traffic": "Traffic", + "Training": "Training", + "Translate": "Translate", + "Translation": "Translation", + "Travel": "Travel", + "Tree": "Tree", + "Tutorial": "Tutorial", + "Twitter": "Twitter", + "UDP": "UDP", + "UI": "UI", + "UML": "UML", + "URL": "URL", + "Ubuntu": "Ubuntu", + "Unicode": "Unicode", + "UnitTesting": "Unit Test", + "Unittest": "Unit Test", + "University": "University", + "Unix": "Unix", + "Utility": "Utility", + "Vector": "Vector", + "Videogame": "Videogame", + "Viewer": "Viewer", + "Virtual": "Virtual", + "Visual": "Visual", + "VisualStudio": "VisualStudio", + "WPF": "WPF", + "Wave": "Wave", + "Web": "Web", + "Webcam": "Webcam", + "Webkit": "Webkit", + "Webservice": "Web Service", + "Webservices": "Web Service", + "Website": "Website", + "WiFi": "WiFi", + "Wicket": "Wicket", + "Widget": "Widget", + "Widgets": "Widgets", + "Wiki": "Wiki", + "Wikipedia": "Wikipedia", + "Windows": "Windows", + "WoW": "WoW", + "Word": "Word", + "Work": "Work", + "World": "World", + "XHTML": "XHTML", + "XMPP": "XMPP", + "XNA": "XNA", + "XSL": "XSL", + "XUL": "XUL", + "XWindow": "XWindow", + "YUI": "YUI", + "YouTube": "YouTube", + "Zend": "Zend", + "ZendFramework": "ZendFramework", + "addon": "Addon", + "extjs": "extjs", + "ffmpeg": "ffmpeg", + "iPhone": "iPhone", + "j2ee": "j2ee", + "j2me": "j2me", + "j2se": "j2se", + "jQuery": "jQuery", + "memcached": "memcached", + "mp3": "MP3", + "p2p": "p2p", + "plugin": "plugin", + "pygame": "pygame", + "pyqt": "pyqt", + "regex": "Regex", + "s3": "s3", + "s60": "s60", + "twisted": "Twisted", + "wxwidgets": "wxwidgets", } diff --git a/minecode/mappings/gcode_licenses.py b/minecode/mappings/gcode_licenses.py index ca5f6483..6ae85c06 100644 --- a/minecode/mappings/gcode_licenses.py +++ b/minecode/mappings/gcode_licenses.py @@ -18,8 +18,7 @@ if not License.objects.filter(dataspace__name='nexB', name=name).exists()] """ - -''' +""" Code licenses @@ -31,32 +30,30 @@ -''' +""" -''' +""" Possible separate content license -''' +""" GCODE_LICENSES = { - 'Apache License 2.0': 'Apache License 2.0', - 'GNU GPL v2': 'GNU General Public License 2.0', + "Apache License 2.0": "Apache License 2.0", + "GNU GPL v2": "GNU General Public License 2.0", # FIXME: or GPL 1.0? - 'Artistic License/GPL': 'Artistic License 2.0', - 'New BSD License': 'BSD-Modified', - 'Eclipse Public License 1.0': 'Eclipse Public License 1.0', - 'GNU GPL v3': 'GNU General Public License 3.0', + "Artistic License/GPL": "Artistic License 2.0", + "New BSD License": "BSD-Modified", + "Eclipse Public License 1.0": "Eclipse Public License 1.0", + "GNU GPL v3": "GNU General Public License 3.0", # FIXME: v3.0 only?? - 'GNU Lesser GPL': 'GNU Lesser General Public License 3.0', - 'MIT License': 'MIT License', - 'Mozilla Public License 1.1': 'Mozilla Public License 1.1', - - 'Other Open Source': None, - 'See source code': None, - - 'Creative Commons 3.0 BY': 'Creative Commons Attribution License 3.0', - 'Creative Commons 3.0 BY-SA': 'Creative Commons Attribution Share Alike License 3.0', + "GNU Lesser GPL": "GNU Lesser General Public License 3.0", + "MIT License": "MIT License", + "Mozilla Public License 1.1": "Mozilla Public License 1.1", + "Other Open Source": None, + "See source code": None, + "Creative Commons 3.0 BY": "Creative Commons Attribution License 3.0", + "Creative Commons 3.0 BY-SA": "Creative Commons Attribution Share Alike License 3.0", } diff --git a/minecode/mappings/gcode_programming_languages.py b/minecode/mappings/gcode_programming_languages.py index e4ef8608..b50a1c92 100644 --- a/minecode/mappings/gcode_programming_languages.py +++ b/minecode/mappings/gcode_programming_languages.py @@ -8,47 +8,45 @@ # -""" -Structure: {'googlecode': 'dje'} -""" +"""Structure: {'googlecode': 'dje'}""" GCODE_PROGRAMMING_LANGUAGES = { - 'ASP': 'ASP', - 'ASP.net': 'ASP', - 'ActionScript': 'ActionScript', - 'Ada': 'Ada', - 'Arduino': 'Arduino', - 'Assembly': 'Assembly', - 'Bash': 'Bash', - 'BASIC': 'Visual Basic', - 'C': 'C', - 'CPlusPlus': 'C++', - 'CSS': 'CSS', - 'CSharp': 'C#', - 'DLanguage': 'D', - 'Erlang': 'Erlang', - 'Flex': 'Flex', - 'Forth': 'Forth', - 'Fortran': 'Fortran', - 'Go': 'Go', - 'Groovy': 'Groovy', - 'Java': 'Java', - 'JavaScript': 'JavaScript', - 'Lisp': 'Lisp', - 'Lua': 'Lua', - 'Mirah': 'Mirah', - 'ObjectivE-C': 'Objective-C', - 'PHP': 'PHP', - 'Pascal': 'Pascal', - 'Perl': 'Perl', - 'Processing': 'Processing', - 'Python': 'Python', - 'Ruby': 'Ruby', - 'SLanguage': 'S', - 'Scala': 'Scala', - 'Scheme': 'Scheme', - 'Shell': 'Shell', - 'Smalltalk': 'Smalltalk', - 'VisualBASIC': 'Visual Basic', - 'XSLT': 'XSLT', + "ASP": "ASP", + "ASP.net": "ASP", + "ActionScript": "ActionScript", + "Ada": "Ada", + "Arduino": "Arduino", + "Assembly": "Assembly", + "Bash": "Bash", + "BASIC": "Visual Basic", + "C": "C", + "CPlusPlus": "C++", + "CSS": "CSS", + "CSharp": "C#", + "DLanguage": "D", + "Erlang": "Erlang", + "Flex": "Flex", + "Forth": "Forth", + "Fortran": "Fortran", + "Go": "Go", + "Groovy": "Groovy", + "Java": "Java", + "JavaScript": "JavaScript", + "Lisp": "Lisp", + "Lua": "Lua", + "Mirah": "Mirah", + "ObjectivE-C": "Objective-C", + "PHP": "PHP", + "Pascal": "Pascal", + "Perl": "Perl", + "Processing": "Processing", + "Python": "Python", + "Ruby": "Ruby", + "SLanguage": "S", + "Scala": "Scala", + "Scheme": "Scheme", + "Shell": "Shell", + "Smalltalk": "Smalltalk", + "VisualBASIC": "Visual Basic", + "XSLT": "XSLT", } diff --git a/minecode/mappings/pypi_trove.py b/minecode/mappings/pypi_trove.py index 4c860c2d..4ad05e7c 100644 --- a/minecode/mappings/pypi_trove.py +++ b/minecode/mappings/pypi_trove.py @@ -12,75 +12,74 @@ See https://pypi.python.org/pypi?%3Aaction=list_classifiers """ - licenses = { - 'License :: Aladdin Free Public License (AFPL)': 'afpl-9.0', - 'License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication': 'cc0-1.0', - 'License :: DFSG approved': 'unknown', - 'License :: Eiffel Forum License (EFL)': 'efl-2.0', - 'License :: Free For Educational Use': 'proprietary', - 'License :: Free For Home Use': 'proprietary', - 'License :: Free for non-commercial use': 'proprietary', - 'License :: Freely Distributable': 'unknown', - 'License :: Free To Use But Restricted': 'proprietary', - 'License :: Freeware': 'proprietary', - 'License :: Netscape Public License (NPL)': 'npl-1.1', - 'License :: Nokia Open Source License (NOKOS)': 'nokos-1.0a', + "License :: Aladdin Free Public License (AFPL)": "afpl-9.0", + "License :: CC0 1.0 Universal (CC0 1.0) Public Domain Dedication": "cc0-1.0", + "License :: DFSG approved": "unknown", + "License :: Eiffel Forum License (EFL)": "efl-2.0", + "License :: Free For Educational Use": "proprietary", + "License :: Free For Home Use": "proprietary", + "License :: Free for non-commercial use": "proprietary", + "License :: Freely Distributable": "unknown", + "License :: Free To Use But Restricted": "proprietary", + "License :: Freeware": "proprietary", + "License :: Netscape Public License (NPL)": "npl-1.1", + "License :: Nokia Open Source License (NOKOS)": "nokos-1.0a", # 'License :: OSI Approved': '', - 'License :: OSI Approved :: Academic Free License (AFL)': 'afl-3.0', - 'License :: OSI Approved :: Apache Software License': 'apache-2.0', - 'License :: OSI Approved :: Apple Public Source License': 'apsl-2.0', - 'License :: OSI Approved :: Artistic License': 'artistic-2.0', - 'License :: OSI Approved :: Attribution Assurance License': 'attribution', - 'License :: OSI Approved :: BSD License': 'bsd-new', - 'License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)': 'cecill-2.1', - 'License :: OSI Approved :: Common Public License': 'cpl-1.0', - 'License :: OSI Approved :: Eiffel Forum License': 'efl-2.0', - 'License :: OSI Approved :: European Union Public Licence 1.0 (EUPL 1.0)': 'eupl-1.0', - 'License :: OSI Approved :: European Union Public Licence 1.1 (EUPL 1.1)': 'eupl-1.1', - 'License :: OSI Approved :: GNU Affero General Public License v3': 'agpl-3.0', + "License :: OSI Approved :: Academic Free License (AFL)": "afl-3.0", + "License :: OSI Approved :: Apache Software License": "apache-2.0", + "License :: OSI Approved :: Apple Public Source License": "apsl-2.0", + "License :: OSI Approved :: Artistic License": "artistic-2.0", + "License :: OSI Approved :: Attribution Assurance License": "attribution", + "License :: OSI Approved :: BSD License": "bsd-new", + "License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)": "cecill-2.1", + "License :: OSI Approved :: Common Public License": "cpl-1.0", + "License :: OSI Approved :: Eiffel Forum License": "efl-2.0", + "License :: OSI Approved :: European Union Public Licence 1.0 (EUPL 1.0)": "eupl-1.0", + "License :: OSI Approved :: European Union Public Licence 1.1 (EUPL 1.1)": "eupl-1.1", + "License :: OSI Approved :: GNU Affero General Public License v3": "agpl-3.0", # FIXME: we do not have agpl-3.0+ - 'License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)': 'agpl-3.0', - 'License :: OSI Approved :: GNU Free Documentation License (FDL)': 'gfdl-1.3', - 'License :: OSI Approved :: GNU General Public License (GPL)': 'gpl', - 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)': 'gpl-2.0', - 'License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)': 'gpl-2.0-plus', - 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)': 'gpl-3.0', - 'License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)': 'gpl-3.0-plus', - 'License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)': 'lgpl-2.0', - 'License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)': 'lgpl-2.0-plus', - 'License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)': 'lgpl-3.0', - 'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)': 'lgpl-3.0-plus', - 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)': 'lgpl', - 'License :: OSI Approved :: IBM Public License': 'ibmpl-1.0', - 'License :: OSI Approved :: Intel Open Source License': 'intel-bsd-export-control', - 'License :: OSI Approved :: ISC License (ISCL)': 'isc', - 'License :: OSI Approved :: Jabber Open Source License': 'josl-1.0', - 'License :: OSI Approved :: MIT License': 'mit', + "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)": "agpl-3.0", + "License :: OSI Approved :: GNU Free Documentation License (FDL)": "gfdl-1.3", + "License :: OSI Approved :: GNU General Public License (GPL)": "gpl", + "License :: OSI Approved :: GNU General Public License v2 (GPLv2)": "gpl-2.0", + "License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)": "gpl-2.0-plus", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)": "gpl-3.0", + "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)": "gpl-3.0-plus", + "License :: OSI Approved :: GNU Lesser General Public License v2 (LGPLv2)": "lgpl-2.0", + "License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)": "lgpl-2.0-plus", + "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)": "lgpl-3.0", + "License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)": "lgpl-3.0-plus", + "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)": "lgpl", + "License :: OSI Approved :: IBM Public License": "ibmpl-1.0", + "License :: OSI Approved :: Intel Open Source License": "intel-bsd-export-control", + "License :: OSI Approved :: ISC License (ISCL)": "isc", + "License :: OSI Approved :: Jabber Open Source License": "josl-1.0", + "License :: OSI Approved :: MIT License": "mit", # FIXME: old and not in scancode: https://opensource.org/licenses/mitrepl # 'License :: OSI Approved :: MITRE Collaborative Virtual Workspace License (CVW)': '', - 'License :: OSI Approved :: Motosoto License': 'motosoto-0.9.1', - 'License :: OSI Approved :: Mozilla Public License 1.0 (MPL)': 'mpl-1.0', - 'License :: OSI Approved :: Mozilla Public License 1.1 (MPL 1.1)': 'mpl-1.1', - 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)': 'mpl-2.0', - 'License :: OSI Approved :: Nethack General Public License': 'ngpl', - 'License :: OSI Approved :: Nokia Open Source License': 'nokos-1.0a', - 'License :: OSI Approved :: Open Group Test Suite License': 'opengroup', - 'License :: OSI Approved :: Python License (CNRI Python License)': 'cnri-python-1.6.1', - 'License :: OSI Approved :: Python Software Foundation License': 'python', - 'License :: OSI Approved :: Qt Public License (QPL)': 'qpl-1.0', - 'License :: OSI Approved :: Ricoh Source Code Public License': 'ricoh-1.0', - 'License :: OSI Approved :: Sleepycat License': 'sleepycat', - 'License :: OSI Approved :: Sun Industry Standards Source License (SISSL)': 'sun-sissl-1.2', - 'License :: OSI Approved :: Sun Public License': 'spl-1.0', - 'License :: OSI Approved :: University of Illinois/NCSA Open Source License': 'uoi-ncsa', - 'License :: OSI Approved :: Vovida Software License 1.0': 'vsl-1.0', - 'License :: OSI Approved :: W3C License': 'w3c', - 'License :: OSI Approved :: X.Net License': 'xnet', - 'License :: OSI Approved :: zlib/libpng License': 'zlib', - 'License :: OSI Approved :: Zope Public License': 'zpl-2.1', - 'License :: Other/Proprietary License': 'proprietary', - 'License :: Public Domain': 'public-domain', + "License :: OSI Approved :: Motosoto License": "motosoto-0.9.1", + "License :: OSI Approved :: Mozilla Public License 1.0 (MPL)": "mpl-1.0", + "License :: OSI Approved :: Mozilla Public License 1.1 (MPL 1.1)": "mpl-1.1", + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)": "mpl-2.0", + "License :: OSI Approved :: Nethack General Public License": "ngpl", + "License :: OSI Approved :: Nokia Open Source License": "nokos-1.0a", + "License :: OSI Approved :: Open Group Test Suite License": "opengroup", + "License :: OSI Approved :: Python License (CNRI Python License)": "cnri-python-1.6.1", + "License :: OSI Approved :: Python Software Foundation License": "python", + "License :: OSI Approved :: Qt Public License (QPL)": "qpl-1.0", + "License :: OSI Approved :: Ricoh Source Code Public License": "ricoh-1.0", + "License :: OSI Approved :: Sleepycat License": "sleepycat", + "License :: OSI Approved :: Sun Industry Standards Source License (SISSL)": "sun-sissl-1.2", + "License :: OSI Approved :: Sun Public License": "spl-1.0", + "License :: OSI Approved :: University of Illinois/NCSA Open Source License": "uoi-ncsa", + "License :: OSI Approved :: Vovida Software License 1.0": "vsl-1.0", + "License :: OSI Approved :: W3C License": "w3c", + "License :: OSI Approved :: X.Net License": "xnet", + "License :: OSI Approved :: zlib/libpng License": "zlib", + "License :: OSI Approved :: Zope Public License": "zpl-2.1", + "License :: Other/Proprietary License": "proprietary", + "License :: Public Domain": "public-domain", # not in scancode # 'License :: Repoze Public License': '', } diff --git a/minecode/mappings/sfnet_licenses.py b/minecode/mappings/sfnet_licenses.py index dd8810dd..77267caf 100644 --- a/minecode/mappings/sfnet_licenses.py +++ b/minecode/mappings/sfnet_licenses.py @@ -17,86 +17,85 @@ if not License.objects.filter(dataspace__name='nexB', name=name).exists()] """ - SFNET_LICENSES = { - 'Academic Free License (AFL)': 'Academic Free License 3.0', - 'Adaptive Public License': 'Adaptive Public License', - 'Affero GNU Public License ': 'GNU Affero General Public License 3.0', - 'Apache License V2.0': 'Apache License 2.0', - 'Apache Software License': 'Apache License 2.0', - 'Apple Public Source License': 'Apple Public Source License 2.0', - 'Artistic License': 'Artistic License 2.0', - 'Artistic License 2.0': 'Artistic License 2.0', - 'Attribution Assurance License': 'Attribution Assurance License', - 'BSD License': 'BSD-Modified', - 'Boost Software License (BSL1.0)': 'Boost Software License 1.0', - 'Common Development and Distribution License': 'Common Development and Distribution License 1.1', - 'Common Public Attribution License 1.0 (CPAL)': 'Common Public Attribution License 1.0', - 'Common Public License 1.0': 'Common Public License 1.0', - 'Computer Associates Trusted Open Source License 1.1': 'Computer Associates Trusted Open Source License 1.1', - 'Creative Commons Attribution License': 'Creative Commons Attribution License 3.0', - 'Creative Commons Attribution Non-Commercial License V2.0': 'Creative Commons Attribution Non-Commercial 2.0', - 'Creative Commons Attribution ShareAlike License V2.0': 'Creative Commons Attribution Share Alike License 2.0', - 'Creative Commons Attribution ShareAlike License V3.0': 'Creative Commons Attribution Share Alike License 3.0', - 'CUA Office Public License Version 1.0': 'CUA Office Public License 1.0', - 'Eclipse Public License': 'Eclipse Public License 1.0', - 'Educational Community License, Version 2.0': 'Educational Community License 2.0', - 'Eiffel Forum License V2.0': 'Eiffel Forum License 2.0', - 'Eiffel Forum License': 'Eiffel Forum License 2.0', - 'Entessa Public License': 'Entessa Public License v1.0', - 'EU DataGrid Software License': 'EU DataGrid Software License', - 'European Union Public License': 'European Union Public Licence 1.1', - 'Fair License': 'Fair License', - 'GNU General Public License version 2.0 (GPLv2)': 'GNU General Public License 2.0', - 'GNU General Public License version 3.0 (GPLv3)': 'GNU General Public License 3.0', - 'GNU General Public License with Classpath exception (Classpath::License)': 'GNU General Public License 2.0 with Classpath exception', - 'GNU Library or Lesser General Public License version 2.0 (LGPLv2)': 'GNU Library General Public License 2.0', - 'GNU Library or Lesser General Public License version 3.0 (LGPLv3)': 'GNU Lesser General Public License 3.0', - 'Historical Permission Notice and Disclaimer': 'Historical Permission Notice and Disclaimer', - 'IBM Public License': 'IBM Public License', - 'ISC License': 'ISC License (ISCL)', - 'Intel Open Source License': 'Intel Open Source License 1989', - 'Jabber Open Source License': 'Jabber Open Source License 1.0', - 'LaTeX Project Public License': 'LaTeX Project Public License v1.3a', - 'Lucent Public License Version 1.02': 'Lucent Public License 1.02', - 'MIT License': 'MIT License', - 'Microsoft Public License': 'Microsoft Public License', - 'Microsoft Reciprocal License': 'Microsoft Reciprocal License', - 'Mozilla Public License 1.0 (MPL)': 'Mozilla Public License 1.0', - 'Mozilla Public License 1.1 (MPL 1.1)': 'Mozilla Public License 1.1', - 'Mozilla Public License 2.0 (MPL 2.0)': 'Mozilla Public License 2.0', - 'NASA Open Source Agreement': 'NASA Open Source License v1.3', - 'Nethack General Public License': 'Nethack General Public License', - 'Nokia Open Source License': 'Nokia Open Source License 1.0a', - 'Non-Profit Open Software License 3.0 (Non-Profit OSL 3.0)': 'Non-Profit Open Software License 3.0', - 'NTP License': 'NTP License', - 'OCLC Research Public License 2.0': 'OCLC Research Public License 2.0', - 'OSI-Approved Open Source': None, - 'Open Font License 1.1 (OFL 1.1)': 'Open Font License 1.1', - 'Open Group Test Suite License': 'Open Group Test Suite License', - 'Open Software License 3.0 (OSL3.0)': 'Open Software License 3.0', - 'Other License': None, - 'PHP License': 'PHP License 3.01', - 'Public Domain': 'Public Domain', - 'Python License (CNRI Python License)': 'CNRI Open Source License Agreement for Python 1.6.1', - 'Python Software Foundation License': 'Python Software Foundation License v2', - 'Qt Public License (QPL)': 'Q Public License Version 1.0', - 'Reciprocal Public License 1.5 (RPL1.5)': 'Reciprocal Public License 1.5', - 'RealNetworks Public Source License V1.0': 'RealNetworks Public Source License v1.0', - 'Reciprocal Public License': 'Reciprocal Public License 1.5', - 'Ricoh Source Code Public License': 'Ricoh Source Code Public License v1.0', - 'Simple Public License 2.0': 'Simple Public License Version 2.0', - 'Sleepycat License': 'Sleepycat License (Berkeley Database License)', - 'Sun Industry Standards Source License (SISSL)': 'Sun Industry Standards Source License 1.2', - 'Sun Public License': 'Sun Public License 1.0', - 'Sybase Open Watcom Public License': 'Sybase Open Watcom Public License v1.0', - 'University of Illinois/NCSA Open Source License': 'University of Illinois/NCSA Open Source License', - 'Vovida Software License 1.0': 'Vovida Software License v. 1.0', - 'W3C License': 'W3C Software Notice and License', - 'Zope Public License': 'Zope Public License 2.1', - 'wxWindows Library Licence': 'wxWindows Library Licence 3.1', - 'X.Net License': 'X.Net Inc. License', - 'zlib/libpng License': 'Libpng License', + "Academic Free License (AFL)": "Academic Free License 3.0", + "Adaptive Public License": "Adaptive Public License", + "Affero GNU Public License ": "GNU Affero General Public License 3.0", + "Apache License V2.0": "Apache License 2.0", + "Apache Software License": "Apache License 2.0", + "Apple Public Source License": "Apple Public Source License 2.0", + "Artistic License": "Artistic License 2.0", + "Artistic License 2.0": "Artistic License 2.0", + "Attribution Assurance License": "Attribution Assurance License", + "BSD License": "BSD-Modified", + "Boost Software License (BSL1.0)": "Boost Software License 1.0", + "Common Development and Distribution License": "Common Development and Distribution License 1.1", + "Common Public Attribution License 1.0 (CPAL)": "Common Public Attribution License 1.0", + "Common Public License 1.0": "Common Public License 1.0", + "Computer Associates Trusted Open Source License 1.1": "Computer Associates Trusted Open Source License 1.1", + "Creative Commons Attribution License": "Creative Commons Attribution License 3.0", + "Creative Commons Attribution Non-Commercial License V2.0": "Creative Commons Attribution Non-Commercial 2.0", + "Creative Commons Attribution ShareAlike License V2.0": "Creative Commons Attribution Share Alike License 2.0", + "Creative Commons Attribution ShareAlike License V3.0": "Creative Commons Attribution Share Alike License 3.0", + "CUA Office Public License Version 1.0": "CUA Office Public License 1.0", + "Eclipse Public License": "Eclipse Public License 1.0", + "Educational Community License, Version 2.0": "Educational Community License 2.0", + "Eiffel Forum License V2.0": "Eiffel Forum License 2.0", + "Eiffel Forum License": "Eiffel Forum License 2.0", + "Entessa Public License": "Entessa Public License v1.0", + "EU DataGrid Software License": "EU DataGrid Software License", + "European Union Public License": "European Union Public Licence 1.1", + "Fair License": "Fair License", + "GNU General Public License version 2.0 (GPLv2)": "GNU General Public License 2.0", + "GNU General Public License version 3.0 (GPLv3)": "GNU General Public License 3.0", + "GNU General Public License with Classpath exception (Classpath::License)": "GNU General Public License 2.0 with Classpath exception", + "GNU Library or Lesser General Public License version 2.0 (LGPLv2)": "GNU Library General Public License 2.0", + "GNU Library or Lesser General Public License version 3.0 (LGPLv3)": "GNU Lesser General Public License 3.0", + "Historical Permission Notice and Disclaimer": "Historical Permission Notice and Disclaimer", + "IBM Public License": "IBM Public License", + "ISC License": "ISC License (ISCL)", + "Intel Open Source License": "Intel Open Source License 1989", + "Jabber Open Source License": "Jabber Open Source License 1.0", + "LaTeX Project Public License": "LaTeX Project Public License v1.3a", + "Lucent Public License Version 1.02": "Lucent Public License 1.02", + "MIT License": "MIT License", + "Microsoft Public License": "Microsoft Public License", + "Microsoft Reciprocal License": "Microsoft Reciprocal License", + "Mozilla Public License 1.0 (MPL)": "Mozilla Public License 1.0", + "Mozilla Public License 1.1 (MPL 1.1)": "Mozilla Public License 1.1", + "Mozilla Public License 2.0 (MPL 2.0)": "Mozilla Public License 2.0", + "NASA Open Source Agreement": "NASA Open Source License v1.3", + "Nethack General Public License": "Nethack General Public License", + "Nokia Open Source License": "Nokia Open Source License 1.0a", + "Non-Profit Open Software License 3.0 (Non-Profit OSL 3.0)": "Non-Profit Open Software License 3.0", + "NTP License": "NTP License", + "OCLC Research Public License 2.0": "OCLC Research Public License 2.0", + "OSI-Approved Open Source": None, + "Open Font License 1.1 (OFL 1.1)": "Open Font License 1.1", + "Open Group Test Suite License": "Open Group Test Suite License", + "Open Software License 3.0 (OSL3.0)": "Open Software License 3.0", + "Other License": None, + "PHP License": "PHP License 3.01", + "Public Domain": "Public Domain", + "Python License (CNRI Python License)": "CNRI Open Source License Agreement for Python 1.6.1", + "Python Software Foundation License": "Python Software Foundation License v2", + "Qt Public License (QPL)": "Q Public License Version 1.0", + "Reciprocal Public License 1.5 (RPL1.5)": "Reciprocal Public License 1.5", + "RealNetworks Public Source License V1.0": "RealNetworks Public Source License v1.0", + "Reciprocal Public License": "Reciprocal Public License 1.5", + "Ricoh Source Code Public License": "Ricoh Source Code Public License v1.0", + "Simple Public License 2.0": "Simple Public License Version 2.0", + "Sleepycat License": "Sleepycat License (Berkeley Database License)", + "Sun Industry Standards Source License (SISSL)": "Sun Industry Standards Source License 1.2", + "Sun Public License": "Sun Public License 1.0", + "Sybase Open Watcom Public License": "Sybase Open Watcom Public License v1.0", + "University of Illinois/NCSA Open Source License": "University of Illinois/NCSA Open Source License", + "Vovida Software License 1.0": "Vovida Software License v. 1.0", + "W3C License": "W3C Software Notice and License", + "Zope Public License": "Zope Public License 2.1", + "wxWindows Library Licence": "wxWindows Library Licence 3.1", + "X.Net License": "X.Net Inc. License", + "zlib/libpng License": "Libpng License", } SFNET_NAMES = SFNET_LICENSES.keys() diff --git a/minecode/mappings/sfnet_programming_languages.py b/minecode/mappings/sfnet_programming_languages.py index 3a646004..ed036b02 100644 --- a/minecode/mappings/sfnet_programming_languages.py +++ b/minecode/mappings/sfnet_programming_languages.py @@ -8,102 +8,100 @@ # -""" -Structure: {'sf.net': 'dje'} -""" +"""Structure: {'sf.net': 'dje'}""" SFNET_PROGRAMMING_LANGUAGES = { - 'ALGOL 68': 'Algol', - 'APL': 'APL', - 'ASP': 'ASP', - 'ASP.NET': 'ASP', - 'AWK': 'Awk', - 'ActionScript': 'ActionScript', - 'Ada': 'Ada', - 'AppleScript': 'AppleScript', - 'AspectJ': 'AspectJ', - 'Assembly': 'Assembly', - 'AutoIt': 'AutoIt', - 'BASIC': 'Visual Basic', - 'BlitzMax': 'BlitzMax', - 'Boo': 'Boo', - 'C': 'C', - 'C#': 'C#', - 'C++': 'C++', - 'COBOL': 'COBOL', - 'Clarion': 'Clarion', - 'Cold Fusion': 'ColdFusion', - 'Common Lisp': 'Common Lisp', - 'Curl': 'Curl', - 'D': 'D', - 'Delphi/Kylix': 'Delphi/Object Pascal', - 'Dylan': 'Dylan', - 'Eiffel': 'Eiffel', - 'Emacs-Lisp': 'Emacs Lisp', - 'Erlang': 'Erlang', - 'Euler': 'Euler', - 'Euphoria': 'Euphoria', - 'Flex': 'Flex', - 'Forth': 'Forth', - 'Fortran': 'Fortran', - 'Free Pascal': 'Pascal', - 'GLSL (OpenGL Shading Language)': 'GLSL (OpenGL Shading Language)', - 'Groovy': 'Groovy', - 'Haskell': 'Haskell', - 'IDL': 'IDL', - 'JSP': 'Java', - 'Java': 'Java', - 'JavaScript': 'JavaScript', - 'Kaya': 'Kaya', - 'LPC': 'LPC', - 'LabVIEW': 'LabVIEW', - 'Lazarus': 'Pascal', - 'Lisp': 'Lisp', - 'Logo': 'Logo', - 'LotusScript': 'LotusScript', - 'Lua': 'Lua', - 'MATLAB': 'MATLAB', - 'MUMPS': 'MUMPS', - 'Mathematica': 'Mathematica', - 'Modula': 'Modula', - 'OCaml (Objective Caml)': 'OCaml', - 'Oberon': 'Oberon', - 'Object Pascal': 'Delphi/Object Pascal', - 'Objective C': 'Objective-C', - 'Objective-C 2.0': 'Objective-C', - 'Oz': 'Oz', - 'PHP': 'PHP', - 'PL/SQL': 'PL/SQL', - 'PROGRESS': 'Progress 4GL', - 'Pascal': 'Pascal', - 'Perl': 'Perl', - 'Pike': 'Pike', - 'Prolog': 'Prolog', - 'Python': 'Python', - 'REALbasic': 'REALBasic', - 'REBOL': 'REBOL', - 'Rexx': 'REXX', - 'Ruby': 'Ruby', - 'S/R': 'SR', - 'Scala': 'Scala', - 'Scheme': 'Scheme', - 'Scilab': 'Scilab', - 'Scriptol': 'Scriptol', - 'Simulink': 'Simulink', - 'Smalltalk': 'Smalltalk', - 'Standard ML': 'Standard ML', - 'Tcl': 'Tcl', - 'Transcript/Revolution': 'Revolution', - 'Unix Shell': 'Shell', - 'VBScript': 'VBScript', - 'VHDL/Verilog': 'Verilog', - 'Visual Basic': '(Visual) Basic', - 'Visual Basic .NET': 'Visual Basic .NET', - 'Visual Basic for Applications (VBA)': '(Visual) Basic', - 'Visual FoxPro': '(Visual) FoxPro', - 'XBase/Clipper': 'Clipper', - 'XBasic': 'XBasic', - 'XSL (XSLT/XPath/XSL-FO)': 'XSLT', - 'Yacc': 'yacc', - 'haXe': 'haXe', + "ALGOL 68": "Algol", + "APL": "APL", + "ASP": "ASP", + "ASP.NET": "ASP", + "AWK": "Awk", + "ActionScript": "ActionScript", + "Ada": "Ada", + "AppleScript": "AppleScript", + "AspectJ": "AspectJ", + "Assembly": "Assembly", + "AutoIt": "AutoIt", + "BASIC": "Visual Basic", + "BlitzMax": "BlitzMax", + "Boo": "Boo", + "C": "C", + "C#": "C#", + "C++": "C++", + "COBOL": "COBOL", + "Clarion": "Clarion", + "Cold Fusion": "ColdFusion", + "Common Lisp": "Common Lisp", + "Curl": "Curl", + "D": "D", + "Delphi/Kylix": "Delphi/Object Pascal", + "Dylan": "Dylan", + "Eiffel": "Eiffel", + "Emacs-Lisp": "Emacs Lisp", + "Erlang": "Erlang", + "Euler": "Euler", + "Euphoria": "Euphoria", + "Flex": "Flex", + "Forth": "Forth", + "Fortran": "Fortran", + "Free Pascal": "Pascal", + "GLSL (OpenGL Shading Language)": "GLSL (OpenGL Shading Language)", + "Groovy": "Groovy", + "Haskell": "Haskell", + "IDL": "IDL", + "JSP": "Java", + "Java": "Java", + "JavaScript": "JavaScript", + "Kaya": "Kaya", + "LPC": "LPC", + "LabVIEW": "LabVIEW", + "Lazarus": "Pascal", + "Lisp": "Lisp", + "Logo": "Logo", + "LotusScript": "LotusScript", + "Lua": "Lua", + "MATLAB": "MATLAB", + "MUMPS": "MUMPS", + "Mathematica": "Mathematica", + "Modula": "Modula", + "OCaml (Objective Caml)": "OCaml", + "Oberon": "Oberon", + "Object Pascal": "Delphi/Object Pascal", + "Objective C": "Objective-C", + "Objective-C 2.0": "Objective-C", + "Oz": "Oz", + "PHP": "PHP", + "PL/SQL": "PL/SQL", + "PROGRESS": "Progress 4GL", + "Pascal": "Pascal", + "Perl": "Perl", + "Pike": "Pike", + "Prolog": "Prolog", + "Python": "Python", + "REALbasic": "REALBasic", + "REBOL": "REBOL", + "Rexx": "REXX", + "Ruby": "Ruby", + "S/R": "SR", + "Scala": "Scala", + "Scheme": "Scheme", + "Scilab": "Scilab", + "Scriptol": "Scriptol", + "Simulink": "Simulink", + "Smalltalk": "Smalltalk", + "Standard ML": "Standard ML", + "Tcl": "Tcl", + "Transcript/Revolution": "Revolution", + "Unix Shell": "Shell", + "VBScript": "VBScript", + "VHDL/Verilog": "Verilog", + "Visual Basic": "(Visual) Basic", + "Visual Basic .NET": "Visual Basic .NET", + "Visual Basic for Applications (VBA)": "(Visual) Basic", + "Visual FoxPro": "(Visual) FoxPro", + "XBase/Clipper": "Clipper", + "XBasic": "XBasic", + "XSL (XSLT/XPath/XSL-FO)": "XSLT", + "Yacc": "yacc", + "haXe": "haXe", } diff --git a/minecode/visitors/__init__.py b/minecode/miners/__init__.py similarity index 76% rename from minecode/visitors/__init__.py rename to minecode/miners/__init__.py index 0120c82d..70314368 100644 --- a/minecode/visitors/__init__.py +++ b/minecode/miners/__init__.py @@ -7,13 +7,9 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - -from functools import total_ordering -import gzip import json -import os import pkgutil -import tempfile +from functools import total_ordering from minecode.utils import fetch_http from minecode.utils import get_temp_file @@ -21,35 +17,48 @@ # FIXME: use attr or use a plain ResourceURI object insteaad @total_ordering -class URI(object): +class URI: """ Describe a URI to visit as returned by Visitors subclasses or visit functions. This mostly mirrors the ResourceURI models as a plain Python object. """ + __slots__ = ( - 'uri', - 'source_uri', - 'package_url', - 'file_name', - 'size', - 'date', - 'md5', - 'sha1', - 'sha256', - 'priority', - 'data', - 'visited', - 'mining_level', - 'visit_error' + "uri", + "source_uri", + "package_url", + "file_name", + "size", + "date", + "md5", + "sha1", + "sha256", + "priority", + "data", + "visited", + "mining_level", + "visit_error", ) - def __init__(self, - uri, source_uri=None, package_url=None, - file_name=None, size=None, date=None, md5=None, sha1=None, sha256=None, - priority=0, - data=None, visited=False, mining_level=0, visit_error=None, **kwargs - ): + def __init__( + self, + uri, + source_uri=None, + package_url=None, + file_name=None, + size=None, + date=None, + md5=None, + sha1=None, + sha256=None, + priority=0, + data=None, + visited=False, + mining_level=0, + visit_error=None, + **kwargs, + ): """ Construct a new URI. A URI represents an address and extra information about this address at some point in time. `uri` is a mandatory URI @@ -95,7 +104,7 @@ def to_dict(self, data_is_json=False): ordered_dict = dict() for k in self.__slots__: value = getattr(self, k) - if value and data_is_json and k == 'data': + if value and data_is_json and k == "data": value = json.loads(value) ordered_dict[k] = value return ordered_dict @@ -107,19 +116,21 @@ def __eq__(self, other): return isinstance(other, URI) and self.to_dict() == other.to_dict() def __lt__(self, other): - return (isinstance(other, URI) - and self.to_dict().items() < other.to_dict().items()) + return ( + isinstance(other, URI) and self.to_dict().items() < other.to_dict().items() + ) def __repr__(self): - args = [key + '=%(' + key + ')r' for key in self.__slots__ - if getattr(self, key, None)] - return ('URI(' + ', '.join(args) + ')') % self.to_dict() + args = [ + key + "=%(" + key + ")r" + for key in self.__slots__ + if getattr(self, key, None) + ] + return ("URI(" + ", ".join(args) + ")") % self.to_dict() @classmethod def from_db(cls, resource_uri): - """ - Build a new URI from a ResourceURI model object. - """ + """Build a new URI from a ResourceURI model object.""" kwargs = {} for key in cls.__slots__: value = getattr(resource_uri, key, None) @@ -129,11 +140,12 @@ def from_db(cls, resource_uri): return URI(**kwargs) -class Visitor(object): +class Visitor: """ Abstract base class for visitors. Subclasses must implement the fetch() and get_uris() methods and use a routing decorator for the URIs they can handle. """ + save_data = True def __call__(self, uri): @@ -154,9 +166,7 @@ def __call__(self, uri): return uris_to_visit, self.dumps(content_object), None def fetch(self, uri): - """ - Fetch and return the content content found at a remote URI. - """ + """Fetch and return the content content found at a remote URI.""" raise NotImplementedError def get_uris(self, content): @@ -217,17 +227,14 @@ def fetch(self, uri, timeout=10): `timeout` is a default timeout. """ - content = super(NonPersistentHttpVisitor, - self).fetch(uri, timeout=timeout) - temp_file = get_temp_file('NonPersistentHttpVisitor') - with open(temp_file, 'wb') as tmp: + content = super().fetch(uri, timeout=timeout) + temp_file = get_temp_file("NonPersistentHttpVisitor") + with open(temp_file, "wb") as tmp: tmp.write(content) return temp_file def dumps(self, content): - """ - Return nothing. The content should not be saved. - """ + """Return nothing. The content should not be saved.""" return None @@ -245,11 +252,30 @@ def loads(self, content): return json.loads(content) +class Mapper: + """ + Abstract base class for mappers. Subclasses must implement the + get_packages() method and use a routing decorator for the URIs they can + handle. + """ + + def __call__(self, uri, resource_uri): + # Note: we let exceptions bubble up and they will be caught and + # processed by the worker loop + return self.get_packages(uri, resource_uri) + + def get_packages(self, uri, resource_uri): + """ + Yield ScannedPackage objects (or return a list) built from a + resource_uri ResourceURI object. + """ + raise NotImplementedError + + """ Minimal way to recursively import all submodules dynamically. If this module is imported, all submodules will be imported: this triggers the actual registration -of visitors. -This should stay as the last import in this init module. +of miners. This should stay as the last import in this init module. """ -for _, name, _ in pkgutil.walk_packages(__path__, prefix=__name__ + '.'): +for _, name, _ in pkgutil.walk_packages(__path__, prefix=__name__ + "."): __import__(name) diff --git a/minecode/miners/apache.py b/minecode/miners/apache.py new file mode 100644 index 00000000..1c56ccca --- /dev/null +++ b/minecode/miners/apache.py @@ -0,0 +1,600 @@ +# +# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# + +import json +import logging +from itertools import chain + +import packagedcode.models as scan_models +from commoncode import fileutils +from packageurl import PackageURL + +from minecode import ls +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.utils import parse_date + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +""" +Collect data from Apache.org. +There are two primary sources of data: + +1. directory listings of the downloads distribution web site apache.org/dist + and archive.apache.org. These map well to packages but we get little or no + data beside a checksum and some name and painfully extracted version. + This data could also be fetched for the most recent ones (since 2012) from: + https://dist.apache.org/repos/dist/release/ which is an SVN repo + And svn ls -R https://dist.apache.org/repos/dist/release/ could be more + efficient and easier to parse incrementally? + +2. JSON data collated by the Foundation to provide project information. These + are for projects and do not map very well to a package or download (but + rather to several of thems at once) + +The JSON data comes from https://projects.apache.org/about.html and +is created with this code: +https://svn.apache.org/repos/asf/comdev/projects.apache.org/trunk/ . + +This JSON data is project-level except for releases-files.json .. but this is +just based on parsing the find-ls directory listing so bring nothing new. + +- http://home.apache.org/public/public_ldap_projects.json : seems to be the + origin for the projects.json and podlings.json data + +- These is a list of VCS repositories. Each key maps rather well to a package + name. But the key (some package name?) may not match a project: + https://projects.apache.org/json/foundation/repositories.json + This comes from http://git.apache.org/ + +- This more or less maps to top-level projects but does not relate to packages + https://projects.apache.org/json/foundation/committees.json + +- This list podling projects with only few details and does not map to packages + https://projects.apache.org/json/foundation/podlings.json + +- This should contain an entry for each project but does not. Yet each JSON + contains also the releases.json and repositories.json content for that project. + https://projects.apache.org/json/projects/ + +- This seems to be the origin of most project data: + https://svn.apache.org/repos/asf/comdev/projects.apache.org/trunk/data/projects.xml + +- Another source of the JSON may be: + https://whimsy.apache.org/public/ +""" + + +class ApacheSeed(seed.Seeder): + def get_seeds(self): + # note: this is the same as below and does not list archived files + # https://archive.apache.org/dist/zzz/find-ls.gz + # to get these we need to rsync or use other techniques + yield "https://apache.org/dist/zzz/find-ls.gz" + + # FIXME: we cannot relate this to a download package: disabled for now + # yield 'https://projects.apache.org/json/foundation/projects.json' + # yield 'https://projects.apache.org/json/foundation/podlings.json' + + +CHECKSUM_EXTS = ( + ".sha256", + ".sha512", + ".md5", + ".sha", + ".sha1", +) + +# only keep downloads with certain extensions for some archives, packages and checksums +ARCHIVE_EXTS = ( + # archives + ".jar", + ".zip", + ".tar.gz", + ".tgz", + ".tar.bz2", + ".war", + ".tar.xz", + ".tgz", + ".tar", + # packages + # '.deb', '.rpm', '.msi', '.exe', + ".whl", + ".gem", + ".nupkg", + # '.dmg', + # '.nbm', +) + +IGNORED_PATH_CONTAINS = ( + "META/", # # + # doc + "/documentation/", + "/doc/", # # + "-doc.", # # + "-doc-", # # + "/docs/", # # + "-docs.", # # + "-docs-", # # + "javadoc", # # + "fulldoc", # # + "apidoc", # # + "-manual.", + "-asdocs.", # # + # eclipse p2/update sites are redundant + # redundant + "updatesite/", # # + "eclipse-update-site", # # + "update/eclipse", # # + "sling/eclipse", # # + "eclipse.site-", + # large multi-origin binary distributions + "-distro.", + "-bin-withdeps.", + "-bin-with-deps", + # these are larger distributions with third-parties + "apache-airavata-distribution", + "apache-airavata-server", + "apache-mahout-distribution", + "/syncope-standalone-", + "binaries/conda", + # obscure + "perl/contrib", + # index data + "zzz", + # doc + "ant/manual", +) + + +# TODO: ignore these globs too: + +# openoffice/*/binaries is very large +# /*/apache-log4j-*-site.zip + + +SOURCE_INDICATORS = ( + "_src.", + "-src.", + "-source.", + "-sources.", + "-source-release", + "/source/", + "/sources/", + "/src/", + "_sources.", +) + + +BINARY_INDICATORS = () + + +@visit_router.route(r"https?://apache.org/dist/zzz/find\-ls\.gz") +class ApacheDistIndexVisitor(NonPersistentHttpVisitor): + """ + Collect URIs for all packages in the "find -ls" index available from Apache + dist sites. + """ + + def get_uris(self, content): + import gzip + + with gzip.open(content, "rt") as f: + content = f.read() + + url_template = "https://apache.org/dist/{path}" + + archive_checksum_extensions = tuple( + chain.from_iterable( + [[ae + cke for ae in ARCHIVE_EXTS] for cke in CHECKSUM_EXTS] + ) + ) + kept_extensions = archive_checksum_extensions + ARCHIVE_EXTS + + for entry in ls.parse_directory_listing(content, from_find=True): + # skip directories, links and special files + if entry.type != ls.FILE: + continue + path = entry.path + + # ignore several downloads + if not path.endswith(kept_extensions) or any( + i in path for i in IGNORED_PATH_CONTAINS + ): + continue + # only checksums need further visit, the archive will be scanned only + is_visited = not path.endswith(CHECKSUM_EXTS) + + yield URI( + visited=is_visited, + source_uri=self.uri, + uri=url_template.format(path=path), + package_url=build_purl(path), + size=entry.size, + ) + + +def build_purl(uri): + """ + Return a PackageURL built from an Apache download URL or path. + + URLs start with this prefix 'https://apache.org/dist/' + """ + # FIXME: this is the essence of collecting name and versions for Apache and + # this need to be super robust + segments = [p for p in uri.split("/") if p] + version = None + project_name = segments[0] + # The path typically contains the version but where is highly inconsistent + # - bahir/bahir-spark/2.1.1/apache-bahir-2.1.1-src.zip + # - groovy/2.4.15/sources/apache-groovy-src-2.4.15.zip + # FIXME: this is not correct + if len(segments) > 1 and ("/distribution/" in uri or "/sources/" in uri): + version = segments[1] + + package_url = PackageURL( + type="apache", + # TODO: namespace='', + name=project_name, + version=version, + ) + + return package_url + + +@visit_router.route( + r"https?://(archive\.)apache.org/dist/.*\.(md5|sha1?|sha256|sha512)", +) +class ApacheChecksumVisitor(HttpVisitor): + """Collect files that contain archive checksums.""" + + def dumps(self, content): + if content: + # the format can be md5sum-like this way: + # c7a2d3becea1d28b518528f8204b8d2a apache-groovy-docs-2.4.6.zip + # with split on space to get the checksum value. + content = content.split() + if content: + content = content[0] + else: + content = "" + return content + + +# FIXME: we cannot relate this to a download package: disabled for now +# @visit_router.route('https://projects.apache.org/json/foundation/projects.json') +class ApacheProjectsJsonVisitor(HttpJsonVisitor): + """ + Collect URIs for all Apache projects. + + The json format is like: + "abdera": { + "bug-database": "https://issues.apache.org/jira/browse/ABDERA", + "category": "xml", + "created": "2008-12-25", + "description": "The goal of the Apache Abdera project ....", + "doap": "http://svn.apache.org/repos/asf/abdera/java/trunk/doap_Abdera.rdf", + "download-page": "http://abdera.apache.org/#downloads", + "homepage": "http://abdera.apache.org", + "license": "http://usefulinc.com/doap/licenses/asl20", + "mailing-list": "http://abdera.apache.org/project.html#lists", + "name": "Apache Abdera", + "pmc": "abdera", + "programming-language": "Java", + "release": [ + { + "created": "2008-04-11", + "name": "Apache Abdera 0.4", + "revision": "1.7.1" + } + ], + "repository": [ + "http://svn.apache.org/repos/asf/abdera" + ], + "shortdesc": "An open source Atom implementation" + }, + """ + + def get_uris(self, content): + url_template = "https://projects.apache.org/json/projects/{name}.json" + for project_name, project_meta in content.items(): + package_url = PackageURL(type="apache", name=project_name) + yield URI( + uri=url_template.format(name=project_name), + package_url=package_url.to_string(), + date=project_meta.get("created"), + ) + + +# FIXME: we cannot relate this to a download package: disabled for now +# @visit_router.route('https://projects.apache.org/json/projects/.*json') +class ApacheSingleProjectJsonVisitor(HttpJsonVisitor): + """ + Collect json content from single project json file. It does not + return any URI as the json contains the project meatadata only, so + this visitor is getting the json to pass to mapper. + """ + + pass + + +# FIXME: what can we do with a homepage and nam, packagedb wise?? +# @visit_router.route('https://projects.apache.org/json/foundation/podlings.json') +class ApachePodlingsJsonVisitor(HttpJsonVisitor): + """ + Collect name and homepage for all podlings aka "incubator" projects. + + The json format is like: + "airflow": { + "description": "Airflow is a workflow automation and scheduling ...", + "homepage": "http://airflow.incubator.apache.org/", + "name": "Apache Airflow (Incubating)", + "pmc": "incubator", + "podling": true, + "started": "2016-03" + }, + """ + + def get_uris(self, content): + for project_name, project_meta in content.items(): + if "homepage" not in project_meta: + continue + + package_url = PackageURL( + type="apache", namespace="incubator", name=project_name + ) + + yield URI( + uri=project_meta.get("homepage"), + package_url=package_url.to_string(), + data=project_meta, + source_uri=self.uri, + visited=True, + ) + + +# common licenses found in JSON +APACHE_LICENSE_URL = { + "http://usefulinc.com/doap/licenses/asl20", + "https://usefulinc.com/doap/licenses/asl20", + "http://spdx.org/licenses/Apache-2.0", + "https://spdx.org/licenses/Apache-2.0", + "http://www.apache.org/licenses/LICENSE-2.0", + "https://www.apache.org/licenses/LICENSE-2.0", + "http://www.apache.org/licenses/LICENSE-2.0.txt", + "https://www.apache.org/licenses/LICENSE-2.0.txt", + "http://www.apache.org/licenses/", + "http://forrest.apache.org/license.html", + "https://svn.apache.org/repos/asf/tomee/tomee/trunk/LICENSE", +} + + +# FIXME: this is NOT specific to a download URL but to a project: disabled for now +# @map_router.route('https://projects.apache.org/json/foundation/projects.json') +class ApacheProjectJsonMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Packages built from resource_uri record for a single + package version. + """ + metadata = json.loads(resource_uri.data) + return build_packages_from_projects(metadata, uri=uri) + + +def build_packages_from_projects(metadata, uri=None): + """ + Yield Package built from Apache a `metadata` mapping + which is a dictionary keyed by project name and values are project_metadata. + Yield as many Package as there are download URLs. + """ + for project_name, project_meta in metadata.items(): + short_desc = project_meta.get("shortdesc") + long_desc = project_meta.get("description") + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = "\n".join(descriptions) + common_data = dict( + datasource_id="apache_json", + type="apache", + name=project_name, + description=description, + homepage_url=project_meta.get("homepage"), + bug_tracking_url=project_meta.get("bug-database"), + primary_language=project_meta.get("programming-language"), + ) + + # FIXME: setting the download-page as the download_url is not right + if project_meta.get("download-page"): + download_url = project_meta.get("download-page") + common_data["download_url"] = download_url + for repo in project_meta.get("repository", []): + common_data["code_view_url"] = repo + # Package code_view_url only support one URL, so break when + # finding a code_view_url + break + + maintainers = project_meta.get("maintainer", []) + for maintainer in maintainers: + mailbox = maintainer.get("mbox", "").replace("mailto:", "") + name = maintainer.get("name") + party = scan_models.Party( + type=scan_models.party_person, + name=name, + role="maintainer", + email=mailbox, + ) + parties = common_data.get("parties") + if not parties: + common_data["parties"] = [] + common_data["parties"].append(party.to_dict()) + + # license is just a URL in the json file, for example: + # http://usefulinc.com/doap/licenses/asl20 + license_url = project_meta.get("license") + common_data["extracted_license_statement"] = license_url + + if license_url in APACHE_LICENSE_URL: + common_data["declared_license_expression"] = "apache-2.0" + common_data["declared_license_expression_spdx"] = "Apache-2.0" + common_data["license_detections"] = [] + + keywords = [] + category = project_meta.get("category", "") + for kw in category.split(","): + kw = kw.strip() + if kw: + keywords.append(kw) + common_data["keywords"] = keywords + + common_data["primary_language"] = project_meta.get("programming-language") + + # FIXME: these cannot be related to actual packages with a download URL + releases = project_meta.get("release") + if releases: + for release in releases: + rdata = dict(common_data) + rdata["version"] = release.get("revision") + if release.get("created") and len(release.get("created")) == 10: + rdata["release_date"] = parse_date(release.get("created")) + else: + logger.warn( + "Unexpected date format for release date: {}".format( + release.get("created") + ) + ) + package = scan_models.Package.from_package_data( + package_data=rdata, + datafile_path=uri, + ) + yield package + else: + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + yield package + + +# FIXME: this is NOT specific to a download URL but to a project: disabled for now +# FIXME: this is casting too wide a net! +# @map_router.route('http?://[\w\-\.]+.incubator.apache.org/"') +class ApachePodlingsMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Packages built from resource_uri record for a single + package version. + """ + metadata = json.loads(resource_uri.data) + return build_packages_from_podlings(metadata, resource_uri.package_url) + + +def build_packages_from_podlings(metadata, purl): + """ + Yield Package built from Apache podlings metadata + which is a dictionary keyed by project name and values are project_metadata. + Yield as many Package as there are download URLs. + """ + name = metadata.get("name") + if name: + common_data = dict( + type="apache-podling", + name=name, + description=metadata.get("description"), + homepage_url=metadata.get("homepage"), + ) + package = scan_models.Package(**common_data) + package.set_purl(purl) + yield package + + +@map_router.route(r"http?s://(archive\.)?apache\.org/dist/.*") +class ApacheDownloadMapper(Mapper): + def get_packages(self, uri, resource_uri): + """Yield Packages build from a bare download URI or download checksum URI.""" + if uri.endswith(CHECKSUM_EXTS): + # 1. create a regular package from the URL stripped from its checksum extension + archive_uri, _, checksum_type = uri.rpartition(".") + + pack = build_package_from_download(archive_uri, resource_uri.package_url) + # 2. collect the checksum inside the file + # and attach it to the package + checksum_value = resource_uri.data.strip() + if checksum_value: + checksum_field_name = "download_{checksum_type}".format(**locals()) + setattr(pack, checksum_field_name, checksum_value) + yield pack + else: + # a plain download URI + yield build_package_from_download(uri, resource_uri.package_url) + + +def build_package_from_download(uri, purl=None): + """ + Return a Package built from an Apache dist download archive URL. + + The uri could be: + http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip + https://apache.org/dist/chemistry/opencmis/1.1.0/chemistry-opencmis-dist-1.1.0-server-webapps.zip + """ + name, version = get_name_version(uri) + if purl: + purl = PackageURL.from_string(purl) + if not name: + name = purl.name + # FIXME: use purl data?? + package = scan_models.Package( + type="apache", + namespace=purl.namespace, + name=name, + version=version, + download_url=uri, + ) + package.set_purl(purl) + return package + + +# FIXME: there should be only one such method and this one is rather weak +def get_name_version(uri): + """Return name and version extracted from a path.""" + # base_url will end being 'https://archive.apache.org/dist' or 'https://apache.org/dist' + # path is the uri without base url, for example: + # /groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip + _, _, path = uri.partition("apache.org/dist/") + base_name = fileutils.file_base_name(path) + version = None + package_name = "" + name_segments = base_name.split("-") + for segment in name_segments: + try: + # To test if each split segment with . is integer. + # For example in '1.2.3' all chars are integer or period. + # If so, this segment is a version segment. + if version: + # The segment after integer segment should belong to version too. + # For example: turbine-4.0-M1, after detecting 4.0, + # M1 should be including in version too, so the final version is 4.0-M1 + version = "-".join([version, segment]) + continue + + is_all_int = all(n.isdigit() for n in segment.split(".")) + if is_all_int: + version = segment + except ValueError: + # Connect the package_name with - because we split it with - eariler, util + # when we meet version, package_name should be good. + if not package_name: + package_name = segment + else: + package_name = ("-").join([package_name, segment]) + continue + return package_name, version diff --git a/minecode/miners/bitbucket.py b/minecode/miners/bitbucket.py new file mode 100644 index 00000000..91b3b4fc --- /dev/null +++ b/minecode/miners/bitbucket.py @@ -0,0 +1,304 @@ +# +# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. +# + +import json +import logging + +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +""" +Mercurial support is retiring in mid 2020 and only git is +available after that. +https://bitbucket.org/blog/sunsetting-mercurial-support-in-bitbucket + + +TODO: collect actual packages.... +TODO: collect counts and more: +watchers count: + https://api.bitbucket.org/2.0/repositories/mikael/stellaris/watchers?fields=size +forks count: + https://api.bitbucket.org/2.0/repositories/mikael/stellaris/forks?fields=size +tags: + https://api.bitbucket.org/2.0/repositories/mchaput/whoosh/refs/tags +then the tag download is with: + https://bitbucket.org/pypa/setuptools/get/.zip + https://bitbucket.org/pypa/setuptools/get/20.1.1.tar.bz2 + +the latest commit to get a download link: + https://api.bitbucket.org/2.0/repositories/pypa/setuptools/commits + This gets the count of commits. + the link is then: https://bitbucket.org/pypa/setuptools/get/.tar.bz2 + +the downloads if any: +https://api.bitbucket.org/2.0/repositories/pypa/setuptools/downloads +each download has a count and a URL such as: +https://api.bitbucket.org/2.0/repositories/pypa/setuptools/downloads/setuptools-19.6b1.zip + this URL can also be built using the filename as: + https://bitbucket.org/pypa/setuptools/downloads/setuptools-19.6b1.zip + +Also there is no value to add repos that are empty and have no downloads. +Therefore we should better: +1. collect repo data as a "template" only record +2. effectively create package IFF there are commits and/or downloads. +2.1 if commits and no tags: make a single package using the latest commit +2.2 if tags: use these for packages +2.3 if downloads: use these packages + +NB: we can also get only certain fields: +https://api.bitbucket.org/2.0/repositories/pypa/setuptools?pagelen=1&fields=size,links,full_name +https://api.bitbucket.org/2.0/repositories/pypa/setuptools/watchers?pagelen=1&fields=size,values.links +""" + + +class BitbucketSeed(seed.Seeder): + def get_seeds(self): + yield "https://api.bitbucket.org/2.0/repositories?pagelen=400" + + +# TODO: review mapper +@visit_router.route( + r"https://api\.bitbucket\.org/2\.0/repositories\?pagelen=.*", +) +class BitbucketIndexVisitor(HttpJsonVisitor): + """ + Collect repository data through paginated API calls. + The index contains repo-level data for every repo. + """ + + def get_uris(self, content): + next_page = content.get("next") + if next_page: + yield URI(uri=next_page, source_uri=self.uri) + + +@visit_router.route(r"https://api\.bitbucket\.org/2\.0/repositories/[^\/]*/[^\/\?]*/?") +class BitbucketSingleRepoVisitor(HttpJsonVisitor): + """ + Collect data for a single repository. + Note: this is strictly equivalent to one item of the index paginated calls. + """ + + def get_uris(self, content): + return get_repo_uris(content, source_uri=self.uri) + + +@visit_router.route( + r"https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*/(refs/tags|downloads).*" +) +class BitbucketDetailsVisitorPaginated(HttpJsonVisitor): + """Collect repository details for data that are paginated.""" + + def get_uris(self, content): + next_page = content.get("next") + if next_page: + purl = get_purl(self.uri) + yield URI(uri=next_page, source_uri=self.uri, package_url=purl) + + +@visit_router.route( + r"https://api\.bitbucket\.org/2\.0/repositories/[^\/]*/[^\/]*/(watchers|forks|commits).*" +) +class BitbucketDetailsVisitor(HttpJsonVisitor): + """Collect repository details for data that are not paginated.""" + + pass + + +def get_repo_ns_name(url_like): + """ + Return a namespace and name for a bitbucket repo given something that looks + like a bitbucket URL. + + For example: + >>> get_repo_ns_name('https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/refs/tags?pagelen=2') + ('bastiand', 'mercurialeclipse') + >>> get_repo_ns_name('https://bitbucket.org/bastiand/mercurialeclipse/src') + ('bastiand', 'mercurialeclipse') + >>> get_repo_ns_name('/bastiand/mercurialeclipse/src') + ('bastiand', 'mercurialeclipse') + """ + if url_like.startswith("https://api.bitbucket.org"): + head, _, path = url_like.partition("2.0/repositories") + if head: + segments = [p for p in path.split("/") if p] + if len(segments) >= 2: + ns = segments[0] + name = segments[1] + return ns, name + + if url_like.startswith("https://bitbucket.org/"): + head, _, path = url_like.partition("bitbucket.org/") + if head: + segments = [p for p in path.split("/") if p] + if len(segments) >= 2: + ns = segments[0] + name = segments[1] + return ns, name + + segments = [p for p in url_like.strip("/").split("/") if p] + if len(segments) >= 2: + ns = segments[0] + name = segments[1] + return ns, name + + +def get_purl(url_like): + """Return a Package URL string created from a bitbucket url or url-like.""" + ns_name = get_repo_ns_name(url_like) + if not ns_name: + return + ns, name = ns_name + return PackageURL(type="bitbucket", namespace=ns, name=name).to_string() + + +def get_repo_uris(repo_data, source_uri): + """Yield URIs from a single repository `repo_data` data.""" + full_name = repo_data.get("full_name", "").strip() + package_url = get_purl(full_name) + links = repo_data.get("links", {}) + repo_uri = links.get("html", {}).get("href") + if not repo_uri: + repo_uri = f"https://bitbucket.org/{full_name}" + + # Yield URI for latest commits, tags and downloads as candidate packages. + commits_url = links.get("commits", {}).get("href") + # we only care about the latest commit + commits_url += "?pagelen=1" + yield URI(uri=commits_url, package_url=package_url, source_uri=source_uri) + + # for counts only: these should go to the package template + for link in ("forks", "watchers"): + url = links.get(link, {}).get("href") + if url: + # we get a single fields and only one page + url += "?pagelen=1&fields=size" + yield URI(uri=url, package_url=package_url, source_uri=source_uri) + + for link in ("refs/tags", "downloads"): + url = links.get(link, {}).get("href") + if url: + # paginated, we want them all + url += "?pagelen=100" + yield URI(uri=url, package_url=package_url, source_uri=source_uri) + + +@map_router.route( + r"https://api.bitbucket\.org/2\.0/repositories/.*/downloads/", +) +class BitbucketDownloadMapper(Mapper): + """Build package from download urls if present.""" + + def get_packages(self, uri, resource_uri): + """Yield Package built from resource_uri record for a single package version.""" + downloads_data = json.loads(resource_uri.data) + for download_data in downloads_data.get("values", []): + yield from build_bitbucket_download_packages( + download_data, resource_uri.package_url + ) + + +def build_bitbucket_download_packages(download_data, purl): + """ + Yield scanned Packages for each download + https://api.bitbucket.org/2.0/repositories/pypa/setuptools/downloads/ + """ + purl = PackageURL.from_string(purl) + namespace = purl.namespace + name = purl.name + + # FIXME: add these ? + # filename = download_data.get("name") + # download_counts = download_data.get("downloads", 0) + + download_url = download_data.get("links", {}).get("self", {}).get("href") + size = download_data.get("size") + + package = scan_models.Package( + type="bitbucket", + name=name, + namespace=namespace, + download_url=download_url, + size=size, + ) + package.set_purl(purl) + yield package + + +# @map_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*') +class BitbucketIndexMapper(Mapper): + """Build a Package for a repo.""" + + def get_packages(self, uri, resource_uri): + repo = json.loads(resource_uri.data) + if not repo: + return + yield build_bitbucket_repo_package(repo, resource_uri.package_url) + + +# FIXME: disabled as this is for a package template +# @map_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*') +class BitbucketRepoMapper(Mapper): + """Build a Package for a repo.""" + + def get_packages(self, uri, resource_uri): + repo = json.loads(resource_uri.data) + if not repo: + return + yield build_bitbucket_repo_package(repo, resource_uri.package_url) + + +def build_bitbucket_repo_package(repo_data, purl): + """ + Peturn a Package "template" from repository data. + Notes: this is not version-specific and has no download URL. + """ + purl = PackageURL.from_string(purl) + scm_protocol = repo_data.get("scm") + if not scm_protocol: + scm_protocol = "git" + bb_url = "{protocol}+https://bitbucket.org/{namespace}/{name}".format( + protocol=scm_protocol, **purl.to_dict() + ) + + owner = repo_data.get("owner") + owner_party = scan_models.Party( + type=scan_models.party_person, + name=owner.get("username"), + role="owner", + url=owner.get("links", {}).get("html", {}).get("href", {}), + ) + + if repo_data.get("has_issues"): + bug_tracking_url = bb_url + "/issues" + else: + bug_tracking_url = None + + package = scan_models.Package( + type=purl.type, + namespace=purl.namespace, + name=purl.name, + homepage_url=repo_data.get("website") or bb_url, + code_view_url=bb_url + "/src", + bug_tracking_url=bug_tracking_url, + description=repo_data.get("description"), + vcs_url=bb_url, + primary_language=repo_data.get("language"), + parties=[owner_party], + ) + package.set_purl(purl) + return package diff --git a/minecode/miners/bower.py b/minecode/miners/bower.py new file mode 100644 index 00000000..55aa3d81 --- /dev/null +++ b/minecode/miners/bower.py @@ -0,0 +1,218 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json + +from packagedcode import models as scan_models +from packagedcode.models import DependentPackage +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper + + +class BowerSeed(seed.Seeder): + def get_seeds(self): + yield "https://registry.bower.io/packages" + + +@visit_router.route("https://registry.bower.io/packages") +class BowerTopJsonVisitor(HttpJsonVisitor): + """Collect URIs for all packages from the json returned.""" + + def get_uris(self, content): + """ + Yield URIs given `content` from Bower. + + The json content is a list with name and url, like the following format: + ... + { + "name": "bello", + "url": "https://github.com/QiaoBuTang/bello.git" + }, + { + "name": "bello-gfw", + "url": "https://gitcafe.com/GilbertSun/bello.git" + }, + ... + The url could be in the following formats like github, loglg, gitcafe, bitbuckets etc. + # FIXME: We should cover all urls beyond the above four categories. + """ + github_base_url = ( + "https://raw.githubusercontent.com/{owner}/{name}/master/bower.json" + ) + lolg_base_url = "https://lolg.it/{owner}/{name}/raw/master/bower.json" + gitcafe_base_url = ( + "https://coding.net/u/{owner}/p/{name}/git/raw/master/bower.json" + ) + bitbucket_base_url = ( + "https://bitbucket.org/{owner}/{name}/raw/master/bower.json" + ) + base_url_map = { + "https://github.com/": github_base_url, + "https://lolg.it/": lolg_base_url, + "https://gitcafe.com/": gitcafe_base_url, + "https://bitbucket.org/": bitbucket_base_url, + } + for entry in content: + name = entry.get("name") + url = entry.get("url") + if name in url: + owner = None + package_url = PackageURL(type="bower", name=name).to_string() + for host_name, base_url in base_url_map.iteritems(): + if url.startswith(host_name): + owner = url[len(host_name) : url.index(name) - 1] + yield URI( + uri=base_url.format(owner=owner, name=name), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route( + "https://raw.githubusercontent.com/.*/master/bower.json", + "https://lolg.it/.*/master/bower.json", + "https://coding.net/.*/master/bower.json", + "https://bitbucket.org/*/master/bower.json", +) +class BowerJsonVisitor(HttpJsonVisitor): + """Collect content of the json itself by the visitor.""" + + pass + + +@map_router.route( + "https://raw.githubusercontent.com/.*/master/bower.json", + "https://lolg.it/.*/master/bower.json", + "https://coding.net/.*/master/bower.json", +) +class BowerJsonMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = resource_uri.data + build_packages_from_jsonfile( + metadata, resource_uri.uri, resource_uri.package_url + ) + + +def build_packages_from_jsonfile(metadata, uri=None, purl=None): + """Yield Package built from Bower json content""" + content = json.loads(metadata) + + licenses_content = content.get("licenses") + extracted_license_statement = set([]) + if licenses_content: + if isinstance(licenses_content, list): + for lic in licenses_content: + extracted_license_statement.add(lic) + else: + extracted_license_statement.add(licenses_content) + + keywords_content = content.get("keywords", []) + name = content.get("name") + + devdependencies = content.get("devDependencies") + dev_dependencies = [] + if devdependencies: + for key, value in devdependencies.items(): + dev_dependencies.append( + DependentPackage( + purl=key, extracted_requirement=value, scope="devdependency" + ).to_dict() + ) + + dependencies = content.get("dependencies") + dependencies_build = [] + if dependencies: + for key, value in dependencies.items(): + dependencies_build.append( + DependentPackage( + purl=key, extracted_requirement=value, scope="runtime" + ).to_dict() + ) + + if name: + vcs_tool, vcs_repo = get_vcs_repo(content) + if vcs_tool and vcs_repo: + # Form the vsc_url by + # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 + vcs_repo = vcs_tool + "+" + vcs_repo + common_data = dict( + type="bower", + name=name, + description=content.get("description"), + version=content.get("version"), + vcs_url=vcs_repo, + keywords=keywords_content, + homepage_url=content.get("homepage"), + datasource_id="bower_json", + license_detections=[], + ) + + if extracted_license_statement: + common_data["extracted_license_statement"] = list( + extracted_license_statement + ) + + author_content = content.get("author") + if author_content: + parties = common_data.get("parties") + if not parties: + common_data["parties"] = [] + common_data["parties"].append( + scan_models.Party( + name=author_content, + role="author", + ).to_dict() + ) + else: + parties = common_data.get("parties") + if not parties: + common_data["parties"] = [] + author_content = content.get("authors", []) + for author in author_content: + author_split = author.split(":") + if len(author_split) > 1: + common_data["parties"].append( + scan_models.Party( + name=author_split[1].strip(), + role="author", + ).to_dict() + ) + + dependencies = [] + if dependencies_build: + dependencies.extend(dependencies_build) + if dev_dependencies: + dependencies.extend(dev_dependencies) + if len(dependencies) > 0: + common_data["dependencies"] = dependencies + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package + + +def get_vcs_repo(content): + """Return the repo type and url.""" + repo = content.get("repository", {}) + if repo: + return repo.get("type"), repo.get("url") + return None, None diff --git a/minecode/miners/cpan.py b/minecode/miners/cpan.py new file mode 100644 index 00000000..d2eebcc1 --- /dev/null +++ b/minecode/miners/cpan.py @@ -0,0 +1,506 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json + +import packagedcode.models as scan_models +import saneyaml +from bs4 import BeautifulSoup +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.utils import parse_date + + +class CpanSeed(seed.Seeder): + def get_seeds(self): + yield "http://www.cpan.org/modules/01modules.index.html" + author_search_template = ( + "https://fastapi.metacpan.org/author/_search?q=email:{char}*&size=5000" + ) + for char in "abcdefghijklmnopqrstuvwxyz".split(): + yield author_search_template.format(char) + + +# The idea of CPAN API visitor is based on +# https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md +# +# From the doc: You can certainly scroll if you are fetching less than 5,000 +# items. You might want to do this if you are expecting a large data set, but +# will still need to run many requests to get all of the required data. +# +# To get all results for sure it's over 5000, we should use search twice based +# on author and release. +# +# First get all authors by searching email from a-z, then get all releases based +# on each author. It will make the returned result a small set. + +# For example: + +# First try to reach the author search, the following search URL will get all +# authors whose email starts with 'a', this will loop from 'a' to 'z. + +# https://fastapi.metacpan.org/author/_search?q=email:a*&size=5000 + +# If we get the Author ID in above returned json, we can pass to release search +# URL as follows, it will get all releases from the passing author. + +# https://fastapi.metacpan.org/release/_search?q=author:ABERNDT&size=5000 + + +@visit_router.route( + r"https://fastapi.metacpan.org/author/_search\?q=email:[a-z]\*&size=5000" +) +class MetaCpanAuthorURLVisitors(HttpJsonVisitor): + """ + Run search on author's email, and parse the returned json content and form + the MetaCpanRleaseURLVisitors' URL by adding AUTHOR condition. For example: + https://fastapi.metacpan.org/author/_search?q=email:a*&size=5000 a* stands + for all email which starts with 'a', and it's the same with 'A' as email is + case insensitive. The visitor will cover all cases from a to z, and yield + the search URLs by passing each author in the release searching URL + """ + + def get_uris(self, content): + release_visitor_template = ( + "https://fastapi.metacpan.org/release/_search?q=author:{id}&size=5000" + ) + hits = content.get("hits", {}) + inner_hits = hits.get("hits", []) + for hit in inner_hits: + _id = hit.get("_id") + if not _id: + continue + yield URI(uri=release_visitor_template.format(id=_id), source_uri=self.uri) + + +@visit_router.route( + r"https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000" +) +class MetaCpanRleaseURLVisitors(HttpJsonVisitor): + """ + Run the release results by searching the passing AUTHOR ID. The visitor will + yield the json whose author ID is the passing author info. The + implementation if the class is empty, it just returns for mapper use of the + json content. + """ + + pass + + +@visit_router.route("http://www.cpan.org/modules/01modules.index.html") +class CpanModulesVisitors(HttpVisitor): + """Return URIs by parsing the HTML page of cpan modules page.""" + + def get_uris(self, content): + """ + Return the uris of authors pages, the returning URIs will be an input of + CpanProjectHTMLVisitors + """ + page = BeautifulSoup(content, "lxml") + url_template = "http://www.cpan.org/{path}" + for a in page.find_all(name="a"): + if "href" not in a.attrs: + continue + + url = a["href"] + if not url: + continue + + if url.startswith("../authors"): + if url.endswith((".zip", ".tar.gz")): + # Skip tar.gz since it will be captured by the CpanProjectHTMLVisitors + continue + else: + url = url_template.format(path=url[3:]) + yield URI(uri=url, source_uri=self.uri) + + +@visit_router.route("http://www.cpan.org/authors/.*/") +class CpanProjectHTMLVisitors(HttpVisitor): + """ + Visit the HTML page of cpan project page and return the Packages info, HTML + data and error. + """ + + def get_uris(self, content): + """ + Return the uris by looking for the tar.gz in the html, and then forming + the uri for meta and readme files + """ + page = BeautifulSoup(content, "lxml") + if self.uri.endswith("/"): + url_template = self.uri + "{path}" + else: + url_template = self.uri + "/{path}" + for a in page.find_all(name="a"): + if "href" not in a.attrs: + continue + + url = a["href"] + if not url: + continue + + if url.startswith(("/", "?")): + continue # Avoid the directory and other non-file links + else: + name = url + name = ( + name.replace("tar.gz", "") + .replace(".readme", "") + .replace(".meta", "") + ) + partions = name.rpartition("-") + name = partions[0] + version = partions[-1] + package_url = None + if name and version: + package_url = PackageURL( + type="cpan", name=name, version=version + ).to_string() + url = url_template.format(path=url) + yield URI(uri=url, package_url=package_url, source_uri=self.uri) + + +@visit_router.route("http://www.cpan.org/.*.meta") +class CpanMetaVisitors(HttpVisitor): + """ + Visit the meta file and return the meta data of the Package The goal + of this visitor is to get the content instead of returning any valid + uris. + """ + + pass + + +@visit_router.route("http://www.cpan.org/.*.readme") +class CpanReadmeVisitors(HttpVisitor): + """Visit the readme file and translate to json and dump it and return for mapper use.""" + + def dumps(self, content): + """Return the json by parsing the readme content""" + # Handle bytes properly in python3 + if type(content) is bytes: + content = content.decode("utf-8") + + lines = content.splitlines() + readme_dict = dict() + body = [] + head = None + for line in lines: + if len(line) > 1 and line.isupper() and line[0] != " ": + if head: + readme_dict[head] = "\n".join(body).lstrip("\n").rstrip("\n") + head = line + body = [] + else: + body.append(line.strip()) + return json.dumps(readme_dict) + + +@map_router.route( + r"https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000" +) +class MetaCpanReleaseSearchMapper(Mapper): + def get_packages(self, uri, resource_uri): + """Yield packages by parsing the json returned from release search request.""" + metadata = resource_uri.data + build_packages_from_release_json( + metadata, resource_uri.uri, resource_uri.package_url + ) + + +def build_packages_from_release_json(metadata, uri=None): + """ + Yield packages built from the json from release search request. + metadata: json content with metadata + uri: the uri of the ResourceURI object + """ + content = json.loads(metadata) + hits = content.get("hits", {}) + inner_hits = hits.get("hits", []) + for hit in inner_hits: + release = hit.get("_source", {}) + if not release: + continue + name = release.get("name") + if not name: + continue + + extracted_license_statement = [ + lic for lic in release.get("license", []) if lic and lic.strip() + ] + + common_data = dict( + datasource_id="cpan_release_json", + type="cpan", + name=name, + description=release.get("abstract"), + version=release.get("version"), + download_url=release.get("download_url"), + extracted_license_statement=extracted_license_statement, + license_detections=[], + # the date format passing is like: + # "2014-04-20T21:30:13" + release_date=parse_date(release.get("date")), + ) + + # Get the homepage_url, declared_license and vcs_repository/vcs_tool under resources section. + # The resources section format is like this: + # "resources" : { + # "homepage" : "http://plackperl.org", + # "license" : [ + # "http://dev.perl.org/licenses/" + # ], + # "bugtracker" : { + # "web" : "https://github.com/plack/Plack/issues" + # }, + # "repository" : { + # "url" : "git://github.com/plack/Plack.git" + # } + # }, + resources = release.get("resources") or {} + + common_data["homepage_url"] = resources.get("homepage") + # Usually the license in root node contains the license name + # like perl_5. The license here under resources section is the + # url of license for example: http://dev.perl.org/licenses/ So + # it's useful to collect both information... + license_url = [ + lic for lic in resources.get("license", []) if lic and lic.strip() + ] + if license_url: + common_data["extracted_license_statement"].extend(license_url) + + vcs_tool, vcs_repo = get_vcs_repo1(resources) + if vcs_tool and vcs_repo: + # Form the vsc_url by + # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 + vcs_repo = vcs_tool + "+" + vcs_repo + common_data["vcs_url"] = vcs_repo + + bugtracker_section = resources.get("bugtracker", {}) + common_data["bug_tracking_url"] = bugtracker_section.get("web") + + if release.get("author"): + party = scan_models.Party( + type=scan_models.party_person, name=release.get("author"), role="author" + ) + common_data["parties"] = common_data.get("parties", []) + common_data["parties"].append(party.to_dict()) + + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package_url = PackageURL( + type="cpan", name=release.get("name"), version=release.get("version") + ) + package.set_purl(package_url.to_string()) + yield package + + +def get_vcs_repo1(content): + """Return the repo type and url.""" + repo_type = None + repo_url = None + repo = content.get("repository", {}) + if repo: + url = repo.get("url") + if url: + repo_url = url + if ".git" in url: + repo_type = "git" + return repo_type, repo_url + + +@map_router.route("http://www.cpan.org/.*.meta") +class CpanMetaFileMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = resource_uri.data + build_packages_from_metafile( + metadata, resource_uri.uri, resource_uri.package_url + ) + + +def build_packages_from_metafile(metadata, uri=None, purl=None): + """ + Yield Package built from Cpan a `metadata` content + metadata: json content with metadata + uri: the uri of the ResourceURI object + purl: String value of the package url of the ResourceURI object + """ + # FIXME: it does not make sense to use a single functin tod eal with the two + # formats IMHO + if is_json(metadata): + content = json.loads(metadata) + else: + content = saneyaml.load(metadata) + + licenses_content = content.get("license") + extracted_license_statement = [] + if licenses_content: + if isinstance(licenses_content, list): + for lic in licenses_content: + extracted_license_statement.append(lic) + else: + extracted_license_statement.append(licenses_content) + + keywords_content = content.get("keywords", []) + + download_url = uri.replace(".meta", ".tar.gz") if uri else None + + name = content.get("name") + if name: + vcs_tool, vcs_repo = get_vcs_repo(content) + if vcs_tool and vcs_repo: + # Form the vsc_url by + # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 + vcs_repo = vcs_tool + "+" + vcs_repo + common_data = dict( + datasource_id="cpan_meta_json", + type="cpan", + name=name, + description=content.get("abstract", name), + version=content.get("version"), + download_url=download_url, + extracted_license_statement=extracted_license_statement, + vcs_url=vcs_repo, + keywords=keywords_content, + ) + + parties = common_data["parties"] = [] + + for author_content in content.get("author", []): + # The author format is like: Abigail + if "<" in author_content: + author_name, _, author_email = author_content.partition("<") + author_email = author_email.strip(">") + else: + author_name = author_content + author_email = "" + + party = scan_models.Party( + role="author", + type=scan_models.party_person, + name=author_name.rstrip(), + email=author_email, + ) + + parties.append(party.to_dict()) + + package = scan_models.PackageData.from_data(package_data=common_data) + package.set_purl(purl) + yield package + + +def get_vcs_repo(content): + """Return the repo type and url.""" + repo = content.get("resources", {}).get("repository") + if repo: + if isinstance(repo, dict): + repo = repo.get("url", "") + if repo.startswith("git:"): + return "git", repo + return None, None + + +def is_json(json_content): + try: + json.loads(json_content) + except ValueError: + return False + return True + + +@map_router.route("http://www.cpan.org/.*.readme") +class CpanReadmeFileMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = resource_uri.data + build_packages_from_metafile( + metadata, resource_uri.uri, resource_uri.package_url + ) + + +def build_packages_from_readmefile(metadata, uri=None, purl=None): + """ + Yield Package built from Cpan a `readme` content + metadata: json metadata content of readme file + uri: the uri of the ResourceURI object + purl: String value of the package url of the ResourceURI object + """ + content = json.loads(metadata) + name = content.get("NAME") + if name: + download_url = uri.replace(".meta", ".tar.gz") if uri else None + vcs_tool, vcs_repo = get_vcs_repo_fromstring(content) + if vcs_tool and vcs_repo: + # Form the vsc_url by + # https://spdx.org/spdx-specification-21-web-version#h.49x2ik5 + vcs_repo = vcs_tool + "+" + vcs_repo + copyr = content.get("COPYRIGHT and LICENSE") + common_data = dict( + datasource_id="cpan_readme", + type="cpan", + name=name, + description=content.get("ABSTRACT", name), + download_url=download_url, + vcs_url=vcs_repo, + copyright=copyr, + version=content.get("VERSION"), + ) + + authors = content.get("AUTHOR", []) + for author_content in authors: + author_split = author_content.split("<") + if len(author_split) > 1: + party = scan_models.Party( + type=scan_models.party_person, + name=author_split[0].rstrip(), + role="author", + email=author_split[1].replace(">", ""), + ) + parties = common_data.get("parties") + if not parties: + common_data["parties"] = [] + common_data["parties"].append(party) + + keywords_content = [] + if content.get("KEYWORDS"): + keywords_content = [content.get("KEYWORDS")] + common_data["keywords"] = keywords_content + + package = scan_models.PackageData.from_data(package_data=common_data) + package.set_purl(purl) + yield package + + +def get_vcs_repo_fromstring(content): + """Return the repo type and url.""" + repo = content.get("DEVELOPMENT") + if repo and repo.index("<") < repo.index(">") and "git:" in repo: + return "git", repo[repo.index("<") + 1 : repo.index(">")] + else: + return None, None diff --git a/minecode/miners/cran.py b/minecode/miners/cran.py new file mode 100644 index 00000000..c524c36c --- /dev/null +++ b/minecode/miners/cran.py @@ -0,0 +1,198 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import packagedcode.models as scan_models +from bs4 import BeautifulSoup +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.utils import parse_date + +CRAN_URL = "https://cloud.r-project.org/" +CRAN_WEB_URL = CRAN_URL + "web/" + + +class CranSeed(seed.Seeder): + def get_seeds(self): + yield "https://cloud.r-project.org/web/packages/available_packages_by_date.html" + + +@visit_router.route( + "https://cloud.r-project.org/web/packages/available_packages_by_date.html" +) +class CranPackagesVisitors(HttpVisitor): + """Return URIs by parsing the HTML content of the page""" + + def get_uris(self, content): + base_url = "https://cloud.r-project.org/web/packages/{package}/index.html" + a_blocks = BeautifulSoup(content, "lxml").find_all("a") + for a in a_blocks: + package = a.text + package_url = PackageURL(type="cran", name=package).to_string() + yield URI( + uri=base_url.format(package=package), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route(r"https://cloud.r-project.org/web/packages/[\w\-\.]/index.html") +class CranSinglePackageVisitor(HttpVisitor): + """Return only the HTML content of the page, and will be parsed in mapper""" + + pass + + +@map_router.route(r"https://cloud.r-project.org/web/packages/[\w\-\.]/index.html") +class CranMetaFileMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = resource_uri.data + build_packages_from_html(metadata, resource_uri.uri, resource_uri.package_url) + + +def get_download_url(url): + return url.replace("../../../", CRAN_URL) + + +def get_dependencies(depends): + """Return a dictionary of dependencies keyed by dep_group.""" + dep_pkgs = [] + if not depends: + return dep_pkgs + dependencies = comma_separated(depends) + if not dependencies: + return dep_pkgs + for name in dependencies: + dep_pkgs.append(scan_models.DependentPackage(purl=name).to_dict()) + return dep_pkgs + + +def comma_separated(text): + """Return a list of strings from a comma-separated text.""" + if not text: + return [] + return [t.strip() for t in text.split(",") if t and t.strip()] + + +def build_packages_from_html(metadata, uri=None, purl=None): + """ + Yield Package built from Cpan a `metadata` content + metadata: json metadata content + uri: the uri of the ResourceURI object + purl: String value of the package url of the ResourceURI object + """ + # Parse the name from the url, for example: https://cloud.r-project.org/web/packages/ANN2/index.html + common_data = dict( + datasource_id="cran_metadata", + type="cran", + name=uri.rpartition("/")[0].rpartition("/")[-1], + ) + extracted_license_statement = [] + download_urls = [] + + soup = BeautifulSoup(metadata, "lxml") + first_pblock = soup.find("p") + if first_pblock: + common_data["description"] = first_pblock.string + else: + h2_block = soup.find("h2") + if h2_block: + common_data["description"] = h2_block.string + + tables = soup.find_all("table") + for table in tables: + rows = table.find_all("tr") + for row in rows: + col_values = [] + cols = row.find_all("td") + for ele in cols: + if ele.find_all("a"): + col_values.append([a["href"].strip() for a in ele.find_all("a")]) + col_values.append(ele.text.strip()) + if len(cols) >= 2: + key = col_values[0] + value = col_values[1] + if key == "Version:": + common_data["version"] = value + elif key == "URL:": + if type(value) is list and len(value) > 0: + homepages = [] + for home_page in value: + homepages.append(home_page) + common_data["homepage_url"] = "\n".join(homepages) + else: + common_data["homepage_url"] = value + elif key == "License:": + for license_url in value: + extracted_license_statement.append(license_url) + elif key == "Author:": + parties = common_data.get("parties") + if not parties: + common_data["parties"] = [] + party = scan_models.Party( + type=scan_models.party_person, name=value, role="author" + ) + common_data["parties"].append(party.to_dict()) + elif key == "Maintainer:": + maintainer_split = value.split("<") + if len(maintainer_split) > 1: + parties = common_data.get("parties") + if not parties: + common_data["parties"] = [] + party = scan_models.Party( + type=scan_models.party_person, + name=maintainer_split[0].rstrip(), + role="maintainer", + email=maintainer_split[1] + .replace(">", "") + .replace(" at ", "@"), + ) + common_data["parties"].append(party.to_dict()) + elif "source" in key or "binaries" in key: + if type(value) is list: + for url in value: + download_urls.append(get_download_url(url)) + elif key == "Published:": + common_data["release_date"] = parse_date(value) + elif key == "Imports:": + # use the text instead of a href since the text is more accurate + if len(col_values) == 3: + value = col_values[2] + common_data["dependencies"] = get_dependencies(value) + if extracted_license_statement: + common_data["extracted_license_statement"] = extracted_license_statement + common_data["license_detections"] = [] + + if download_urls: # for else statement will have else running always if there is no break statement + for download_url in download_urls: + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.download_url = download_url + package.set_purl(purl) + yield package + else: + # Yield a package without download_url + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/miners/debian.py b/minecode/miners/debian.py new file mode 100644 index 00000000..13702168 --- /dev/null +++ b/minecode/miners/debian.py @@ -0,0 +1,655 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import gzip +import json +import logging +from collections import defaultdict + +import attr +import debian_inspector +from commoncode import fileutils +from debian_inspector import copyright as debcopy +from debian_inspector import debcon +from packagedcode import models as scan_models +from packageurl import PackageURL + +import minecode.collectors.debian as debian_collector +from minecode import debutils +from minecode import ls +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.utils import form_vcs_url + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + +""" +Collect Debian and Debian derivative packages (such as Ubuntu). +There are two approaches: +1. get the directory listings of all available packages (and files) +2. get and navigate through the tree of Debian control files +""" + + +DEBIAN_BASE_URL = "https://deb.debian.org/debian/pool/main/" +DEBIAN_METADATA_URL = "https://metadata.ftp-master.debian.org/changelogs/main/" + +UBUNTU_BASE_URL = "http://archive.ubuntu.com/ubuntu/pool/main/" +UBUNTU_METADATA_URL = "http://changelogs.ubuntu.com/changelogs/pool/main/" + +# Other URLs and sources to consider +# 'http://ftp.debian.org/debian/' +# rsync://archive.debian.org/debian-archive +# http://sources.debian.net/doc/api/ +# Packages.gz files: https://get.videolan.org/debian/i386/Packages.gz +# https://debian-handbook.info/browse/stable/sect.setup-apt-package-repository.html + + +class DebianSeed(seed.Seeder): + def get_seeds(self): + yield "http://ftp.debian.org/debian/ls-lR.gz" + yield "http://archive.ubuntu.com/ubuntu/ls-lR.gz" + + +def is_collectible(file_name): + """Return True if a `file_name` is collectible.""" + # 'Contents-*.gz' are mapping/indexes of installed files to the actual package that provides them. + # TODO: add tests! + + return file_name and ( + file_name + in ( + "Packages.gz", + "Release", + "Sources.gz", + ) + or file_name.endswith( + ( + ".deb", + ".dsc", + ) + ) + or (file_name.startswith("Contents-") and file_name.endswith(".gz")) + ) + + +def is_debian_url(uri): + return "debian.org" in uri + + +def is_ubuntu_url(uri): + return "ubuntu" in uri + + +@visit_router.route( + r"http://ftp.debian.org/.*/ls\-lR\.gz", + r"http://.*/ubuntu/ls\-lR\.gz", + # mirrors + r"http://ftp.[a-z][a-z].debian.org/.*/ls\-lR\.gz", +) +class DebianDirectoryIndexVisitor(NonPersistentHttpVisitor): + """Collect package URIs from Debian-like repos with an ls-LR directory listing.""" + + def get_uris(self, content): + with gzip.open(content, "rt") as f: + content = f.read() + + url_template = self.uri.replace("ls-lR.gz", "{path}") + + for entry in ls.parse_directory_listing(content): + if entry.type != ls.FILE: + continue + + path = entry.path.lstrip("/") + file_name = fileutils.file_name(path) + + if not is_collectible(file_name): + continue + + if is_debian_url(self.uri): + namespace = "debian" + elif is_ubuntu_url(self.uri): + namespace = "ubuntu" + else: + logger.error(f"Unknown Debian URI namespace: {self.uri}") + continue + + if file_name.endswith( + (".deb", ".udeb", ".tar.gz", ".tar.xz", ".tar.bz2", ".tar.lzma") + ): + name, version, arch = debian_inspector.package.get_nva(file_name) + package_url = PackageURL( + type="deb", + namespace=namespace, + name=name, + version=str(version), + qualifiers=dict(arch=arch) if arch else None, + ).to_string() + else: + package_url = None + + yield URI( + uri=url_template.format(path=path), + package_url=None or package_url, + file_name=file_name, + date=entry.date, + size=entry.size, + source_uri=self.uri, + ) + + +def parse_release(location): + """ + Return a dictionary of data message from the debian Release file at `location`. + + A Release file contains return value like these: + Origin: Debian + Label: Debian + Suite: stable + Version: 8.3 + Codename: jessie + Date: Sat, 23 Jan 2016 13:17:38 UTC + Architectures: amd64 arm64 armel armhf i386 mips mipsel powerpc ppc64el s390x + Components: main contrib non-free + Description: Debian 8.3 Released 23 January 2016 + MD5Sum: + f08bebee4d8727f4320c0ed6984a01c9 1194884 contrib/Contents-amd64 + c7f0b9213c9031cf89343a1bb8dbca3a 88565 contrib/Contents-amd64.gz + 36d2e8055b0cc185c8c5b081b414f4ce 1021655 contrib/Contents-arm64 + 20bb294fefef1ab19e20ff0de7976ee2 72539 contrib/Contents-arm64.gz + d2e1f415e05f53742b7133dd10ccf3af 1035687 contrib/Contents-armel + 5f24794a69552fbb10f303e33d35d380 73710 contrib/Contents-armel.gz + d70a5e2db762a9eb493607e16f8c423e 1028590 contrib/Contents-armhf + + The MD5Sum key will return a list instead of a string value, element in the + list is a dictionary keyed by: + + md5sum + size + name + """ + return debcon.get_paragraphs_data_from_file(location) + + +def parse_copyright_only(location): + """Return a DebianCopyright from the Debian copyright file at `location`.""" + return debcopy.DebianCopyright.from_file(location) + + +def parse_copyright_allinfo(location): + """Return a DebianCopyright from the Debian copyright file at `location`.""" + return debcopy.DebianCopyright.from_file(location) + + +def parse_license(location): + """Return a list of License paragraphs from Debian copyright file at location.""" + copyparas = debcopy.DebianCopyright.from_file(location) + return [ + para + for para in copyparas.paragraphs + if isinstance(para, debian_inspector.copyright.CopyrightLicenseParagraph) + ] + + +def collect_source_packages(location): + """ + Yield one Paragraph object per package from a plain text 'Sources' file at + location. + + The source info is a dictionary, the content is like this: + 'Package': 'album' + 'Binary': 'album' + 'Version': '4.12-3' + 'Build-Depends': 'debhelper (>= 9)' + 'Architecture': 'all' + 'Format': '3.0 (quilt)' + """ + return debcon.get_paragraphs_data_from_file(location) + + +def parse_packages_index(location): + """ + Yield one Paragraph object per package from a plain text 'Packages' file at + location. + + A typical Debian Packages file looks like this: + http://ftp.debian.org/debian/dists/unstable/main/binary-mips/Packages.gz + """ + return debcon.get_paragraphs_data_from_file(location) + + +@visit_router.route("http://ftp.debian.org/debian/dists/.*/Sources.gz") +class DebianSourcesVisitor(NonPersistentHttpVisitor): + """Collect package URIs from a Sources gz data file.""" + + def get_uris(self, content): + base_url = "http://ftp.debian.org/debian" + with gzip.open(content, "rb") as f: + text = f.read() + for source in debcon.get_paragraphs_data(text): + dir_info = source.get("Directory") + if not dir_info: + continue + package = source.get("Package") + version = source.get("Version") + + package_url = None + if package and version: + package_url = PackageURL( + type="deb", namespace="debian", name=package, version=version + ).to_string() + + dir_info = dir_info.lstrip("/") + dir_url = base_url + f"/{dir_info}" + yield URI(uri=dir_url, package_url=package_url, source_uri=self.uri) + + +# TODO add .xz support +@visit_router.route("http://ftp.debian.org/debian/dists/.*Packages.gz") +class DebianPackagesVisitor(NonPersistentHttpVisitor): + """Collect URIs to actual .deb Packages and the content itself from a Packages gz data file.""" + + def get_uris(self, content): + base_url = "http://ftp.debian.org/debian" + with gzip.open(content, "rb") as f: + text = f.read() + + for package in debcon.get_paragraphs_data(text): + file_info = package.get("Filename") + if not file_info: + continue + + package = package.get("Package") + version = package.get("Version") + + if package and version: + package_url = PackageURL( + type="deb", namespace="debian", name=package, version=version + ).to_string() + else: + package_url = None + + # FIXME: we we do not keep the actual content... we should! + file_info = file_info.lstrip("/") + dir_url = base_url + file_info + yield URI(uri=dir_url, package_url=package_url, source_uri=self.uri) + + +@visit_router.route(r"http://ftp.debian.org/debian/pool/.*\.dsc") +class DebianDescriptionVisitor(HttpVisitor): + """ + Collect package data from a .dsc Package description file. + There is no URI we can get from description file directly. + """ + + def dumps(self, content): + dsc = debcon.Debian822.from_string(content) + # FIXME: this does not make sense as this is a mapping-time thing + return json.dumps(dsc.to_dict()) + + +@visit_router.route("http://ftp.debian.org/debian/.*/Release") +class DebianReleaseVisitor(HttpVisitor): + """Collect Release file content from a Release data file.""" + + pass + + +@map_router.route(r"http://ftp.debian.org/debian/pool/.*\.dsc") +class DebianDescriptionMapper(Mapper): + def get_packages(self, uri, resource_uri): + """Yield packages parsed from a dsc Debian control file mapping.""" + return parse_description( + metadata=json.loads(resource_uri.data), + purl=resource_uri.package_url, + base_download_url=None, + ) + + +def get_files(text): + """ + Yield tuples of (checksum, size, filename) collected from a files field + `text`. + """ + if text: + for line in text.splitlines(False): + # we have htree space-separated items, so we perform two partitions + line = " ".join(line.split()) + checksum, _, rest = line.partition(" ") + size, _, filename = rest.partition(" ") + yield checksum, size, filename + + +def parse_description(metadata, purl=None, base_download_url=None): + """ + Yield Scanned Package parse from description `metadata` mapping + for a single package version. + Yield as many Package as there are download URLs. + Optionally use the `purl` Package URL string if provided. + """ + # FIXME: this may not be correct: Source and Binary are package names + common_data = dict( + name=metadata["Source"], + version=metadata["Version"], + homepage_url=metadata.get("Homepage"), + code_view_url=metadata.get("Vcs-Browser"), + parties=[], + ) + + if metadata.get("Label"): + common_data["keywords"] = [metadata.get("Label")] + + vcs_tool, vcs_repo = debian_collector.get_vcs_repo(metadata) + if vcs_tool and vcs_repo: + vcs_repo = form_vcs_url(vcs_tool, vcs_repo) + common_data["vcs_url"] = vcs_repo + + dependencies = debian_collector.get_dependencies(metadata) + if dependencies: + common_data["dependencies"] = dependencies + + # TODO: add "original maintainer" seen in Ubuntu + maintainer = metadata.get("Maintainer") + if maintainer: + name, email = debutils.parse_email(maintainer) + if name: + party = scan_models.Party(name=name, role="maintainer", email=email) + common_data["parties"].append(party) + + @attr.s() + class File: + name = attr.ib(default=None) + size = attr.ib(default=None) + md5 = attr.ib(default=None) + sha1 = attr.ib(default=None) + sha256 = attr.ib(default=None) + + def collect_files(existing_files, field_value, checksum_name): + for checksum, size, name in get_files(field_value): + fl = existing_files[name] + if not fl.name: + fl.name = name + fl.size = size + setattr(fl, checksum_name, checksum) + + # TODO: what do we do with files? + # FIXME: we should store them in the package record + files = defaultdict(File) + collect_files( + existing_files=files, field_value=metadata.get("Files"), checksum_name="md5" + ) + collect_files( + existing_files=files, + field_value=metadata.get("Checksums-Sha1"), + checksum_name="sha1", + ) + collect_files( + existing_files=files, + field_value=metadata.get("Checksums-Sha256"), + checksum_name="sha256", + ) + + # FIXME: craft a download_url + download_url = None + if base_download_url: + download_url = None + common_data["download_url"] = download_url + + package = scan_models.DebianPackage(**common_data) + package.set_purl(purl) + yield package + + +@map_router.route("http://ftp.debian.org/debian/dists/.*Sources.gz") +class DebianSourceFileMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield ScannedPackages built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = resource_uri.data + return parse_packages(metadata, resource_uri.package_url) + + +def build_source_file_packages(metadata, purl=None): + """ + Yield packages from the passing source file metadata. + metadata: json metadata content + purl: String value of the package url of the ResourceURI object + """ + for source in debcon.get_paragraphs_data(metadata): + package_name = source.get("Package") + + parties = [] + maintainer_names = debutils.comma_separated(source.get("Maintainer", "")) + if maintainer_names: + for maintainer in maintainer_names: + name, email = debutils.parse_email(maintainer) + if name: + party = scan_models.Party(name=name, role="maintainer", email=email) + parties.append(party) + contributor_names = debutils.comma_separated(source.get("Uploaders", "")) + if contributor_names: + for contributor in contributor_names: + name, email = debutils.parse_email(contributor) + if name: + party = scan_models.Party( + name=name, role="contributor", email=email + ) + parties.append(party) + + dependencies = debian_collector.get_dependencies(source, ["Build-Depends"]) + + keywords = set() + keywords.update(debutils.comma_separated(source.get("Binary", ""))) + if source.get("Section"): + keywords.add(source.get("Section")) + + files = source.get("Files") + for f in files: + name = f.get("name") + package = dict( + name=package_name, + version=source.get("Version"), + dependencies=dependencies, + parties=parties, + code_view_url=source.get("Vcs-Browser"), + homepage_url=source.get("Homepage"), + keywords=list(keywords), + ) + + download_url = "http://ftp.debian.org/debian/{path}/{name}".format( + path=source.get("Directory"), name=name + ) + + package["download_url"] = download_url + + vcs_tool, vcs_repo = debian_collector.get_vcs_repo(source) + if vcs_tool and vcs_repo: + vcs_repo = form_vcs_url(vcs_tool, vcs_repo) + package["vcs_url"] = vcs_repo + + package["md5"] = f.get("md5sum") + # TODO: Why would we have more than a single SHA1 or SHA256 + sha1s = source.get("Checksums-Sha1", []) + for sha1 in sha1s: + sha1value = sha1.get("sha1") + name = sha1.get("name") + if name and sha1value: + package["sha1"] = sha1value + sha256s = source.get("Checksums-Sha256", []) + for sha256 in sha256s: + sha256value = sha256.get("sha256") + name = sha256.get("name") + if name and sha256value: + package["sha256"] = sha256value + package = scan_models.DebianPackage(**package) + package.set_purl(purl) + yield package + + +@map_router.route("http://ftp.debian.org/debian/dists/.*Packages.gz") +class DebianPackageFileMapper(Mapper): + def get_packages(self, uri, resource_uri): + """Yield Packages from a Debian Packages inex.""" + metadata = resource_uri.data + return parse_packages(metadata, resource_uri.package_url) + + +def get_programming_language(tags): + """Return the programming language extracted from list of `tags` strings.""" + for tag in tags: + key, _, value = tag.partition("::") + if key == "implemented-in": + return value + + +def parse_packages(metadata, purl=None): + """ + Yield packages from Debian package text data. + metadata: Debian data (e.g. a Packages files) + purl: String value of the package url of the ResourceURI object + """ + for pack in debcon.get_paragraphs_data(metadata): + data = dict( + name=pack["Package"], + version=pack["Version"], + homepage_url=pack.get("Homepage"), + code_view_url=pack.get("Vcs-Browser"), + description=pack.get("Description"), + bug_tracking_url=pack.get("Bugs"), + parties=[], + md5=pack.get("MD5sum"), + sha1=pack.get("SHA1"), + sha256=pack.get("SHA256"), + ) + + filename = (pack.get("Filename"),) + if filename: + data["download_url"] = f"http://ftp.debian.org/debian/{filename}" + + maintainers = pack.get("Maintainer") + if maintainers: + name, email = debutils.parse_email(maintainers) + if name: + party = scan_models.Party(name=name, role="maintainer", email=email) + data["parties"].append(party) + + dependencies = debian_collector.get_dependencies(pack) + if dependencies: + data["dependencies"] = dependencies + + keywords = debutils.comma_separated(pack.get("Tag", "")) + + section = pack.get("Section") + if section: + keywords.append(section) + data["keywords"] = keywords + + data["primary_language"] = get_programming_language(keywords) + + package = scan_models.DebianPackage(**data) + if purl: + package.set_purl(purl) + yield package + + +################################################################################# +# FIXME: this cannot work since we do not fetch these yet AND what are the zip jar and gz in this??? +################################################################################# + + +@map_router.route( + r"http://ftp.debian.org/debian/dists/.*\.zip", + r"http://ftp.debian.org/debian/dists/.*\.jar", + r"http://ftp.debian.org/debian/dists/.*\.gz", +) +class DebianArchiveFileMapper(Mapper): + def get_packages(self, uri, resource_uri): + return build_packages_from_dist_archive(resource_uri.data, resource_uri.uri) + + +def build_packages_from_dist_archive(metadata, uri): + """ + Yield Package built from Debian project URI and the ls content associated + which is a result by running ls LR command at the Debiain root folder. + Yield as many Package as there are download URLs. + """ + debian_dist_length = len("http://ftp.debian.org/debian/dists") + # The parent folder URI related to uri file itself. + folder_uri = uri[debian_dist_length : uri.rindex("/")] + debian_dist_length = len("http://ftp.debian.org/debian/dists") + # project name by trucking the uri + name = uri[debian_dist_length : uri.index("/", debian_dist_length)] + folder_length = debian_dist_length + len(name) + 1 + # version by analysing the uri + version = uri[folder_length : uri.index("/", folder_length)] + common_data = dict( + datasource_id="debian_archive_file", + name=name, + version=version, + ) + + # FIXME: this is NOT RIGHT + def get_resourceuri_by_uri(uri): + """Return the Resource URI by searching with passing uri string value.""" + from minecode.models import ResourceURI + + uris = ResourceURI.objects.filter(uri=uri) + if uris: + return uris[0] + + url_template = "http://ftp.debian.org/debian/dists{name}" + download_urls = [] + for entry in ls.parse_directory_listing(metadata): + if entry.type != ls.FILE: + continue + path = entry.path + + if path.startswith(folder_uri): + path = path.lstrip("/") + url = url_template.format(name=path) + # FIXME: this is NOT RIGHT + if path.endswith(".md5") and url.replace(".md5", "") == uri: + if get_resourceuri_by_uri(url) and get_resourceuri_by_uri(url).md5: + common_data["md5"] = get_resourceuri_by_uri(url).md5 + # FIXME: this is NOT RIGHT + if path.endswith(".sha") and url.replace(".sha", "") == uri: + if get_resourceuri_by_uri(url) and get_resourceuri_by_uri(url).sha1: + common_data["sha1"] = get_resourceuri_by_uri(url).sha1 + + if path.endswith((".jar", "zip", "gz")) and url != uri: + download_urls.append(url) + + if download_urls: + for download_url in download_urls: + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package["download_url"] = download_url + yield package + else: + # yield package without a download_url value + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + # FIXME: this is NOT RIGHT: purl is not defined + package.set_purl(package.purl) + yield package diff --git a/minecode/miners/dockerhub.py b/minecode/miners/dockerhub.py new file mode 100644 index 00000000..3e9dd285 --- /dev/null +++ b/minecode/miners/dockerhub.py @@ -0,0 +1,205 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import string + +from bs4 import BeautifulSoup +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import Mapper + + +def get_search_conditions(): + """ + Return a list of combination of char and char, char and number, number and number. + By doing this, we can pass the conditions to the query API of docker hub, the API does not + support the single char, so we combine two chars as a list. + For example: ['aa', 'ab', .....'a1', 'a2'.....'z9'...] + """ + char_list = [] + for char in string.ascii_lowercase: + char_list.append(char) + int_list = [] + for i in range(0, 10): + int_list.append(str(i)) + char_list.extend(int_list) + + conditions = [] + for c in char_list: + for second_c in char_list: + conditions.append(c + second_c) + return conditions + + +class DockerHubSeed(seed.Seeder): + def get_seeds(self): + yield "https://hub.docker.com/explore/?page=1" + search_uril_format = ( + "https://index.docker.io/v1/search?q={condition}&n=100&page=1" + ) + for condition in get_search_conditions(): + # yield a combination of query conditions, the API accepts at least + # two chars for searching conditions. + yield search_uril_format.format(condition=condition) + + +@visit_router.route(r"https://hub.docker.com/explore/\?page=\d?") +class DockHubExplorePageVisitor(HttpVisitor): + """Visit the HTML page of DockerHub Explore Page and yield each uri of the project, and yield the next page of DockHub.""" + + def get_uris(self, content): + dockhub_library_html_template = "https://hub.docker.com/{project}" + dockhub_library_restapi_template = ( + "https://registry.hub.docker.com/v2/repositories/library/{project}" + ) + dockhub_next_page_template = "https://hub.docker.com/explore/?page={page}" + page_legal = False + page = BeautifulSoup(content, "lxml") + for a in page.find_all(name="a"): + if "href" not in a.attrs: + continue + href = a["href"] + if href and href.startswith("/_/"): + page_legal = True + project_name = href[1:] + package_url = PackageURL( + type="docker", + name=project_name.replace("_/", "library/").rstrip("/"), + ).to_string() + yield URI( + uri=dockhub_library_html_template.format(project=project_name), + package_url=package_url, + source_uri=self.uri, + ) + yield URI( + uri=dockhub_library_restapi_template.format( + project=href.partition("/_/")[-1] + ), + package_url=package_url, + source_uri=self.uri, + ) + if page_legal: + current_page = int(self.uri.partition("=")[-1]) + next_page = current_page + 1 + yield URI( + uri=dockhub_next_page_template.format(page=next_page), + source_uri=self.uri, + ) + + +@visit_router.route(r"https://hub.docker.com/_/[\w\-\.]+/") +class DockHubProjectHTMLVisitor(HttpVisitor): + def dumps(self, content): + """Return the json by parsing the HTML project page""" + metadata_dict = dict() + page = BeautifulSoup(content, "lxml") + for div in page.find_all(name="div"): + for span in div.find_all(name="span"): + if span.string == "Short Description": + next_sibling = div.next_sibling + if next_sibling: + for sibling_span in next_sibling.find_all(name="span"): + sibling_text = sibling_span.string + metadata_dict["summary"] = sibling_text + for h1 in div.find_all(name="h1"): + if h1.string == "License": + licenses_paras = [] + next_sibling = h1.next_sibling + while next_sibling: + if next_sibling.string: + licenses_paras.append(next_sibling.string) + next_sibling = next_sibling.next_sibling + if licenses_paras: + metadata_dict["license_text"] = "".join(licenses_paras) + return json.dumps(metadata_dict) + + +@visit_router.route( + r"https://registry.hub.docker.com/v2/repositories/library/[\w\-\.]+/" +) +class DockHubLibraryRESTJsonVisitor(HttpJsonVisitor): + """ + Return URIs by parsing the json content of API of Dock Hub library + Note that this class is reuse the parent's function to return json data. + """ + + +@visit_router.route(r"https://index.docker.io/v1/search\?q=\w\w&n=100&page=\d+") +class DockHubGetAllProjectsFromSearchVisitor(HttpJsonVisitor): + def get_uris(self, content): + base_url = "https://hub.docker.com/v2/repositories/{name}" + num_page = content.get("num_pages") + current_page = content.get("page") + if num_page and current_page: + if int(current_page) < int(num_page): + next_page = int(current_page) + 1 + yield URI( + uri=(self.uri.rpartition("=")[0] + "=" + str(next_page)), + source_uri=self.uri, + ) + results = content.get("results", {}) + for result in results: + name = result.get("name") + # TODO: This will be used when new Package definition is merged. + # star_count = result.get("star_count") + if name: + package_url = PackageURL(type="docker", name=name).to_string() + yield URI( + uri=base_url.format(name=name), + package_url=package_url, + source_uri=self.uri, + ) + + +@map_router.route(r"https://registry.hub.docker.com/v2/repositories/library/[\w\-\.]+/") +class DockerHubLiraryJsonMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = resource_uri.data + build_packages_from_jsonfile( + metadata, resource_uri.uri, resource_uri.package_url + ) + + +def build_packages_from_jsonfile(metadata, uri=None, purl=None): + """ + Yield Package built from Docker Hub json content. + metadata: json metadata content + uri: String value of uri of the ResourceURI object. + purl: String value of the package url of the ResourceURI object + """ + content = json.loads(metadata) + dockhub_library_htmlpage_template = "https://hub.docker.com/_/{project}" + name = content.get("name") + if name: + short_desc = content.get("description") + long_desc = content.get("full_description") + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = "\n".join(descriptions) + common_data = dict( + type="docker", + name=name, + description=description, + homepage_url=dockhub_library_htmlpage_template.format(project=name), + ) + package = scan_models.Package(**common_data) + package.set_purl(purl) + yield package diff --git a/minecode/miners/eclipse.py b/minecode/miners/eclipse.py new file mode 100644 index 00000000..4e30aceb --- /dev/null +++ b/minecode/miners/eclipse.py @@ -0,0 +1,333 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json + +from bs4 import BeautifulSoup +from commoncode import fileutils +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import Mapper + + +class EclipseSeed(seed.Seeder): + def get_seeds(self): + yield "http://projects.eclipse.org/json/projects/all" + + +@visit_router.route("https://projects.eclipse.org/list-of-projects") +class EclipseProjectVisitors(HttpVisitor): + """Visit the HTML page of eclipse projects page and return the Packages info, json data and error.""" + + def get_uris(self, content): + page = BeautifulSoup(content, "lxml") + for a in page.find_all(name="a"): + if "href" not in a.attrs: + continue + href = a["href"] + if href and href.startswith("https://projects.eclipse.org/projects/"): + # if the herf content starts with Eclipse single project suffix, generate a URI with the href content + project_name = href.replace( + "https://projects.eclipse.org/projects/", "" + ) + package_url = PackageURL(type="eclipse", name=project_name).to_string() + yield URI(uri=href, package_url=package_url, source_uri=self.uri) + + +@visit_router.route("https://projects.eclipse.org/projects/.*") +class EclipseSingleProjectVisitor(HttpVisitor): + """ + Visit the HTML page of single eclipse project. + This is to get the HTML page as metadata, as it's single project and the URI is already collected by + EclipseProjectVisitors https://projects.eclipse.org/list-of-projects, so it won't return any new URI + and the goal is to return HTML page. + + For example:https://projects.eclipse.org/projects/modeling.m2t.accele + """ + + pass + + +@visit_router.route("http://git.eclipse.org/c") +class EclipseGitVisitor(HttpVisitor): + """Visitor Eclipse Git HTML page and return URIs in the Git HTML page.""" + + def get_uris(self, content): + page = BeautifulSoup(content, "lxml") + for td in page.find_all(name="td"): + if "class" not in td.attrs: + continue + if td.attrs.get("class") != ["sublevel-repo"]: + continue + + for a in td.findChildren(name="a"): + href = a["href"] + name = a.contents[0] + package_url = PackageURL(type="eclipse", name=name).to_string() + yield URI(uri=href, package_url=package_url, source_uri=self.uri) + + +@visit_router.route("http://www.eclipse.org/downloads/packages/all") +class EclipsePackagesVisitor(HttpVisitor): + """Visit the Eclipse packages HTML page and return URIs parsed from HTML page.""" + + def fetch(self, uri, timeout=40): + """Fetch and return the content found at a remote uri with an extra timeout""" + return HttpVisitor.fetch(self, uri, timeout=timeout) + + def get_uris(self, content): + page = BeautifulSoup(content, "lxml") + for td in page.find_all(name="span"): + if "class" not in td.attrs: + continue + if td.attrs.get("class") != ["field-content"]: + continue + + a = td.find(name="a") + href = a["href"] + name = a.contents[0] + # Skip some of the nodes if it's a HTML tag but not a string + if name and isinstance(name, str): + package_url = PackageURL(type="eclipse", name=name).to_string() + yield URI(uri=href, package_url=package_url, source_uri=self.uri) + + +@visit_router.route("http://www.eclipse.org/downloads/packages/release/.*") +class EclipseReleaseVisitor(HttpVisitor): + """Visit the Eclipse release HTML page and return expected Package URIs.""" + + def get_uris(self, content): + page = BeautifulSoup(content, "lxml") + suffix_list = [ + "-win32.zip", + "-win64.exe", + "-win32-x86_64.zip", + "-linux-gtk-x86_64.tar.gz", + "-linux-gtk-x86_64.tar.gz", + "-macosx-cocoa-x86_64.tar.gz", + "-linux-gtk.tar.gz", + "-x86_64.tar.gz", + ] + for div in page.find_all(name="div"): + for a in div.find_all(name="a"): + url = a.get("href") + if url and "download.php?file=" in url: + file_name = fileutils.file_name(url) + name = file_name + for suffix in suffix_list: + name = name.replace(suffix, "") + package_url = PackageURL(type="eclipse", name=name).to_string() + yield URI( + uri=url, + file_name=file_name, + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route("http://projects.eclipse.org/json/projects/all") +class EclipseProjectsJsonVisitor(HttpJsonVisitor): + """Visit the Ecipse json API and return expected project specified URIs.""" + + def fetch(self, uri, timeout=40): + """Fetch and return the content found at a remote uri with an extra timeout""" + return HttpJsonVisitor.fetch(self, uri, timeout=timeout) + + def get_uris(self, content): + url_template = "http://projects.eclipse.org/json/project/{name}" + projects = content.get("projects", {}) + for project in projects: + # TODO: are we sure there is not more data available in this JSON? + package_url = PackageURL(type="eclipse", name=project).to_string() + yield URI( + uri=url_template.format(name=project), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route("http://projects.eclipse.org/json/project/.*") +class EclipseSingleProjectJsonVisitor(HttpJsonVisitor): + """ + Visit json of a single Eclipse project. This is to return the json + itself without any URIs, as the URI itself is returned by + EclipseProjectsJsonVisitor. + """ + + pass + + +# FIXME: we should create packages from releases!!!! not from projects + + +@map_router.route("http://projects.eclipse.org/json/project/.*") +class EclipseJsonPackageMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + # FIXME: JSON deserialization should be handled eventually by the framework + metadata = json.loads(resource_uri.data) + return build_packages_with_json(metadata, resource_uri.package_url, uri) + + +def build_packages_with_json(metadata, purl=None, uri=None): + """ + Yield Package built from Eclipse a `metadata` mapping + The package can contain multiple projects, and each project can contain + meta data including title, description, homepage, bug tracking url etc. + metadata: json metadata content + purl: String value of the package url of the ResourceURI object + """ + projects = metadata["projects"] + for project, project_metadata in projects.items(): + common_data = dict( + datasource_id="eclipse_metadata", + type="eclipse", + name=project, + ) + + descriptions = project_metadata.get("description") + if descriptions and len(descriptions) > 0: + common_data["description"] = descriptions[0].get("value") + else: + common_data["description"] = project_metadata["title"] + + homepage_urls = project_metadata.get("website_url") + if homepage_urls and len(homepage_urls) > 0: + common_data["homepage_url"] = homepage_urls[0].get("url") + + bug_tracking_urls = project_metadata.get("bugzilla") + if bug_tracking_urls and len(bug_tracking_urls) > 0: + common_data["bug_tracking_url"] = bug_tracking_urls[0].get("query_url") + + if project_metadata.get("licenses"): + common_data["extracted_license_statement"] = [ + lic.get("name") for lic in project_metadata.get("licenses", []) + ] + common_data["license_detections"] = [] + + # FIXME: this is a download page and NOT a download URL!!!!! + for download_url in project_metadata.get("download_url", []): + durl = download_url.get("url") + if durl: + common_data["download_url"] = durl + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package + + +@map_router.route("https://projects.eclipse.org/projects/.*") +class EclipseHTMLProjectMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + # FIXME: JSON deserialization should be handled eventually by the framework + return build_packages(resource_uri.data, resource_uri.package_url, uri) + + +def build_packages(html_text, purl=None, uri=None): + """ + Yield Package objects built from `html_text`and the `purl` package URL + string. + """ + page = BeautifulSoup(html_text, "lxml") + common_data = dict( + datasource_id="eclipse_html", + type="eclipse", + ) + + extracted_license_statement = [] + for meta in page.find_all(name="meta"): + if "name" in meta.attrs and "dcterms.title" in meta.attrs.get("name"): + common_data["name"] = meta.attrs.get("content") + if "name" in meta.attrs and "dcterms.description" in meta.attrs.get("name"): + common_data["description"] = meta.attrs.get("content") + + for div in page.find_all(name="div"): + if "class" not in div.attrs: + continue + if "field-name-field-project-licenses" in div.attrs.get("class"): + # Visit div element whose class atttribute is field-name-field-project-licenses + for a in div.find_all(name="a"): + if "href" not in a.attrs: + continue + license_name = str(a.contents[0]) + extracted_license_statement.append(license_name) + if extracted_license_statement: + common_data["extracted_license_statement"] = extracted_license_statement + common_data["license_detections"] = [] + + for a in page.find_all(name="a"): + if a.contents: + if str(a.contents[0]).strip() == "Website": + common_data["homepage_url"] = a["href"] + + for a in page.find_all(name="a"): + if not a.contents: + continue + if str(a.contents[0]).strip() == "Downloads": + download_data = dict( + download_url=a["href"], + ) + download_data.update(common_data) + package = scan_models.Package.from_package_data( + package_data=download_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package + + for div in page.find_all(name="div"): + if "class" not in div.attrs: + continue + if "field-name-field-latest-releases" not in div.attrs.get("class"): + continue + # Visit div element whose class attribute is ield-name-field-latest-releases + tbody = div.find(name="tbody") + if not tbody: + continue + + for tr in tbody.find_all(name="tr"): + for td in tr.find_all(name="td"): + a = td.find(name="a") + if not a: + continue + + if "href" not in a.attrs or "class" in a.attrs: + continue + + version = a.contents[0] + href = a["href"] + download_data = dict( + version=version, + download_url=href, + ) + download_data.update(common_data) + package = scan_models.Package.from_package_data( + package_data=download_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/mappers/fdroid.py b/minecode/miners/fdroid.py similarity index 56% rename from minecode/mappers/fdroid.py rename to minecode/miners/fdroid.py index e50e59ca..127f34f2 100644 --- a/minecode/mappers/fdroid.py +++ b/minecode/miners/fdroid.py @@ -11,12 +11,16 @@ import logging from packagedcode.models import PackageData +from packagedcode.models import Party +from packagedcode.models import party_person +from packageurl import PackageURL from minecode import map_router -from minecode.mappers import Mapper -from packageurl import PackageURL -from packagedcode.models import party_person -from packagedcode.models import Party +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor TRACE = False @@ -24,13 +28,82 @@ if TRACE: import sys + logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.DEBUG) -@map_router.route('pkg:fdroid/.+') -class FdroidPackageMapper(Mapper): +""" +Visitors for F-Droid package repositories. + +NOTE: the license of F-Droid package data needs to be clarified. +See https://gitlab.com/fdroid/fdroiddata/-/issues/2826 for details + +F-Droid packages come with a main JSON index and possible increment/diffs. +- https://f-droid.org/repo/index-v2.json + +- this is a legacy XMl index https://f-droid.org/repo/index.xml + +- This top level file lists index and diffs https://f-droid.org/repo/entry.json + +- This is a diff example: https://f-droid.org/repo/diff/1666980277000.json + +- Each apk is available from a URL using this form: + https://f-droid.org/repo/app.seeneva.reader_3.apk + https://f-droid.org/repo/{application_id}_{version_code}.apk + +The {application_id}_{version_code}.apk "file name" for each tarball and +apk file name is listed in the index. +""" + + +class FdroidSeed(seed.Seeder): + def get_seeds(self): + yield "https://f-droid.org/repo/index-v2.json" + + +def build_purl(package_id, version_code, filename): + """Return a PackageURL for an F-Droid package.""" + return PackageURL( + type="fdroid", + name=package_id, + version=version_code, + qualifiers=dict(filename=filename), + ) + + +@visit_router.route("https://f-droid.org/repo/index-v2.json") +class FdroidIndexVisitor(NonPersistentHttpVisitor): + """ + Collect package metadata URIs from the F-Droid index for each package. + We treat each apk and corresponding source tarball as a different package. + """ + + def get_uris(self, content): + """Yield a URI for each F-Droid package.""" + json_location = content + with open(json_location) as c: + content = json.loads(c.read()) + + packages = content["packages"] + + for package_id, package_data in packages.items(): + purl = PackageURL(type="fdroid", name=package_id).to_string() + yield URI( + uri=purl, + package_url=purl, + source_uri=self.uri, + data=json.dumps( + package_data, separators=(",", ":"), ensure_ascii=False + ), + # note: visited is True since there nothing more to visit + visited=True, + ) + + +@map_router.route("pkg:fdroid/.+") +class FdroidPackageMapper(Mapper): def get_packages(self, uri, resource_uri): """ Yield Package(s) built from the index data for all versions of an F-Droid @@ -50,20 +123,20 @@ def build_packages(purl, data): # we map categories to keyword # "categories": ["Time"], - keywords = metadata.get('categories', []) + keywords = metadata.get("categories", []) # "issueTracker": "https://github.com/jdmonin/anstop/issues", - bug_tracking_url = metadata.get('issueTracker') + bug_tracking_url = metadata.get("issueTracker") # "license": "GPL-2.0-only", # this is supposed to be an SPDX expression - extracted_license_statement = metadata.get('license') + extracted_license_statement = metadata.get("license") # "sourceCode": "https://github.com/jdmonin/anstop", - vcs_url = metadata.get('sourceCode') + vcs_url = metadata.get("sourceCode") # "webSite": "https://sourceforge.net/projects/androidspeedo", - homepage_url = metadata.get('webSite') + homepage_url = metadata.get("webSite") description = build_description(metadata, language="en-US") @@ -71,16 +144,18 @@ def build_packages(purl, data): # "authorEmail": "jigsaw-code@google.com", # "authorName": "Jigsaw", # "authorWebSite": "https://jigsaw.google.com/", - author_name = metadata.get('authorName') - author_email = metadata.get('authorEmail') - author_url = metadata.get('authorWebSite') + author_name = metadata.get("authorName") + author_email = metadata.get("authorEmail") + author_url = metadata.get("authorWebSite") if any([author_name, author_email, author_url]): - parties.append(Party( - type=party_person, - name=author_name, - role="author", - email=author_email, - url=author_url) + parties.append( + Party( + type=party_person, + name=author_name, + role="author", + email=author_email, + url=author_url, + ) ) # TODO: add these @@ -99,7 +174,7 @@ def build_packages(purl, data): extracted_license_statement=extracted_license_statement, vcs_url=vcs_url, homepage_url=homepage_url, - repository_homepage_url=f'https://f-droid.org/en/packages/{base_purl.name}', + repository_homepage_url=f"https://f-droid.org/en/packages/{base_purl.name}", description=description, parties=parties, ) @@ -109,22 +184,21 @@ def build_packages(purl, data): # "added": 1344556800000, # "file": { # "name": "/An.stop_10.apk", .... - versions = data['versions'] + versions = data["versions"] for _sha256_of_apk, version_data in versions.items(): # TODO: collect versionName - version_code = str(version_data['manifest']['versionCode']) - logger.debug( - f'build_packages: base_purl: {base_purl} version: {version_code}') - logger.debug(f'build_packages: data: {version_data}') + version_code = str(version_data["manifest"]["versionCode"]) + logger.debug(f"build_packages: base_purl: {base_purl} version: {version_code}") + logger.debug(f"build_packages: data: {version_data}") # TODO: add release_date from "added": 1655164800000, # these must exists since F-Droid builds from sources - src = version_data['src'] - src_filename = src['name'] - src_sha256 = src['sha256'] - src_size = src['size'] + src = version_data["src"] + src_filename = src["name"] + src_sha256 = src["sha256"] + src_size = src["size"] download_url = f'https://f-droid.org/repo/{src_filename.strip("/")}' package_mapping = dict( @@ -142,17 +216,18 @@ def build_packages(purl, data): type=src.type, name=src.name, version=src.version, - qualifiers=dict(download_url=download_url) + qualifiers=dict(download_url=download_url), ) # these must exists or there is no F-Droid package - file = version_data['file'] - filename = file['name'] - sha256 = file['sha256'] - size = file['size'] - download_url = f'https://f-droid.org/repo/{filename}' + file = version_data["file"] + filename = file["name"] + filename = filename.lstrip("/") + sha256 = file["sha256"] + size = file["size"] + download_url = f"https://f-droid.org/repo/{filename}" - package_mappping = dict( + package_mapping = dict( version=version_code, download_url=download_url, repository_download_url=download_url, @@ -164,7 +239,7 @@ def build_packages(purl, data): yield PackageData.from_data(package_mapping) -def build_description(metadata, language='en-US'): +def build_description(metadata, language="en-US"): r""" Return a description in ``language`` built from a package name, summary and description, one per line. @@ -196,20 +271,20 @@ def build_description(metadata, language='en-US'): >>> build_description(metadata) 'Anstop' """ - names = metadata.get('name') or {} + names = metadata.get("name") or {} name = names.get(language) - summaries = metadata.get('summary') or {} + summaries = metadata.get("summary") or {} summary = summaries.get(language) if name and summary and summary.startswith(name): name = None - descriptions = metadata.get('description') or {} + descriptions = metadata.get("description") or {} description = descriptions.get(language) if summary and description and description.startswith(summary): summary = None non_empty_parts = [p for p in [name, summary, description] if p] - return '\n'.join(non_empty_parts) + return "\n".join(non_empty_parts) diff --git a/minecode/visitors/fedora.py b/minecode/miners/fedora.py similarity index 99% rename from minecode/visitors/fedora.py rename to minecode/miners/fedora.py index ae93637b..9c20c179 100644 --- a/minecode/visitors/fedora.py +++ b/minecode/miners/fedora.py @@ -1,4 +1,3 @@ - # use this to find all /repodata directories: # https://archive.fedoraproject.org/pub/DIRECTORY_SIZES.txt diff --git a/minecode/miners/freebsd.py b/minecode/miners/freebsd.py new file mode 100644 index 00000000..c2c0fbde --- /dev/null +++ b/minecode/miners/freebsd.py @@ -0,0 +1,119 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import logging +import os +from io import StringIO + +import saneyaml +from bs4 import BeautifulSoup +from packagedcode.freebsd import CompactManifestHandler + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.utils import extract_file +from minecode.utils import get_temp_dir + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +class FreeBSDSeed(seed.Seeder): + def get_seeds(self): + yield "https://pkg.freebsd.org" + + +@visit_router.route("https://pkg.freebsd.org") +class FreeBSDBaseHTMLVisitors(HttpVisitor): + """Visit the freeBSD home link and yield uri for each FreeBSD repo""" + + def get_uris(self, content): + page = BeautifulSoup(content, "lxml") + base_url = "https://pkg.freebsd.org/{path}/" + for a in page.find_all(name="a"): + if "href" not in a.attrs: + continue + href = a["href"] + # the sub link useful is like: FreeBSD:13:aarch64 + if href and href.startswith("FreeBSD%3A"): + url = base_url.format(path=href) + yield URI(uri=url, source_uri=self.uri) + + +@visit_router.route("https://pkg.freebsd.org/.*/") +class FreeBSDSubHTMLVisitors(HttpVisitor): + """Visit the sub repo URL and yield all uris in the page and in its children page""" + + def get_uris(self, content): + page = BeautifulSoup(content, "lxml") + base_url = self.uri + "{path}" + for a in page.find_all(name="a"): + if "href" not in a.attrs or "title" not in a.attrs: + # parent link doesn't have title. + continue + href = a["href"] + url = base_url.format(path=href) + yield URI(uri=url, source_uri=self.uri) + + +@visit_router.route("https://pkg.freebsd.org/.*packagesite.txz") +class FreeBSDIndexVisitors(NonPersistentHttpVisitor): + """Extract packagesite.txz index file, get the data of packagesite.yaml file.""" + + def dumps(self, content): + """Extract the file packagesite.yaml and read the content of the file and return.""" + extracted_location = extract_file(content) + manifest_file = os.path.join(extracted_location, "packagesite.yaml") + if os.path.exists(manifest_file): + with open(manifest_file) as file_handler: + return file_handler.read() + else: + logger.warn("The packagesite.yaml is not existing in index file:" + content) + + +@map_router.route("https://pkg.freebsd.org/.*packagesite.txz") +class FreeBSDIndexMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + return build_packages(resource_uri.data, resource_uri.package_url) + + +def build_packages(metadata, purl=None): + """ + Yield the package by parsing the passing json content. + metadata: json metadata content + purl: String value of the package url of the ResourceURI object + """ + buf = StringIO(metadata) + # The passing metadata is not a well-formatted yaml or json, but each line is a yaml, so read by line and parse with FreeBSDPackage parser. + for each_line in buf: + if each_line and each_line.strip() in ("", "{", "}"): + continue + content = saneyaml.load(each_line) + if content and content.get("name"): + temp_dir = get_temp_dir("freebsd_index") + location = os.path.join(temp_dir, "+COMPACT_MANIFEST") + with open(location, "w") as manifest: + manifest.write(each_line) + with open(location, encoding="utf-8") as loc: + yaml_data = saneyaml.load(loc) + package = CompactManifestHandler._parse(yaml_data=yaml_data) + package.set_purl(purl) + yield package diff --git a/minecode/miners/freedesktop.py b/minecode/miners/freedesktop.py new file mode 100644 index 00000000..5ca9802a --- /dev/null +++ b/minecode/miners/freedesktop.py @@ -0,0 +1,102 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from bs4 import BeautifulSoup +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.utils import form_vcs_url + + +class FreedesktopSeed(seed.Seeder): + def get_seeds(self): + yield "https://www.freedesktop.org/wiki/Software" + + +@visit_router.route("https://www.freedesktop.org/wiki/Software") +class FreedesktopHTMLVisitor(HttpVisitor): + """Visit the Freedesktop Software HTML page and return URIs parsed from HTML page.""" + + def get_uris(self, content): + url_template = "https://www.freedesktop.org/wiki/Software/{name}" + page = BeautifulSoup(content, "lxml") + for div in page.find_all(name="div"): + for a in div.find_all(name="a"): + if "href" not in a.attrs: + continue + href = a["href"] + if href and href.startswith("./"): + project_name = href.replace("./", "").strip("/") + package_url = PackageURL( + type="freedesktop", name=project_name + ).to_string() + yield URI( + uri=url_template.format(name=project_name), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route("https://www.freedesktop.org/wiki/Software/.*") +class FreedesktopProjectHTMLVisitor(HttpVisitor): + """Visit the Freedesktop Project HTML page.""" + + pass + + +@map_router.route("https://www.freedesktop.org/wiki/Software/.*") +class FreedesktopHTMLProjectMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + return build_packages(resource_uri.data, uri, resource_uri.package_url) + + +def build_packages(html_text, uri, purl): + """ + Yield Package objects built from `html_text` from the `uri` and the `purl` + package URL string. + """ + purl = PackageURL.from_string(purl) + package_data = dict( + type="freedesktop", name=purl.name, version=purl.version, homepage_url=uri + ) + + page = BeautifulSoup(html_text, "lxml") + if page.h1: + package_data["description"] = page.h1.string.strip() + + for a in page.find_all(name="a"): + link = a["href"] + if "freedesktop.org" not in link: + continue + + if "/releases/" in link or "/dist/" in link: + package_data["download_url"] = link + + if "https://bugs.freedesktop.org/buglist.cgi" in link: + package_data["bug_tracking_url"] = link + + if "http://cgit.freedesktop.org/" in link and "tree/" in link: + package_data["code_view_url"] = link + + for li in page.find_all(name="li"): + if li.text and li.text.startswith("git://"): + package_data["vcs_url"] = form_vcs_url("git", li.text) + + yield scan_models.Package(**package_data) diff --git a/minecode/miners/github.py b/minecode/miners/github.py new file mode 100644 index 00000000..2345e8aa --- /dev/null +++ b/minecode/miners/github.py @@ -0,0 +1,318 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import logging +from datetime import date +from datetime import datetime + +import attr +import packagedcode.models as scan_models +from github.MainClass import Github +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper +from minecode.utils import form_vcs_url +from minecode.utils import parse_date + +logger = logging.getLogger(__name__) + +TRACE = False +if TRACE: + handler = logging.StreamHandler() + logger.addHandler(handler) + logger.setLevel(logging.INFO) + + +class GithubSeed(seed.Seeder): + def get_seeds(self): + yield "https://api.github.com/repositories?since=0" + + +@visit_router.route(r"https://api.github.com/repositories\?since=\d+") +class GithubReposVisitor(HttpJsonVisitor): + """ + Visitor to run repositories request to get all repositories by increasing since symbol 100 each loop time. + Refer to: https://developer.github.com/v3/repos/#list-all-public-repositories + https://api.github.com/repositories + """ + + def get_uris(self, content): + repo_request_base = "https://api.github.com/repositories?since=" + has_content = False + if content: + for entry in content: + has_content = True + url = entry.get("url") + # Take full_name instead of name here since we want to keep more info, especially when forming the package url + # "name": "grit", + # "full_name": "mojombo/grit", + name = entry.get("full_name") + if url: + package_url = None + if name: + package_url = PackageURL(type="github", name=name).to_string() + # Yield URI for GithubSingleRepoVisitor use + yield URI(uri=url, package_url=package_url, source_uri=self.uri) + if not has_content: + logger.info( + f"The content of the response is empty, the processing might be finished for URI: {self.uri}" + ) + else: + uri = self.uri + current_id = uri.replace("https://api.github.com/repositories?since=", "") + current_id = int(current_id) + # 100 is fixed since each page has 100 entries. Plus 100 means to go from next page. + new_id = current_id + 100 + new_url = repo_request_base + str(new_id) + yield URI(uri=new_url, source_uri=self.uri) + + +@visit_router.route(r"https://api.github.com/repos/[\w\-\.]+/[\w\-\.]+") +class GithubSingleRepoVisitor(HttpJsonVisitor): + """ + Visitor to get the json and add more content with GitHub API from one repo. + For example: https://api.github.com/repos/mojombo/grit + """ + + def fetch(self, uri, timeout=None): + """ + Having its own fetch function instead of inheriting from HttpJsonVisitor class is because: + The json itself has lots of URL info, the Github API can get content without acccessing the URLs inside the json explicitly. + The main idea is to fetch download_url... + """ + full_name = uri.replace("https://api.github.com/repos/", "") + g = Github() + repo = g.get_repo(full_name) + + common_data = dict( + name=repo.name, + description=repo.description, + blobs_url=repo.blobs_url, + language=repo.language, + size=repo.size, + homepage=repo.homepage, + html_url=repo.html_url, + etag=repo.etag, + full_name=repo.full_name, + repo_id=repo.id, + ssh_url=repo.ssh_url, + source_url=repo.svn_url, + clone_url=repo.clone_url, + watchers_count=repo.watchers, + master_branch=repo.master_branch, + updated_at=json_serial_date_obj(repo.updated_at), + pushed_at=json_serial_date_obj(repo.pushed_at), + ) + + if repo.owner: + common_data["owner"] = repo.owner.name + if repo._issues_url: + common_data["issue_url"] = repo._issues_url.value + + if repo._git_url: + common_data["git_url"] = repo._git_url.value + + if repo.organization: + repo.origanization = repo.organization.name + + downloads = [] + if repo.get_downloads(): + for download in list(repo.get_downloads()): + downloads.append( + dict( + name=download.name, + url=download.url, + size=download.size, + s3_url=download.s3_url, + created_at=json_serial_date_obj(download.created_at), + download_count=download.download_count, + description=download.description, + redirect=download.redirect, + signature=download.signature, + html_url=download.html_url, + bucket=download.bucket, + acl=download.acl, + accesskeyid=download.accesskeyid, + expirationdate=json_serial_date_obj(download.expirationdate), + ) + ) + common_data["downloads"] = downloads + + tags = [] + if repo.get_tags(): + for tag in list(repo.get_tags()): + tag_info = dict( + name=tag.name, + tarball_url=tag.tarball_url, + zipball_url=tag.zipball_url, + ) + if tag.commit: + tag_info["sha1"] = tag.commit.sha + tags.append(tag_info) + common_data["tags"] = tags + + if not common_data.get("tags") and not common_data.get("downloads"): + # If there is no downloads and tags, let's make the download_url by forming archive/master.zip at the end + # For example, the base html is: https://github.com/collectiveidea/calendar_builder + # The final download_url is https://github.com/collectiveidea/calendar_builder/archive/master.zip + branches_download_urls = [] + download_url_bases = "{html_url}/archive/{branch_name}.zip" + if repo.get_branches(): + for branch in list(repo.get_branches()): + branches_download_urls.append( + download_url_bases.format( + html_url=common_data.get("html_url"), + branch_name=branch.name, + ) + ) + common_data["branches_download_urls"] = branches_download_urls + + common_data["labels"] = [] + if repo.get_labels(): + for label in repo.get_labels(): + common_data["labels"].append(label.name) + + return json.dumps(common_data) + + +def json_serial_date_obj(obj): + """JSON serializer for date object""" + if obj and isinstance(obj, datetime | date): + return obj.isoformat() + + +@map_router.route(r"https://api\.github\.com/repos/([^/]+)/([^/]+)") +class GithubMetaFileMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + visited_data = resource_uri.data + if not visited_data: + return + return build_github_packages( + visited_data, resource_uri.uri, resource_uri.package_url + ) + + +def build_github_packages(visited_data, uri, purl=None): + """ + Yield Package built from Github API visited_data as a JSON string. + metadata: HTML metadata content + uri: String value of the uri from ResourceURI object + purl: String value of the package url of the ResourceURI object + """ + visited_data = json.loads(visited_data) + + full_name = visited_data["full_name"] + namespace, name = split_org_repo(full_name) + # FIXME: when could this ever happen?? + assert name == visited_data["name"], ( + "build_github_packages: Inconsistent name and org for URI: " + uri + ) + + description = visited_data["description"] + + vcs_url = (visited_data.get("git_url"),) + if vcs_url: + vcs_url = form_vcs_url("git", vcs_url) + package = scan_models.Package( + type="github", + namespace=namespace, + name=name, + description=description, + primary_language=visited_data.get("language"), + homepage_url=visited_data.get("html_url"), + vcs_url=vcs_url, + # this size does not make sense + size=visited_data.get("size"), + ) + + if visited_data.get("owner"): + package.parties = [ + scan_models.Party( + # FIXME: we can add the org or user URL and we can know if this + # is an org or a perrsone too. + type=scan_models.party_person, + name=visited_data.get("owner"), + role="owner", + ) + ] + + package.set_purl(purl) + + downloads = visited_data.get("downloads") or [] + for download in downloads: + html_url = download.get("html_url") + if html_url: + # make a copy + package = attr.evolve(package) + package.download_url = html_url + package.size = download.get("size") + package.release_date = parse_date(download.get("created_at")) + yield package + + tags = visited_data.get("tags") or [] + for tag in tags: + package = attr.evolve(package) + package.version = tag.get("name") + package_url = PackageURL( + type="github", + name=package.name, + namespace=namespace, + version=tag.get("name"), + ).to_string() + package.sha1 = tag.get("sha1") + if tag.get("tarball_url"): + package.download_url = tag.get("tarball_url") + package.set_purl(package_url) + yield package + if tag.get("zipball_url"): + package.download_url = tag.get("zipball_url") + package.set_purl(package_url) + yield package + + branches_download_urls = visited_data.get("branches_download_urls") or [] + for branches_download_url in branches_download_urls: + package = attr.evolve(package) + package.download_url = branches_download_url + yield package + + +def split_org_repo(url_like): + """ + Given a URL-like string to a GitHub repo or a repo name as in org/name, + split and return the org and name. + + For example: + >>> split_org_repo('foo/bar') + ('foo', 'bar') + >>> split_org_repo('https://api.github.com/repos/foo/bar/') + ('foo', 'bar') + >>> split_org_repo('github.com/foo/bar/') + ('foo', 'bar') + >>> split_org_repo('git://github.com/foo/bar.git') + ('foo', 'bar') + """ + segments = [s.strip() for s in url_like.split("/") if s.strip()] + if not len(segments) >= 2: + raise ValueError(f"Not a GitHub-like URL: {url_like}") + org = segments[-2] + name = segments[-1] + if name.endswith(".git"): + name, _, _ = name.rpartition(".git") + return org, name diff --git a/minecode/miners/gitlab.py b/minecode/miners/gitlab.py new file mode 100644 index 00000000..637ce681 --- /dev/null +++ b/minecode/miners/gitlab.py @@ -0,0 +1,140 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json + +import packagedcode.models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.utils import form_vcs_url +from minecode.utils import get_http_response +from minecode.utils import parse_date + + +class GitlabSeed(seed.Seeder): + def get_seeds(self): + yield "https://gitlab.com/api/v4/projects" + + +@visit_router.route("https://gitlab.com/api/v4/projects") +class GitlabAPIHeaderVisitor(HttpVisitor): + """ + Get the header of the API, and parse the page size and total pages from the + header, and yield urls for further visiting like GitlabAPIVisitor + """ + + def fetch(self, uri, timeout=10): + """Return only the headers of the response.""" + return get_http_response(uri, timeout).headers + + def get_uris(self, content): + new_page_template = "https://gitlab.com/api/v4/projects?page={next_page}&per_page={per_page}&statistics=true" + + page_size = content.get("X-Per-Page") + total_pages = content.get("X-Total-Pages") + if page_size and total_pages: + total_pages = int(total_pages) + for i in range(total_pages): + # Use the loop to yield the uri of next page of the visitor. + nextpage_url = new_page_template.format( + next_page=i + 1, per_page=page_size + ) + yield URI(uri=nextpage_url, source_uri=self.uri, visited=False) + + +@visit_router.route( + r"https://gitlab.com/api/v4/projects\?page=\d+&per_page=\d+&statistics=true" +) +class GitlabAPIVisitor(HttpJsonVisitor): + """ + Return URIs from the json content of one API page returned from gitlab api. + This yields the "web_url" from each package in the current json page. + """ + + def get_uris(self, content): + """ + Yield URIs from the json content, the passing content is the json info, the example is: + [ + { + "id": 6377679, + ... + "web_url": "https://gitlab.com/prithajnath/cnn-keras", + ... + }, + { + .. + "web_url": "https://gitlab.com/janpoboril/rules-bug", + ... + } + ... + ] + Each element in the list is a dictionary, and we concern the web_url for the visitor and also return the data. + """ + if not content: + # If the page is empty, just return + return + for element in content: + # The element is one package in the list of current returned page. + url = element.get("web_url") + if url: + project_name = url.rpartition("/")[-1] + package_url = PackageURL(type="gitlab", name=project_name).to_string() + yield URI( + uri=url, + package_url=package_url, + data=element, + source_uri=self.uri, + visited=False, + ) + + +@map_router.route("https://gitlab.com/.*") +class GitLabMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = resource_uri.data + build_packages_from_json(metadata, resource_uri.package_url) + + +def build_packages_from_json(metadata, purl=None): + """ + Yield Package built from gitlab json content + metadata: Json metadata content + purl: String value of the package url of the ResourceURI object + """ + content = json.loads(metadata) + + name = content.get("name") + if name: + common_data = dict( + type="gitlab", + name=name, + homepage_url=content.get("web_url"), + description=content.get("description"), + ) + repo_url = content.get("http_url_to_repo") + if repo_url: + repo_url = form_vcs_url("git", repo_url) + common_data["vcs_url"] = repo_url + common_data["code_view_url"] = repo_url + common_data["release_date"] = parse_date(content.get("created_at")) + package = scan_models.Package(**common_data) + package.set_purl(purl) + yield package diff --git a/minecode/miners/golang.py b/minecode/miners/golang.py new file mode 100644 index 00000000..cee9eb7e --- /dev/null +++ b/minecode/miners/golang.py @@ -0,0 +1,241 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json + +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.utils import form_vcs_url + + +class GoLangSeed(seed.Seeder): + def get_seeds(self): + yield "https://api.godoc.org/packages" + + +@visit_router.route("https://api.godoc.org/packages") +class GodocIndexVisitor(NonPersistentHttpVisitor): + """Collect Golang URIs for packages available in the Go doc index.""" + + def get_uris(self, content): + """Return URIs to search the API further for a package""" + seen_paths = set() + for path, package in get_packages(content): + package_url, path = parse_package_path(path) + if path in seen_paths: + continue + seen_paths.add(path) + + # note the addition of a * at the end of the search string... + # without this the returned data are sparse + details_url = "https://api.godoc.org/search?q={path}*".format(**locals()) + host = get_well_known_host(path) + # If the path belongs github/bitbucket, yield a repo too + if host: + # keep github, bitbucket... as type: + repo_type, _, _ = host.lower().partition(".") # NOQA + repo_url = "https://{namespace}/{name}".format(**package_url.to_dict()) + repo_purl = PackageURL( + type=repo_type, + namespace=package_url.namespace, + name=package_url.name, + qualifiers=dict(package_url=package_url.to_string()), + ).to_string() + + yield URI(uri=repo_url, package_url=repo_purl, source_uri=self.uri) + + yield URI( + uri=details_url, + package_url=package_url.to_string(), + source_uri=self.uri, + ) + + else: + yield URI(uri=details_url, package_url=package_url, source_uri=self.uri) + + +@visit_router.route(r"https://api\.godoc\.org/search\?q=.*") +class GodocSearchVisitor(NonPersistentHttpVisitor): + """Collect URIs and data through the godoc searchi API.""" + + def get_uris(self, content): + seen_paths = set() + for path, package in get_packages(content): + package_url, path = parse_package_path(path) + if path in seen_paths: + continue + seen_paths.add(path) + + purl = package_url.to_string() + yield URI( + # NOTE: here we use a previsited PURL as URI + uri=purl, + package_url=purl, + source_uri=self.uri, + # the data contains some popcounts and a description + data=package, + visited=True, + ) + + +def get_packages(packages_json_location): + """ + Yield a path and mapping of Go package raw data from a JSON data location. + { + "name": "aws", + "path": "github.com/aws/aws-sdk-go/aws", + "import_count": 13623, + "synopsis": "Package aws provides the core SDK's utilities and shared types.", + "stars": 4218, + "score": 0.99 + }, + """ + with open(packages_json_location) as f: + data = json.load(f) + for package in data.get("results", []): + path = package["path"] + if path and not is_standard_import(path): + yield path, package + + +def is_standard_import(path): + """Return True if a Go import path is for a standard library import""" + standard_packages = ( + "archive", + "bufio", + "builtin", + "bytes", + "compress", + "container", + "context", + "crypto", + "database", + "debug", + "encoding", + "expvar", + "flag", + "fmt", + "go", + "hash", + "html", + "image", + "index", + "io", + "log", + "math", + "mime", + "net", + "os", + "path", + "plugin", + "reflect", + "regexp", + "runtime", + "sort", + "strconv", + "strings", + "sync", + "syscall", + "testing", + "text", + "time", + "unsafe", + "golang.org/x/benchmarks", + "golang.org/x/blog", + "golang.org/x/build", + "golang.org/x/crypto", + "golang.org/x/debug", + "golang.org/x/image", + "golang.org/x/mobile", + "golang.org/x/net", + "golang.org/x/perf", + "golang.org/x/review", + "golang.org/x/sync", + "golang.org/x/sys", + "golang.org/x/text", + "golang.org/x/time", + "golang.org/x/tools", + "golang.org/x/tour", + "golang.org/x/exp", + ) + + return path.startswith(standard_packages) + + +repo_hosters = "bitbucket.org/", "github.com/", "gitlab.com/" + + +def get_well_known_host(path): + """Return a host if this path is from a well known hoster or None.""" + if path.startswith(repo_hosters): + host, _, _ = path.partition(".") + return host + + +def parse_package_path(path): + """Return a PackageURL and transformed path given a path to a Go import.""" + path = path or "" + segments = path.split("/") + + host = get_well_known_host(path) + qualifiers = None + if host: + # keep only the first few segments + segments = segments[:3] + repo_url = "https://" + "/".join(segments) + qualifiers = dict(vcs_repository=repo_url) + namespace = None + if len(segments) > 1: + namespace = segments[:-1] + namespace = "/".join(namespace) + + name = segments[-1] + + path = "/".join(segments) + + package_url = PackageURL( + type="golang", namespace=namespace, name=name, qualifiers=qualifiers + ) + + return package_url, path + + +@map_router.route("pkg:golang/.*") +class GolangApiDocMapper(Mapper): + def get_packages(self, uri, resource_uri): + package = json.loads(resource_uri.data) + yield build_golang_package(package, resource_uri.package_url) + + +def build_golang_package(package_data, purl): + """Return a single Golang package""" + package_url = PackageURL.from_string(purl) + vcs_url = package_url.qualifiers.get("vcs_repository") + homepage_url = "/".join(["https:/", package_url.namespace, package_url.name]) + vcs_tool = "git" if "github.com" in package_url.namespace else None + if vcs_tool: + vcs_url = form_vcs_url(vcs_tool, vcs_url) + # TODO: collect stats and counter from package_data too + package = scan_models.Package( + name=package_url.name, + namespace=package_url.namespace, + type=package_url.type, + primary_language="Go", + description=package_data.get("synopsis"), + homepage_url=homepage_url, + vcs_url=vcs_url, + ) + return package diff --git a/minecode/miners/googlecode.py b/minecode/miners/googlecode.py new file mode 100644 index 00000000..b041a42b --- /dev/null +++ b/minecode/miners/googlecode.py @@ -0,0 +1,297 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os +from datetime import datetime + +from bs4 import BeautifulSoup +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.utils import extract_file + + +class GooglecodeSeed(seed.Seeder): + def get_seeds(self): + yield "https://code.google.com/archive/search?q=domain:code.google.com" + yield "https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip" + + +@visit_router.route( + "https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip" +) +class GooglecodeArchiveVisitor(NonPersistentHttpVisitor): + """Fetch the googlecode archive file and extract it, and read the text file and get the URLs""" + + def get_uris(self, content): + """ + Return URIs by extracting and parsing the text file. + + Please refer to: https://github.com/pombredanne/swh-fetcher-googlecode + + For example, with Google + Cloud Storage URL gs://google-code-archive/v2/code.google/hg4j/project.json, + you can get the file's contents by URL-escaping the string and adding it to + googleapis.com. e.g. + https://www.googleapis.com/storage/v1/ + b/google-code-archive/o/v2%2Fcode.google.com%2Fhg4j%2Fproject.json?alt=media + """ + extracted_location = extract_file(content) + text_file = os.path.join(extracted_location, "google-code-archive.txt") + url_base = "https://www.googleapis.com/storage/v1/b/{project_info}?alt=media" + if os.path.exists(text_file): + with open(text_file) as project_file: + for project_line in project_file: + if not project_line: + continue + project_line = project_line.strip() + if project_line.startswith( + "gs://google-code-archive/v2" + ) and project_line.endswith("/project.json"): + project_line = project_line.replace( + "gs://google-code-archive/v2", "" + ) + package_name = project_line.replace("/project.json", "") + package_url = PackageURL( + type="googlecode", name=package_name.strip("/") + ).to_string() + project_line = ( + "google-code-archive/o/v2" + + project_line.replace("/", "%2F") + ) + url = url_base.format(project_info=project_line) + yield URI(uri=url, package_url=package_url, source_uri=self.uri) + + +@visit_router.route( + r"https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2.*project.json\?alt=media" +) +class GoogleAPIProjectJsonVisitor(HttpJsonVisitor): + """Fetch the json of the API URL and this will be used for mapper use.""" + + pass + + +@visit_router.route( + r"https://code.google.com/archive/search\?q=domain:code.google.com", + r"https://code.google.com/archive/search\?q=domain:code.google.com&page=[0-9]*", +) +class GoogleProjectPagesVisitor(HttpVisitor): + """ + Parse the passing google projects list pages, and return all project json url + which the project belongs to in the current page, and the next page url. + """ + + def get_uris(self, content): + """Return URIs for pagnitions of project lists""" + page = BeautifulSoup(content, "lxml") + projectjson_url_template = "https://storage.googleapis.com/google-code-archive/v2/code.google.com/{project}/project.json" + for page in page.find_all("a"): + url = page["href"] + if url and "https://code.google.com/archive/p/" in url: + project_name = url.replace("https://code.google.com/archive/p/", "") + project_api_url = projectjson_url_template.format(project=project_name) + package_url = PackageURL( + type="googlecode", name=project_name.strip("/") + ).to_string() + yield URI( + uri=project_api_url, package_url=package_url, source_uri=self.uri + ) + if page.text.startswith("Next"): + yield URI(uri=url, source_uri=self.uri) + + +@visit_router.route( + "https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/project.json" +) +class GoogleProjectJsonVisitor(HttpJsonVisitor): + """Collect the project json for mapper use and also return the download page json url.""" + + def get_uris(self, content): + """Return the download json URL""" + yield URI(uri=self.uri.replace("project.json", "downloads-page-1.json")) + + +@visit_router.route( + "https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/downloads-page-[0-9]*.json" +) +class GoogleDownloadsPageJsonVisitor(HttpJsonVisitor): + """Collect download URIs and the next page related to the current download page.""" + + def get_uris(self, content): + """ + Yield the next download page based on current page number and total page number. + and yield the download urls in the json, for example: + https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/hg4j/hg4j_1.2m2.jar + """ + url = self.uri + page_num = content.get("pageNumber") + total_pages = content.get("totalPages") + name_template = "downloads-page-{page}.json" + filename = name_template.format(page=str(page_num)) + new_filename = name_template.format(page=str(page_num + 1)) + + assert filename in url + if page_num < total_pages: + new_page_url = url.replace(filename, new_filename) + yield URI( + uri=new_page_url, + source_uri=self.uri, + ) + + download_url_template = url.replace(filename, "") + "{file_name}" + for download in content.get("downloads", []): + file_name = download.get("filename") + package_url = PackageURL(type="googlecode", name=file_name).to_string() + if "_" in file_name and "." in file_name: + partitions = file_name.partition("_") + package_name = partitions[0] + version = partitions[-1].rpartition(".")[0] + package_url = PackageURL( + type="googlecode", name=package_name, version=version + ).to_string() + download_url = download_url_template.format(file_name=file_name) + last_modified_date = None + release_date = download.get("releaseDate") + if release_date: + last_modified_date = datetime.fromtimestamp(release_date) + yield URI( + uri=download_url, + package_url=package_url, + file_name=file_name, + source_uri=self.uri, + date=last_modified_date, + size=download.get("fileSize"), + sha1=download.get("sha1Checksum"), + ) + + +@map_router.route( + "https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/project.json" +) +class GoogleNewAPIV2ProjectJsonMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Packages built from resource_uri record for a single + package version. + """ + # FIXME: JSON deserialization should be handled eventually by the + # framework + metadata = json.loads(resource_uri.data) + return build_packages_from_projectsjson_v2( + metadata, resource_uri.package_url, uri + ) + + +def build_packages_from_projectsjson_v2(metadata, purl=None, uri=None): + """ + Yield Package built from Googlecode API json `metadata` mapping + which is a dictionary keyed by project name and values are metadatadata. + Yield as many Package as there are download URLs. + metadata: json metadata content from API call + purl: String value of the package url of the ResourceURI object + """ + short_desc = metadata.get("summary") + long_desc = metadata.get("description") + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = "\n".join(descriptions) + common_data = dict( + datasource_id="googlecode_api_json", + type="googlecode", + name=metadata.get("name"), + description=description, + ) + + license_name = metadata.get("license") + if license_name: + common_data["extracted_license_statement"] = license_name + common_data["license_detections"] = [] + + keywords = [] + labels = metadata.get("labels") + for label in labels: + if label: + keywords.append(label.strip()) + common_data["keywords"] = keywords + + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package + + +@map_router.route( + r"https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2.*project.json\?alt=media" +) +class GoogleNewAPIV1ProjectJsonMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Packages built from resource_uri record for a single + package version. + """ + # FIXME: JSON deserialization should be handled eventually by the + # framework + metadata = json.loads(resource_uri.data) + return build_packages_from_projectsjson_v1( + metadata, resource_uri.package_url, uri + ) + + +def build_packages_from_projectsjson_v1(metadata, purl=None, uri=None): + """ + Yield Package from the project.json passed by the google code v1 API + metadata: json metadata content from API call + purl: String value of the package url of the ResourceURI object + """ + if metadata.get("name"): + common_data = dict( + datasource_id="googlecode_json", + type="googlecode", + name=metadata.get("name"), + description=metadata.get("description"), + ) + + license_name = metadata.get("license") + if license_name: + common_data["extracted_license_statement"] = license_name + common_data["license_detections"] = [] + + keywords = [] + labels = metadata.get("labels") + for label in labels: + if label: + keywords.append(label.strip()) + common_data["keywords"] = keywords + + common_data["vcs_url"] = metadata.get("ancestorRepo") + common_data["namespace"] = metadata.get("domain") + + # createTime doesn't make sense since the timestamp value is incorrect + # and parsing it will give a wrong year out of range. + + # created_time = metadata.get('creationTime') + # if created_time: + # common_data['release_date'] = date.fromtimestamp(created_time) + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/miners/gstreamer.py b/minecode/miners/gstreamer.py new file mode 100644 index 00000000..f49a5876 --- /dev/null +++ b/minecode/miners/gstreamer.py @@ -0,0 +1,113 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from bs4 import BeautifulSoup +from commoncode import fileutils +from commoncode.fileutils import file_base_name +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper + + +class GstreamerSeed(seed.Seeder): + is_active = False + + def get_seeds(self): + yield "https://gstreamer.freedesktop.org/src/" + + +@visit_router.route(r"https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*") +class GstreamerHTMLVisitor(HttpVisitor): + """ + Visit the HTML page of gstreamer. Yield the uri which can be used for the next visitor use or the uri stands for the file resource. + The regex is to match: + https://gstreamer.freedesktop.org/src/ + https://gstreamer.freedesktop.org/src/gst-openmax/pre/ + """ + + def get_uris(self, content): + page = BeautifulSoup(content, "lxml") + url_template = self.uri + "{sub_path}" + for a in page.find_all(name="a"): + if "href" not in a.attrs: + continue + href = a["href"] + if href: + # For parent folder link or other unrelated links, ignore + if href.startswith("/") or href.startswith("?"): + continue + if href.endswith("/"): + # If the path is folder, yield it for the next visitor use. + yield URI( + uri=url_template.format(sub_path=href), source_uri=self.uri + ) + else: + # If it's the file resource, form the package_url and yield the URI with package url info + # For example: gst-openmax-0.10.0.4.tar.bz2 + file_name = href + file_name_without_prefix = file_base_name(file_name) + if "-" in file_name_without_prefix: + project_name_versions = file_name.rpartition("-") + project_name = project_name_versions[0] + version = project_name_versions[-1] + else: + project_name = file_name + version = None + package_url = PackageURL( + type="gstreamer", name=project_name, version=version + ).to_string() + yield URI( + uri=url_template.format(sub_path=href), + package_url=package_url, + file_name=file_name, + source_uri=self.uri, + ) + + +@map_router.route( + "https://gstreamer.freedesktop.org/src/([\\w\\-\\.]+/)*[\\w\\-\\.]+[.tar\\.bz2\\.gz|\\.tar\\.xz]" +) +class GstreamerURLMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + """ + return build_package_from_url(resource_uri.uri, resource_uri.package_url) + + +def build_package_from_url(uri, purl=None): + """ + Return Package built from uri and package_url. + uri: String value of uri of the ResourceURI object. + purl: String value of the package url of the ResourceURI object + """ + file_name = fileutils.file_name(uri) + file_name_without_prefix = file_name + prefixes = (".tar.bz2", ".tar.gz", ".tar.xz") + for prefix in prefixes: + file_name_without_prefix = file_name_without_prefix.replace(prefix, "") + if "-" in file_name_without_prefix: + project_name, _, version = file_name.rpartition("-") + common_data = dict( + type="gstreamer", + name=project_name, + version=version, + download_url=uri, + homepage_url="https://gstreamer.freedesktop.org", + ) + package = scan_models.Package(**common_data) + package.set_purl(purl) + yield package diff --git a/minecode/miners/haxe.py b/minecode/miners/haxe.py new file mode 100644 index 00000000..cd6e7f1e --- /dev/null +++ b/minecode/miners/haxe.py @@ -0,0 +1,117 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json + +from bs4 import BeautifulSoup +from packagedcode.haxe import HaxelibJsonHandler +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import Mapper + + +class HaxeSeed(seed.Seeder): + is_active = False + + def get_seeds(self): + yield "https://lib.haxe.org/all" + + +@visit_router.route("https://lib.haxe.org/all") +class HaxeProjectsVisitor(HttpVisitor): + """Visit the Haxe all projects page and yield uri of each project.""" + + def get_uris(self, content): + """ + Parse the HTML to get project name, and format the url with this project name into a version URL. + For example: https://lib.haxe.org/p/openfl/versions/ + """ + version_url_tempalte = "https://lib.haxe.org{project_href}versions" + page = BeautifulSoup(content, "lxml") + for a in page.find_all(name="a"): + if "href" not in a.attrs: + continue + href = a["href"] + if href and href.startswith("/p/"): + project_name = href.replace("/p", "").rstrip("/") + package_url = PackageURL(type="haxe", name=project_name).to_string() + yield URI( + uri=version_url_tempalte.format(project_href=href), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route(r"https://lib.haxe.org/p/[\w\-\.]+/versions") +class HaxeVersionsVisitor(HttpVisitor): + """ + Visit the version page of a project and yield uri of each version. + For example: https://lib.haxe.org/p/openfl/versions + """ + + def get_uris(self, content): + """Yield haxelib json URL based on specified version, for example: https://lib.haxe.org/p/openfl/8.6.4/raw-files/openfl/package.json""" + version_url_tempalte = "https://lib.haxe.org/p/{project}/{version}/raw-files/{project}/package.json" + page = BeautifulSoup(content, "lxml") + for a in page.find_all(name="a"): + if "href" not in a.attrs: + continue + href = a["href"] + if href and href.startswith("/p/") and href.endswith("/"): + # Parse if the href contains the versino info:
+ project_version = href.replace("/p/", "").rstrip("/") + project_version = project_version.split("/") + if len(project_version) == 2: + # if there is only one slash between project and version, openfl/8.6.3 + project = project_version[0] + version = project_version[1] + package_url = PackageURL( + type="haxe", name=project, version=version + ).to_string() + yield URI( + uri=version_url_tempalte.format( + project=project, version=version + ), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route( + r"https://lib.haxe.org/p/[\w\-\.]+/[\w\-\.]+/raw-files/[\w\-\.]+/package.json" +) +class HaxePackageJsonVisitor(HttpJsonVisitor): + """Empty Visitor to get the package json content only.""" + + pass + + +@map_router.route( + r"https://lib.haxe.org/p/[\w\-\.]+/[\w\-\.]+/raw-files/[\w\-\.]+/package.json" +) +class HaxePackageJsonMapper(Mapper): + def get_packages(self, uri, resource_uri): + """Yield Package built from package json file.""" + # FIXME: JSON deserialization should be handled eventually by the framework + metadata = json.loads(resource_uri.data) + return build_packages_with_json(metadata, resource_uri.package_url) + + +def build_packages_with_json(metadata, purl=None): + # yield package by getting package from the build_package parser in scancode + package = HaxelibJsonHandler._parse(json_data=metadata) + if package: + package.set_purl(purl) + yield package diff --git a/minecode/visitors/java_stream.py b/minecode/miners/java_stream.py similarity index 82% rename from minecode/visitors/java_stream.py rename to minecode/miners/java_stream.py index 94a766b2..63e32287 100644 --- a/minecode/visitors/java_stream.py +++ b/minecode/miners/java_stream.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - # The MIT License (MIT) # # Copyright (c) 2014 Gustav Arngården @@ -23,14 +21,12 @@ # SOFTWARE. -""" -Reading from Java DataInputStream format. -""" +"""Reading from Java DataInputStream format.""" import struct -class DataInputStream(object): +class DataInputStream: def __init__(self, stream): self.stream = stream @@ -42,14 +38,14 @@ def read(self, n=1): return data def read_byte(self): - return struct.unpack('b', self.read(1))[0] + return struct.unpack("b", self.read(1))[0] def read_long(self): - return struct.unpack('>q', self.read(8))[0] + return struct.unpack(">q", self.read(8))[0] def read_utf(self): - utf_length = struct.unpack('>H', self.read(2))[0] + utf_length = struct.unpack(">H", self.read(2))[0] return self.read(utf_length) def read_int(self): - return struct.unpack('>i', self.read(4))[0] + return struct.unpack(">i", self.read(4))[0] diff --git a/minecode/visitors/java_stream.py.ABOUT b/minecode/miners/java_stream.py.ABOUT similarity index 100% rename from minecode/visitors/java_stream.py.ABOUT rename to minecode/miners/java_stream.py.ABOUT diff --git a/minecode/visitors/java_stream.py.LICENSE b/minecode/miners/java_stream.py.LICENSE similarity index 100% rename from minecode/visitors/java_stream.py.LICENSE rename to minecode/miners/java_stream.py.LICENSE diff --git a/minecode/miners/maven.py b/minecode/miners/maven.py new file mode 100644 index 00000000..6b21b4d7 --- /dev/null +++ b/minecode/miners/maven.py @@ -0,0 +1,1073 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import gzip +import io +import json +import logging +import os +from collections import namedtuple + +import arrow +import javaproperties +import packageurl +from bs4 import BeautifulSoup +from commoncode.text import as_unicode +from dateutil import tz +from jawa.util.utf import decode_modified_utf8 +from packagedcode.maven import _parse +from packagedcode.maven import build_filename +from packagedcode.maven import build_url +from packagedcode.models import PackageData +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.miners import java_stream +from minecode.utils import parse_date + +""" +This module handles the Maven repositories such as central and other +nexus-based maven repositories. This is dubbed the maven2 format for the +repository and support the v4 POM format. + +Old Maven1 format repositories are not supported (e.g. with jars, +sources, poms directories and POM format v2/v3). +""" + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +TRACE = False +TRACE_DEEP = False + +if TRACE: + import sys + + logging.basicConfig(stream=sys.stdout) + logger.setLevel(logging.DEBUG) + + +MAVEN_BASE_URL = "https://repo1.maven.org/maven2" + + +class GzipFileWithTrailing(gzip.GzipFile): + """ + A subclass of gzip.GzipFile supporting files with trailing garbage. Ignore + the garbage. + """ + + # TODO: what is first_file?? + first_file = True + gzip_magic = b"\037\213" + has_trailing_garbage = False + + def _read_gzip_header(self): + # read the first two bytes + magic = self.fileobj.read(2) + # rewind two bytes back + self.fileobj.seek(-2, os.SEEK_CUR) + is_gzip = magic != self.gzip_magic + if is_gzip and not self.first_file: + self.first_file = False + self.has_trailing_garbage = True + raise EOFError("Trailing garbage found") + + self.first_file = False + gzip.GzipFile._read_gzip_header(self) + + +class MavenSeed(seed.Seeder): + def get_seeds(self): + yield "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz" + yield "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties" + # yield 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz' + # yield 'http://jcenter.bintray.com/' + # yield 'https://repo2.maven.org/maven2/.index/nexus-maven-repository-index.gz' + # other repos: http://stackoverflow.com/a/161846/302521 + # 1. google has a mirror https://www.infoq.com/news/2015/11/maven-central-at-google + # https://maven-central.storage.googleapis.com/repos/central/data/.index/nexus-maven-repository-index.properties + # 2. apache has a possible mirro at http://repo.maven.apache.org/maven2/.index/nexus-maven-repository-index.properties + # 3. ibiblio has an out of date mirror that has no directory listing and was last updated on 20161121171437 + # clojars is not a mirror, but its own repo: https://clojars.org/repo/.index/ + # other mirrors https://www.google.com/search?q=allinurl%3A%20.index%2Fnexus-maven-repository-index.properties&pws=0&gl=us&gws_rd=cr + # also has a npm mirrors: https://maven-eu.nuxeo.org/nexus/#view-repositories;npmjs~browsestorage + + +@visit_router.route( + r"http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties" +) +@visit_router.route( + r"https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties" +) +class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor): + """Fetch the property files, parse the create the URI for each increment index""" + + def get_uris(self, content): + """ + Parse a NEXUS index properties file and yield increment index URIs + This file is a Java properties file with rows likes this: + nexus.index.incremental-15=526 + nexus.index.incremental-14=527 + + Each value points to a fragment increamental index that has the same + format as the bigger one. + """ + base_url = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz" + with open(content) as config_file: + properties = javaproperties.load(config_file) or {} + + for key, increment_index in properties.items(): + if key.startswith("nexus.index.incremental"): + yield URI( + uri=base_url.format(index=increment_index), + source_uri=self.uri, + ) + + +@visit_router.route( + "https?://.*/nexus-maven-repository-index.gz", + # increments + r"https?://.*/nexus-maven-repository-index\.\d+\.gz", +) +class MavenNexusIndexVisitor(NonPersistentHttpVisitor): + """ + Download and process a Nexus Maven index file. + WARNING: Processing is rather long: a full index is ~600MB. + """ + + def get_uris(self, content): + """ + Yield a combo of pre-visited URIs with a special maven-index:// + scheme together with other regular fetchable URIs for POMs and + JARs found in a Maven index. + + For NonPersistentHttpVisitor content is the path to the temp Gzipped + index file, not the actual file content. + """ + index_location = content + + artifacts = get_artifacts(index_location, worthyness=is_worthy_artifact) + + for artifact in artifacts: + # we cannot do much without these + group_id = artifact.group_id + artifact_id = artifact.artifact_id + version = artifact.version + extension = artifact.extension + + if not (group_id and artifact_id and version and extension): + continue + + qualifiers = {} + if extension and extension != "jar": + qualifiers["type"] = extension + + classifier = artifact.classifier + if classifier: + qualifiers["classifier"] = classifier + + package_url = PackageURL( + type="maven", + namespace=group_id, + name=artifact_id, + version=version, + qualifiers=qualifiers or None, + ) + + # FIXME: also use the Artifact.src_exist flags too? + + # build a URL: This is the real JAR download URL + # FIXME: this should be set at the time of creating Artifacts + # instead togther with the filename... especially we could use + # different REPOs. + jar_download_url, file_name = build_url_and_filename( + group_id, artifact_id, version, extension, classifier + ) + + # FIXME: should this be set in the yielded URI too + last_mod = artifact.last_modified + + # We yield a pre-visited URI for each JAR + mock_maven_index_uri = build_url( + group_id, + artifact_id, + version, + file_name, + base_url="maven-index://repo1.maven.org", + ) + + artifact_data = artifact.to_dict() + artifact_data["download_url"] = jar_download_url + artifact_as_json = json.dumps(artifact_data, separators=(",", ":")) + + yield URI( + # this is the Maven index index URI + source_uri=self.uri, + # FIXME: remove these mock URIs after migration + uri=mock_maven_index_uri, + package_url=package_url.to_string(), + visited=True, + mining_level=0, + file_name=file_name, + size=artifact.size, + sha1=artifact.sha1, + date=last_mod, + data=artifact_as_json, + ) + + package_url = PackageURL( + type="maven", + namespace=group_id, + name=artifact_id, + version=version, + ) + + # also yield a POM for this. There are no artifacts for + # the POM of a Jar in the repo. Only for Parent POMs + # therefore we create a download with the pomextension + pom_download_url, pom_file_name = build_url_and_filename( + group_id, artifact_id, version, extension="pom", classifier="" + ) + yield URI( + # this is the Maven index index URI + source_uri=self.uri, + uri=pom_download_url, + # use the same PURL as the main jar + package_url=package_url.to_string(), + visited=False, + mining_level=20, + file_name=pom_file_name, + size=0, + date=last_mod, + ) + + +@visit_router.route(r"https?://jcenter\.bintray\.com/(.+/)*") +class MavenHTMLPageVisitor(HttpVisitor): + """ + Parse the HTML page and yield all necessary uris from the page and its sub pages. + Note that the regex of the route expression is using . to map any characters except new line is becasue of the case: + http://jcenter.bintray.com/'com/virtualightning'/, this is in the test too. + """ + + def get_uris(self, content): + page = BeautifulSoup(content, "lxml") + for pre in page.find_all(name="pre"): + for a in pre.find_all(name="a"): + url = a.get("href") + if not url: + continue + # Remove : symbol since it's a special char for bintray repo. + if url.startswith(":"): + url = url[1:] + filename = None # default is folder, the filename is None. + if not url.endswith("/"): + # a file + filename = url + yield URI( + uri=self.uri + url, + visited=False, + file_name=filename, + source_uri=self.uri, + ) + + +@visit_router.route(r"https?://.*/maven-metadata\.xml") +class MavenMetaDataVisitor(HttpVisitor): + """Parse the maven-metadata.xml file and yield uris of jars and pom.""" + + def get_uris(self, content): + # FIXME this may not be correct. The only thing we can infer from the maven + # metadata is wha are the groupid/artifactid and available versions + # The actual download files likely need to be obtained from directory listing + # or infered from parsing the POM??? + + base_url = self.uri.partition("maven-metadata.xml")[0] + "{version}/" + pom_url = base_url + "{artifactId}-{version}.pom" + + # FIXME: this may not exist and or with another extension?? and this should be PREVISITED + jar_url = base_url + "{artifactId}-{version}.jar" + # FIXME: sources may not exists?? and this should be PREVISITED + source_url = base_url + "{artifactId}-{version}-sources.jar" + + # FIXME: why use BeautifulSoup for valid XML??? + page = BeautifulSoup(content, "lxml-xml") + + group_id = page.find(name="groupId") + artifact_id = page.find(name="artifactId") + if not (group_id and artifact_id): + return + + group_id = group_id.string + artifact_id = artifact_id.string + + for version in page.find_all("version"): + version = version.string + + # FIXME: we may not get the proper extensions and classifiers and miss the qualifiers + package_url = PackageURL( + type="maven", namespace=group_id, name=artifact_id, version=version + ).to_string() + + # the JAR proper as previsited + yield URI( + source_uri=self.uri, + uri=jar_url.format(version=version, artifactId=artifact_id), + package_url=package_url, + visited=True, + ) + + # the source as previsited + yield URI( + source_uri=self.uri, + uri=source_url.format(version=version, artifactId=artifact_id), + package_url=package_url, + visited=True, + ) + + # the POM needs to be visited + yield URI( + source_uri=self.uri, + uri=pom_url.format(version=version, artifactId=artifact_id), + package_url=package_url, + visited=False, + ) + + +# TODO: consider switching to HTTPS +def build_url_and_filename( + group_id, + artifact_id, + version, + extension, + classifier, + base_repo_url="https://repo1.maven.org/maven2", +): + """ + Return a tuple of (url, filename) for the download URL of a Maven + artifact built from its coordinates. + """ + file_name = build_filename(artifact_id, version, extension, classifier) + url = build_url(group_id, artifact_id, version, file_name, base_repo_url) + return url, file_name + + +# TODO: consider switching to HTTPS +def build_maven_xml_url( + group_id, artifact_id, base_repo_url="https://repo1.maven.org/maven2" +): + """ + Return a download URL for a Maven artifact built from its + coordinates. + """ + group_id = group_id.replace(".", "/") + path = "{group_id}/{artifact_id}".format(**locals()) + return "{base_repo_url}/{path}/maven-metadata.xml".format(**locals()) + + +@visit_router.route(r"https?://repo1.maven.org/maven2/.*\.pom") +class MavenPOMVisitor(HttpVisitor): + """ + Visit a POM. The POM XML is stored as data and there is nothing + special to do for this visitor. + """ + + pass + + +def is_worthy_artifact(artifact): + """ + We only care for certain artifacts that are worthy of indexing. + + Maven has some intricate interrelated values for these fields + type, extension, packaging, classifier, language + See http://maven.apache.org/ref/3.2.5/maven-core/artifact-handlers.html + + These are the defaults: + + type extension packaging classifier language + -------------------------------------------------------------- + pom = type = type none + jar = type = type java + maven-plugin jar = type java + ejb jar ejb = type java + ejb3 = type ejb3 = type java + war = type = type java + ear = type = type java + rar = type = type java + par = type = type java + java-source jar = type sources java + javadoc jar = type javadoc java + ejb-client jar ejb client java + test-jar jar jar tests java + """ + if artifact.version == "archetypes": + # we skip these entirely, they have a different shape + return + + worthy_ext_pack = set( + [ + # packaging, classifier, extension + ("jar", "sources", "jar"), + ("jar", None, "jar"), + ("bundle", None, "jar"), + ("war", None, "war"), + ("zip", "source-release", "zip"), + ("maven-plugin", None, "jar"), + ("aar", None, "aar"), + ("jar", "sources-commercial", "jar"), + ("zip", "src", "zip"), + ("tar.gz", "src", "tar.gz"), + ("jar", None, "zip"), + ("zip", "project-src", "zip"), + ("jar", "src", "jar"), + ] + ) + + return ( + artifact.packaging, + artifact.classifier, + artifact.extension, + ) in worthy_ext_pack + + +def is_source(classifier): + """Return True if the `artifact` Artifact is a source artifact.""" + return classifier and ("source" in classifier or "src" in classifier) + + +######################################################################## +# DOCUMENTAION OF the FIELDS aka. Records: +# +# Constants and information for field names can be found in +# https://github.com/apache/maven-indexer/tree/ecddb3c18ee1ee1357a01bffa7f9cb5252f21209 +# in these classes: +# - org.apache.maven.index.ArtifactInfoRecord +# - org.apache.maven.index.ArtifactInfo +# - org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator +# See also org.apache.maven.index.reader +# +# Note: these are the field names found in the Maven central index in +# July 2016: +# i u 1 m n d del +# allGroups allGroupsList rootGroups rootGroupsList +# IDXINFO DESCRIPTOR +# +# Bundle-Description Bundle-DocURL Bundle-License Bundle-Name Bundle- +# SymbolicName Bundle-Version Export-Package Export-Service Import- +# Package Require-Bundle + + +ENTRY_FIELDS = { + "u": "Artifact UINFO: Unique groupId, artifactId, version, classifier, extension (or packaging). using", + "i": "Artifact INFO: data using | separator", + "1": "Artifact SHA1 checksum, hex encoded as in sha1sum", + "m": "Artifact record last modified, a long as a string representing a Java time for the entry record", + "n": "Artifact name", + "d": "Artifact description", +} + +# we IGNORE these fields for now. They can be included optionally. +ENTRY_FIELDS_OTHER = { + # rarely present, mostly is repos other than central + "c": "Artifact Classes (tokenized on newlines only) a list of LF-separated paths, without .class extension", + "sha256": "sha256 of artifact? part of OSGI?", + # OSGI stuffs, not always there but could be useful metadata + "Bundle-SymbolicName": "Bundle-SymbolicName (indexed, stored)", + "Bundle-Version": "Bundle-Version (indexed, stored)", + "Bundle-Description": "Bundle-Description (indexed, stored)", + "Bundle-Name": "Bundle-Name (indexed, stored)", + "Bundle-License": "Bundle-License (indexed, stored)", + "Bundle-DocURL": "Bundle-DocURL (indexed, stored)", + "Require-Bundle": "Require-Bundle (indexed, stored)", +} + +# we ignore these fields entirely for now. +ENTRY_FIELDS_IGNORED = { + "IDXINFO": "", + "DESCRIPTOR": "", + "allGroups": "", + "allGroupsList": "", + "rootGroups": "", + "rootGroupsList": "", + # FIXME: we should deal with these + "del": "Deleted marker, will contain UINFO if document is deleted from index", + "Export-Package": "Export-Package (indexed, stored)", + "Export-Service": "Export-Service (indexed, stored)", + "Import-Package": "Import-Package (indexed, stored)", + # maven-plugin stuffs + "px": "MavenPlugin prefix (as keyword, stored)", + "gx": "MavenPlugin goals (as keyword, stored)", +} + + +def get_artifacts( + location, + fields=frozenset(ENTRY_FIELDS), + worthyness=is_worthy_artifact, + include_all=False, +): + """ + Yield artifact mappings from a Gzipped Maven nexus index data file + at location. + """ + for entry in get_entries(location, fields): + artifact = build_artifact(entry, include_all) + # at this stage we know enough to decide is this data is worthy of being an + # artifact for now we care only about a few things: POMs and binary Jars. + if artifact and worthyness(artifact): + yield artifact + + +_artifact_base_fields = ( + "group_id", + "artifact_id", + "version", + "packaging", + "classifier", + "extension", + "last_modified", + "size", + "sha1", + "name", + "description", + "src_exist", + "jdoc_exist", + "sig_exist", +) + +_artifact_extended_fields = ( + "sha256", + "osgi", + "classes", +) + +# FIXME: named tuples are suboptimal here for a simple dictionary + + +def to_dict(self): + return self._asdict() + + +Artifact = namedtuple("Artifact", _artifact_base_fields) +Artifact.to_dict = to_dict + +ArtifactExtended = namedtuple( + "ArtifactExtended", _artifact_base_fields + _artifact_extended_fields +) +ArtifactExtended.to_dict = to_dict + + +def build_artifact(entry, include_all=False): + """ + Return a Maven artifact mapping collected from a single entry + mapping or None. + """ + SEP = "|" + NA = "NA" + NULL = "null" + + # UINFO + # See org.apache.maven.index.reader.RecordExpander.expandUinfo + # See org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator.updateArtifactInfo + uinfo = entry.get("u") + if not uinfo: + # not much we can do without this + return + + uinfo = uinfo.split(SEP) + gid = uinfo[0] + aid = uinfo[1] + version = uinfo[2] + + classifier = uinfo[3] + if classifier == NA: + classifier = None + + extension = None + if len(uinfo) > 4: + extension = uinfo[4] + + # INFO + # See org.apache.maven.index.reader.RecordExpander.expandAddedArtifact + # See org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator.updateArtifactInfo + + packaging = None + size = 0 + # record last modified is at entry.get('m') and we ignore this + last_modified = None + src_exist = False + jdoc_exist = False + sig_exist = False + + info = entry.get("i") + if info: + info = info.split(SEP) + + packaging = info[0] + if packaging in (NA, NULL): + packaging = None + + # this is the artifact last modified + # create a date/time stamp string from a long as a string + lm = info[1] + if lm and lm.isdigit() and lm != "0": + last_modified = java_time_ts(int(lm)) + + size = info[2] + size = int(size) if size and size.isdigit() else None + + # for *Exists fields of INFO: see org.apache.maven.index.ArtifactAvailability + # not present locally: '0': False, + # present locally: '1': True, ==> the only one we care for + # not available: '2': False, + PRESENT = "1" + src_exist = info[3] == PRESENT + jdoc_exist = info[4] == PRESENT + + if len(info) > 6: + extension = info[6] + else: + # FIXME: is this likely incorrect see worthyness check + if classifier or packaging in ("pom", "war", "ear"): + extension = packaging + else: + extension = "jar" + sig_exist = info[5] == PRESENT + + # other MISC fields + sha1 = entry.get("1") + name = entry.get("n") + description = entry.get("d") + + if not include_all: + artifact = Artifact( + group_id=gid, + artifact_id=aid, + version=version, + packaging=packaging, + classifier=classifier, + extension=extension, + last_modified=last_modified, + size=size, + sha1=sha1, + name=name, + description=description, + src_exist=src_exist, + jdoc_exist=jdoc_exist, + sig_exist=sig_exist, + ) + + else: + # TODO: should this be part of the base set? + sha256 = entry.get("sha256") + + # OSGI: Rarely there. Note that we ignore 'Export-', 'Import-', on + # purpose: these are big and messey for now + osgi = dict() + for key, value in entry.items(): + if key.startswith("Bundle-") and value: + # TODO: could also include 'Require-Bundle' + osgi[key] = value.strip() + + # Classes: Rarely there, but eventually useful in the future + # Can be quite big too + classes = entry.get("c", "").splitlines(False) + + artifact = ArtifactExtended( + group_id=gid, + artifact_id=aid, + version=version, + packaging=packaging, + classifier=classifier, + extension=extension, + last_modified=last_modified, + size=size, + sha1=sha1, + name=name, + description=description, + src_exist=src_exist, + jdoc_exist=jdoc_exist, + sig_exist=sig_exist, + sha256=sha256, + osgi=osgi, + classes=classes, + ) + + return artifact + + +def get_entries(location, fields=frozenset(ENTRY_FIELDS)): + """ + Yield Maven index entry mappings from a Gzipped Maven nexus index + data file at `location`. Only includes `fields` names. + """ + buffer_size = 128 * 1024 * 1024 + if TRACE_DEEP: + entry = None + entries_count = 0 + keys = set() + keys_update = keys.update + + with GzipFileWithTrailing(location, "rb") as compressed: + # using io.BufferedReader for increased perfs + with io.BufferedReader(compressed, buffer_size=buffer_size) as nexus_index: + jstream = java_stream.DataInputStream(nexus_index) + + # FIXME: we do nothing with these two + # NOTE: this reads 1+8=9 bytes of the stream + _index_version, _last_modified = decode_index_header(jstream) + while True: + try: + entry = decode_entry(jstream, fields) + if TRACE_DEEP: + if entry: + keys_update(entry) + entries_count += 1 + + if entry: + yield entry + + except EOFError: + if TRACE_DEEP: + print( + f"Index version: {_index_version} last_modified: {_last_modified}" + ) + print(f"Processed {entries_count} docs. Last entry: {entry}") + print("Unique keys:") + for k in sorted(keys): + print(k) + break + + +def decode_index_header(jstream): + """ + Return the index header from a `jstream` Java-like stream as a tuple + of (index_version, last_updated_date) where index_version is an int + and last_updated_date is a an UTC ISO timestamp string or an empty + string. + """ + # this.chunkName = chunkName.trim(); + # this.dataInputStream = new DataInputStream( new GZIPInputStream( inputStream, 2 * 1024 ) ); + # this.version = ( (int) dataInputStream.readByte() ) & 0xff; + # this.timestamp = new Date( dataInputStream.readLong() ); + + supported_format_version = 1 + # one byte + index_version = int(jstream.read_byte()) + assert supported_format_version == index_version + # eight byte + timestamp = jstream.read_long() + last_modified = timestamp != -1 and java_time_ts(timestamp) or "" + return int(index_version), last_modified + + +def decode_entry(jstream, fields=()): + """ + Read and return one entry mapping of name -> values from a Maven + index `jstream` Java-like stream. Note that the stream is not a + standard Java stream for UTF data. + + Only includes `fields` names. + + An entry starts with an integer which is the number of fields for + this entry. + + Then we have this data layout for each field: + + - field storage type: one byte flag which is then compared to + constants. These are flags for Lucene indexing: INDEXED, STORED, + TOKENIZED, ANALYZED it ends up being two booleans: indexed and + stored and we do not care for these. + + - field name: a Java UTF-8 string (using a len on 2 bytes, then the + name proper). Constants for field names are in ArtifactInfoRecord + and ArtifactInfo. The entry for these is available in ENTRY_FIELDS + for reference. + + - field value: a Java UTF-8-encoded string using the Maven Index special encoding + - one int which is the length of the UTF string in bytes + - the utf-8 string proper using Java conventions + """ + read = jstream.read + read_int = jstream.read_int + read_byte = jstream.read_byte + read_utf = jstream.read_utf + + has_fields = bool(fields) + entry = {} + # this read 4 bytes + field_count = read_int() + for _ in range(field_count): + # Flags for lucene: INDEXED, STORED, TOKENIZED, ANALYZED: ignored + # this is a mask and one off: + # field_indexed = 1 + # field_tokenized = 2 + # field_stored = 4 + # this reads 1 byte: total 5 + _indexing_type = read_byte() + + # all field names are ASCII chars, so even though this is UTF-8 + # encoded, this is ascii Constants for field names are in + # ArtifactInfoRecord and ArtifactInfo + # FIXME: we should discard things we do not care for in terms of fields right away + + # Read a regular "Java Modified UTF-8" as unicode. + # this read 2 bytes which are the len then the len. total 7 + len + name = decode_modified_utf8(read_utf()) + + # Read a Maven Nexus index special "Java Modified UTF-8" as + # unicode: Regular Java write/readUTF is a string length on 2 + # bytes followed by a UTF-encoded stream of bytes of that + # length. The Nexus Maven index use a full int rather than a 2 + # bytes int bypassing the 65K char limit for length of the + # standard Java readUTF. + # this read 4 bytes which is a len + value_length = read_int() + # this read bytes len + value = decode_modified_utf8(read(value_length)) + + # why do we skip some fields + if has_fields: + if name in fields: + entry[name] = value + else: + entry[name] = value + + return entry + + +def java_time_ts(tm): + """ + Convert a Java time long (as milliseconds since epoch) to an UTC ISO + timestamp. + """ + tzinfo = tz.tzutc() + ar = arrow.get(tm / 1000).replace(tzinfo=tzinfo).to("utc") + return ar.isoformat() + + +################################################################################ +# These are CLI/shell test and stat utilities +################################################################################ + + +def _spit_json(location, target): + with open(target, "w") as t: + t.write("[\n") + for i, artifact in enumerate(get_artifacts(location)): + if i % 1000 == 0: + print("number or artifacts:", i) + t.write(json.dumps(artifact.to_dict(), separators=(",", ":"))) + t.write(",\n") + + t.write("]\n") + + print("total number or artifacts:", i) + + +def _artifact_stats(location): + """ + Print artifacts stats from a Gzipped Maven nexus index data file + at location. + """ + from collections import Counter + + pom_packs = Counter() + pom_classifs = Counter() + pom_extensions = Counter() + combos = Counter() + + pom_worthy = 0 + + for i, artifact in enumerate(get_artifacts(location)): + combos[(artifact.packaging, artifact.classifier, artifact.extension)] += 1 + + if artifact.packaging: + pom_packs[artifact.packaging] += 1 + + if artifact.classifier: + pom_classifs[artifact.classifier] += 1 + + if artifact.extension: + pom_extensions[artifact.extension] += 1 + + if is_worthy_artifact(artifact): + pom_worthy += 1 + + if i % 10000 == 0: + print("number or artifacts:", i) + + print() + print("Total number of artifacts:", i) + print("Total number of worthy artifacts:", pom_worthy) + + print("Top packaging:") + for n, c in pom_packs.most_common(): + print(n, ":", c) + + print("Top classifiers:") + for n, c in pom_classifs.most_common(): + print(n, ":", c) + + print("Top extensions:") + for n, c in pom_extensions.most_common(): + print(n, ":", c) + + print("Top Combos: packaging, classifier, extension") + for n, c in combos.most_common(): + print(n, ":", c) + + """ + Latest stats on 2017-08-07: +Total number or artifacts: 5844648 +Total number of POMs: 302603 +Total number of worthy POMs: 300879 +Total number of JARs: 5158191 +Total number of POMs with names: 278521 with description: 151034 +Total number of JARs with names: 4762013 with description: 3144938 +Total number of Other with names: 360646 with description: 228119 +Unique POM packagings: [None, u'${packaging.type}', u'${packagingType}', + u'0-alpha-1-20050407.154541-1.pom', u'aar', u'apk', u'application-assembly', + u'bundle', u'feature', u'gem', u'hk2-jar', u'it-packaging', u'izpack-jar', + u'jar', u'jboss-sar', u'maven-archetype', u'maven-plugin', u'mule-extension', + u'mule-plugin', u'nar', u'nbm-application', u'pom', u'so', u'swc', u'tar', + u'tar.gz', u'war', u'xar', u'zip'] +Unique POM classifiers: [None, u'1', u'DEAD', u'M6a', u'bsd', u'changelog', +u'dtddoc', u'it', u'java', u'javadoc', u'jdbc3', u'pom'] + """ + + +def _entries_stats(location): + """ + Print entries stats from a Gzipped Maven nexus index data file + at location. + """ + from collections import Counter + + field_names = Counter() + field_names_update = field_names.update + + field_sets = Counter() + field_sets_update = field_sets.update + + for i, entry in enumerate(get_entries(location, ())): + keys = tuple(entry.keys()) + field_names_update(keys) + field_sets_update([keys]) + if i % 10000 == 0: + print() + print("number of entries:", i) + print("field names stats:", field_names) + + print() + print("Total number of entries:", i) + print() + print("All field names:", field_names.most_common()) + print() + print("All field name sets:", field_sets.most_common()) + print() + + +@map_router.route("maven-index://.*") +class MavenIndexArtifactMapper(Mapper): + """ + Process the minimal artifacts collected for a Maven Jar or POM in an + index visit. + """ + + def get_packages(self, uri, resource_uri): + yield get_mini_package(resource_uri.data, uri, resource_uri.package_url) + + +def get_mini_package(data, uri, purl): + """ + Return a MavenPomPackage built from the minimal artifact data available in a + nexus index, given a `data` JSON string, a `uri` string and a `purl` + PacxkageURL string. Return None if the package cannot be built. + """ + if not data: + return + + artdata = json.loads(data) + + # FIXME: this should a slot in Artifact + download_url = artdata.pop("download_url") + # FIXME: what if this is an ArtifactExtended?? + artifact = Artifact(**artdata) + + if purl: + if isinstance(purl, str): + purl = PackageURL.from_string(purl) + assert isinstance(purl, PackageURL) + + qualifiers = None + if purl and purl.qualifiers: + qualifiers = packageurl.normalize_qualifiers(purl.qualifiers, encode=False) + if qualifiers: + assert isinstance(qualifiers, dict) + logger.debug(f"get_mini_package: qualifiers: {qualifiers}") + + package = PackageData( + type="maven", + namespace=artifact.group_id, + name=artifact.artifact_id, + version=artifact.version, + qualifiers=qualifiers, + description=artifact.description, + download_url=download_url, + release_date=parse_date(artifact.last_modified), + size=artifact.size, + sha1=artifact.sha1 or None, + ) + logger.debug(f"get_mini_package: package.qualifiers: {package.qualifiers}") + logger.debug(f"get_mini_package for uri: {uri}, package: {package}") + return package + + +# FIXME this should be valid for any POM +@map_router.route(r"https?://repo1.maven.org/maven2/.*\.pom") +class MavenPomMapper(Mapper): + """Map a proper full POM visited as XML.""" + + def get_packages(self, uri, resource_uri): + logger.debug( + f"MavenPomMapper.get_packages: uri: {uri}, resource_uri: {resource_uri.uri}, purl:" + ) + package = get_package(resource_uri.data, resource_uri.package_url) + if package: + logger.debug(f"MavenPomMapper.get_packages: uri: {uri}, package: {package}") + yield package + + +def get_package(text, package_url=None, baseurl="https://repo1.maven.org/maven2"): + """Return a ScannedPackage built from a POM XML string `text`.""" + text = as_unicode(text) + package = _parse( + datasource_id="maven_pom", + package_type="maven", + primary_language="Java", + text=text, + ) + if package: + # FIXME: this should be part of the parse call + if package_url: + purl = PackageURL.from_string(package_url) + package.set_purl(purl) + # Build proper download_url given a POM: this must be the URL for + # the Jar which is the key to the PackageDB record + # FIXME the download is hardcoded to Maven Central? + # package.download_url = package.repository_download_url(baseurl=baseurl) + return package diff --git a/minecode/miners/npm.py b/minecode/miners/npm.py new file mode 100644 index 00000000..3f6fec24 --- /dev/null +++ b/minecode/miners/npm.py @@ -0,0 +1,136 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import logging + +from packagedcode.npm import NpmPackageJsonHandler +from packagedcode.npm import npm_api_url +from packagedcode.npm import split_scoped_package_name +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +class NpmSeed(seed.Seeder): + def get_seeds(self): + yield "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=0" + + +@visit_router.route( + r"https://replicate.npmjs.com/registry/_changes\?include_docs=true&limit=\d+&since=\d+" +) +class NpmRegistryVisitor(NonPersistentHttpVisitor): + """ + Yield one URI for the next batch of changes to re-visit. Yield one URI for + each NPM package (that contains all the versions for this package) as + previsited for mapping. + """ + + def get_uris(self, content): + """ + Yield a URI for the next index sequence to visit and one URI for each + package fetched in a batch. + """ + next_visitable_index_url_template = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since={last_seq}" + + json_location = content + with open(json_location) as c: + content = json.loads(c.read()) + + try: + last_seq = content["last_seq"] + except KeyError: + # provide a more meaningful message in case the JSON is incorrect + raise Exception('NpmRegistryVisitor: Missing "last_seq" field: Aborting.') + + # Always yield an index URI, even if there is no results to avoid stopping the index visits + yield URI( + uri=next_visitable_index_url_template.format(last_seq=last_seq), + source_uri=self.uri, + ) + + try: + results = content["results"] + except KeyError: + # provide a more meaningful message in case the JSON is incorrect + raise Exception('NpmRegistryVisitor: Missing "results" field: Aborting.') + + for result in results: + doc = result.get("doc") + # verify if this record is a package record (as opposed to + # some couchdb design document that we would ignore) + is_package_record = "versions" in doc and "name" in doc + if not is_package_record: + continue + + # remove the readme field from the data: this is big and mostly + # useless for now + doc.pop("readme", None) + + name = doc.get("name") + + namespace, name = split_scoped_package_name(name) + package_api_url = npm_api_url(namespace, name) + + package_url = PackageURL( + type="npm", namespace=namespace, name=name + ).to_string() + + # here: this is ready for mapping + yield URI( + uri=package_api_url, + package_url=package_url, + source_uri=self.uri, + data=json.dumps(doc, separators=(",", ":"), ensure_ascii=False), + # note: visited is True since there nothing more to visit + visited=True, + ) + + +# FIXME: This route may not work when we have scoped Packages or URLs to a specific version +# or yarn URLs +@map_router.route(r"https://registry.npmjs.org/[^\/]+") +class NpmPackageMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield NpmPackage built from a resource_uri record that contains many + npm versions for a given npm name. + """ + if not resource_uri.data: + return + visited_data = json.loads(resource_uri.data) + return build_packages(visited_data) + + +# FIXME: Consider using PURL here +def build_packages(data): + """ + Yield NpmPackage built from data corresponding to a single package name + and many npm versions. + """ + versions = data.get("versions", {}) + + logger.debug("build_packages: versions: " + repr(type(versions))) + for version, data in versions.items(): + logger.debug("build_packages: version: " + repr(version)) + logger.debug("build_packages: data: " + repr(data)) + package = NpmPackageJsonHandler._parse(json_data=data) + if package: + yield package diff --git a/minecode/miners/nuget.py b/minecode/miners/nuget.py new file mode 100644 index 00000000..5761328c --- /dev/null +++ b/minecode/miners/nuget.py @@ -0,0 +1,340 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json + +from bs4 import BeautifulSoup +from commoncode import fileutils +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import Mapper + + +class NugetSeed(seed.Seeder): + def get_seeds(self): + yield "https://api-v2v3search-0.nuget.org/query" + yield "https://www.nuget.org/packages?page=1" + + +@visit_router.route("https://api-v2v3search-0.nuget.org/query") +class NugetQueryVisitor(HttpJsonVisitor): + """ + 'https://api-v2v3search-0.nuget.org/query' is a query URL which has metadata for + Nuget packages and we can query for all the packages by using the pagination + technique. For example 'https://api-v2v3search-0.nuget.org/query?skip=40' will + skip the first 40 packages in the order and returns JSON data for the packages + from 40-60. + 'https://api-v2v3search-0.nuget.org/query' could be the latest version, as the + url 'https://api-v3search-0.nuget.org/query' is not accessible now. + """ + + def get_uris(self, content): + """ + Return all the URLs for query results through pagination. + Starts with number '0', increment count by '20'. + The total count is found by 'totalHits'. + """ + pkgs_count = content.get("totalHits", 0) + count = 0 + url_template = "https://api-v2v3search-0.nuget.org/query?skip={count}" + while count < pkgs_count: + url = url_template.format(count=str(count)) + yield URI(uri=url, source_uri=self.uri) + count = count + 20 + + +@visit_router.route(r"https://api-v2v3search-0.nuget.org/query\?skip=\d+") +class PackagesPageVisitor(HttpJsonVisitor): + """Visit the nuget API resources and return all the package URLs available at the passing`uri`.""" + + def get_uris(self, content): + metadata = content["data"] + for packages in metadata: + for version in packages["versions"]: + pkg_ver = version["version"] + pkg_url = version["@id"] + version_template = "{pkg_version}.0.json" + version_name = version_template.format(pkg_version=pkg_ver) + name = pkg_url.replace( + "https://api.nuget.org/v3/registration1/", "" + ).partition("/")[0] + package_url = PackageURL( + type="nuget", name=name, version=pkg_ver + ).to_string() + if version_name in pkg_url: + # sometimes an extra '0' is appended to the version in the URL + # FIXME: this is weird: there must be good reason why this is done??? + pkg_url = pkg_url.replace(version_name, pkg_ver + ".json") + yield URI(uri=pkg_url, package_url=package_url, source_uri=self.uri) + + # Add another case to have registration0 or registration1 in the url, yield the alternative url. + if pkg_url.find("/registration0/") > 0: + pkg_url = pkg_url.replace("/registration0/", "/registration1/") + yield URI(uri=pkg_url, source_uri=self.uri) + + elif pkg_url.find("/registration1/") > 0: + pkg_url = pkg_url.replace("/registration1/", "/registration0/") + yield URI(uri=pkg_url, source_uri=self.uri) + + +@visit_router.route("https://api.nuget.org/.+.json") +class NugetAPIJsonVisitor(HttpJsonVisitor): + """ + Visit packageContent of nuget API json and return a + download URL for the NugetPackage object + + This could cover three cases: + 1. packageContent is not empty. + https://api.nuget.org/v3/registration1/entityframework/4.3.1.json + Visiting above link will return the npkg file: https://api.nuget.org/packages/entityframework.4.3.1.nupkg + and return the json resource for next DownloadVisitor: https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json + + 2. catalogEntry is not empty + https://api.nuget.org/v3/registration1/entityframework/4.3.1.json + Visiting above link will return the npkg file: https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json + + 3. No key matched + The second loop will return the url https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json + by visiting this url it won't create any new uris, the key is to store the json file itself through visitor and used in mapper. + """ + + def get_uris(self, content): + download_url = content.get("packageContent") + if download_url: + filename = fileutils.file_name(download_url) + withou_prefix = filename.replace(".nupkg", "") + filename_splits = withou_prefix.partition(".") + name = filename_splits[0] + version = None + if len(filename_splits) > 1: + version = filename_splits[-1] + package_url = PackageURL(type="nuget", name=name, version=version) + yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) + + catalog_entry_url = content.get("catalogEntry") + if catalog_entry_url: + yield URI(uri=catalog_entry_url, source_uri=self.uri) + + +@visit_router.route(r"https://www.nuget.org/packages\?page=\d+") +class NugetHTMLPageVisitor(HttpVisitor): + """Visitor to yield the URI of the each package page.""" + + def get_uris(self, content): + url_format = "https://www.nuget.org/packages/{name}" + soup = BeautifulSoup(content, "lxml") + has_package = False + for a in soup.find_all("a"): + if a.get("class") and "package-title" in a.get("class"): + has_package = True + href = a.get("href") + if not href: + continue + # href format is like: "/packages/NUnit/" + name = href.strip("/").partition("/")[-1] + if name: + yield URI(uri=url_format.format(name=name), source_uri=self.uri) + if has_package: + page_id = self.uri.replace( + "https://www.nuget.org/packages?page=", "" + ).strip("/") + next_pageid = int(page_id) + 1 + nextpage_url_format = "https://www.nuget.org/packages?page={id}" + yield URI( + uri=nextpage_url_format.format(id=next_pageid), source_uri=self.uri + ) + + +@visit_router.route( + r"https://www.nuget.org/packages/[\w\-\.]+", + r"https://www.nuget.org/packages/[\w\-\.]+/[\w\-\.]+", +) +class NugetHTMLPackageVisitor(HttpVisitor): + """ + Visitor to fetch the package HTML content + Example: https://www.nuget.org/packages/log4net + or https://www.nuget.org/packages/log4net/2.0.7 + """ + + pass + + +@map_router.route(r"https://api.nuget.org/v3/catalog.+\.json") +class NugetPackageMapper(Mapper): + """ + Return NugetPackage object by parsing the ResourceURI stored in db referenced by the + nuget API URIs. + """ + + def get_packages(self, uri, resource_uri): + if not resource_uri.data: + return + pkg_data = json.loads(resource_uri.data) + return build_packages_with_json(pkg_data, resource_uri.package_url) + + +def build_packages_with_json(metadata, purl=None): + """ + Yield package from the json metadata passed + metadata: json metadata content from API call + purl: String value of the package url of the ResourceURI object + """ + licenseUrl = metadata.get("licenseUrl") + copyr = metadata.get("copyright") + + authors = [] + names = metadata.get("authors") + if names: + for name in names.split(","): + authors.append(scan_models.Party(name=name.strip(), role="author")) + + keywords = metadata.get("tags", []) + + # TODO: the content has the SHA512, our model may extend to SHA512 + + if name: + short_desc = metadata.get("summary") + long_desc = metadata.get("description") + if long_desc == short_desc: + long_desc = None + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = "\n".join(descriptions) + package_mapping = dict( + type="nuget", + name=metadata["id"], + version=metadata["version"], + homepage_url=metadata.get("projectUrl"), + description=description, + extracted_license_statement=licenseUrl, + license_detections=[], + copyright=copyr, + parties=authors, + keywords=keywords, + ) + package = scan_models.PackageData.from_data(package_data=package_mapping) + package.set_purl(purl) + yield package + + +@map_router.route(r"https://api.nuget.org/packages/.*\.nupkg") +class NugetNUPKGDownloadMapper(Mapper): + """ + Return NugetPackage object by parsing the download URL. + For example: https://api.nuget.org/packages/entityframework.4.3.1.nupkg + """ + + def get_packages(self, uri, resource_uri): + if not resource_uri.data: + return + pkg_data = json.loads(resource_uri.data) + return build_packages_with_nupkg_download_url( + pkg_data, resource_uri.package_url, resource_uri.uri + ) + + +def build_packages_with_nupkg_download_url(metadata, purl, uri): + if purl: + package = scan_models.PackageData( + type="nuget", name=purl.name, download_url=uri + ) + package.set_purl(purl) + yield package + + +@map_router.route( + r"https://www.nuget.org/packages/[\w\-\.]+", + r"https://www.nuget.org/packages/[\w\-\.]+/[\w\-\.]+", +) +class NugetHTMLPackageMapper(Mapper): + """ + Return NugetPackage object by parsing the package HTML content. + For example: https://www.nuget.org/packages/log4net + """ + + def get_packages(self, uri, resource_uri): + """Yield Package built from resource_uri data.""" + metadata = resource_uri.data + build_packages_from_html(metadata, resource_uri.uri, resource_uri.package_url) + + +def build_packages_from_html(metadata, uri, purl=None): + """ + Yield Package built from Nuget a `metadata` content + metadata: json metadata content + uri: the uri of the ResourceURI object + purl: String value of the package url of the ResourceURI object + """ + download_url_format = "https://www.nuget.org/api/v2/package/{name}/{version}" + soup = BeautifulSoup(metadata, "lxml") + h1 = soup.find("h1") + if h1 and h1.contents: + license_value = None + name = str(h1.contents[0]).strip() + for a in soup.find_all("a"): + if a.get("data-track") and a.get("data-track") == "outbound-license-url": + license_value = a.string + if license_value: + license_value = str(license_value).strip() + + copyright_value = None + h2s = soup.find_all("h2") + for h2 in h2s: + # Copyright will be after the copyright h2 node + # The exmaple is like this: + #

Copyright

+ #

Copyright 2004-2017 The Apache Software Foundation

+ if h2.string and h2.string == "Copyright": + next_element = h2.find_next_sibling("p") + if next_element: + copyright_value = next_element.string + + description = None + for m in soup.find_all("meta"): + if ( + m.get("property") + and m.get("property") == "og:description" + and m.get("content") + ): + description = m.get("content") + + for tbody in soup.find_all("tbody"): + if tbody.get("class") and tbody.get("class")[0] == "no-border": + for a in tbody.find_all("a"): + version = a.string + if not version or not version.strip(): + continue + version = version.strip() + download_url = download_url_format.format( + name=name, version=version + ) + package_mapping = dict( + datasource_id="nuget_metadata_json", + name=name, + type="nuget", + version=version, + homepage_url=uri, + description=description, + download_url=download_url, + extracted_license_statement=license_value, + license_detections=[], + copyright=copyright_value, + ) + package = scan_models.Package.from_package_data( + package_data=package_mapping, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/miners/openssl.py b/minecode/miners/openssl.py new file mode 100644 index 00000000..6140dd58 --- /dev/null +++ b/minecode/miners/openssl.py @@ -0,0 +1,168 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import logging +from datetime import datetime + +from bs4 import BeautifulSoup +from commoncode import fileutils +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.utils import is_int +from minecode.utils import parse_date + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +class OpenSSLSeed(seed.Seeder): + def get_seeds(self): + yield "https://ftp.openssl.org/" + + +@visit_router.route("https://ftp.openssl.org/", "https://ftp.openssl.org/.*/") +class OpenSSLVisitor(HttpVisitor): + """Collect package metadata URIs from the open SSL HTML site.""" + + def get_uris(self, content): + """Return URIs objects and the corresponding size, file date info.""" + page = BeautifulSoup(content, "lxml") + for a in page.find_all(name="a"): + if "href" not in a.attrs: + continue + href = a["href"] + if not href: + continue + if href.startswith("?") or href.startswith("/"): + # if href is not valid resource, ignore, for example, it's a + # link to parent link etc. + continue + url = self.uri + href + next_sibling = a.parent.findNext("td") + + date = None + if next_sibling and next_sibling.contents: + date = next_sibling.contents[0].strip() + # The passing date format is like: 2014-11-19 17:48 + date = datetime.strptime(date, "%Y-%m-%d %H:%M") + + if next_sibling: + next_next = next_sibling.findNext("td") + if next_next and next_next.contents: + size = next_next.contents[0].strip() + if size and is_int(size): + # By default, if the unit is not shown, it means k. + size = str(int(size) * 1024) + if size.endswith(("M", "m")): + # If the size is mega byte, and the format is a float + # instead of int, since it's possible like 5.1M + size = str( + int( + float(size.replace("M", "").replace("m", "")) + * 1024 + * 1024 + ) + ) + elif size.endswith("G") or size.endswith("G"): + # if the size is gega byte + size = str( + int( + float(size.replace("G", "").replace("g", "")) + * 1024 + * 1024 + * 1024 + ) + ) + if size == "-": + # if it's folder, ignore the size + size = None + file_name = None + if not url.endswith("/"): + file_name = fileutils.file_name(url) + if file_name: + # If it's a file, pass the url to mapper by setting the visited + # to True + package_url = None + version = None + if "tar.gz" in file_name: + version = file_name.replace("openssl-", "").partition(".tar.gz")[0] + package_url = PackageURL( + type="generic", name="openssl", version=version + ).to_string() + yield URI( + uri=url, + source_uri=self.uri, + package_url=package_url, + date=date, + file_name=file_name, + size=size, + ) + else: + yield URI(uri=url, source_uri=self.uri, date=date, size=size) + + +@map_router.route("https://ftp.openssl.org/.*") +class OpenSSLMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield ScannedPackage built from resource_uri record for a single package + version. Yield as many Package from the uri + """ + return build_packages(resource_uri, resource_uri.package_url) + + +def build_packages(resource_uri, purl=None): + """ + Yield Package from resource_uri metadata + resource_uri: ResourceURI object + purl: String value of the package url of the ResourceURI object + """ + uri = resource_uri.uri + file_name = fileutils.file_name(uri) + version = ( + file_name.replace(".tar.gz", "") + .replace("openssl-", "") + .replace(".tar.gz", "") + .replace(".asc", "") + .replace(".md5", "") + .replace(".sha1", "") + .replace(".sha256", "") + ) + common_data = dict( + datasource_id="openssl_metadeta", + type="generic", + name=file_name, + description="The OpenSSL Project is a collaborative effort to develop a robust, commercial-grade, fully featured, and Open Source toolkit implementing the Transport Layer Security (TLS) protocols (including SSLv3) as well as a full-strength general purpose cryptographic library.", + version=version, + size=resource_uri.size, + release_date=parse_date(resource_uri.last_modified_date), + extracted_license_statement="OpenSSL License", + license_detections=[], + homepage_url="https://www.openssl.org/", + download_url=uri, + copyright="Copyright (c) 1998-2018 The OpenSSL Project\nCopyright (c) 1995-1998 Eric A. Young, Tim J. Hudson\nAll rights reserved.", + vcs_url="git+https://github.com/openssl/openssl.git", + code_view_url="https://github.com/openssl/openssl", + bug_tracking_url="https://github.com/openssl/openssl/issues", + ) + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/miners/openwrt.py b/minecode/miners/openwrt.py new file mode 100644 index 00000000..97d89214 --- /dev/null +++ b/minecode/miners/openwrt.py @@ -0,0 +1,175 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import gzip +import json +import logging +import os + +from bs4 import BeautifulSoup +from debian_inspector import debcon +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import debutils +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.collectors.debian import get_dependencies +from minecode.miners import URI +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.utils import extract_file + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +class OpenWrtSeed(seed.Seeder): + def get_seeds(self): + yield "https://downloads.openwrt.org/chaos_calmer/15.05/" + + +@visit_router.route("https://downloads.openwrt.org/.*/") +class OpenWrtDownloadPagesVisitor(HttpVisitor): + """Visit the OpwnWRT download HTML page and return URIs parsed from HTML page.""" + + def get_uris(self, content): + page = BeautifulSoup(content, "lxml") + for td in page.find_all(name="td"): + a = td.find(name="a") + if not a: + continue + href = a["href"] + if href == "../": # Ignore the parent url + continue + + # Add the uri for next loop if it ends with "/", which means it'a + # folder resource uri + if href.endswith("/"): + package_url = PackageURL( + type="openwrt", name=href.replace("/", "") + ).to_string() + yield URI( + uri=self.uri + href, package_url=package_url, source_uri=self.uri + ) + elif href.endswith(("Packages", "Packages.gz", ".ipk")): + yield URI(uri=self.uri + href, source_uri=self.uri) + + +@visit_router.route(r"https://downloads.openwrt.org/.*/Packages\.gz") +class OpenWrtPackageIndexVisitor(NonPersistentHttpVisitor): + """Visit the OpwnWRT Packages.gz Index file and collect uris.""" + + def get_uris(self, content): + with gzip.open(content, "rb") as f: + content = f.read() + + for package in debcon.get_paragraphs_data(content): + file_info = package.get("Filename") + if not file_info: + continue + version = package.get("Version") + md5sum = package.get("MD5Sum") + sha256sum = package.get("SHA256sum") + package_name = package.get("Package") + package_url = None + if package_name and version: + package_url = PackageURL( + type="openwrt", name=package_name, version=version + ).to_string() + file_info = file_info.lstrip("/") + dir_url = self.uri.replace("Packages.gz", "") + file_info + yield URI( + uri=dir_url, + package_url=package_url, + data=json.dumps(str(package)), + source_uri=self.uri, + md5=md5sum, + sha256=sha256sum, + ) + + +@visit_router.route(r"https://downloads.openwrt.org/.*\.ipk") +class OpenWrtIpkPackageArchiveVisitor(NonPersistentHttpVisitor): + """Visit the OpwnWRT Packages.gz and collect uris.""" + + def dumps(self, content): + """ + Extract an ipk package archive and its control.targ.gz. Parse the + control file and return a JSON string from these data. + """ + extracted_location = extract_file(content) + control_targz = os.path.join(extracted_location, "control.tar.gz") + control_extracted_folder = extract_file(control_targz) + control_location = os.path.join(control_extracted_folder, "control") + parsed = debcon.Debian822.from_file(control_location) + return json.dumps(parsed) + + +@map_router.route(r"https://downloads.openwrt.org/.*\.ipk") +class OpenwrtIpkMetadataMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield ScannedPackage built from resource_uri record for a single package + version. Yield as many Package as there are download URLs. + """ + metadata = json.loads(resource_uri.data) + return build_packages(metadata, resource_uri.package_url, uri) + + +def build_packages(metadata, purl=None, uri=None): + """ + Yield ScannedPackage built from the passing metadata. + metadata: metadata mapping + purl: String value of the package url of the ResourceURI object + """ + common_data = dict( + type="openwrt", + datasource_id="openwrt_metadata", + name=metadata.get("Package"), + version=metadata.get("Version"), + description=metadata.get("Description"), + size=metadata.get("Installed-Size"), + ) + + dependencies = get_dependencies(metadata, ["Depends"]) + if dependencies: + common_data["dependencies"] = dependencies + + maintainers = metadata.get("Maintainer") + if maintainers: + name, email = debutils.parse_email(maintainers) + if name: + parties = common_data.get("parties") + if not parties: + common_data["parties"] = [] + party = scan_models.Party(name=name, role="maintainer", email=email) + common_data["parties"].append(party) + + lic = metadata.get("License") + if lic: + common_data["declared_license"] = lic + + common_data["keywords"] = [] + section = metadata.get("Section") + if section: + common_data["keywords"].append(section) + architecture = metadata.get("Architecture") + if architecture: + common_data["keywords"].append(architecture) + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/miners/packagist.py b/minecode/miners/packagist.py new file mode 100644 index 00000000..b5e8e3b4 --- /dev/null +++ b/minecode/miners/packagist.py @@ -0,0 +1,152 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json + +from packagedcode import models as scan_models +from packagedcode.models import DependentPackage +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper +from minecode.utils import form_vcs_url + +""" +Collect packagist packages + +The packagist repo API is at: https://packagist.org/apidoc +""" + + +class PackagistSeed(seed.Seeder): + def get_seeds(self): + yield "https://packagist.org/packages/list.json" + + +@visit_router.route("https://packagist.org/packages/list.json") +class PackagistListVisitor(HttpJsonVisitor): + """ + Collect list json resource and yield URIs for searching with package url. + + The yield uri format is like: https://packagist.org/p/[vendor]/[package].json + """ + + def get_uris(self, content): + search_url_template = "https://packagist.org/p/{vendor}/{package}.json" + packages_entries = content.get("packageNames", {}) + for package in packages_entries: + # FIXME: what does it mean to have no / in the URL? + if "/" not in package: + continue + vp = package.split("/") + vendor = vp[0] + package = vp[1] + package_url = PackageURL(type="composer", name=package).to_string() + yield URI( + uri=search_url_template.format(vendor=vendor, package=package), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route("https://packagist.org/p/.*json") +class PackageVisitor(HttpJsonVisitor): + """Collect JSON for a package.""" + + # FIXME: what about having a download URL to fetch the real package??? + pass + + +@map_router.route("https://packagist.org/p/.*json") +class PackagistPackageMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are multiple versions. + """ + metadata = json.loads(resource_uri.data) + return build_packages_with_json(metadata, resource_uri.package_url, uri) + + +def build_packages_with_json(metadata, purl=None, uri=None): + """ + Yield Package built from Packist package json content. + metadata: json metadata content + purl: String value of the package url of the ResourceURI object + """ + package = metadata.get("package") + if package: + primary_language = package.get("language") + for version_content in package.get("versions").values(): + common = dict( + datasource_id="php_composer_json", + type="composer", + name=version_content.get("name"), + description=version_content.get("description"), + primary_language=primary_language, + ) + common["version"] = version_content.get("version") + common["keywords"] = version_content.get("keywords") + common["homepage_url"] = version_content.get("homepage") + + source = version_content.get("source") + if source: + if source.get("type") == "git" and source.get("url"): + common["vcs_url"] = form_vcs_url("git", source.get("url")) + else: + pass # Packagist only has the github repo + + dist = version_content.get("dist") + if dist: + common["download_url"] = dist.get("url") + common["sha1"] = dist.get("shasum") + + for author in version_content.get("authors", []): + parties = common.get("parties") + if not parties: + common["parties"] = [] + common["parties"].append( + scan_models.Party( + name=author.get("name"), + role="author", + url=author.get("homepage"), + email=author.get("email"), + ).to_dict() + ) + + extracted_license_statement = set([]) + for lic in version_content.get("license"): + extracted_license_statement.add(lic) + if extracted_license_statement: + common["extracted_license_statement"] = list( + extracted_license_statement + ) + common["license_detections"] = [] + + dependencies = [] + for name, version in version_content.get("require", {}).items(): + dependencies.append( + DependentPackage( + purl=name, extracted_requirement=version, scope="runtime" + ).to_dict() + ) + if dependencies: + common["dependencies"] = dependencies + # FIXME: We should create a composer package + package = scan_models.Package.from_package_data( + package_data=common, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/miners/pypi.py b/minecode/miners/pypi.py new file mode 100644 index 00000000..9669ff19 --- /dev/null +++ b/minecode/miners/pypi.py @@ -0,0 +1,275 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import codecs +import json +import xmlrpc + +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper +from minecode.miners import Visitor +from minecode.utils import get_temp_file +from minecode.utils import parse_date + +""" +Visitors for Pypi and Pypi-like Python package repositories. + +We have this hierarchy in Pypi: + index (xmlrpc) -> packages (json) -> package releases (json) -> download urls + +Pypi serves a main index via XMLRPC that contains a list of package names. +For each package, a JSON contains details including the list of all releases. +For each release, a JSON contains details for the released version and all the +downloads available for this release. We create Packages at this level as well +as one download URI for each effective download. + +Some information about every release and download is replicated in every JSON +payload and is ignored for simplicity (which is not super efficient). +""" + + +class PypiSeed(seed.Seeder): + def get_seeds(self): + yield "https://pypi.python.org/pypi/" + + +@visit_router.route("https://pypi.python.org/pypi/") +class PypiIndexVisitor(Visitor): + """Collect package metadata URIs from the top level pypi index for each package.""" + + def fetch(self, uri, timeout=None): + """Specialized fetching using XML RPCs.""" + packages = xmlrpc.client.ServerProxy(uri).list_packages() + content = list(packages) + + temp_file = get_temp_file("PypiIndexVisitor") + with codecs.open(temp_file, mode="wb", encoding="utf-8") as expect: + json.dump(content, expect, indent=2, separators=(",", ":")) + return temp_file + + def dumps(self, content): + """Return None as the content is huge json and should not be dumped.""" + return None + + def get_uris(self, content): + with codecs.open(content, mode="rb", encoding="utf-8") as contentfile: + packages_list = json.load(contentfile) + + url_template = "https://pypi.python.org/pypi/{name}/json" + for name in packages_list: + package_url = PackageURL(type="pypi", name=name).to_string() + yield URI( + uri=url_template.format(name=name), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route("https://pypi.python.org/pypi/[^/]+/json") +class PypiPackageVisitor(HttpJsonVisitor): + """ + Collect package metadata URIs for all release of a single Pypi package. + The url will contain only the package name, for example: https://pypi.org/pypi/vmock/json + By parsing the content, the goal is to form the json with version/release: https://pypi.org/pypi/vmock/0.1/json + """ + + def get_uris(self, content): + url_template = "https://pypi.python.org/pypi/{name}/{release}/json" + info = content.get("info", {}) + name = info.get("name") + if name: + for release in content["releases"]: + package_url = PackageURL( + type="pypi", name=name, version=release + ).to_string() + yield URI( + uri=url_template.format(name=name, release=release), + package_url=package_url, + source_uri=self.uri, + ) + + +@visit_router.route("https://pypi.python.org/pypi/[^/]+/[^/]+/json") +class PypiPackageReleaseVisitor(HttpJsonVisitor): + """ + Collect package download URIs for all packages archives of one Pypi package + release. The example is: https://pypi.org/pypi/vmock/0.1/json + """ + + def get_uris(self, content): + # TODO: this is likely best ignored entirely??? + # A download_url may be provided for an off-Pypi-download + info = content.get("info", {}) + name = info.get("name") + version = None + download_url = info.get("download_url") + if download_url and download_url != "UNKNOWN": + version = info.get("version") + package_url = PackageURL( + type="pypi", name=name, version=version + ).to_string() + yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) + + # Common on-Pypi-download URLs are in the urls block + for download in content.get("urls", {}): + url = download.get("url") + if not url: + continue + package_url = PackageURL( + type="pypi", name=name, version=version + ).to_string() + yield URI( + url, + package_url=package_url, + file_name=download.get("filename"), + size=download.get("size"), + date=download.get("upload_time"), + md5=download.get("md5_digest"), + source_uri=self.uri, + ) + + +@map_router.route("https://pypi.python.org/pypi/[^/]+/[^/]+/json") +class PypiPackageMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield ScannedPackages built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + # FIXME: JSON deserialization should be handled eventually by the framework + metadata = json.loads(resource_uri.data) + return build_packages(metadata, resource_uri.package_url) + + +def build_packages(metadata, purl=None): + """ + Yield ScannedPackage built from Pypi a `metadata` mapping + for a single package version. + Yield as many Package as there are download URLs. + + The metadata for a Pypi package has three main blocks: info, releases and + urls. Releases is redundant with urls and contains all download urls for + every releases. It is repeased for each version-specific json: we ignore it + and use only info and urls. + + purl: String value of the package url of the ResourceURI object + """ + info = metadata["info"] + # mapping of information that are common to all the downloads of a version + short_desc = info.get("summary") + long_desc = info.get("description") + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = "\n".join(descriptions) + common_data = dict( + name=info["name"], + version=info["version"], + description=description, + homepage_url=info.get("home_page"), + bug_tracking_url=info.get("bugtrack_url"), + ) + + author = info.get("author") + email = info.get("author_email") + if author or email: + parties = common_data.get("parties") + if not parties: + common_data["parties"] = [] + common_data["parties"].append( + scan_models.Party( + type=scan_models.party_person, name=author, role="author", email=email + ) + ) + + maintainer = info.get("maintainer") + email = info.get("maintainer_email") + if maintainer or email: + parties = common_data.get("parties") + if not parties: + common_data["parties"] = [] + common_data["parties"].append( + scan_models.Party( + type=scan_models.party_person, + name=maintainer, + role="maintainer", + email=email, + ) + ) + + extracted_license_statement = [] + lic = info.get("license") + if lic and lic != "UNKNOWN": + extracted_license_statement.append(lic) + + classifiers = info.get("classifiers") + if classifiers and not extracted_license_statement: + licenses = [lic for lic in classifiers if lic.lower().startswith("license")] + for lic in licenses: + extracted_license_statement.append(lic) + + common_data["extracted_license_statement"] = extracted_license_statement + + kw = info.get("keywords") + if kw: + common_data["keywords"] = [k.strip() for k in kw.split(",") if k.strip()] + + # FIXME: we should either support "extra" data in a ScannedPackage or just ignore this kind of FIXME comments for now + + # FIXME: not supported in ScanCode Package: info.platform may provide some platform infor (possibly UNKNOWN) + # FIXME: not supported in ScanCode Package: info.docs_url + # FIXME: not supported in ScanCode Package: info.release_url "http://pypi.python.org/pypi/Django/1.10b1" + # FIXME: not supported in ScanCode Package: info.classifiers: this contains a lot of other info (platform, license, etc) + # FIXME: if the homepage is on Github we can infer the VCS + # FIXME: info.requires_dist contains a list of requirements/deps that should be mapped to dependencies? + # FIXME: info.requires_python may be useful and should be mapped to some platform? + # FIXME: Package Index Owner: seems to be only available on the web page + + # A download_url may be provided for off Pypi download: we yield a package if relevant + # FIXME: do not prioritize the download_url outside Pypi over actual exact Pypi donwload URL + download_url = info.get("download_url") + if download_url and download_url != "UNKNOWN": + download_data = dict( + datasource_id="pypi_sdist_pkginfo", + type="pypi", + download_url=download_url, + ) + download_data.update(common_data) + package = scan_models.PackageData.from_data(download_data) + # TODO: Consider creating a DatafileHandler for PyPI API metadata + package.datasource_id = "pypi_api_metadata" + package.set_purl(purl) + yield package + + # yield a package for each download URL + for download in metadata["urls"]: + url = download.get("url") + if not url: + continue + + download_data = dict( + download_url=url, + size=download.get("size"), + release_date=parse_date(download.get("upload_time")), + datasource_id="pypi_sdist_pkginfo", + type="pypi", + ) + # TODO: Check for other checksums + download_data["md5"] = download.get("md5_digest") + download_data.update(common_data) + package = scan_models.PackageData.from_data(download_data) + package.datasource_id = "pypi_api_metadata" + package.set_purl(purl) + yield package diff --git a/minecode/visitors/repodata.py b/minecode/miners/repodata.py similarity index 55% rename from minecode/visitors/repodata.py rename to minecode/miners/repodata.py index 87053e1d..4584222d 100644 --- a/minecode/visitors/repodata.py +++ b/minecode/miners/repodata.py @@ -2,9 +2,6 @@ # Copyright (c) nexB Inc. and others. All rights reserved. # -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals import logging import posixpath @@ -19,9 +16,7 @@ def remove_list_repetitions(input_list): - """ - Removes the repeated items in a list and returns a list with unique values - """ + """Remove the repeated items in a list and return a list with unique values""" output = [] for item in input_list: if item not in output: @@ -37,15 +32,18 @@ def combine_dicts_using_pkgid(all_dicts): """ all_package_info = [] for package_info in all_dicts: - if package_info['pkgid']: - all_package_info.append(combine_list_of_dicts( - [a for a in all_dicts if a['pkgid'] == package_info['pkgid']])) + if package_info["pkgid"]: + all_package_info.append( + combine_list_of_dicts( + [a for a in all_dicts if a["pkgid"] == package_info["pkgid"]] + ) + ) return remove_list_repetitions(all_package_info) def combine_list_of_dicts(input_dicts): """ - Combines a list of dictionaries and returns a single dictionary with all the + Combine a list of dictionaries and return a single dictionary with all the keys and values from all the dictionaries in the list. """ all_dict_items = [] @@ -63,19 +61,17 @@ def convert_tuples_to_dict(input, attr_name=None): infos = {} if input: if not attr_name: - attr_name = '' + attr_name = "" else: - attr_name = '_' + attr_name + attr_name = "_" + attr_name for attrib, value in input: infos[attrib + attr_name] = value return infos def is_absolute(url): - """ - Return 'True' if the URL is absolute. - """ - schemes = ('http://', 'ftp://', 'https://') + """Return 'True' if the URL is absolute.""" + schemes = ("http://", "ftp://", "https://") return url.startswith(schemes) @@ -86,8 +82,8 @@ def build_rpm_download_url(base_url, href): """ if is_absolute(href): return href - if href.startswith('/'): - href = href.lstrip('/') + if href.startswith("/"): + href = href.lstrip("/") return posixpath.join(base_url, href) @@ -115,15 +111,17 @@ def get_url_for_tag(location, data_type): """ repomd = etree.parse(location).getroot() - for data_tag in repomd.findall('{http://linux.duke.edu/metadata/repo}data'): + for data_tag in repomd.findall("{http://linux.duke.edu/metadata/repo}data"): for attrib, value in data_tag.items(): - if attrib == 'type' and value == data_type: + if attrib == "type" and value == data_type: download_location = data_tag.find( - '{http://linux.duke.edu/metadata/repo}location') + "{http://linux.duke.edu/metadata/repo}location" + ) relative_url_info = convert_tuples_to_dict( - download_location.items(), 'location') + download_location.items(), "location" + ) if relative_url_info: - return relative_url_info['href_location'] + return relative_url_info["href_location"] def get_value_from_tuple_pairs(tuples, key): @@ -134,7 +132,7 @@ def get_value_from_tuple_pairs(tuples, key): def filelistsxml_parser(location): """ - Parses filelists.xml file and yields the data needed to generate RPM objects. + Parse filelists.xml file and yield the data needed to generate RPM objects. @@ -153,31 +151,33 @@ def filelistsxml_parser(location): """ infos = [] filelistsxml = etree.parse(location).getroot() - for package in filelistsxml.findall('{http://linux.duke.edu/metadata/filelists}package'): - version = package.find( - '{http://linux.duke.edu/metadata/filelists}version') + for package in filelistsxml.findall( + "{http://linux.duke.edu/metadata/filelists}package" + ): + version = package.find("{http://linux.duke.edu/metadata/filelists}version") package_info = dict(package.items() + version.items()) directory_listing = package.findall( - '{http://linux.duke.edu/metadata/filelists}file') + "{http://linux.duke.edu/metadata/filelists}file" + ) directories = [] files = [] for name in directory_listing: items = name.items() if items: - file_type = get_value_from_tuple_pairs(items, 'type') - if file_type == 'dir': - directories.append({'name': name.text}) + file_type = get_value_from_tuple_pairs(items, "type") + if file_type == "dir": + directories.append({"name": name.text}) else: - files.append({'name': name.text}) - package_info['directories'] = directories - package_info['files'] = files + files.append({"name": name.text}) + package_info["directories"] = directories + package_info["files"] = files infos.append(package_info) return infos def primaryxml_parser(location): """ - Parses primary.xml file and yields the data needed to generate RPM objects. + Parse primary.xml file and yield the data needed to generate RPM objects. 36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5 @@ -196,67 +196,54 @@ def primaryxml_parser(location): """ pkgs_infos = [] primaryxml = etree.parse(location).getroot() - for package in primaryxml.findall('{http://linux.duke.edu/metadata/common}package'): + for package in primaryxml.findall("{http://linux.duke.edu/metadata/common}package"): package_info = dict(package.items()) tags_infos = [] - description = package.find( - '{http://linux.duke.edu/metadata/common}description') - summary = package.find( - '{http://linux.duke.edu/metadata/common}summary') - packager = package.find( - '{http://linux.duke.edu/metadata/common}packager') - url = package.find('{http://linux.duke.edu/metadata/common}url') - size = package.find('{http://linux.duke.edu/metadata/common}size') - time = package.find('{http://linux.duke.edu/metadata/common}time') + description = package.find("{http://linux.duke.edu/metadata/common}description") + summary = package.find("{http://linux.duke.edu/metadata/common}summary") + packager = package.find("{http://linux.duke.edu/metadata/common}packager") + url = package.find("{http://linux.duke.edu/metadata/common}url") + size = package.find("{http://linux.duke.edu/metadata/common}size") + time = package.find("{http://linux.duke.edu/metadata/common}time") download_location = package.find( - '{http://linux.duke.edu/metadata/common}location') - checksum = package.find( - '{http://linux.duke.edu/metadata/common}checksum') - - rpm_format = package.find( - '{http://linux.duke.edu/metadata/common}format') - buildhost = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}buildhost') - rpm_group = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}group') + "{http://linux.duke.edu/metadata/common}location" + ) + checksum = package.find("{http://linux.duke.edu/metadata/common}checksum") + + rpm_format = package.find("{http://linux.duke.edu/metadata/common}format") + buildhost = rpm_format.find("{http://linux.duke.edu/metadata/rpm}buildhost") + rpm_group = rpm_format.find("{http://linux.duke.edu/metadata/rpm}group") header_range = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}header-range') - rpm_license = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}license') - rpm_vendor = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}vendor') - source_rpm = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}sourcerpm') - - package_info['description'] = get_tag_text(description) - package_info['summary'] = get_tag_text(summary) - package_info['url'] = get_tag_text(url) - package_info['checksum'] = get_tag_text(checksum) - package_info['pkgid'] = get_tag_text(checksum) - package_info['buildhost'] = get_tag_text(buildhost) - package_info['group'] = get_tag_text(rpm_group) - package_info['license'] = get_tag_text(rpm_license) - package_info['sourcerpm'] = get_tag_text(source_rpm) - tags_infos.append(convert_tuples_to_dict(packager.items(), 'packager')) - tags_infos.append(convert_tuples_to_dict(size.items(), 'size')) - tags_infos.append(convert_tuples_to_dict(time.items(), 'time')) + "{http://linux.duke.edu/metadata/rpm}header-range" + ) + rpm_license = rpm_format.find("{http://linux.duke.edu/metadata/rpm}license") + rpm_vendor = rpm_format.find("{http://linux.duke.edu/metadata/rpm}vendor") + source_rpm = rpm_format.find("{http://linux.duke.edu/metadata/rpm}sourcerpm") + + package_info["description"] = get_tag_text(description) + package_info["summary"] = get_tag_text(summary) + package_info["url"] = get_tag_text(url) + package_info["checksum"] = get_tag_text(checksum) + package_info["pkgid"] = get_tag_text(checksum) + package_info["buildhost"] = get_tag_text(buildhost) + package_info["group"] = get_tag_text(rpm_group) + package_info["license"] = get_tag_text(rpm_license) + package_info["sourcerpm"] = get_tag_text(source_rpm) + tags_infos.append(convert_tuples_to_dict(packager.items(), "packager")) + tags_infos.append(convert_tuples_to_dict(size.items(), "size")) + tags_infos.append(convert_tuples_to_dict(time.items(), "time")) tags_infos.append(convert_tuples_to_dict(download_location.items())) - tags_infos.append( - convert_tuples_to_dict(header_range.items(), 'header_range')) - tags_infos.append(convert_tuples_to_dict(rpm_vendor.items(), 'vendor')) - - requires = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}requires') - provides = rpm_format.find( - '{http://linux.duke.edu/metadata/rpm}provides') + tags_infos.append(convert_tuples_to_dict(header_range.items(), "header_range")) + tags_infos.append(convert_tuples_to_dict(rpm_vendor.items(), "vendor")) + + requires = rpm_format.find("{http://linux.duke.edu/metadata/rpm}requires") + provides = rpm_format.find("{http://linux.duke.edu/metadata/rpm}provides") if requires is not None: - required_rpms = [ - convert_tuples_to_dict(rpm.items()) for rpm in requires] - package_info['required_rpms'] = required_rpms + required_rpms = [convert_tuples_to_dict(rpm.items()) for rpm in requires] + package_info["required_rpms"] = required_rpms if provides is not None: - provided_rpms = [ - convert_tuples_to_dict(rpm.items()) for rpm in provides] - package_info['provided_rpms'] = provided_rpms + provided_rpms = [convert_tuples_to_dict(rpm.items()) for rpm in provides] + package_info["provided_rpms"] = provided_rpms package_info = combine_list_of_dicts([package_info] + tags_infos) pkgs_infos.append(package_info) @@ -265,7 +252,7 @@ def primaryxml_parser(location): def otherxml_parser(location): """ - Parses other.xml file and yields the data needed to generate RPM objects. + Parse other.xml file and yield the data needed to generate RPM objects. @@ -280,28 +267,27 @@ def otherxml_parser(location): """ otherxml = etree.parse(location).getroot() infos = [] - for package in otherxml.findall('{http://linux.duke.edu/metadata/other}package'): - version = package.find('{http://linux.duke.edu/metadata/other}version') + for package in otherxml.findall("{http://linux.duke.edu/metadata/other}package"): + version = package.find("{http://linux.duke.edu/metadata/other}version") package_info = dict(package.items() + version.items()) - changelogs = package.findall( - '{http://linux.duke.edu/metadata/other}changelog') - package_info['changelogs'] = [] + changelogs = package.findall("{http://linux.duke.edu/metadata/other}changelog") + package_info["changelogs"] = [] for changelog in changelogs: if changelog.items(): change_info = convert_tuples_to_dict(changelog.items()) - change_info['changelog'] = changelog.text - package_info['changelogs'].append(change_info) + change_info["changelog"] = changelog.text + package_info["changelogs"].append(change_info) else: - package_info['changelogs'].append( - {'changelog': changelog.text}) + package_info["changelogs"].append({"changelog": changelog.text}) infos.append(package_info) return infos def get_pkg_infos(filelists_xml, primary_xml, other_xml): - primaryxml_dicts = primaryxml_parser(primary_xml) otherxml_dicts = otherxml_parser(other_xml) filelistsxml_dicts = filelistsxml_parser(filelists_xml) - return combine_dicts_using_pkgid(primaryxml_dicts + otherxml_dicts + filelistsxml_dicts) + return combine_dicts_using_pkgid( + primaryxml_dicts + otherxml_dicts + filelistsxml_dicts + ) diff --git a/minecode/visitors/repodata_rpms.py b/minecode/miners/repodata_rpms.py similarity index 50% rename from minecode/visitors/repodata_rpms.py rename to minecode/miners/repodata_rpms.py index 882285a3..f0ea9f32 100644 --- a/minecode/visitors/repodata_rpms.py +++ b/minecode/miners/repodata_rpms.py @@ -2,14 +2,11 @@ # Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals -from minecode import seed from minecode import rsync +from minecode import seed from minecode import visit_router -from minecode.visitors import URI +from minecode.miners import URI """ Collect YUM repositories index (aka. repodata) from CentOS, Fedora, openSUSE and @@ -17,23 +14,22 @@ """ rsync_urls = ( - 'rsync://mirrors.kernel.org/centos/', - 'rsync://yum.postgresql.org', - 'rsync://www.fedora.is/fedora/', - 'rsync://rsync.opensuse.org/', + "rsync://mirrors.kernel.org/centos/", + "rsync://yum.postgresql.org", + "rsync://www.fedora.is/fedora/", + "rsync://rsync.opensuse.org/", ) class RPMRepoDataSeed(seed.Seeder): - def get_seeds(self): - yield 'rsync://mirrors.kernel.org/centos/' - yield 'rsync://yum.postgresql.org' - yield 'rsync://www.fedora.is/fedora/' - yield 'rsync://rsync.opensuse.org/' + yield "rsync://mirrors.kernel.org/centos/" + yield "rsync://yum.postgresql.org" + yield "rsync://www.fedora.is/fedora/" + yield "rsync://rsync.opensuse.org/" -def collect_rsync_urls(directory_listing, base_url, file_names=('repomd.xml',)): +def collect_rsync_urls(directory_listing, base_url, file_names=("repomd.xml",)): """ Given an rsync URI that may contain files with path ending with any of the 'path_ends' tuple yield URIs using the 'base_url' as the base. @@ -42,13 +38,21 @@ def collect_rsync_urls(directory_listing, base_url, file_names=('repomd.xml',)): for entry in rsync.directory_entries(directory_listing): # FIXME: why this assert? - assert not entry['path'].startswith('/') - if entry['path'].endswith(file_names): - entry = base_url + entry['path'] + assert not entry["path"].startswith("/") + if entry["path"].endswith(file_names): + entry = base_url + entry["path"] yield URI(uri=entry) @visit_router.route(*rsync_urls) -def collect_repomd_urls(uri, file_names=('repomd.xml',)): +def collect_repomd_urls(uri, file_names=("repomd.xml",)): directory_listing = rsync.fetch_directory(uri) - return collect_rsync_urls(directory_listing, base_url=uri.replace('rsync://', 'http://'), file_names=file_names), None, None + return ( + collect_rsync_urls( + directory_listing, + base_url=uri.replace("rsync://", "http://"), + file_names=file_names, + ), + None, + None, + ) diff --git a/minecode/miners/repomd.py b/minecode/miners/repomd.py new file mode 100644 index 00000000..a39217aa --- /dev/null +++ b/minecode/miners/repomd.py @@ -0,0 +1,126 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import logging +import os + +from commoncode import fileutils +from packagedcode.models import PackageData +from packagedcode.rpm import EVR + +from minecode import map_router +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import repodata +from minecode.utils import extract_file +from minecode.utils import fetch_http +from minecode.utils import get_temp_file + +logger = logging.getLogger(__name__) + + +""" +Analyzes the "repomd.xml" of a given repository from the URL given as input +and generates a list of RPM objects +""" + + +def download(uri): + """ + Fetch the file at uri, saving it to a temp file and return the path to + this temp file. + """ + name = fileutils.file_name(uri) + file_ext = fileutils.file_extension(name) + name = name.replace(file_ext, "") + + content = fetch_http(uri) + temp_file = get_temp_file( + file_name="minecode-fetched-file-" + name, extension=file_ext + ) + with open(temp_file, "wb") as tmp: + tmp.write(content) + file_name = tmp.name + return file_name + + +def generate_rpm_objects(package_infos, base_url): + """Yield Packages from an iterable of RPM infos given a base_url.""" + # FIXME: what does package_infos mean? wheer does it come from? + for infos in package_infos: + package_data = dict( + # FIXME: need to add id back? this is id is some hash which is local to the repo. + # id=infos.get('pkgid'), + type="rpm", + name=infos.get("name"), + version=EVR( + epoch=infos.get("epoch"), + version=infos.get("ver"), + release=infos.get("rel"), + ).to_string(), + description=infos.get("description"), + homepage_url=infos.get("url"), + download_url=repodata.build_rpm_download_url(base_url, infos.get("href")), + extracted_license_statement=infos.get("license", ""), + ) + package = PackageData.from_data(package_data) + if infos.get("source_rpm"): + src_rpm = PackageData(name=infos.get("source_rpm")) + package.related_packages = [src_rpm] + yield package + + +# TODO: refactor, this does not make sense, each are different URIs? +# FIXME: the doc and semantics are cryptic too + + +def fetch_repomd_subfile(base_url, repomd_xml, subfile): + """ + Download and extract a subfile('filelists.xml.gz', 'primary.xml.gz', + 'other.xml.gz') of any repodata and return the subfile location. + """ + url = base_url + repodata.get_url_for_tag(repomd_xml, subfile) + target_location = extract_file(download(url)) + return os.path.join(target_location, os.listdir(target_location)[0]) + + +@visit_router.route(".+/repomd.xml") +def collect_rpm_packages_from_repomd(uri): + """Collect RPM data from yum repository repomd.xml.""" + base_url = fileutils.parent_directory(fileutils.parent_directory(uri)) + repomd_xml = download(uri) + + filelists_xml = fetch_repomd_subfile(base_url, repomd_xml, "filelists") + primary_xml = fetch_repomd_subfile(base_url, repomd_xml, "primary") + other_xml = fetch_repomd_subfile(base_url, repomd_xml, "other") + + pkg_infos = repodata.get_pkg_infos(filelists_xml, primary_xml, other_xml) + + rpms = list(generate_rpm_objects(pkg_infos, base_url)) + uris = [] + for rpm in rpms: + if rpm.download_url: + uris.append(URI(uri=rpm.download_url)) + return uris, json.dumps([r.to_dict() for r in rpms]), None + + +@map_router.route(".+/repomd.xml") +def map_repomd_data(uris, resource_uri): + """Return a list of RpmPackage objects collected from visitors.""" + if not resource_uri.data: + return + packages = [] + for pkg_data in json.loads(resource_uri.data): + # 'name' is required for every package + # FIXME: how could we obtain a package without a name??? + # FIXME: This cannot work unless we use **pkg_data + if pkg_data.get("name"): + packages.append(PackageData(pkg_data)) + return packages diff --git a/minecode/miners/rubygems.py b/minecode/miners/rubygems.py new file mode 100644 index 00000000..fb6ab2e8 --- /dev/null +++ b/minecode/miners/rubygems.py @@ -0,0 +1,418 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + + +import gzip +import json +import logging +import os + +import saneyaml +from packagedcode import models as scan_models +from packagedcode.models import DependentPackage +from packagedcode.models import PackageData +from packageurl import PackageURL +from rubymarshal import reader +from rubymarshal.classes import UsrMarshal + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor +from minecode.utils import extract_file +from minecode.utils import parse_date + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +# FIXME: we are missing several API calls: +# http://guides.rubygems.org/rubygems-org-api/ + + +class RubyGemsSeed(seed.Seeder): + def get_seeds(self): + # We keep only specs.4.8.gz and exclude latest_spec.4.8.gz, + # since specs.4.8.gz covers all uris in latest spec. + yield "http://rubygems.org/specs.4.8.gz" + + +class GemVersion(UsrMarshal): + def version(self): + return self.values["version"] + + +@visit_router.route(r"https?://rubygems\.org/specs\.4\.8\.gz") +class RubyGemsIndexVisitor(NonPersistentHttpVisitor): + """Collect REST APIs URIs from RubyGems index file.""" + + def get_uris(self, content): + with gzip.open(content, "rb") as idx: + index = idx.read() + + # TODO: use a purl!!! + for name, version, platform in reader.loads(index): + json_url = "https://rubygems.org/api/v1/versions/{name}.json".format( + **locals() + ) + + package_url = PackageURL(type="gem", name=name).to_string() + yield URI(uri=json_url, package_url=package_url, source_uri=self.uri) + + # note: this list only has ever a single value + version = version.values[0] + if isinstance(version, bytes): + version = version.decode("utf-8") + + download_url = "https://rubygems.org/downloads/{name}-{version}" + + if isinstance(platform, bytes): + platform = platform.decode("utf-8") + if platform != "ruby": + download_url += "-{platform}" + + download_url += ".gem" + download_url = download_url.format(**locals()) + package_url = PackageURL(type="gem", name=name, version=version).to_string() + yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) + + +@visit_router.route(r"https?://rubygems\.org/api/v1/versions/[\w\-\.]+.json") +class RubyGemsApiManyVersionsVisitor(HttpJsonVisitor): + """ + Collect the json content of each version. + Yield the uri of each gem based on name, platform and version. + The data of the uri is the JSON subset for a single version. + """ + + def get_uris(self, content): + """Yield URI of the gems url and data.""" + # FIXME: return actual data too!!! + for version_details in content: + # get the gems name by parsing from the uri + name = self.uri[ + self.uri.index("/versions/") + len("/versions/") : -len(".json") + ] + version = version_details.get("number") + gem_name = f"{name}-{version}" + package_url = PackageURL(type="gem", name=name, version=version).to_string() + download_url = f"https://rubygems.org/downloads/{gem_name}.gem" + yield URI( + uri=download_url, + source_uri=self.uri, + package_url=package_url, + data=json.dumps(version_details), + ) + + +# TODO: add API dependencies +# https://rubygems.org/api/v1/dependencies.json?gems=file_validators +# Also use Use the V2 API at http://guides.rubygems.org/rubygems-org-api-v2/ +# GET - /api/v2/rubygems/[GEM NAME]/versions/[VERSION NUMBER].(json|yaml) + + +@visit_router.route(r"https?://rubygems.org/downloads/[\w\-\.]+.gem") +class RubyGemsPackageArchiveMetadataVisitor(NonPersistentHttpVisitor): + """Fetch a Rubygems gem archive, extract it and return its metadata file content.""" + + def dumps(self, content): + return get_gem_metadata(content) + + +def get_gem_metadata(location): + """ + Return the metadata file content as a string extracted from the gem archive + at `location`. + """ + # Extract the compressed file first. + extracted_location = extract_file(location) + metadata_gz = os.path.join(extracted_location, "metadata.gz") + # Extract the embedded metadata gz file + extract_parent_location = extract_file(metadata_gz) + # Get the first file in the etracted folder which is the meta file location + meta_extracted_file = os.path.join( + extract_parent_location, os.listdir(extract_parent_location)[0] + ) + with open(meta_extracted_file) as meta_file: + return meta_file.read() + + +@map_router.route(r"https*://rubygems\.org/api/v1/versions/[\w\-\.]+.json") +class RubyGemsApiVersionsJsonMapper(Mapper): + """Mapper to build Rubygems Packages from JSON API data.""" + + def get_packages(self, uri, resource_uri): + metadata = json.loads(resource_uri.data) + _, sep, namejson = uri.partition("versions/") + if not sep: + return + name, sep, _ = namejson.rpartition(".json") + if not sep: + return + return build_rubygem_packages_from_api_data(metadata, name) + + +def build_rubygem_packages_from_api_data(metadata, name, purl=None): + """ + Yield Package built from resource_uri record for a single + package version. + metadata: json metadata content + name: package name + purl: String value of the package url of the ResourceURI object + """ + for version_details in metadata: + short_desc = version_details.get("summary") + long_desc = version_details.get("description") + if long_desc == short_desc: + long_desc = None + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = "\n".join(descriptions) + package = dict( + type="gem", + name=name, + description=description, + version=version_details.get("number"), + ) + # FIXME: we are missing deps and more things such as download URL and more + + if version_details.get("sha"): + package["sha256"] = version_details.get("sha") + + package["release_date"] = ( + parse_date(version_details.get("created_at") or "") or None + ) + + author = version_details.get("authors") + if author: + parties = package.get("parties") + if not parties: + package["parties"] = [] + party = scan_models.Party(name=author, role="author") + package["parties"].append(party) + + extracted_license_statement = [] + licenses = version_details.get("licenses") + if licenses: + for lic in licenses: + extracted_license_statement.append(lic) + if extracted_license_statement: + package["extracted_license_statement"] = extracted_license_statement + package = PackageData.from_data(package) + package.set_purl(purl) + yield package + + +@map_router.route(r"https?://rubygems.org/downloads/[\w\-\.]+.gem") +class RubyGemsPackageArchiveMetadataMapper(Mapper): + """Mapper to build on e Package from the metadata file found inside a gem.""" + + def get_packages(self, uri, resource_uri): + metadata = resource_uri.data + return build_rubygem_packages_from_metadata(metadata, download_url=uri) + + +def build_rubygem_packages_from_metadata(metadata, download_url=None, purl=None): + """ + Yield Package built from a Gem `metadata` YAML content + metadata: json metadata content + download_url: url to download the package + purl: String value of the package url of the ResourceURI object + """ + content = saneyaml.load(metadata) + if not content: + return + + name = content.get("name") + short_desc = content.get("summary") + long_desc = content.get("description") + if long_desc == short_desc: + long_desc = None + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = "\n".join(descriptions) + package = dict( + type="gem", + name=name, + description=description, + homepage_url=content.get("homepage"), + ) + if download_url: + package["download_url"] = download_url + + extracted_license_statement = [] + licenses = content.get("licenses") + if licenses: + for lic in licenses: + extracted_license_statement.append(lic) + if extracted_license_statement: + package["extracted_license_statement"] = extracted_license_statement + + authors = content.get("authors") + for author in authors: + parties = package.get("parties") + if not parties: + package["parties"] = [] + party = scan_models.Party(name=author, role="author") + package["parties"].append(party) + + # Release date in the form of `2010-02-01 00:00:00 -05:00` + release_date = content.get("date", "").split() + package["release_date"] = parse_date(release_date[0]) + + package["dependencies"] = get_dependencies_from_meta(content) or [] + + # This is a two level nenest item + version1 = content.get("version") or {} + version = version1.get("version") or None + package["version"] = version + package = PackageData.from_data(package) + package.set_purl(purl) + yield package + + +def get_dependencies_from_meta(content): + """ + Return a mapping of dependencies keyed by group based on the gem YAML + metadata data structure. + """ + dependencies = content.get("dependencies") or [] + if not dependencies: + return [] + + group = [] + for dependency in dependencies: + name = dependency.get("name") or None + if not name: + continue + + requirement = dependency.get("requirement") or {} + # FIXME when upating to the ScanCode package model + scope = dependency.get("type") + scope = scope and scope.lstrip(":") + + # note that as weird artifact of our saneyaml YAML parsing, we are + # getting both identical requirements and version_requirements mapping. + # We ignore version_requirements + # requirement is {'requirements': [ + # [u'>=', {'version': '0'}] + # ] + # } + requirements = requirement.get("requirements") or [] + version_constraint = [] + + # each requirement is [u'>=', {'version': '0'}] + for constraint, req_version in requirements: + req_version = req_version.get("version") or None + # >= 0 allows for any version: we ignore these type of contrainsts + # as this is the same as no constraints. We also ignore lack of + # constraints and versions + if (constraint == ">=" and req_version == "0") or not ( + constraint and req_version + ): + continue + version_constraint.append(" ".join([constraint, req_version])) + version_constraint = ", ".join(version_constraint) or None + + group.append( + DependentPackage( + purl=name, extracted_requirement=version_constraint, scope=scope + ) + ) + + return group + + +def get_dependencies_from_api(content): + """ + Return a mapping of dependencies keyed by group based on the RubyGems API + data structure. + """ + dependencies = content.get("dependencies") or [] + if not dependencies: + return {} + + group = [] + for dependency in dependencies: + name = dependency.get("name") or None + if not name: + continue + + requirement = dependency.get("requirement") or {} + scope = dependency.get("type") + scope = scope and scope.lstrip(":") + + # note that as weird artifact of our saneyaml YAML parsing, we are + # getting both identical requirements and version_requirements mapping. + # We ignore version_requirements + # requirement is {'requirements': [ + # [u'>=', {'version': '0'}] + # ] + # } + requirements = requirement.get("requirements") or [] + version_constraint = [] + # each requirement is [u'>=', {'version': '0'}] + for constraint, req_version in requirements: + req_version = req_version.get("version") or None + # >= 0 allows for any version: we ignore these type of contrainsts + # as this is the same as no constraints. We also ignore lack of + # constraints and versions + if (constraint == ">=" and req_version == "0") or not ( + constraint and req_version + ): + continue + version_constraint.append(" ".join([constraint, req_version])) + version_constraint = ", ".join(version_constraint) or None + + group.append( + DependentPackage( + purl=name, extracted_requirement=version_constraint, scope=scope + ) + ) + + return group + + +# Structure: {gem_spec: license.key} +LICENSES_MAPPING = { + "None": None, + "Apache 2.0": "apache-2.0", + "Apache License 2.0": "apache-2.0", + "Apache-2.0": "apache-2.0", + "Apache": "apache-2.0", + "GPL": "gpl-2.0", + "GPL-2": "gpl-2.0", + "GNU GPL v2": "gpl-2.0", + "GPLv2+": "gpl-2.0-plus", + "GPLv2": "gpl-2.0", + "GPLv3": "gpl-3.0", + "MIT": "mit", + "Ruby": "ruby", + "same as ruby's": "ruby", + "Ruby 1.8": "ruby", + "Artistic 2.0": "artistic-2.0", + "Perl Artistic v2": "artistic-2.0", + "2-clause BSDL": "bsd-simplified", + "BSD": "bsd-new", + "BSD-3": "bsd-new", + "ISC": "isc", + "SIL Open Font License": "ofl-1.0", + "New Relic": "new-relic", + "GPL2": "gpl-2.0", + "BSD-2-Clause": "bsd-simplified", + "BSD 2-Clause": "bsd-simplified", + "LGPL-3": "lgpl-3.0", + "LGPL-2.1+": "lgpl-2.1-plus", + "LGPLv2.1+": "lgpl-2.1-plus", + "LGPL": "lgpl", + "Unlicense": "unlicense", +} diff --git a/minecode/miners/sourceforge.py b/minecode/miners/sourceforge.py new file mode 100644 index 00000000..dd62397d --- /dev/null +++ b/minecode/miners/sourceforge.py @@ -0,0 +1,180 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import logging +import re + +from bs4 import BeautifulSoup +from packagedcode import models as scan_models +from packageurl import PackageURL + +from minecode import map_router +from minecode import seed +from minecode import visit_router +from minecode.miners import URI +from minecode.miners import HttpJsonVisitor +from minecode.miners import HttpVisitor +from minecode.miners import Mapper +from minecode.miners import NonPersistentHttpVisitor + +logger = logging.getLogger(__name__) +handler = logging.StreamHandler() +logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +class SourceforgeSeed(seed.Seeder): + def get_seeds(self): + yield "https://sourceforge.net/sitemap.xml" + + +@visit_router.route("https?://sourceforge.net/sitemap.xml") +class SourceforgeSitemapIndexVisitor(NonPersistentHttpVisitor): + """ + Collect sub-sitemaps from the main sitemap. Return on URI for each sub- + sitemap, for example: https://sourceforge.net/sitemap-167.xml + + Note that the class implements from NonPersistentHttpVisitor instead of HttpVisitor, + as the XML file itself will be over 100M big, so NonPersistentHttpVisitor will be more + reasonable. + """ + + def get_uris(self, content): + """Collect all the sitemaps URIs from master sitemap.""" + locs = BeautifulSoup(open(content), "lxml").find_all("loc") + # Content passing from NonPersistentHttpVisitor is a temp file path + # instead of file content, so opening to get a file handler is + # necessary. + for loc in locs: + yield URI(uri=loc.text, source_uri=self.uri) + + +@visit_router.route(r"https?://sourceforge.net/sitemap-\d+.xml") +class SourceforgeSitemapPageVisitor(HttpVisitor): + def get_uris(self, content): + """Collect all the projects URIs from a sub-sitemaps.""" + sitemap_locs = BeautifulSoup(content, "lxml").find_all("loc") + regex = re.compile(r"^https?://sourceforge.net/projects/[a-z0-9.-]+/?$") + for loc in sitemap_locs: + if loc.text and re.match(regex, loc.text): + project_json_baseurl = ( + "https://sourceforge.net/api/project/name/{}/json" + ) + project_name = loc.text.partition("https://sourceforge.net/projects/")[ + -1 + ].strip("/") + project_json_url = project_json_baseurl.format(project_name) + package_url = PackageURL( + type="sourceforge", name=project_name + ).to_string() + # The priority in the xml has different view with the priority in visitor, so skip it. + yield URI( + uri=project_json_url, package_url=package_url, source_uri=self.uri + ) + + +@visit_router.route( + "https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json", + "https?://sourceforge.net/rest/p/[a-z0-9.-]+", +) +class SourceforgeProjectJsonVisitor(HttpJsonVisitor): + """ + Collect Sourceforge project data through the JSON API. + The implementation is empty since it will inherit the implementation from HttpJsonVisitor and it returns json data for mapper. + """ + + pass + + +@map_router.route( + "https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json", + "https?://sourceforge.net/rest/p/[a-z0-9.-]+", +) +class SourceforgeProjectJsonAPIMapper(Mapper): + def get_packages(self, uri, resource_uri): + """ + Yield Package built from resource_uri record for a single + package version. + Yield as many Package as there are download URLs. + """ + metadata = json.loads(resource_uri.data) + return build_packages_from_metafile(metadata, resource_uri.package_url, uri) + + +def build_packages_from_metafile(metadata, purl=None, uri=None): + """ + Yield Package built from package a `metadata` content + metadata: json metadata content + purl: String value of the package url of the ResourceURI object + """ + short_desc = metadata.get("summary") + long_desc = metadata.get("short_description") + descriptions = [d for d in (short_desc, long_desc) if d and d.strip()] + description = "\n".join(descriptions) + name = metadata.get("shortname") + # short name is more reasonable here for name, since it's an abbreviation + # for the project and unique + if not name: + name = metadata.get("name") + if name: + common_data = dict( + datasource_id="sourceforge_metadata", + type="sourceforge", + name=metadata.get("shortname", metadata.get("name")), + description=description, + homepage_url=metadata.get("external_homepage", metadata.get("url")), + license_detections=[], + ) + + devs = metadata.get("developers") or [] + for dev in devs: + parties = common_data.get("parties") + if not parties: + common_data["parties"] = [] + if dev.get("name"): + common_data["parties"].append( + scan_models.Party( + name=dev.get("name"), role="contributor", url=dev.get("url") + ).to_dict() + ) + + categories = metadata.get("categories", {}) + languages = categories.get("language", []) + langs = [] + for lang in languages: + lshort = lang.get("shortname") + if lshort: + langs.append(lshort) + langs = ", ".join(langs) + common_data["primary_language"] = langs or None + + extracted_license_statement = [] + licenses = categories.get("license") or [] + for lic in licenses: + license_name = lic.get("fullname") + # full name is first priority than shortname since shortname is like gpl, it doesn't show detailed gpl version etc. + if license_name: + extracted_license_statement.append(lic.get("shortname")) + if license_name: + extracted_license_statement.append(license_name) + if extracted_license_statement: + common_data["extracted_license_statement"] = extracted_license_statement + + keywords = [] + topics = categories.get("topic", []) + for topic in topics: + keywords.append(topic.get("shortname")) + common_data["keywords"] = keywords or None + package = scan_models.Package.from_package_data( + package_data=common_data, + datafile_path=uri, + ) + package.set_purl(purl) + yield package diff --git a/minecode/visitors/ubuntu.py b/minecode/miners/ubuntu.py similarity index 78% rename from minecode/visitors/ubuntu.py rename to minecode/miners/ubuntu.py index 3a8d9e74..3736b0ac 100644 --- a/minecode/visitors/ubuntu.py +++ b/minecode/miners/ubuntu.py @@ -2,9 +2,6 @@ # Copyright (c) 2014 by nexB, Inc. http://www.nexb.com/ - All rights reserved. # -from __future__ import absolute_import -from __future__ import unicode_literals - # http://askubuntu.com/questions/139032/how-to-programmatically-fetch-a-list-of-applications-from-the-software-center # http://askubuntu.com/questions/112004/is-there-any-web-api-for-software-center-available diff --git a/minecode/model_utils.py b/minecode/model_utils.py index 74b2f975..16f79baf 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -2,10 +2,14 @@ import logging import sys -from minecode.models import ScannableURI +from django.utils import timezone + from commoncode import fileutils +from packagedcode.models import PackageData from packageurl import normalize_qualifiers +from minecode.models import ScannableURI +from minecode.utils import stringify_null_purl_fields from packagedb.models import DependentPackage from packagedb.models import Package from packagedb.models import PackageContentType @@ -14,9 +18,6 @@ from packagedb.models import Resource from packagedb.serializers import DependentPackageSerializer from packagedb.serializers import PartySerializer -from packagedcode.models import PackageData -from minecode.utils import stringify_null_purl_fields -from django.utils import timezone TRACE = False @@ -28,23 +29,25 @@ # These are the list of default pipelines to run when we scan a Package for # indexing DEFAULT_PIPELINES = ( - 'scan_single_package', - 'fingerprint_codebase', + "scan_single_package", + "fingerprint_codebase", ) # These are the list of supported addon pipelines to run when we scan a Package for # indexing. SUPPORTED_ADDON_PIPELINES = ( - 'collect_strings_gettext', - 'collect_symbols_ctags', - 'collect_symbols_pygments', - 'collect_symbols_tree_sitter', - 'inspect_elf_binaries', - 'scan_for_virus', + "collect_strings_gettext", + "collect_symbols_ctags", + "collect_symbols_pygments", + "collect_symbols_tree_sitter", + "inspect_elf_binaries", + "scan_for_virus", ) -def add_package_to_scan_queue(package, pipelines=DEFAULT_PIPELINES, priority=0, reindex_uri=False): +def add_package_to_scan_queue( + package, pipelines=DEFAULT_PIPELINES, priority=0, reindex_uri=False +): """ Add a Package `package` to the scan queue to run the list of provided `pipelines` with a given `priority`. A ScannableURI with a `priority` of 100 @@ -53,7 +56,7 @@ def add_package_to_scan_queue(package, pipelines=DEFAULT_PIPELINES, priority=0, If `reindex_uri` is True, force rescanning of the package """ if not pipelines: - raise Exception('pipelines required to add package to scan queue') + raise Exception("pipelines required to add package to scan queue") uri = package.download_url _, scannable_uri_created = ScannableURI.objects.get_or_create( uri=uri, @@ -63,7 +66,7 @@ def add_package_to_scan_queue(package, pipelines=DEFAULT_PIPELINES, priority=0, priority=priority, ) if scannable_uri_created: - logger.debug(' + Inserted ScannableURI\t: {}'.format(uri)) + logger.debug(f" + Inserted ScannableURI\t: {uri}") def merge_packages(existing_package, new_package_data, replace=False): @@ -84,22 +87,22 @@ def merge_packages(existing_package, new_package_data, replace=False): # We remove `purl` from `existing_mapping` because we use the other purl # fields (type, namespace, name, version, etc.) to generate the purl. - existing_mapping.pop('purl') + existing_mapping.pop("purl") # FIXME REMOVE this workaround when a ScanCode bug fixed with # https://github.com/aboutcode-org/scancode-toolkit/commit/9b687e6f9bbb695a10030a81be7b93c8b1d816c2 - qualifiers = new_package_data.get('qualifiers') + qualifiers = new_package_data.get("qualifiers") if isinstance(qualifiers, dict): # somehow we get an dict on the new value instead of a string # this not likely the best place to fix this - new_package_data['qualifiers'] = normalize_qualifiers(qualifiers, encode=True) + new_package_data["qualifiers"] = normalize_qualifiers(qualifiers, encode=True) new_mapping = new_package_data fields_to_skip = ( - 'package_uid', - 'declared_license_expression_spdx', - 'other_license_expression_spdx', + "package_uid", + "declared_license_expression_spdx", + "other_license_expression_spdx", ) updated_fields = [] @@ -107,10 +110,16 @@ def merge_packages(existing_package, new_package_data, replace=False): new_value = new_mapping.get(existing_field) if TRACE: logger.debug( - '\n'.join([ - 'existing_field:', repr(existing_field), - ' existing_value:', repr(existing_value), - ' new_value:', repr(new_value)]) + "\n".join( + [ + "existing_field:", + repr(existing_field), + " existing_value:", + repr(existing_value), + " new_value:", + repr(new_value), + ] + ) ) # FIXME: handle Booleans??? though there are none for now @@ -118,48 +127,52 @@ def merge_packages(existing_package, new_package_data, replace=False): # If the checksum from `new_package` is different than the one # existing checksum in `existing_package`, there is a big data # inconsistency issue and an Exception is raised - if (existing_field in ('md5', 'sha1', 'sha256', 'sha512') and - existing_value and - new_value and - existing_value != new_value): + if ( + existing_field in ("md5", "sha1", "sha256", "sha512") + and existing_value + and new_value + and existing_value != new_value + ): raise Exception( - '\n'.join([ - 'Mismatched {} for {}:'.format(existing_field, existing_package.uri), - ' existing_value: {}'.format(existing_value), - ' new_value: {}'.format(new_value) - ]) + "\n".join( + [ + f"Mismatched {existing_field} for {existing_package.uri}:", + f" existing_value: {existing_value}", + f" new_value: {new_value}", + ] + ) ) if not new_value: if TRACE: - logger.debug(' No new value: skipping') + logger.debug(" No new value: skipping") continue if not existing_value or replace: if TRACE and not existing_value: - logger.debug( - ' No existing value: set to new: {}'.format(new_value)) + logger.debug(f" No existing value: set to new: {new_value}") if TRACE and replace: - logger.debug( - ' Existing value and replace: set to new: {}'.format(new_value)) + logger.debug(f" Existing value and replace: set to new: {new_value}") - if existing_field == 'parties': + if existing_field == "parties": # If `existing_field` is `parties`, then we update the `Party` table parties = new_value existing_parties = Party.objects.filter(package=existing_package) - serialized_existing_parties = PartySerializer(existing_parties, many=True).data + serialized_existing_parties = PartySerializer( + existing_parties, many=True + ).data if replace: # Delete existing Party objects existing_parties.delete() for party in parties: _party, _created = Party.objects.get_or_create( package=existing_package, - type=party['type'], - role=party['role'], - name=party['name'], - email=party['email'], - url=party['url'], + type=party["type"], + role=party["role"], + name=party["name"], + email=party["email"], + url=party["url"], ) entry = dict( field=existing_field, @@ -168,23 +181,27 @@ def merge_packages(existing_package, new_package_data, replace=False): ) updated_fields.append(entry) continue - elif existing_field == 'dependencies': + elif existing_field == "dependencies": # If `existing_field` is `dependencies`, then we update the `DependentPackage` table dependencies = new_value - existing_dependencies = DependentPackage.objects.filter(package=existing_package) - serialized_existing_dependencies = DependentPackageSerializer(existing_dependencies, many=True).data + existing_dependencies = DependentPackage.objects.filter( + package=existing_package + ) + serialized_existing_dependencies = DependentPackageSerializer( + existing_dependencies, many=True + ).data if replace: # Delete existing DependentPackage objects existing_dependencies.delete() for dependency in dependencies: _dep, _created = DependentPackage.objects.get_or_create( package=existing_package, - purl=dependency['purl'], - extracted_requirement=dependency['extracted_requirement'], - scope=dependency['scope'], - is_runtime=dependency['is_runtime'], - is_optional=dependency['is_optional'], - is_resolved=dependency['is_resolved'], + purl=dependency["purl"], + extracted_requirement=dependency["extracted_requirement"], + scope=dependency["scope"], + is_runtime=dependency["is_runtime"], + is_optional=dependency["is_optional"], + is_resolved=dependency["is_resolved"], ) entry = dict( field=existing_field, @@ -193,9 +210,9 @@ def merge_packages(existing_package, new_package_data, replace=False): ) updated_fields.append(entry) continue - elif existing_field == 'package_content': + elif existing_field == "package_content": # get new_value from extra_data - new_value = new_mapping.extra_data.get('package_content') + new_value = new_mapping.extra_data.get("package_content") if not new_value: continue elif existing_field in fields_to_skip: @@ -206,9 +223,7 @@ def merge_packages(existing_package, new_package_data, replace=False): # `existing_field` is a regular field on the Package model and can # be updated normally. entry = dict( - field=existing_field, - old_value=existing_value, - new_value=new_value + field=existing_field, old_value=existing_value, new_value=new_value ) updated_fields.append(entry) setattr(existing_package, existing_field, new_value) @@ -216,7 +231,7 @@ def merge_packages(existing_package, new_package_data, replace=False): existing_package.save() if TRACE: - logger.debug(' Nothing done') + logger.debug(" Nothing done") return updated_fields @@ -235,30 +250,30 @@ def merge_or_create_package(scanned_package, visit_level, override=False): created = False merged = False package = None - map_error = '' + map_error = "" mining_level = visit_level if override: # this will force the data override - visit_level =+1 + visit_level = +1 if not isinstance(scanned_package, PackageData): - msg = 'Not a ScanCode PackageData type:' + repr(scanned_package) - map_error += msg + '\n' + msg = "Not a ScanCode PackageData type:" + repr(scanned_package) + map_error += msg + "\n" logger.error(msg) raise RuntimeError(msg) if not scanned_package.download_url: # TODO: there could be valid cases where we have no download URL # and still want to create a package??? - msg = 'No download_url for package:' + repr(scanned_package) - map_error += msg + '\n' + msg = "No download_url for package:" + repr(scanned_package) + map_error += msg + "\n" logger.error(msg) return package, created, merged, map_error package_uri = scanned_package.download_url - logger.debug('Package URI: {}'.format(package_uri)) - history = scanned_package.extra_data.get('history', []) + logger.debug(f"Package URI: {package_uri}") + history = scanned_package.extra_data.get("history", []) stored_package = None # Check if we already have an existing PackageDB record to update @@ -286,7 +301,8 @@ def merge_or_create_package(scanned_package, visit_level, override=False): updated_fields = merge_packages( existing_package=stored_package, new_package_data=scanned_package.to_dict(), - replace=False) + replace=False, + ) # for a foreign key, such as dependencies and parties, we will adopt the # same logic. In this case, parties or dependencies coming from a scanned # package are only added if there is no parties or dependencies in the @@ -300,7 +316,8 @@ def merge_or_create_package(scanned_package, visit_level, override=False): updated_fields = merge_packages( existing_package=stored_package, new_package_data=scanned_package.to_dict(), - replace=True) + replace=True, + ) # for a foreign key, such as dependencies and parties, we will adopt the # same logic. In this case, parties or dependencies coming from a scanned # package will override existing values. If there are parties in the scanned @@ -311,15 +328,17 @@ def merge_or_create_package(scanned_package, visit_level, override=False): if updated_fields: data = { - 'updated_fields': updated_fields, + "updated_fields": updated_fields, } - stored_package.append_to_history('Package field values have been updated.', data=data) + stored_package.append_to_history( + "Package field values have been updated.", data=data + ) # TODO: append updated_fields information to the package's history stored_package.last_modified_date = timezone.now() stored_package.save() - logger.debug(' + Updated package\t: {}'.format(package_uri)) + logger.debug(f" + Updated package\t: {package_uri}") package = stored_package merged = True @@ -337,7 +356,7 @@ def merge_or_create_package(scanned_package, visit_level, override=False): version=scanned_package.version, ) existing_related_package = existing_related_packages.first() - package_content = scanned_package.extra_data.get('package_content') + package_content = scanned_package.extra_data.get("package_content") package_data = dict( # FIXME: we should get the file_name in the @@ -380,7 +399,9 @@ def merge_or_create_package(scanned_package, visit_level, override=False): stringify_null_purl_fields(package_data) created_package = Package.objects.create(**package_data) - created_package.append_to_history('New Package created from URI: {}'.format(package_uri)) + created_package.append_to_history( + f"New Package created from URI: {package_uri}" + ) # This is used in the case of Maven packages created from the priority queue for h in history: @@ -388,12 +409,9 @@ def merge_or_create_package(scanned_package, visit_level, override=False): if existing_related_package: related_package_sets_count = existing_related_package.package_sets.count() - if ( - related_package_sets_count == 0 - or ( - related_package_sets_count > 0 - and created_package.package_content == PackageContentType.BINARY - ) + if related_package_sets_count == 0 or ( + related_package_sets_count > 0 + and created_package.package_content == PackageContentType.BINARY ): # Binary packages can only be part of one set package_set = PackageSet.objects.create() @@ -433,15 +451,15 @@ def merge_or_create_package(scanned_package, visit_level, override=False): created_package.save() package = created_package created = True - logger.debug(' + Inserted package\t: {}'.format(package_uri)) + logger.debug(f" + Inserted package\t: {package_uri}") return package, created, merged, map_error def update_or_create_resource(package, resource_data): """ - Using Resource data from `resource_data`, create or update the - corresponding purldb Resource from `package`. + Create or update the corresponding purldb Resource from `package` using + Resource data from `resource_data`. Return a 3-tuple of the corresponding purldb Resource of `resource_data`, `resource`, as well as booleans representing whether the Resource was @@ -450,9 +468,9 @@ def update_or_create_resource(package, resource_data): updated = False created = False resource = None - path = resource_data.get('path') + path = resource_data.get("path") - extra_data = copy.deepcopy(resource_data.get('extra_data', {})) + extra_data = copy.deepcopy(resource_data.get("extra_data", {})) extra_data.pop("directory_content", None) extra_data.pop("directory_structure", None) @@ -463,21 +481,21 @@ def update_or_create_resource(package, resource_data): resource = Resource( package=package, path=path, - is_file=resource_data.get('type') == 'file', - name=resource_data.get('name'), - extension=resource_data.get('extension'), - size=resource_data.get('size'), - md5=resource_data.get('md5'), - sha1=resource_data.get('sha1'), - sha256=resource_data.get('sha256'), - mime_type=resource_data.get('mime_type'), - file_type=resource_data.get('file_type'), - programming_language=resource_data.get('programming_language'), - is_binary=resource_data.get('is_binary'), - is_text=resource_data.get('is_text'), - is_archive=resource_data.get('is_archive'), - is_media=resource_data.get('is_media'), - is_key_file=resource_data.get('is_key_file'), + is_file=resource_data.get("type") == "file", + name=resource_data.get("name"), + extension=resource_data.get("extension"), + size=resource_data.get("size"), + md5=resource_data.get("md5"), + sha1=resource_data.get("sha1"), + sha256=resource_data.get("sha256"), + mime_type=resource_data.get("mime_type"), + file_type=resource_data.get("file_type"), + programming_language=resource_data.get("programming_language"), + is_binary=resource_data.get("is_binary"), + is_text=resource_data.get("is_text"), + is_archive=resource_data.get("is_archive"), + is_media=resource_data.get("is_media"), + is_key_file=resource_data.get("is_key_file"), extra_data=extra_data, ) created = True diff --git a/minecode/models.py b/minecode/models.py index 1e64d99c..a728b1af 100644 --- a/minecode/models.py +++ b/minecode/models.py @@ -7,10 +7,10 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from datetime import timedelta import logging import sys import uuid +from datetime import timedelta from django.conf import settings from django.db import models @@ -19,13 +19,11 @@ import django_rq from minecode import map_router -from minecode import visit_router # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration -from minecode import mappers # NOQA -from minecode import visitors # NOQA - +# But importing the miners module triggers routes registration +from minecode import miners # NOQA +from minecode import visit_router from packagedb.models import Package logger = logging.getLogger(__name__) @@ -45,13 +43,11 @@ def get_canonical(uri): in the URI it is removed from the canonical output. """ import urlpy + normalized = urlpy.parse(uri).canonical().defrag().sanitize().punycode() # Taken from an old version of urlpy (latest does not have the PORTS dict # See: https://github.com/seomoz/url-py/blob/1d0efdda102cc48ce9dbcc41154296cea1d28c1f/url.py#L46 - PORTS = { - 'http': 80, - 'https': 443 - } + PORTS = {"http": 80, "https": 443} if normalized.port == PORTS.get(normalized.scheme, None): normalized.remove_default_port() return normalized.unicode @@ -62,46 +58,47 @@ class BaseURI(models.Model): A base abstract model to store URI for crawling, scanning and indexing. Also used as a processing "to do" queue for visiting and mapping these URIs. """ + uri = models.CharField( max_length=2048, db_index=True, - help_text='URI for this resource. This is the unmodified original URI.', + help_text="URI for this resource. This is the unmodified original URI.", ) canonical = models.CharField( max_length=3000, db_index=True, - help_text='Canonical form of the URI for this resource that must be ' - 'unique across all ResourceURI.', + help_text="Canonical form of the URI for this resource that must be " + "unique across all ResourceURI.", ) source_uri = models.CharField( max_length=2048, null=True, blank=True, - help_text='Optional: real source remote URI for this visit.' - 'For example for a package repository index is a typical source ' - 'via which a first level of package data is fetched. And it is ' - 'not the URI in the uri field. It is just the source of the fetch' - 'Or the source may be a mirror URI used for fetching.' + help_text="Optional: real source remote URI for this visit." + "For example for a package repository index is a typical source " + "via which a first level of package data is fetched. And it is " + "not the URI in the uri field. It is just the source of the fetch" + "Or the source may be a mirror URI used for fetching.", ) priority = models.PositiveIntegerField( # Using default because NULL is ordered first on Postgres. default=0, db_index=True, - help_text='Absolute procdssing priority of a URI (default to zero), ' - 'higher number means higher priority, zero means lowest ' - 'priority.', + help_text="Absolute procdssing priority of a URI (default to zero), " + "higher number means higher priority, zero means lowest " + "priority.", ) wip_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Work In Progress. This is a timestamp set at the start of a ' - 'visit or mapping or indexing or null when no processing is ' - 'in progress.', + help_text="Work In Progress. This is a timestamp set at the start of a " + "visit or mapping or indexing or null when no processing is " + "in progress.", ) file_name = models.CharField( @@ -109,8 +106,8 @@ class BaseURI(models.Model): null=True, blank=True, db_index=True, - help_text='File name of a resource sometimes part of the URI proper ' - 'and sometimes only available through an HTTP header.', + help_text="File name of a resource sometimes part of the URI proper " + "and sometimes only available through an HTTP header.", ) # FIXME: 2147483647 is the max size which means we cannot store more than 2GB files @@ -118,7 +115,7 @@ class BaseURI(models.Model): null=True, blank=True, db_index=True, - help_text='Size in bytes of the file represented by this ResourceURI.', + help_text="Size in bytes of the file represented by this ResourceURI.", ) sha1 = models.CharField( @@ -126,8 +123,8 @@ class BaseURI(models.Model): null=True, blank=True, db_index=True, - help_text='SHA1 checksum hex-encoded (as in the sha1sum command) of the ' - 'content of the file represented by this ResourceURI.', + help_text="SHA1 checksum hex-encoded (as in the sha1sum command) of the " + "content of the file represented by this ResourceURI.", ) md5 = models.CharField( @@ -135,8 +132,8 @@ class BaseURI(models.Model): null=True, blank=True, db_index=True, - help_text='MD5 checksum hex-encoded (as in the md5sum command) of the ' - 'content of the file represented by this ResourceURI.', + help_text="MD5 checksum hex-encoded (as in the md5sum command) of the " + "content of the file represented by this ResourceURI.", ) sha256 = models.CharField( @@ -144,18 +141,18 @@ class BaseURI(models.Model): null=True, blank=True, db_index=True, - help_text='SHA256 checksum hex-encoded (as in the sha256sum command) of the ' - 'content of the file represented by this ResourceURI.', + help_text="SHA256 checksum hex-encoded (as in the sha256sum command) of the " + "content of the file represented by this ResourceURI.", ) last_modified_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the last modified date of the remote ' - 'resource represented by this URI such as the modified date ' - 'of a file, the lastmod value on a sitemap or the modified ' - 'date returned by an HTTP resource.', + help_text="Timestamp set to the last modified date of the remote " + "resource represented by this URI such as the modified date " + "of a file, the lastmod value on a sitemap or the modified " + "date returned by an HTTP resource.", ) class Meta: @@ -175,15 +172,14 @@ def normalize_fields(self, exclude=None): sha1 = self.sha1 if sha1 and len(sha1) != 40: logger.warning( - 'ResourceURI.normalize_fields() for URI: "{}" - ' - 'Invalid SHA1 length: "{}": SHA1 ignored!' - .format(self.uri, sha1)) + f'ResourceURI.normalize_fields() for URI: "{self.uri}" - ' + f'Invalid SHA1 length: "{sha1}": SHA1 ignored!' + ) self.sha1 = None # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. class ResourceURIManager(models.Manager): - def insert(self, uri, **extra_fields): """ Create and return a new ResourceURI after computing its canonical URI @@ -214,8 +210,7 @@ def needs_revisit(self, uri, hours): if existing: return False - revisitable = self.get_revisitables( - hours=hours).filter(uri=uri).exists() + revisitable = self.get_revisitables(hours=hours).filter(uri=uri).exists() if revisitable: return True else: @@ -236,15 +231,11 @@ def visited(self): return self.filter(wip_date__isnull=True, last_visit_date__isnull=False) def successfully_visited(self): - """ - Limit the QuerySet to ResourceURIs that were visited successfully. - """ + """Limit the QuerySet to ResourceURIs that were visited successfully.""" return self.visited().filter(has_visit_error=False) def unsuccessfully_visited(self): - """ - Limit the QuerySet to ResourceURIs that were visited with errors. - """ + """Limit the QuerySet to ResourceURIs that were visited with errors.""" return self.visited().filter(has_visit_error=True) def get_revisitables(self, hours): @@ -252,12 +243,11 @@ def get_revisitables(self, hours): Limit the QuerySet to ResourceURIs that have not been visited since the number of `hours`, and therefore considered revisitable. """ - revisitables = self.visited().filter( - last_visit_date__lt=timezone.now() - timedelta(hours=hours) - ).exclude( - is_mappable=True, last_map_date__isnull=True - ).exclude( - is_visitable=False + revisitables = ( + self.visited() + .filter(last_visit_date__lt=timezone.now() - timedelta(hours=hours)) + .exclude(is_mappable=True, last_map_date__isnull=True) + .exclude(is_visitable=False) ) return revisitables @@ -279,7 +269,7 @@ def get_visitables(self): visitables = never_visited # NOTE: this matches an index for efficient ordering - visitables = visitables.order_by('-priority', '-uri') + visitables = visitables.order_by("-priority", "-uri") return visitables def get_next_visitable(self): @@ -296,7 +286,6 @@ def get_next_visitable(self): ResourceURI. ResourceURI that have not yet been visited are sorted by decreasing priority. """ - # We use select_for_update to ensure an atomic query. We ignore # locked rows by using skip_locked=True available since Django # 1.11. @@ -318,7 +307,7 @@ def get_next_visitable(self): # Mark the URI as wip: Callers mark this done by resetting # wip_date to null resource_uri.wip_date = timezone.now() - resource_uri.save(update_fields=['wip_date']) + resource_uri.save(update_fields=["wip_date"]) return resource_uri def never_mapped(self): @@ -326,7 +315,9 @@ def never_mapped(self): Limit the QuerySet to ResourceURIs that have never been mapped. This is usually the state of a ResourceURI after its succesful visit. """ - return self.successfully_visited().filter(last_map_date__isnull=True, wip_date__isnull=True) + return self.successfully_visited().filter( + last_map_date__isnull=True, wip_date__isnull=True + ) def mapped(self): """ @@ -336,15 +327,11 @@ def mapped(self): return self.filter(wip_date__isnull=True, last_map_date__isnull=False) def successfully_mapped(self): - """ - Limit the QuerySet to ResourceURIs that were mapped successfully. - """ + """Limit the QuerySet to ResourceURIs that were mapped successfully.""" return self.mapped().filter(has_map_error=False) def unsuccessfully_mapped(self): - """ - Limit the QuerySet to ResourceURIs that were mapped with errors. - """ + """Limit the QuerySet to ResourceURIs that were mapped with errors.""" return self.mapped().filter(has_map_error=True) def get_mappables(self): @@ -355,7 +342,7 @@ def get_mappables(self): """ qs = self.never_mapped().filter(is_mappable__exact=True, has_map_error=False) # NOTE: this matches an index for efficient ordering - qs = qs.order_by('-priority') + qs = qs.order_by("-priority") return qs @@ -378,9 +365,9 @@ class ResourceURI(BaseURI): mining_level = models.PositiveIntegerField( default=0, - help_text='A numeric indication of the depth and breadth of data ' - 'collected through this ResourceURI visit. Higher means ' - 'more and deeper data.', + help_text="A numeric indication of the depth and breadth of data " + "collected through this ResourceURI visit. Higher means " + "more and deeper data.", ) # This is a text blob that contains either HTML, JSON or anything @@ -389,9 +376,9 @@ class ResourceURI(BaseURI): data = models.TextField( null=True, blank=True, - help_text='Text content of the file represented by this ' - 'ResourceURI. This contains the data that was fetched or ' - 'extracted from a remote ResourceURI such as HTML or JSON.', + help_text="Text content of the file represented by this " + "ResourceURI. This contains the data that was fetched or " + "extracted from a remote ResourceURI such as HTML or JSON.", ) package_url = models.CharField( @@ -399,105 +386,99 @@ class ResourceURI(BaseURI): null=True, blank=True, db_index=True, - help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""" + help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""", ) last_visit_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of the last visit. Used to track visit status.', + help_text="Timestamp set to the date of the last visit. Used to track visit status.", ) is_visitable = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'this URI is visitable in the sense that there is a visitor ' - 'route available to process it.' + help_text="When set to True (Yes), this field indicates that " + "this URI is visitable in the sense that there is a visitor " + "route available to process it.", ) has_visit_error = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'an error has occured when visiting this URI.' + help_text="When set to True (Yes), this field indicates that " + "an error has occured when visiting this URI.", ) visit_error = models.TextField( null=True, blank=True, - help_text='Visit errors messages. When present this means the visit failed.', + help_text="Visit errors messages. When present this means the visit failed.", ) last_map_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of the last mapping. ' - 'Used to track mapping status.', + help_text="Timestamp set to the date of the last mapping. " + "Used to track mapping status.", ) is_mappable = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'this URI is mappable in the sense that there is a mapper ' - 'route available to process it.' + help_text="When set to True (Yes), this field indicates that " + "this URI is mappable in the sense that there is a mapper " + "route available to process it.", ) has_map_error = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'an error has occured when mapping this URI.' + help_text="When set to True (Yes), this field indicates that " + "an error has occured when mapping this URI.", ) map_error = models.TextField( null=True, blank=True, - help_text='Mapping errors messages. When present this means the mapping failed.', + help_text="Mapping errors messages. When present this means the mapping failed.", ) objects = ResourceURIManager() class Meta: - verbose_name = 'Resource URI' - unique_together = ['canonical', 'last_visit_date'] + verbose_name = "Resource URI" + unique_together = ["canonical", "last_visit_date"] indexes = [ # to get the next visitable models.Index( fields=[ - 'is_visitable', - 'last_visit_date', - 'wip_date', - 'has_visit_error', + "is_visitable", + "last_visit_date", + "wip_date", + "has_visit_error", ] ), # to get the next mappable models.Index( fields=[ - 'is_mappable', - 'last_visit_date', - 'wip_date', - 'last_map_date', - 'has_visit_error', - 'has_map_error', + "is_mappable", + "last_visit_date", + "wip_date", + "last_map_date", + "has_visit_error", + "has_map_error", ] ), # ordered by for the main queue query e.g. '-priority' - models.Index( - fields=[ - '-priority' - ] - ) + models.Index(fields=["-priority"]), ] def _set_defauts(self): - """ - Set defaults for computed fields. - """ + """Set defaults for computed fields.""" uri = self.uri if not self.canonical: self.canonical = get_canonical(uri) @@ -505,28 +486,24 @@ def _set_defauts(self): self.is_mappable = map_router.is_routable(uri) def save(self, *args, **kwargs): - """ - Save, adding defaults for computed fields and validating fields. - """ + """Save, adding defaults for computed fields and validating fields.""" self._set_defauts() self.normalize_fields() self.has_map_error = True if self.map_error else False self.has_visit_error = True if self.visit_error else False - super(ResourceURI, self).save(*args, **kwargs) + super().save(*args, **kwargs) class ScannableURIManager(models.Manager): - def get_scannables(self): """ Return an ordered query set of all scannable ScannableURIs. Note: this does not evaluate the query set and does not lock the database for update. """ - qs = self.filter(scan_status__exact=ScannableURI.SCAN_NEW, - scan_error=None) + qs = self.filter(scan_status__exact=ScannableURI.SCAN_NEW, scan_error=None) # NOTE: this matches an index for efficient ordering - qs = qs.order_by('-priority') + qs = qs.order_by("-priority") return qs def get_next_scannable(self): @@ -585,7 +562,7 @@ def __get_next_candidate(self, qs): # Mark the URI as wip: Callers mark this done by resetting # wip_date to null canidate_uri.wip_date = timezone.now() - canidate_uri.save(update_fields=['wip_date']) + canidate_uri.save(update_fields=["wip_date"]) return canidate_uri def get_processables(self): @@ -596,15 +573,17 @@ def get_processables(self): Note: this does not evaluate the query set and does not lock the database for update. """ - qs = self.filter(scan_status__in=[ - ScannableURI.SCAN_SUBMITTED, - ScannableURI.SCAN_IN_PROGRESS, - ScannableURI.SCAN_COMPLETED - ], - wip_date=None, scan_error=None, + qs = self.filter( + scan_status__in=[ + ScannableURI.SCAN_SUBMITTED, + ScannableURI.SCAN_IN_PROGRESS, + ScannableURI.SCAN_COMPLETED, + ], + wip_date=None, + scan_error=None, ) # NOTE: this matches an index for efficient ordering - qs = qs.order_by('-scan_status', '-priority') + qs = qs.order_by("-scan_status", "-priority") return qs def get_next_processable(self): @@ -619,33 +598,38 @@ def get_next_processable(self): return self.__get_next_candidate(self.get_processables()) def statistics(self): - """ - Return a statistics mapping with summary counts of ScannableURI grouped by status. - """ - statuses = list(self.values('scan_status').annotate( - count=models.Count('scan_status')).order_by('scan_status'),) + """Return a statistics mapping with summary counts of ScannableURI grouped by status.""" + statuses = list( + self.values("scan_status") + .annotate(count=models.Count("scan_status")) + .order_by("scan_status"), + ) for stat in statuses: - stat['scan_status'] = ScannableURI.SCAN_STATUSES_BY_CODE[stat['scan_status']] + stat["scan_status"] = ScannableURI.SCAN_STATUSES_BY_CODE[ + stat["scan_status"] + ] stats = { - 'total': self.count(), - 'processables': self.get_processables().count(), - 'scannables': self.get_scannables().count(), - 'by_status': statuses, + "total": self.count(), + "processables": self.get_processables().count(), + "scannables": self.get_scannables().count(), + "by_status": statuses, } most_recent = dict( - most_recent_submitted=self._recent( - scan_status=ScannableURI.SCAN_SUBMITTED), - most_recent_indexed=self._recent( - scan_status=ScannableURI.SCAN_INDEXED), + most_recent_submitted=self._recent(scan_status=ScannableURI.SCAN_SUBMITTED), + most_recent_indexed=self._recent(scan_status=ScannableURI.SCAN_INDEXED), most_recent_failed=self._recent( - scan_status=ScannableURI.SCAN_FAILED, extra_value="scan_error",), + scan_status=ScannableURI.SCAN_FAILED, + extra_value="scan_error", + ), most_recent_in_progress=self._recent( - scan_status=ScannableURI.SCAN_IN_PROGRESS), - most_recent_completed=self._recent( - scan_status=ScannableURI.SCAN_COMPLETED), + scan_status=ScannableURI.SCAN_IN_PROGRESS + ), + most_recent_completed=self._recent(scan_status=ScannableURI.SCAN_COMPLETED), most_recent_index_errors=self._recent( - scan_status=ScannableURI.SCAN_INDEX_FAILED, extra_value="index_error",), + scan_status=ScannableURI.SCAN_INDEX_FAILED, + extra_value="index_error", + ), ) stats.update(most_recent) return stats @@ -656,8 +640,9 @@ def _recent(self, scan_status, extra_value=None, most_recent=10): ``scan_status``. Include an optional ``extra value`` field name. """ - recent_uris = self.filter(scan_status=scan_status).order_by( - '-scan_date')[:most_recent] + recent_uris = self.filter(scan_status=scan_status).order_by("-scan_date")[ + :most_recent + ] for scauri in recent_uris: recent = dict( # this is NOT a field requiring this loop @@ -702,6 +687,7 @@ class ScannableURI(BaseURI): - update the matching index for the PackageDB as needed with fingerprints from the scan - set status and timestamps as needed """ + uuid = models.UUIDField( default=uuid.uuid4, unique=True, @@ -712,14 +698,14 @@ class ScannableURI(BaseURI): null=True, blank=True, db_index=True, - help_text='Timestamp set to the date when a scan was taken by a worker', + help_text="Timestamp set to the date when a scan was taken by a worker", ) pipelines = models.JSONField( default=list, blank=True, editable=False, - help_text='A list of ScanCode.io pipeline names to be run for this scan', + help_text="A list of ScanCode.io pipeline names to be run for this scan", ) SCAN_NEW = 0 @@ -732,53 +718,51 @@ class ScannableURI(BaseURI): SCAN_INDEX_FAILED = 7 SCAN_STATUS_CHOICES = [ - (SCAN_NEW, 'new'), - (SCAN_SUBMITTED, 'submitted'), - (SCAN_IN_PROGRESS, 'in progress'), - (SCAN_COMPLETED, 'scanned'), - (SCAN_INDEXED, 'indexed'), - (SCAN_FAILED, 'failed'), - (SCAN_TIMEOUT, 'timeout'), - (SCAN_INDEX_FAILED, 'scan index failed'), + (SCAN_NEW, "new"), + (SCAN_SUBMITTED, "submitted"), + (SCAN_IN_PROGRESS, "in progress"), + (SCAN_COMPLETED, "scanned"), + (SCAN_INDEXED, "indexed"), + (SCAN_FAILED, "failed"), + (SCAN_TIMEOUT, "timeout"), + (SCAN_INDEX_FAILED, "scan index failed"), ] SCAN_STATUSES_BY_CODE = dict(SCAN_STATUS_CHOICES) SCAN_STATUS_CODES_BY_SCAN_STATUS = { - status: code - for code, status - in SCAN_STATUS_CHOICES + status: code for code, status in SCAN_STATUS_CHOICES } scan_status = models.IntegerField( default=SCAN_NEW, choices=SCAN_STATUS_CHOICES, db_index=True, - help_text='Status of the scan for this URI.', + help_text="Status of the scan for this URI.", ) reindex_uri = models.BooleanField( default=False, null=True, blank=True, - help_text='Flag indicating whether or not this URI should be rescanned and reindexed.', + help_text="Flag indicating whether or not this URI should be rescanned and reindexed.", ) scan_error = models.TextField( null=True, blank=True, - help_text='Scan errors messages. When present this means the scan failed.', + help_text="Scan errors messages. When present this means the scan failed.", ) index_error = models.TextField( null=True, blank=True, - help_text='Indexing errors messages. When present this means the indexing failed.', + help_text="Indexing errors messages. When present this means the indexing failed.", ) package = models.ForeignKey( Package, - help_text='The Package that this ScannableURI is for', + help_text="The Package that this ScannableURI is for", on_delete=models.CASCADE, null=False, ) @@ -786,38 +770,29 @@ class ScannableURI(BaseURI): objects = ScannableURIManager() class Meta: - verbose_name = 'Scannable URI' + verbose_name = "Scannable URI" indexes = [ # to get the scannables models.Index( fields=[ - 'scan_status', - 'scan_date', + "scan_status", + "scan_date", ] ), # ordered by for the main queue query e.g. '-priority' - models.Index( - fields=[ - '-priority' - ] - ) + models.Index(fields=["-priority"]), ] def save(self, *args, **kwargs): - """ - Save, adding defaults for computed fields and validating fields. - """ + """Save, adding defaults for computed fields and validating fields.""" if not self.canonical: self.canonical = get_canonical(self.uri) self.normalize_fields() - super(ScannableURI, self).save(*args, **kwargs) + super().save(*args, **kwargs) def process_scan_results( - self, - scan_results_location, - scan_summary_location, - project_extra_data + self, scan_results_location, scan_summary_location, project_extra_data ): from minecode import tasks @@ -846,7 +821,6 @@ def process_scan_results( # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. class PriorityResourceURIManager(models.Manager): - def insert(self, uri, **extra_fields): """ Create and return a new PriorityResourceURI after computing its canonical URI @@ -856,26 +830,17 @@ def insert(self, uri, **extra_fields): """ # TODO: be able to create a request for an existing purl if the previous request has been completed already - priority_resource_uris = self.filter( - uri=uri, - package_url=uri, - **extra_fields - ) - if ( - priority_resource_uris.count() == 0 - or all(p.processed_date for p in priority_resource_uris) + priority_resource_uris = self.filter(uri=uri, package_url=uri, **extra_fields) + if priority_resource_uris.count() == 0 or all( + p.processed_date for p in priority_resource_uris ): priority_resource_uri = self.create( - uri=uri, - package_url=uri, - **extra_fields + uri=uri, package_url=uri, **extra_fields ) return priority_resource_uri def in_progress(self): - """ - Limit the QuerySet to PriorityResourceURI being processed. - """ + """Limit the QuerySet to PriorityResourceURI being processed.""" return self.filter(wip_date__isnull=False) def never_processed(self): @@ -883,17 +848,12 @@ def never_processed(self): Limit the QuerySet to PriorityResourceURIs that have never been processed. This is usually the state of a PriorityResourceURI after upon creation. """ - return self.filter( - processed_date__isnull=True, - wip_date__isnull=True - ).order_by( - 'request_date' + return self.filter(processed_date__isnull=True, wip_date__isnull=True).order_by( + "request_date" ) def get_requests(self): - """ - Return an ordered query set of all processable PriorityResourceURIs. - """ + """Return an ordered query set of all processable PriorityResourceURIs.""" never_processed = self.never_processed() return never_processed @@ -907,12 +867,13 @@ def get_next_request(self): NOTE: this method can only be called from within a transaction.atomic block. """ - priority_resource_uri = self.get_requests( - ).select_for_update(skip_locked=True).first() + priority_resource_uri = ( + self.get_requests().select_for_update(skip_locked=True).first() + ) if not priority_resource_uri: return priority_resource_uri.wip_date = timezone.now() - priority_resource_uri.save(update_fields=['wip_date']) + priority_resource_uri.save(update_fields=["wip_date"]) return priority_resource_uri @@ -937,15 +898,15 @@ class PriorityResourceURI(BaseURI): max_length=2048, null=True, blank=True, - help_text='URI for this resource. This is the unmodified original URI.', + help_text="URI for this resource. This is the unmodified original URI.", ) canonical = models.CharField( max_length=3000, null=True, blank=True, - help_text='Canonical form of the URI for this resource that must be ' - 'unique across all ResourceURI.', + help_text="Canonical form of the URI for this resource that must be " + "unique across all ResourceURI.", ) # This is a text blob that contains either HTML, JSON or anything @@ -954,9 +915,9 @@ class PriorityResourceURI(BaseURI): data = models.TextField( null=True, blank=True, - help_text='Text content of the file represented by this ' - 'ResourceURI. This contains the data that was fetched or ' - 'extracted from a remote ResourceURI such as HTML or JSON.', + help_text="Text content of the file represented by this " + "ResourceURI. This contains the data that was fetched or " + "extracted from a remote ResourceURI such as HTML or JSON.", ) package_url = models.CharField( @@ -964,59 +925,56 @@ class PriorityResourceURI(BaseURI): null=True, blank=True, db_index=True, - help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""" + help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""", ) request_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of when this Package info was requested.', + help_text="Timestamp set to the date of when this Package info was requested.", ) processed_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of when this Package info was requested.', + help_text="Timestamp set to the date of when this Package info was requested.", ) has_processing_error = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'an error has occured when processing this URI.' + help_text="When set to True (Yes), this field indicates that " + "an error has occured when processing this URI.", ) processing_error = models.TextField( null=True, blank=True, - help_text='Processing errors messages. When present this means the processing failed.', + help_text="Processing errors messages. When present this means the processing failed.", ) addon_pipelines = models.JSONField( default=list, blank=True, editable=False, - help_text='A list of addon ScanCode.io pipeline to run.', + help_text="A list of addon ScanCode.io pipeline to run.", ) objects = PriorityResourceURIManager() class Meta: - verbose_name = 'Priority Resource URI' + verbose_name = "Priority Resource URI" def save(self, *args, **kwargs): - """ - Save, adding defaults for computed fields and validating fields. - """ + """Save, adding defaults for computed fields and validating fields.""" self.normalize_fields() - super(PriorityResourceURI, self).save(*args, **kwargs) + super().save(*args, **kwargs) # TODO: Use the QuerySet.as_manager() for more flexibility and chaining. class ImportableURIManager(models.Manager): - def insert(self, uri, data, package_url, **extra_fields): """ Create and return a new ImportableURI @@ -1024,26 +982,17 @@ def insert(self, uri, data, package_url, **extra_fields): """ # TODO: be able to create a request for an existing purl if the previous request has been completed already - importable_uris = self.filter( - uri=uri, - **extra_fields - ) - if ( - importable_uris.count() == 0 - or all(p.processed_date for p in importable_uris) + importable_uris = self.filter(uri=uri, **extra_fields) + if importable_uris.count() == 0 or all( + p.processed_date for p in importable_uris ): importable_uri = self.create( - uri=uri, - data=data, - package_url=package_url, - **extra_fields + uri=uri, data=data, package_url=package_url, **extra_fields ) return importable_uri def in_progress(self): - """ - Limit the QuerySet to ImportableURI being processed. - """ + """Limit the QuerySet to ImportableURI being processed.""" return self.filter(wip_date__isnull=False) def never_processed(self): @@ -1051,17 +1000,12 @@ def never_processed(self): Limit the QuerySet to ImportableURIs that have never been processed. This is usually the state of a ImportableURI after upon creation. """ - return self.filter( - processed_date__isnull=True, - wip_date__isnull=True - ).order_by( - 'request_date' + return self.filter(processed_date__isnull=True, wip_date__isnull=True).order_by( + "request_date" ) def get_requests(self): - """ - Return an ordered query set of all processable ImportableURIs. - """ + """Return an ordered query set of all processable ImportableURIs.""" never_processed = self.never_processed() return never_processed @@ -1079,9 +1023,10 @@ def get_next_request(self): if not importable_uri: return importable_uri.wip_date = timezone.now() - importable_uri.save(update_fields=['wip_date']) + importable_uri.save(update_fields=["wip_date"]) return importable_uri + # TODO: have a second queue for crawling maven repo, that tracks which pages and namespaces we visited # when we hit the point of a package page, we add it to the queue that creates skinny packages for the package we visited. @@ -1092,7 +1037,7 @@ class ImportableURI(BaseURI): null=True, blank=True, db_index=True, - help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""" + help_text="""Package URL for this resource. It stands for a package "mostly universal" URL.""", ) # This is a text blob that contains either HTML, JSON or anything @@ -1101,49 +1046,47 @@ class ImportableURI(BaseURI): data = models.TextField( null=True, blank=True, - help_text='Text content of the file represented by this ' - 'ResourceURI. This contains the data that was fetched or ' - 'extracted from a remote ResourceURI such as HTML or JSON.', + help_text="Text content of the file represented by this " + "ResourceURI. This contains the data that was fetched or " + "extracted from a remote ResourceURI such as HTML or JSON.", ) request_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of when this Package info was requested.', + help_text="Timestamp set to the date of when this Package info was requested.", ) processed_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of when this Package info was processed.', + help_text="Timestamp set to the date of when this Package info was processed.", ) has_processing_error = models.BooleanField( db_index=True, default=False, - help_text='When set to True (Yes), this field indicates that ' - 'an error has occured when processing this URI.' + help_text="When set to True (Yes), this field indicates that " + "an error has occured when processing this URI.", ) processing_error = models.TextField( null=True, blank=True, - help_text='Processing errors messages. When present this means the processing failed.', + help_text="Processing errors messages. When present this means the processing failed.", ) objects = ImportableURIManager() class Meta: - verbose_name = 'Importable URI' + verbose_name = "Importable URI" def save(self, *args, **kwargs): - """ - Save, adding defaults for computed fields and validating fields. - """ + """Save, adding defaults for computed fields and validating fields.""" self.normalize_fields() - super(ImportableURI, self).save(*args, **kwargs) + super().save(*args, **kwargs) class ProcessingError(BaseURI): @@ -1151,21 +1094,19 @@ class ProcessingError(BaseURI): max_length=100, null=True, blank=True, - help_text='The name of the service running where the error occured.' + help_text="The name of the service running where the error occured.", ) date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text='Timestamp set to the date of when this error occured.', + help_text="Timestamp set to the date of when this error occured.", ) error_message = models.TextField( - null=True, - blank=True, - help_text='The message associated with this error' + null=True, blank=True, help_text="The message associated with this error" ) class Meta: - verbose_name = 'Processing Error' + verbose_name = "Processing Error" diff --git a/minecode/permissions.py b/minecode/permissions.py index 83815cea..f4a3ce65 100644 --- a/minecode/permissions.py +++ b/minecode/permissions.py @@ -2,8 +2,7 @@ class IsScanQueueWorkerAPIUser(permissions.BasePermission): - """ - Allow access to a user who is a part of the `scan_queue_workers` group - """ + """Allow access to a user who is a part of the `scan_queue_workers` group""" + def has_permission(self, request, view): - return request.user.groups.filter(name='scan_queue_workers').exists() + return request.user.groups.filter(name="scan_queue_workers").exists() diff --git a/minecode/route.py b/minecode/route.py index 59a0cd97..0b0445c7 100644 --- a/minecode/route.py +++ b/minecode/route.py @@ -8,10 +8,9 @@ # -from functools import wraps import inspect import re - +from functools import wraps """ Given a URI regex (or some string), this module can route execution to a @@ -36,7 +35,7 @@ """ -class Rule(object): +class Rule: """ A rule is a mapping between a pattern (typically a URI) and a callable (typically a function). @@ -48,8 +47,8 @@ class Rule(object): def __init__(self, pattern, endpoint): # To ensure the pattern will match entirely, we wrap the pattern # with start of line ^ and end of line $. - self.pattern = pattern.lstrip('^').rstrip('$') - self.pattern_match = re.compile('^' + self.pattern + '$').match + self.pattern = pattern.lstrip("^").rstrip("$") + self.pattern_match = re.compile("^" + self.pattern + "$").match # ensure the endpoint is callable assert callable(endpoint) @@ -61,35 +60,26 @@ def __init__(self, pattern, endpoint): self.endpoint = endpoint def __repr__(self): - return 'Rule(r"""{}""", {}.{})'.format( - self.pattern, self.endpoint.__module__, self.endpoint.__name__) + return f'Rule(r"""{self.pattern}""", {self.endpoint.__module__}.{self.endpoint.__name__})' def match(self, string): - """ - Match a string with the rule pattern, return True is matching. - """ + """Match a string with the rule pattern, return True is matching.""" return self.pattern_match(string) class RouteAlreadyDefined(TypeError): - """ - Raised when this route Rule already exists in the route map. - """ + """Raised when this route Rule already exists in the route map.""" class NoRouteAvailable(TypeError): - """ - Raised when there are no route available. - """ + """Raised when there are no route available.""" class MultipleRoutesDefined(TypeError): - """ - Raised when there are more than one route possible. - """ + """Raised when there are more than one route possible.""" -class Router(object): +class Router: """ A router is: - a container for a route map, consisting of several rules, stored in an @@ -104,9 +94,7 @@ class Router(object): """ def __init__(self, route_map=None): - """ - 'route_map' is an ordered mapping of pattern -> Rule. - """ + """'route_map' is an ordered mapping of pattern -> Rule.""" self.route_map = route_map or dict() # lazy cached pre-compiled regex match() for all route patterns self._is_routable = None @@ -131,14 +119,18 @@ def append(self, pattern, endpoint): def route(self, *patterns): """ - Decorator to make a callable 'endpoint' routed to one or more patterns. + Return a decorator to make a callable 'endpoint' routed to one or more + patterns. Example: + ------- >>> my_router = Router() >>> @my_router.route('http://nexb.com', 'http://deja.com') ... def somefunc(uri): ... pass + """ + def decorator(endpoint): assert patterns for pat in patterns: @@ -147,6 +139,7 @@ def decorator(endpoint): @wraps(endpoint) def decorated(*args, **kwargs): return self.process(*args, **kwargs) + return decorated return decorator @@ -186,7 +179,7 @@ def resolve(self, string): # this can happen when multiple patterns match the same string # we raise an exception with enough debugging information pats = repr([r.pattern for r in candidates]) - msg = '%(string)r matches multiple patterns %(pats)r' % locals() + msg = f"{string} matches multiple patterns {pats}" raise MultipleRoutesDefined(msg) return candidates[0].endpoint @@ -201,7 +194,7 @@ def is_routable(self, string): if not self._is_routable: # build an alternation regex - routables = '^(' + '|'.join(pat for pat in self.route_map) + ')$' + routables = "^(" + "|".join(pat for pat in self.route_map) + ")$" self._is_routable = re.compile(routables, re.UNICODE).match return bool(self._is_routable(string)) diff --git a/minecode/rsync.py b/minecode/rsync.py index 164b4e5d..c67bd9ea 100644 --- a/minecode/rsync.py +++ b/minecode/rsync.py @@ -23,7 +23,7 @@ # logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) # logger.setLevel(logging.DEBUG) -RSYNC_COMMAND = 'rsync' +RSYNC_COMMAND = "rsync" def modules(input_file): @@ -38,22 +38,22 @@ def modules(input_file): for line in inp: if not line: continue - if line.startswith(' '): + if line.startswith(" "): # this is the motd section continue line = line.strip() if line: - name, _desc = line.split('\t', 1) + name, _desc = line.split("\t", 1) yield name.strip() -octals = re.compile(r'#(\d{3})').findall +octals = re.compile(r"#(\d{3})").findall def decode_path(p): """Decode an rsync path with octal encodings""" for oc in set(octals(p)): - p = p.replace('#' + oc, octal2char(oc)) + p = p.replace("#" + oc, octal2char(oc)) return p @@ -63,30 +63,29 @@ def octal2char(s): def decode_ts(s): - """ - Convert an rsync timestamp (which is local tz) to an UTC ISO timestamp. - """ + """Convert an rsync timestamp (which is local tz) to an UTC ISO timestamp.""" tzinfo = tz.tzutc() - ar = arrow.get(s, 'YYYY/MM/DD HH:mm:ss').replace(tzinfo=tzinfo).to('utc') + ar = arrow.get(s, "YYYY/MM/DD HH:mm:ss").replace(tzinfo=tzinfo).to("utc") return ar.isoformat() + # note: there is a large number of possible file types, but we do not care for # them: only files and dirs matter; And links, block, pipes, fifo, etc do not. # i.e. we keep only - and d rsync_line = re.compile( - r'^(?P[\-d])' - r'(?P.{9})' - r' +' - r'(?P[\d,]+)' - r' ' - r'(?P\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})' # YYYY/MM/DD HH:mm:ss - r' +' - r'(?P.+$)' + r"^(?P[\-d])" + r"(?P.{9})" + r" +" + r"(?P[\d,]+)" + r" " + r"(?P\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})" # YYYY/MM/DD HH:mm:ss + r" +" + r"(?P.+$)" ).match -Entry = collections.namedtuple('Entry', 'type perm size date path') +Entry = collections.namedtuple("Entry", "type perm size date path") def entry(line): @@ -94,20 +93,20 @@ def entry(line): Return an Entry constructed from an rsync directory listing line. Assumes universal line endings. """ - line = line.rstrip('\n') + line = line.rstrip("\n") if not line: return - if 'skipping directory' in line: + if "skipping directory" in line: return rline = rsync_line(line) if not rline: return - typ = rline.group('type') - perm = rline.group('perm') - size = int(rline.group('size').replace(',', '')) - ts = rline.group('ts') + typ = rline.group("type") + perm = rline.group("perm") + size = int(rline.group("size").replace(",", "")) + ts = rline.group("ts") date = decode_ts(ts) - path = rline.group('path') + path = rline.group("path") path = decode_path(path) return dict(Entry(typ, perm, size, date, path)._asdict()) @@ -133,22 +132,21 @@ def fetch_directory(uri, recurse=True): Return the location of a tempfile containing an rsync dir listing for uri. Recursive if recurse is True. Raise an Exception with error details. """ - temp_file = get_temp_file( - file_name='minecode-rsync-dir-', extension='.rsync') - with open(temp_file, 'w') as tmp: + temp_file = get_temp_file(file_name="minecode-rsync-dir-", extension=".rsync") + with open(temp_file, "w") as tmp: file_name = tmp.name - ends = not uri.endswith('/') and '/' or '' - recursive = recurse and '--recursive' or '--no-recursive' - cmd = 'rsync --no-motd %(recursive)s -d "%(uri)s%(ends)s"' % locals() + ends = not uri.endswith("/") and "/" or "" + recursive = recurse and "--recursive" or "--no-recursive" + cmd = f'rsync --no-motd {recursive} -d "{uri}{ends}"' rsync = command.Command(cmd) out, err = rsync.execute() for o in out: tmp.write(o) - err = '\n'.join([e for e in err]) + err = "\n".join([e for e in err]) rc = rsync.returncode if err or rc: - raise Exception('%(cmd) failed. rc:%(tc)d err: %(err)s' % locals()) + raise Exception(f"{cmd} failed. rc:{rc} err: {err}") else: return file_name diff --git a/minecode/saneyaml.py b/minecode/saneyaml.py index 499c1eee..ad2176d0 100644 --- a/minecode/saneyaml.py +++ b/minecode/saneyaml.py @@ -13,11 +13,11 @@ import yaml try: - from yaml import CSafeLoader as SafeLoader from yaml import CSafeDumper as SafeDumper + from yaml import CSafeLoader as SafeLoader except ImportError: - from yaml import SafeLoader from yaml import SafeDumper + from yaml import SafeLoader """ @@ -48,9 +48,7 @@ def load(s): def dump(obj): - """ - Return a safe and sane YAML unicode string representation from `obj`. - """ + """Return a safe and sane YAML unicode string representation from `obj`.""" return yaml.dump( obj, Dumper=SaneDumper, @@ -62,29 +60,25 @@ def dump(obj): encoding=None, indent=4, width=90, - line_break='\n', + line_break="\n", explicit_start=False, explicit_end=False, ) class SaneLoader(SafeLoader): - """ - A safe loader configured with many sane defaults. - """ + """A safe loader configured with many sane defaults.""" def ignore_aliases(self, data): return True def string_loader(loader, node): - """ - Ensure that a scalar type (a value) is returned as a plain unicode string. - """ + """Ensure that a scalar type (a value) is returned as a plain unicode string.""" return loader.construct_scalar(node) -SaneLoader.add_constructor(u'tag:yaml.org,2002:str', string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:str", string_loader) # Load as strings most scalar types: nulls, ints, (such as in version # 01) floats (such version 2.20) and timestamps conversion (in @@ -94,20 +88,18 @@ def string_loader(loader, node): # must handle type conversion explicitly from unicode to other types # in the loaded objects. -SaneLoader.add_constructor(u'tag:yaml.org,2002:null', string_loader) -SaneLoader.add_constructor(u'tag:yaml.org,2002:timestamp', string_loader) -SaneLoader.add_constructor(u'tag:yaml.org,2002:float', string_loader) -SaneLoader.add_constructor(u'tag:yaml.org,2002:int', string_loader) -SaneLoader.add_constructor(u'tag:yaml.org,2002:null', string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:null", string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:timestamp", string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:float", string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:int", string_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:null", string_loader) # keep boolean conversion # SaneLoader.add_constructor(u'tag:yaml.org,2002:boolean', string_loader) def ordered_loader(loader, node): - """ - Ensure that YAML maps ordered is preserved and loaded in an dict now always ordered - """ + """Ensure that YAML maps ordered is preserved and loaded in an dict now always ordered""" assert isinstance(node, yaml.MappingNode) omap = dict() yield omap @@ -118,8 +110,8 @@ def ordered_loader(loader, node): omap[key] = value -SaneLoader.add_constructor(u'tag:yaml.org,2002:map', ordered_loader) -SaneLoader.add_constructor(u'tag:yaml.org,2002:omap', ordered_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:map", ordered_loader) +SaneLoader.add_constructor("tag:yaml.org,2002:omap", ordered_loader) # Fall back to mapping for anything else, e.g. ignore tags such as # !!Python, ruby and other dangerous mappings: treat them as a mapping @@ -128,53 +120,45 @@ def ordered_loader(loader, node): class SaneDumper(SafeDumper): def increase_indent(self, flow=False, indentless=False): - """ - Ensure that lists items are always indented. - """ - return super(SaneDumper, self).increase_indent(flow, indentless=False) + """Ensure that lists items are always indented.""" + return super().increase_indent(flow, indentless=False) def ignore_aliases(self, data): - """ - Avoid having aliases created from re-used Python objects. - """ + """Avoid having aliases created from re-used Python objects.""" return True def ordered_dumper(dumper, data): - """ - Ensure that maps are always dumped in the items order. - """ - return dumper.represent_mapping(u'tag:yaml.org,2002:map', data.items()) + """Ensure that maps are always dumped in the items order.""" + return dumper.represent_mapping("tag:yaml.org,2002:map", data.items()) SaneDumper.add_representer(dict, ordered_dumper) def null_dumper(dumper, value): - """ - Always dump nulls as empty string. - """ - return dumper.represent_scalar(u'tag:yaml.org,2002:null', u'') + """Dump nulls as an empty string.""" + return dumper.represent_scalar("tag:yaml.org,2002:null", "") SafeDumper.add_representer(type(None), null_dumper) -def string_dumper(dumper, value, _tag=u'tag:yaml.org,2002:str'): +def string_dumper(dumper, value, _tag="tag:yaml.org,2002:str"): """ Ensure that all scalars are dumped as UTF-8 unicode, folded and quoted in the sanest and most readable way. """ - if not isinstance(value, basestring): + if not isinstance(value, str): value = repr(value) if isinstance(value, str): - value = value.decode('utf-8') + value = value.decode("utf-8") style = None - multilines = '\n' in value + multilines = "\n" in value if multilines: - literal_style = '|' + literal_style = "|" style = literal_style return dumper.represent_scalar(_tag, value, style=style) @@ -183,19 +167,17 @@ def string_dumper(dumper, value, _tag=u'tag:yaml.org,2002:str'): SaneDumper.add_representer(str, string_dumper) # treat number as strings, not as numbers -SaneDumper.add_representer(int, partial( - string_dumper, _tag=u'tag:yaml.org,2002:int')) -SaneDumper.add_representer(float, partial( - string_dumper, _tag=u'tag:yaml.org,2002:float')) +SaneDumper.add_representer(int, partial(string_dumper, _tag="tag:yaml.org,2002:int")) +SaneDumper.add_representer( + float, partial(string_dumper, _tag="tag:yaml.org,2002:float") +) def boolean_dumper(dumper, value): - """ - Dump booleans as yes or no strings. - """ - value = u'yes' if value else u'no' + """Dump booleans as yes or no strings.""" + value = "yes" if value else "no" style = None - return dumper.represent_scalar(u'tag:yaml.org,2002:bool', value, style=style) + return dumper.represent_scalar("tag:yaml.org,2002:bool", value, style=style) SaneDumper.add_representer(bool, boolean_dumper) diff --git a/minecode/seed.py b/minecode/seed.py index 61892ef3..abba367a 100644 --- a/minecode/seed.py +++ b/minecode/seed.py @@ -18,7 +18,7 @@ unicode = str # NOQA -class Seeder(object): +class Seeder: """ Abstract base class for seeding URIs to visit. Each visitor should create a subclass of Seeder and implement the get_seeds method to yield the top levle @@ -31,9 +31,7 @@ class Seeder(object): revisit_after = 240 # hours def get_seeds(self): - """ - Yield seed URIs strings. Subclass must override. - """ + """Yield seed URIs strings. Subclass must override.""" raise NotImplementedError() @@ -47,8 +45,8 @@ def get_active_seeders(seeders=()): if not seeders: seeders = get_configured_seeders() for seeder in seeders: - if isinstance(seeder, (bytes, unicode)): - module_name, _, class_name = seeder.rpartition('.') + if isinstance(seeder, bytes | unicode): + module_name, _, class_name = seeder.rpartition(".") module = importlib.import_module(module_name) yield getattr(module, class_name)() else: @@ -62,5 +60,6 @@ def get_configured_seeders(): environment. """ from minecode.management.commands import get_settings + # ACTIVE_VISITOR_SEEDS is a list of fully qualified Seeder subclass strings - return get_settings('ACTIVE_SEEDERS') or [] + return get_settings("ACTIVE_SEEDERS") or [] diff --git a/minecode/tasks.py b/minecode/tasks.py index 072ab06f..d8550625 100644 --- a/minecode/tasks.py +++ b/minecode/tasks.py @@ -29,7 +29,6 @@ def process_scan_results( `scan_results_location` and `scan_summary_location` are deleted after the indexing process has finished. """ - with open(scan_results_location) as f: scan_data = json.load(f) with open(scan_summary_location) as f: @@ -38,7 +37,7 @@ def process_scan_results( try: scannable_uri = ScannableURI.objects.get(uuid=scannable_uri_uuid) except ScannableURI.DoesNotExist: - raise Exception(f'ScannableURI {scannable_uri_uuid} does not exist!') + raise Exception(f"ScannableURI {scannable_uri_uuid} does not exist!") indexing_errors = index_package( scannable_uri, diff --git a/minecode/tests/__init__.py b/minecode/tests/__init__.py index 8598b637..c7703d25 100644 --- a/minecode/tests/__init__.py +++ b/minecode/tests/__init__.py @@ -9,5 +9,4 @@ import os - FIXTURES_REGEN = os.environ.get("MINECODE_TEST_FIXTURES_REGEN", False) diff --git a/minecode/tests/collectors/__init__.py b/minecode/tests/collectors/__init__.py new file mode 100644 index 00000000..2eb8f9f0 --- /dev/null +++ b/minecode/tests/collectors/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# diff --git a/minecode/tests/test_conan.py b/minecode/tests/collectors/test_conan.py similarity index 84% rename from minecode/tests/test_conan.py rename to minecode/tests/collectors/test_conan.py index 61faa8e7..6f2556da 100644 --- a/minecode/tests/test_conan.py +++ b/minecode/tests/collectors/test_conan.py @@ -9,27 +9,28 @@ import os +from unittest.mock import patch -import saneyaml from django.test import TestCase -from mock import patch + +import saneyaml from packageurl import PackageURL import packagedb +from minecode.collectors import conan from minecode.utils_test import JsonBasedTesting -from minecode.visitors import conan class ConanPriorityQueueTests(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def setUp(self): - super(ConanPriorityQueueTests, self).setUp() + super().setUp() self.package_url1 = PackageURL.from_string("pkg:conan/zlib@1.3.1") - zlib_conanfile_loc = self.get_test_loc( - "conan/zlib/manifest/conanfile.py") - zlib_conandata_loc = self.get_test_loc( - "conan/zlib/manifest/conandata.yml") + zlib_conanfile_loc = self.get_test_loc("conan/zlib/manifest/conanfile.py") + zlib_conandata_loc = self.get_test_loc("conan/zlib/manifest/conandata.yml") zlib_config_loc = self.get_test_loc("conan/zlib/manifest/config.yml") with open(zlib_conanfile_loc) as f: @@ -41,8 +42,7 @@ def setUp(self): with open(zlib_conandata_loc) as f: self.zlib_conandata_contents = f.read() - self.zlib_conandata_contents_dict = saneyaml.load( - self.zlib_conandata_contents) + self.zlib_conandata_contents_dict = saneyaml.load(self.zlib_conandata_contents) @patch("requests.get") def test_get_conan_recipe(self, mock_get): @@ -91,7 +91,7 @@ def test_get_download_info(self): self.assertEqual(result_download_url, expected_zlib_download_url) self.assertEqual(result_sha256, expected_zlib_sha256) - @patch("minecode.visitors.conan.get_conan_recipe") + @patch("minecode.collectors.conan.get_conan_recipe") def test_map_conan_package(self, mock_get_conan_recipe): mock_get_conan_recipe.return_value = ( self.zlib_conanfile_contents, @@ -101,7 +101,7 @@ def test_map_conan_package(self, mock_get_conan_recipe): package_count = packagedb.models.Package.objects.all().count() self.assertEqual(package_count, 0) - conan.map_conan_package(self.package_url1, ('test_pipelines')) + conan.map_conan_package(self.package_url1, ("test_pipelines")) package_count = packagedb.models.Package.objects.all().count() self.assertEqual(package_count, 1) package = packagedb.models.Package.objects.all().first() diff --git a/minecode/tests/test_generic.py b/minecode/tests/collectors/test_generic.py similarity index 76% rename from minecode/tests/test_generic.py rename to minecode/tests/collectors/test_generic.py index 69300c86..ec58d2a6 100644 --- a/minecode/tests/test_generic.py +++ b/minecode/tests/collectors/test_generic.py @@ -8,12 +8,12 @@ # from django.test import TestCase as DjangoTestCase -from packagedcode.maven import _parse + from packageurl import PackageURL +from minecode.collectors import generic from minecode.route import NoRouteAvailable from minecode.utils_test import JsonBasedTesting -from minecode.visitors import generic from packagedb.models import Package @@ -22,7 +22,7 @@ def test_process_request(self): package_count = Package.objects.all().count() self.assertEqual(0, package_count) - purl = 'pkg:generic/test@1.0.0?download_url=http://example.com/test.tar.gz' + purl = "pkg:generic/test@1.0.0?download_url=http://example.com/test.tar.gz" error_msg = generic.process_request(purl) self.assertEqual(None, error_msg) @@ -30,10 +30,9 @@ def test_process_request(self): self.assertEqual(1, package_count) package = Package.objects.first() - self.assertEqual('test', package.name) - self.assertEqual('1.0.0', package.version) - self.assertEqual('http://example.com/test.tar.gz', - package.download_url) + self.assertEqual("test", package.name) + self.assertEqual("1.0.0", package.version) + self.assertEqual("http://example.com/test.tar.gz", package.download_url) def test_process_request_no_download_url(self): package_count = Package.objects.all().count() @@ -48,29 +47,27 @@ def test_map_generic_package(self): package_count = Package.objects.all().count() self.assertEqual(0, package_count) - purl = 'pkg:generic/test@1.0.0?download_url=http://example.com/test.tar.gz' + purl = "pkg:generic/test@1.0.0?download_url=http://example.com/test.tar.gz" package_url = PackageURL.from_string(purl) - error_msg = generic.map_generic_package(package_url, ('test_pipeline')) + error_msg = generic.map_generic_package(package_url, ("test_pipeline")) - self.assertEqual('', error_msg) + self.assertEqual("", error_msg) package_count = Package.objects.all().count() self.assertEqual(1, package_count) package = Package.objects.first() - self.assertEqual('test', package.name) - self.assertEqual('1.0.0', package.version) - self.assertEqual('http://example.com/test.tar.gz', - package.download_url) + self.assertEqual("test", package.name) + self.assertEqual("1.0.0", package.version) + self.assertEqual("http://example.com/test.tar.gz", package.download_url) def test_map_fetchcode_supported_package(self): package_count = Package.objects.all().count() self.assertEqual(0, package_count) purl = PackageURL.from_string("pkg:generic/udhcp@0.9.1") - error_msg = generic.map_fetchcode_supported_package( - purl, ('test_pipeline')) + error_msg = generic.map_fetchcode_supported_package(purl, ("test_pipeline")) - self.assertEqual('', error_msg) + self.assertEqual("", error_msg) package_count = Package.objects.all().count() self.assertEqual(1, package_count) diff --git a/minecode/tests/test_gnu.py b/minecode/tests/collectors/test_gnu.py similarity index 89% rename from minecode/tests/test_gnu.py rename to minecode/tests/collectors/test_gnu.py index db9d7249..3dec9b7b 100644 --- a/minecode/tests/test_gnu.py +++ b/minecode/tests/collectors/test_gnu.py @@ -9,20 +9,22 @@ import os +from unittest.mock import patch from django.test import TestCase -from mock import patch +from minecode.collectors import gnu from minecode.utils_test import JsonBasedTesting -from minecode.visitors import gnu from packagedb.models import Package class GnuPriorityQueueTests(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def setUp(self): - super(GnuPriorityQueueTests, self).setUp() + super().setUp() glibc_data_loc = self.get_test_loc("gnu/glibc/index.html") with open(glibc_data_loc) as f: diff --git a/minecode/tests/collectors/test_maven.py b/minecode/tests/collectors/test_maven.py new file mode 100644 index 00000000..1cbd2f71 --- /dev/null +++ b/minecode/tests/collectors/test_maven.py @@ -0,0 +1,523 @@ +import os +from unittest import mock +from unittest.mock import patch + +from django.test import TestCase as DjangoTestCase + +from packagedcode.maven import _parse +from packageurl import PackageURL + +import packagedb +from minecode.collectors import maven +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting + + +class MavenPriorityQueueTests(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def setUp(self): + super().setUp() + + self.expected_pom_loc = self.get_test_loc("maven/pom/classworlds-1.1.pom") + with open(self.expected_pom_loc) as f: + self.expected_pom_contents = f.read() + + self.scan_package = _parse( + "maven_pom", + "maven", + "Java", + text=self.expected_pom_contents, + ) + + def test_get_pom_text(self, regen=FIXTURES_REGEN): + pom_contents = maven.get_pom_text( + namespace=self.scan_package.namespace, + name=self.scan_package.name, + version=self.scan_package.version, + ) + if regen: + with open(self.expected_pom_loc, "w") as f: + f.write(pom_contents) + self.assertEqual(self.expected_pom_contents, pom_contents) + + pom_contents = maven.get_pom_text( + namespace="", + name="does-not-exist", + version="1.0", + ) + self.assertFalse(pom_contents) + + def test_get_package_sha1(self): + sha1 = maven.get_package_sha1(self.scan_package) + expected_sha1 = "60c708f55deeb7c5dfce8a7886ef09cbc1388eca" + self.assertEqual(expected_sha1, sha1) + + def test_map_maven_package(self): + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(0, package_count) + package_url = PackageURL.from_string(self.scan_package.purl) + maven.map_maven_package( + package_url, packagedb.models.PackageContentType.BINARY, ("test_pipeline") + ) + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(1, package_count) + package = packagedb.models.Package.objects.all().first() + expected_purl_str = "pkg:maven/classworlds/classworlds@1.1" + self.assertEqual(expected_purl_str, package.purl) + + def test_map_maven_package_custom_repo_url(self): + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(0, package_count) + custom_repo_purl = "pkg:maven/org.eclipse.core/runtime@20070801?repository_url=https://packages.atlassian.com/mvn/maven-atlassian-external/" + package_url = PackageURL.from_string(custom_repo_purl) + maven.map_maven_package( + package_url, packagedb.models.PackageContentType.BINARY, ("test_pipeline") + ) + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(1, package_count) + package = packagedb.models.Package.objects.all().first() + expected_repo_url = "https://packages.atlassian.com/mvn/maven-atlassian-external//org/eclipse/core/runtime/20070801/runtime-20070801.jar" + self.assertEqual(expected_repo_url, package.download_url) + + def test_process_request(self): + purl_str = "pkg:maven/org.apache.twill/twill-core@0.12.0" + download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar" + purl_sources_str = f"{purl_str}?classifier=sources" + sources_download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar" + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(0, package_count) + maven.process_request(purl_str) + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(2, package_count) + purls = [ + (package.purl, package.download_url) + for package in packagedb.models.Package.objects.all() + ] + self.assertIn((purl_str, download_url), purls) + self.assertIn((purl_sources_str, sources_download_url), purls) + + def test_fetch_parent(self, regen=FIXTURES_REGEN): + pom_loc = self.get_test_loc("maven/pom/ant-antlr-1.10.1.pom") + with open(pom_loc) as f: + pom_text = f.read() + parent_pom_text = maven.fetch_parent(pom_text) + expected_loc = self.get_test_loc("maven/pom/ant-parent-1.10.1.pom") + + if regen: + with open(expected_loc, "w") as f: + f.write(parent_pom_text) + + with open(expected_loc) as f: + expected_pom_text = f.read() + self.assertEqual(expected_pom_text, parent_pom_text) + + def test_get_ancestry(self): + pom_loc = self.get_test_loc("maven/pom/pulsar-client-1x-2.5.1.pom") + with open(pom_loc) as f: + pom_text = f.read() + ancestor_pom_texts = list(maven.get_ancestry(pom_text)) + expected_ancestor_pom_texts = [] + for expected_loc in [ + self.get_test_loc("maven/pom/apache-18.pom"), + self.get_test_loc("maven/pom/pulsar-2.5.1.pom"), + self.get_test_loc("maven/pom/pulsar-client-1x-base-2.5.1.pom"), + ]: + with open(expected_loc) as f: + expected_pom_text = f.read() + expected_ancestor_pom_texts.append(expected_pom_text) + self.assertEqual(expected_ancestor_pom_texts, ancestor_pom_texts) + + def test_merge_parent(self, regen=FIXTURES_REGEN): + pom_loc = self.get_test_loc("maven/pom/ant-antlr-1.10.1.pom") + with open(pom_loc) as f: + pom_text = f.read() + package = _parse("maven_pom", "maven", "Java", text=pom_text) + expected_before_loc = self.get_test_loc( + "maven/pom/ant-antlr-1.10.1-package_before.json" + ) + self.check_expected_results(package.to_dict(), expected_before_loc, regen=regen) + + parent_pom_loc = self.get_test_loc("maven/pom/ant-parent-1.10.1.pom") + with open(parent_pom_loc) as f: + parent_pom_text = f.read() + parent_package = _parse("maven_pom", "maven", "Java", text=parent_pom_text) + package = maven.merge_parent(package, parent_package) + expected_after_loc = self.get_test_loc( + "maven/pom/ant-antlr-1.10.1-package_after.json" + ) + self.check_expected_results(package.to_dict(), expected_after_loc, regen=regen) + + def test_merge_ancestors(self, regen=FIXTURES_REGEN): + pom_loc = self.get_test_loc("maven/pom/pulsar-client-1x-2.5.1.pom") + with open(pom_loc) as f: + pom_text = f.read() + package = _parse("maven_pom", "maven", "Java", text=pom_text) + expected_before_loc = self.get_test_loc( + "maven/pom/pulsar-client-1x-2.5.1-package_before.json" + ) + self.check_expected_results(package.to_dict(), expected_before_loc, regen=regen) + + ancestor_pom_texts = [] + for loc in [ + self.get_test_loc("maven/pom/apache-18.pom"), + self.get_test_loc("maven/pom/pulsar-2.5.1.pom"), + self.get_test_loc("maven/pom/pulsar-client-1x-base-2.5.1.pom"), + ]: + with open(loc) as f: + pom_text = f.read() + ancestor_pom_texts.append(pom_text) + + maven.merge_ancestors(ancestor_pom_texts, package) + expected_after_loc = self.get_test_loc( + "maven/pom/pulsar-client-1x-2.5.1-package_after.json" + ) + self.check_expected_results(package.to_dict(), expected_after_loc, regen=regen) + + @mock.patch("minecode.collectors.maven.get_pom_text") + def test_get_merged_ancestor_package_from_maven_package( + self, get_pom_text_mock, regen=FIXTURES_REGEN + ): + get_pom_text_mock.return_value = "" + ancestor_pom_texts = [] + with patch("minecode.collectors.maven.get_ancestry") as mock_get_ancestry: + for loc in [ + self.get_test_loc("maven/pom/apache-18.pom"), + self.get_test_loc("maven/pom/pulsar-2.5.1.pom"), + self.get_test_loc("maven/pom/pulsar-client-1x-base-2.5.1.pom"), + ]: + with open(loc) as f: + pom_text = f.read() + ancestor_pom_texts.append(pom_text) + mock_get_ancestry.return_value = ancestor_pom_texts + db_package = packagedb.models.Package.objects.create( + name="pulsar-client", + namespace="org.apache.pulsar", + version="2.5.1", + type="maven", + download_url="https://repo1.maven.org/maven2/org/apache/pulsar/pulsar-client/2.5.1/pulsar-client-2.5.1.jar", + ) + merged_package = maven.get_merged_ancestor_package_from_maven_package( + package=db_package + ) + expected_loc = self.get_test_loc( + "maven/pom/pulsar-client-merged-ancestor-package.json" + ) + self.check_expected_results( + merged_package.to_dict(), expected_loc, regen=regen + ) + + +class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") + + def test_check_if_file_name_is_linked_on_page(self): + links = ["foo/", "bar/", "baz/"] + self.assertTrue(maven.check_if_file_name_is_linked_on_page("foo/", links)) + self.assertFalse(maven.check_if_file_name_is_linked_on_page("qux/", links)) + + def test_check_if_page_has_pom_files(self): + links1 = ["foo/", "bar.jar", "bar.pom"] + links2 = ["foo/", "bar.jar"] + self.assertTrue(maven.check_if_page_has_pom_files(links1)) + self.assertFalse(maven.check_if_page_has_pom_files(links2)) + + def test_check_if_page_has_directories(self): + links1 = ["foo/", "bar/", "baz/"] + links2 = ["../", "bar.pom", "bar.jar"] + self.assertTrue(maven.check_if_page_has_directories(links1)) + self.assertFalse(maven.check_if_page_has_directories(links2)) + + def test_check_if_package_version_page(self): + links1 = ["../", "bar.pom", "bar.jar"] + links2 = ["../", "foo/", "bar/", "baz/"] + self.assertTrue(maven.check_if_package_version_page(links1)) + self.assertFalse(maven.check_if_package_version_page(links2)) + + def test_check_if_package_page(self): + links1 = ["../", "maven-metadata.xml"] + links2 = ["../", "bar.pom", "bar.jar"] + self.assertTrue(maven.check_if_package_page(links1)) + self.assertFalse(maven.check_if_package_page(links2)) + + def test_check_if_maven_root(self): + links1 = ["../", "archetype-catalog.xml"] + links2 = ["../", "bar.pom", "bar.jar"] + self.assertTrue(maven.check_if_maven_root(links1)) + self.assertFalse(maven.check_if_maven_root(links2)) + + @mock.patch("requests.get") + def test_check_on_page(self, mock_request_get): + checker = maven.check_if_page_has_pom_files + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = '
parent-7.11.0.pom' + self.assertTrue( + maven.check_on_page( + "https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/", checker + ) + ) + + @mock.patch("requests.get") + def test_is_maven_root(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'archetype-catalog.xml' + self.assertTrue(maven.is_maven_root("https://repo1.maven.org/maven2/")) + + @mock.patch("requests.get") + def test_is_package_page(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'maven-metadata.xml' + self.assertTrue( + maven.is_package_page("https://repo1.maven.org/maven2/xml-apis/xml-apis/") + ) + + @mock.patch("requests.get") + def test_is_package_version_page(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = """ + ../ + parent-7.11.0.pom + """ + self.assertTrue( + maven.is_package_version_page( + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/" + ) + ) + + def test_url_parts(self): + url = "https://example.com/foo/bar/baz.jar" + scheme, netloc, path_segments = maven.url_parts(url) + self.assertEqual("https", scheme) + self.assertEqual("example.com", netloc) + self.assertEqual(["foo", "bar", "baz.jar"], path_segments) + + def test_create_url(self): + scheme = "https" + netloc = "example.com" + path_segments = ["foo", "bar", "baz.jar"] + url = "https://example.com/foo/bar/baz.jar" + self.assertEqual(url, maven.create_url(scheme, netloc, path_segments)) + + @mock.patch("requests.get") + def test_get_maven_root(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = 'archetype-catalog.xml' + self.assertEqual( + "https://repo1.maven.org/maven2", + maven.get_maven_root( + "https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/" + ), + ) + + @mock.patch("requests.get") + def test_determine_namespace_name_version_from_url(self, mock_request_get): + url = "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2" + root_url = "https://repo1.maven.org/maven2" + + package_page_text = """ + 1.0.b2/ + 2005-09-20 05:53 - + maven-metadata.xml + 2012-06-26 17:01 567 + """ + package_page = mock.Mock(ok=True, text=package_page_text) + + package_version_page_text = """ + ../ - + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + """ + package_version_page = mock.Mock(ok=True, text=package_version_page_text) + mock_request_get.side_effect = [ + mock.Mock(ok=True, text=""), + mock.Mock(ok=True, text=""), + package_page, + mock.Mock(ok=True, text=""), + package_version_page, + ] + + namespace, package_name, package_version = ( + maven.determine_namespace_name_version_from_url(url, root_url) + ) + self.assertEqual("xml-apis", namespace) + self.assertEqual("xml-apis", package_name) + self.assertEqual("1.0.b2", package_version) + + @mock.patch("requests.get") + def test_add_to_import_queue(self, mock_request_get): + from minecode.models import ImportableURI + + url = "https://repo1.maven.org/maven2/xml-apis/xml-apis/" + root_url = "https://repo1.maven.org/maven2" + + package_page_text = """ + 1.0.b2/ + 2005-09-20 05:53 - + maven-metadata.xml + 2012-06-26 17:01 567 + """ + package_page = mock.Mock(ok=True, text=package_page_text) + + package_version_page_text = """ + ../ - + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + """ + package_version_page = mock.Mock(ok=True, text=package_version_page_text) + mock_request_get.side_effect = [ + package_page, + mock.Mock(ok=True, text=""), + mock.Mock(ok=True, text=""), + package_page, + mock.Mock(ok=True, text=""), + package_version_page, + ] + + self.assertEqual(0, ImportableURI.objects.all().count()) + maven.add_to_import_queue(url, root_url) + self.assertEqual(1, ImportableURI.objects.all().count()) + importable_uri = ImportableURI.objects.get(uri=url) + self.assertEqual("pkg:maven/xml-apis/xml-apis", importable_uri.package_url) + + def test_filter_only_directories(self): + timestamps_by_links = { + "../": "-", + "foo/": "-", + "foo.pom": "2023-09-28", + } + expected = { + "foo/": "-", + } + self.assertEqual(expected, maven.filter_only_directories(timestamps_by_links)) + + def test_filter_for_artifacts(self): + timestamps_by_links = { + "../": "2023-09-28", + "foo.pom": "2023-09-28", + "foo.ejb3": "2023-09-28", + "foo.ear": "2023-09-28", + "foo.aar": "2023-09-28", + "foo.apk": "2023-09-28", + "foo.gem": "2023-09-28", + "foo.jar": "2023-09-28", + "foo.nar": "2023-09-28", + "foo.so": "2023-09-28", + "foo.swc": "2023-09-28", + "foo.tar": "2023-09-28", + "foo.tar.gz": "2023-09-28", + "foo.war": "2023-09-28", + "foo.xar": "2023-09-28", + "foo.zip": "2023-09-28", + } + expected = { + "foo.ejb3": "2023-09-28", + "foo.ear": "2023-09-28", + "foo.aar": "2023-09-28", + "foo.apk": "2023-09-28", + "foo.gem": "2023-09-28", + "foo.jar": "2023-09-28", + "foo.nar": "2023-09-28", + "foo.so": "2023-09-28", + "foo.swc": "2023-09-28", + "foo.tar": "2023-09-28", + "foo.tar.gz": "2023-09-28", + "foo.war": "2023-09-28", + "foo.xar": "2023-09-28", + "foo.zip": "2023-09-28", + } + self.assertEqual(expected, maven.filter_for_artifacts(timestamps_by_links)) + + def test_collect_links_from_text(self): + filter = maven.filter_only_directories + text = """ + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + """ + expected = {"1.0.b2/": "2005-09-20 05:53", "1.2.01/": "2010-02-03 21:05"} + self.assertEqual(expected, maven.collect_links_from_text(text, filter=filter)) + + def test_create_absolute_urls_for_links(self): + filter = maven.filter_only_directories + text = """ + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + """ + url = "https://repo1.maven.org/maven2/xml-apis/xml-apis/" + expected = { + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/": "2005-09-20 05:53", + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/": "2010-02-03 21:05", + } + self.assertEqual( + expected, maven.create_absolute_urls_for_links(text, url, filter=filter) + ) + + @mock.patch("requests.get") + def test_get_directory_links(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = """ + ../ + 1.0.b2/ + 2005-09-20 05:53 - + 1.2.01/ + 2010-02-03 21:05 - + """ + url = "https://repo1.maven.org/maven2/xml-apis/xml-apis/" + expected = { + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/": "2005-09-20 05:53", + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/": "2010-02-03 21:05", + } + self.assertEqual(expected, maven.get_directory_links(url)) + + @mock.patch("requests.get") + def test_get_artifact_links(self, mock_request_get): + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = """ + ../ + xml-apis-1.0.b2.jar + 2005-09-20 05:53 109318 + xml-apis-1.0.b2.pom + 2005-09-20 05:53 2249 + """ + url = "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/" + expected = { + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar": "2005-09-20 05:53", + } + self.assertEqual(expected, maven.get_artifact_links(url)) + + def test_crawl_to_package(self): + pass + + def test_crawl_maven_repo_from_root(self): + pass + + @mock.patch("requests.get") + def test_get_artifact_sha1(self, mock_request_get): + sha1 = "3136ca936f64c9d68529f048c2618bd356bf85c9" + mock_request_get.return_value.ok = True + mock_request_get.return_value.text = sha1 + self.assertEqual( + sha1, + maven.get_artifact_sha1( + "https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1" + ), + ) + + def test_get_classifier_from_artifact_url(self): + artifact_url = "https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar" + package_version_page_url = ( + "https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/" + ) + package_name = "livereload-jvm" + package_version = "0.2.0" + classifier = maven.get_classifier_from_artifact_url( + artifact_url, package_version_page_url, package_name, package_version + ) + self.assertEqual("onejar", classifier) diff --git a/minecode/tests/collectors/test_npm.py b/minecode/tests/collectors/test_npm.py new file mode 100644 index 00000000..d517e278 --- /dev/null +++ b/minecode/tests/collectors/test_npm.py @@ -0,0 +1,61 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os + +from django.test import TestCase as DjangoTestCase + +from packagedcode.npm import NpmPackageJsonHandler +from packageurl import PackageURL + +import packagedb +from minecode.collectors import npm +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting + + +class NpmPriorityQueueTests(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def setUp(self): + super().setUp() + self.expected_json_loc = self.get_test_loc("npm/lodash_package-expected.json") + with open(self.expected_json_loc) as f: + self.expected_json_contents = json.load(f) + + self.scan_package = NpmPackageJsonHandler._parse( + json_data=self.expected_json_contents, + ) + + def test_get_package_json(self, regen=FIXTURES_REGEN): + json_contents = npm.get_package_json( + namespace=self.scan_package.namespace, + name=self.scan_package.name, + version=self.scan_package.version, + ) + if regen: + with open(self.expected_json_loc, "w") as f: + json.dump(json_contents, f, indent=3, separators=(",", ":")) + self.assertEqual(self.expected_json_contents, json_contents) + + def test_map_npm_package(self): + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(0, package_count) + package_url = PackageURL.from_string(self.scan_package.purl) + npm.map_npm_package(package_url, ("test_pipeline")) + package_count = packagedb.models.Package.objects.all().count() + self.assertEqual(1, package_count) + package = packagedb.models.Package.objects.all().first() + expected_purl_str = "pkg:npm/lodash@4.17.21" + expected_download_url = "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz" + self.assertEqual(expected_purl_str, package.purl) + self.assertEqual(expected_download_url, package.download_url) diff --git a/minecode/tests/miners/__init__.py b/minecode/tests/miners/__init__.py new file mode 100644 index 00000000..2eb8f9f0 --- /dev/null +++ b/minecode/tests/miners/__init__.py @@ -0,0 +1,8 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# diff --git a/minecode/tests/test_apache.py b/minecode/tests/miners/test_apache.py similarity index 50% rename from minecode/tests/test_apache.py rename to minecode/tests/miners/test_apache.py index 306c08bb..5b403243 100644 --- a/minecode/tests/test_apache.py +++ b/minecode/tests/miners/test_apache.py @@ -7,183 +7,189 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import OrderedDict import json import os import re +from collections import OrderedDict +from unittest.mock import patch from django.test import TestCase as DjangoTestCase -from mock import Mock -from mock import patch -from minecode import mappers -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting -from minecode.visitors import apache +from minecode import miners +from minecode.miners import apache from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class ApacheVistorTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_ApacheDistIndexVisitor(self): - uri = 'http://apache.org/dist/zzz/find-ls.gz' - test_loc = self.get_test_loc('apache/find-ls.gz') - with patch('requests.get') as mock_http_get: + uri = "http://apache.org/dist/zzz/find-ls.gz" + test_loc = self.get_test_loc("apache/find-ls.gz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = apache.ApacheDistIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'apache/find-ls.gz_uris-expected.json') + expected_loc = self.get_test_loc("apache/find-ls.gz_uris-expected.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_ApacheChecksumVisitor(self): - uri = 'http://archive.apache.org/dist/abdera/1.1.3/apache-abdera-1.1.3-src.zip.md5' - test_loc = self.get_test_loc('apache/apache-abdera-1.1.3-src.zip.md5') - with patch('requests.get') as mock_http_get: + uri = "http://archive.apache.org/dist/abdera/1.1.3/apache-abdera-1.1.3-src.zip.md5" + test_loc = self.get_test_loc("apache/apache-abdera-1.1.3-src.zip.md5") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _ = apache.ApacheChecksumVisitor(uri) self.assertEqual(None, uris) - self.assertEqual(b'0b5f2c334916c289f06c03f8577a9879', data) + self.assertEqual(b"0b5f2c334916c289f06c03f8577a9879", data) def test_ApacheChecksumVisitor_2(self): - uri = 'http://archive.apache.org/dist/groovy/2.4.6/distribution/apache-groovy-docs-2.4.6.zip.md5' - test_loc = self.get_test_loc('apache/apache-groovy-docs-2.4.6.zip.md5') - with patch('requests.get') as mock_http_get: + uri = "http://archive.apache.org/dist/groovy/2.4.6/distribution/apache-groovy-docs-2.4.6.zip.md5" + test_loc = self.get_test_loc("apache/apache-groovy-docs-2.4.6.zip.md5") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, data, _ = apache.ApacheChecksumVisitor(uri) self.assertEqual(None, uris) - self.assertEqual(b'c7a2d3becea1d28b518528f8204b8d2a', data) + self.assertEqual(b"c7a2d3becea1d28b518528f8204b8d2a", data) def test_ApacheProjectsJsonVisitor(self): - uri = 'https://projects.apache.org/json/foundation/projects.json' - test_loc = self.get_test_loc('apache/projects.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/foundation/projects.json" + test_loc = self.get_test_loc("apache/projects.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again uris, result, _ = apache.ApacheProjectsJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/projects_uris-expected.json') + expected_loc = self.get_test_loc("apache/projects_uris-expected.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) self.check_expected_results(result, test_loc, regen=FIXTURES_REGEN) def test_ApacheSingleProjectJsonVisitor(self): - uri = 'https://projects.apache.org/json/projects/ant-dotnet.json' - test_loc = self.get_test_loc('apache/ant-dotnet.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/projects/ant-dotnet.json" + test_loc = self.get_test_loc("apache/ant-dotnet.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again _, result, _ = apache.ApacheSingleProjectJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/ant-dotnet_expected.json') + expected_loc = self.get_test_loc("apache/ant-dotnet_expected.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_ApacheSingleProjectJsonVisitor_error1_json(self): - uri = 'https://projects.apache.org/json/projects/felix.json' - test_loc = self.get_test_loc('apache/felix.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/projects/felix.json" + test_loc = self.get_test_loc("apache/felix.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again _, result, _ = apache.ApacheSingleProjectJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/felix_expected.json') + expected_loc = self.get_test_loc("apache/felix_expected.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_ApacheSingleProjectJsonVisitor_error2_json(self): - uri = 'https://projects.apache.org/json/projects/attic-mrunit.json' - test_loc = self.get_test_loc('apache/attic-mrunit.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/projects/attic-mrunit.json" + test_loc = self.get_test_loc("apache/attic-mrunit.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again _, result, _ = apache.ApacheSingleProjectJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/attic-mrunit_expected.json') + expected_loc = self.get_test_loc("apache/attic-mrunit_expected.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_ApacheSingleProjectJsonVisitor_error3_json(self): - uri = 'https://projects.apache.org/json/projects/metamodel.json' - test_loc = self.get_test_loc('apache/metamodel.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/projects/metamodel.json" + test_loc = self.get_test_loc("apache/metamodel.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again _, result, _ = apache.ApacheSingleProjectJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/metamodel_expected.json') + expected_loc = self.get_test_loc("apache/metamodel_expected.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_ApachePodlingsJsonVisitor(self): - uri = 'https://projects.apache.org/json/foundation/podlings.json' - test_loc = self.get_test_loc('apache/podlings.json') - with patch('requests.get') as mock_http_get: + uri = "https://projects.apache.org/json/foundation/podlings.json" + test_loc = self.get_test_loc("apache/podlings.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # note: remove the "()" below once this visitor route is made active again uris, result, _ = apache.ApachePodlingsJsonVisitor()(uri) - expected_loc = self.get_test_loc('apache/podlings_expected_uris.json') + expected_loc = self.get_test_loc("apache/podlings_expected_uris.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - expected_loc = self.get_test_loc('apache/podlings_expected.json') + expected_loc = self.get_test_loc("apache/podlings_expected.json") self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) class ApacheMapperTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_package_from_download(self): - package = mappers.apache.build_package_from_download( - 'http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip', - 'pkg:apache/groovy@2.4.6') - expected_loc = self.get_test_loc('apache/map-groovy_expected.json') + package = miners.apache.build_package_from_download( + "http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip", + "pkg:apache/groovy@2.4.6", + ) + expected_loc = self.get_test_loc("apache/map-groovy_expected.json") self.check_expected_results( - package.to_dict(), expected_loc, regen=FIXTURES_REGEN) + package.to_dict(), expected_loc, regen=FIXTURES_REGEN + ) def test_build_package_from_download2(self): - package = mappers.apache.build_package_from_download( - 'http://archive.apache.org/dist/turbine/maven/turbine-webapp-2.3.3-1.0.0-source-release.zip', - 'pkg:apache/turbine-webapp@2.3.3-1.0.0-source-release') - expected_loc = self.get_test_loc( - 'apache/map-turbine-webapp_expected.json') + package = miners.apache.build_package_from_download( + "http://archive.apache.org/dist/turbine/maven/turbine-webapp-2.3.3-1.0.0-source-release.zip", + "pkg:apache/turbine-webapp@2.3.3-1.0.0-source-release", + ) + expected_loc = self.get_test_loc("apache/map-turbine-webapp_expected.json") self.check_expected_results( - package.to_dict(), expected_loc, regen=FIXTURES_REGEN) + package.to_dict(), expected_loc, regen=FIXTURES_REGEN + ) # TODO: add tests for checksums def test_build_packages_from_projects_json(self): - with open(self.get_test_loc('apache/projects.json')) as projectsjson_meta: - metadata = json.load( - projectsjson_meta, object_pairs_hook=OrderedDict) - packages = mappers.apache.build_packages_from_projects(metadata) + with open(self.get_test_loc("apache/projects.json")) as projectsjson_meta: + metadata = json.load(projectsjson_meta, object_pairs_hook=OrderedDict) + packages = miners.apache.build_packages_from_projects(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('apache/projects_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("apache/projects_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_build_packages_from_one_podling_json(self): - with open(self.get_test_loc('apache/podling_amaterasu.json')) as podlings_meta: + with open(self.get_test_loc("apache/podling_amaterasu.json")) as podlings_meta: metadata = json.load(podlings_meta, object_pairs_hook=OrderedDict) - packages = mappers.apache.build_packages_from_podlings( - metadata, purl='pkg:apache-podlings/amaterasu') + packages = miners.apache.build_packages_from_podlings( + metadata, purl="pkg:apache-podlings/amaterasu" + ) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'apache/podling_amaterasu_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("apache/podling_amaterasu_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) # TODO: add real mapper class tests c def test_regex_1(self): - regex = re.compile(r'^https?://(archive\.)?apache\.org/dist/.*$') + regex = re.compile(r"^https?://(archive\.)?apache\.org/dist/.*$") result = re.match( - regex, 'http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip') + regex, + "http://archive.apache.org/dist/groovy/2.4.6/sources/apache-groovy-src-2.4.6.zip", + ) self.assertTrue(result) def test_regex_2(self): - regex = re.compile(r'^https?://(archive\.)?apache\.org/dist/.*$') + regex = re.compile(r"^https?://(archive\.)?apache\.org/dist/.*$") result = re.match( - regex, 'https://apache.org/dist/chemistry/opencmis/1.1.0/chemistry-opencmis-dist-1.1.0-server-webapps.zip') + regex, + "https://apache.org/dist/chemistry/opencmis/1.1.0/chemistry-opencmis-dist-1.1.0-server-webapps.zip", + ) self.assertTrue(result) diff --git a/minecode/tests/miners/test_bitbucket.py b/minecode/tests/miners/test_bitbucket.py new file mode 100644 index 00000000..f123f343 --- /dev/null +++ b/minecode/tests/miners/test_bitbucket.py @@ -0,0 +1,145 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os +import re +from collections import OrderedDict +from unittest.mock import patch + +from minecode.miners.bitbucket import BitbucketDetailsVisitorPaginated +from minecode.miners.bitbucket import BitbucketIndexVisitor +from minecode.miners.bitbucket import BitbucketSingleRepoVisitor +from minecode.miners.bitbucket import build_bitbucket_download_packages +from minecode.miners.bitbucket import build_bitbucket_repo_package +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get + + +class BitbucketVisitorTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_BitbucketIndexVisitor(self): + uri = "https://api.bitbucket.org/2.0/repositories?pagelen=10" + test_loc = self.get_test_loc("bitbucket/visit/index-repositories.json") + + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, data, _ = BitbucketIndexVisitor(uri) + + expected_uri_loc = self.get_test_loc( + "bitbucket/visit/index-repositories_expected_uris.json" + ) + self.check_expected_uris(uris, expected_uri_loc, regen=FIXTURES_REGEN) + + expected_data_loc = self.get_test_loc( + "bitbucket/visit/index-repositories_expected_data.json" + ) + self.check_expected_results(data, expected_data_loc, regen=FIXTURES_REGEN) + + def test_BitbucketSingleRepoVisitor(self): + uri = "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/" + test_loc = self.get_test_loc("bitbucket/visit/singlerepo.json") + + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, data, _ = BitbucketSingleRepoVisitor(uri) + + expected_data_loc = self.get_test_loc( + "bitbucket/visit/singlerepo_expected_data.json" + ) + self.check_expected_results(data, expected_data_loc, regen=FIXTURES_REGEN) + + expected_uris_loc = self.get_test_loc( + "bitbucket/visit/singlerepo_expected_uris.json" + ) + self.check_expected_uris(uris, expected_uris_loc, regen=FIXTURES_REGEN) + + def test_BitbucketDetailsVisitorPaginated(self): + uri = "https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/refs/tags?pagelen=2" + test_loc = self.get_test_loc("bitbucket/visit/paginated_tags.json") + + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, data, _ = BitbucketDetailsVisitorPaginated(uri) + + expected_data_loc = self.get_test_loc( + "bitbucket/visit/paginated_tags_expected_data.json" + ) + self.check_expected_results(data, expected_data_loc, regen=FIXTURES_REGEN) + + expected_uris_loc = self.get_test_loc( + "bitbucket/visit/paginated_tags_expected_uris.json" + ) + self.check_expected_uris(uris, expected_uris_loc, regen=FIXTURES_REGEN) + + +class BitbucketMapperTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_pattern_match_without_download(self): + url = "https://api.bitbucket.org/2.0/repositories/phlogistonjohn/tweakmsg" + pattern = r"https://api.bitbucket.org/2.0/repositories/.*(?= 5.0.37.2)" in result["Build-Depends"]) + self.assertTrue("cmake" in result["Build-Depends"]) + + @expectedFailure + def test_debcon_get_paragraph_data_from_file_control_invalid(self): + control_file = self.get_test_loc("debian/debutils/control_invalid") + result = debcon.get_paragraph_data_from_file(control_file) + self.assertEqual({}, result) + + @expectedFailure + def test_debcon_get_paragraph_data_from_file_with_non_existing_path(self): + control_file = "path_invalid" + with self.assertRaises(Exception) as context: + debcon.get_paragraph_data_from_file(control_file) + self.assertTrue("No such file or directory" in context.exception) + + def test_parse_deb822_dsc(self): + dsc_file = self.get_test_loc("debian/debutils/3dldf_2.0.3+dfsg-2.dsc") + result = debcon.get_paragraph_data_from_file(dsc_file) + expected_loc = self.get_test_loc( + "debian/debutils/3dldf_2.0.3+dfsg-2.dsc-expected" + ) + self.check_expected_deb822(result, expected_loc, regen=FIXTURES_REGEN) + + ################################################################# + + def test_parse_email(self): + content = "Debian TeX Maintainers " + name, email = debutils.parse_email(content) + self.assertEqual("Debian TeX Maintainers", name) + self.assertEqual("debian-tex-maint@lists.debian.org", email) + + def test_parse_email_2(self): + # Space left Purposefully + content = " Debian TeX Maintainers " + name, email = debutils.parse_email(content) + self.assertEqual("Debian TeX Maintainers", name) + self.assertEqual(None, email) + + def test_parse_email_3(self): + # Space left Purposefully + content = "< debian-tex-maint@lists.debian.org >" + name, email = debutils.parse_email(content) + self.assertEqual(None, name) + self.assertEqual("debian-tex-maint@lists.debian.org", email) + + def test_comma_separated(self): + tags = "implemented-in::perl, role::program, use::converting, works-with::pim" + result = list(debutils.comma_separated(tags)) + self.assertEqual( + [ + "implemented-in::perl", + "role::program", + "use::converting", + "works-with::pim", + ], + result, + ) + + +class DebianReleaseTest(BaseDebianTest): + def test_parse_release(self): + release_file = self.get_test_loc("debian/release/Release") + result = list(debian.parse_release(release_file)) + expected_loc = self.get_test_loc("debian/release/Release_expected") + self.check_expected_deb822(result, expected_loc) + + def test_parse_release_with_md5(self): + release_file = self.get_test_loc("debian/release/Release_with_md5") + result = list(debian.parse_release(release_file)) + expected_loc = self.get_test_loc("debian/release/Release_with_md5_expected") + self.check_expected_deb822(result, expected_loc) + + @expectedFailure + def test_visit_debian_release(self): + uri = "http://ftp.debian.org/debian/dists/Debian8.3/Release" + test_loc = self.get_test_loc("debian/release/visited_Release") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = debian.DebianReleaseVisitor(uri) + result = json.loads(data) + + release_file = self.get_test_loc("debian/release/visited_Release-expected.json") + self.check_expected_deb822(result, release_file) + + +class DebianCopyrightTest(BaseDebianTest): + # TODO: There is an exception for the current debian copyright parser + @expectedFailure + def test_parse_copyright_only_basic(self): + copyright_file = self.get_test_loc("debian/copyright/basic_copyright") + copyrights = [info for info in debian.parse_copyright_only(copyright_file)] + self.assertTrue("Copyright 1998 John Doe " in copyrights) + self.assertTrue("Copyright 1998 Jane Doe " in copyrights) + + @expectedFailure + def test_parse_copyright_only_with_incorrect_file(self): + copyright_file = self.get_test_loc("debian/copyright/invalid_copyright") + with self.assertRaises(Exception) as context: + [info for info in debian.parse_copyright_only(copyright_file)] + self.assertTrue("no paragraphs in input" in context.exception) + + @expectedFailure + def test_parse_copyright_only_with_incorrect_path(self): + copyright_file = "path_invalid" + with self.assertRaises(Exception) as context: + [info for info in debian.parse_copyright_only(copyright_file)] + self.assertTrue("No such file or directory" in context.exception) + + @expectedFailure + def test_parse_copyright_allinfo_basic(self): + copyright_file = self.get_test_loc("debian/copyright/basic_copyright") + copyright_data = [ + info for info in debian.parse_copyright_allinfo(copyright_file) + ] + expected = [ + { + "files": ("*",), + "license": "GPL-2+", + "copyright": "Copyright 1998 John Doe ", + }, + { + "files": ("debian/*",), + "license": "GPL-2+", + "copyright": "Copyright 1998 Jane Doe ", + }, + ] + self.assertEqual(expected, copyright_data) + + @expectedFailure + def test_parse_copyright_allinfo_with_invalid_file(self): + copyright_file = self.get_test_loc("debian/copyright/invalid_copyright") + with self.assertRaises(Exception) as context: + [info for info in debian.parse_copyright_allinfo(copyright_file)] + self.assertTrue("no paragraphs in input" in context.exception) + + @expectedFailure + def test_parse_copyright_allinfo_with_incorrect_path(self): + copyright_file = "path_invalid" + with self.assertRaises(Exception) as context: + [info for info in debian.parse_copyright_allinfo(copyright_file)] + self.assertTrue("No such file or directory" in context.exception) + + @expectedFailure + def test_parse_license_basic(self): + copyright_file = self.get_test_loc("debian/copyright/basic_copyright") + licenses, licensetexts = debian.parse_license(copyright_file) + expected = { + "GPL-2+": [ + "This program is free software; you can redistribute it\n" + "and/or modify it under the terms of the GNU General Public\n" + "License as published by the Free Software Foundation; either\n" + "version 2 of the License, or (at your option) any later\n" + "version.\n\n" + "This program is distributed in the hope that it will be\n" + "useful, but WITHOUT ANY WARRANTY; without even the implied\n" + "warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n" + "PURPOSE. See the GNU General Public License for more\ndetails.\n\n" + "You should have received a copy of the GNU General Public\n" + "License along with this package; if not, write to the Free\n" + "Software Foundation, Inc., 51 Franklin St, Fifth Floor,\n" + "Boston, MA 02110-1301 USA\n\n" + "On Debian systems, the full text of the GNU General Public\n" + "License version 2 can be found in the file\n" + "`/usr/share/common-licenses/GPL-2'." + ] + } + self.assertEqual(expected, licenses) + self.assertEqual([], licensetexts) + + @expectedFailure + def test_parse_license_with_invalid_file(self): + copyright_file = self.get_test_loc("debian/copyright/invalid_copyright") + with self.assertRaises(Exception) as context: + debian.parse_license(copyright_file) + self.assertTrue("no paragraphs in input" in context.exception) + + @expectedFailure + def test_parse_license_with_incorrect_path(self): + copyright_file = "path_invalid" + with self.assertRaises(Exception) as context: + debian.parse_license(copyright_file) + self.assertTrue("No such file or directory" in context.exception) + + +class DebianSourcesTest(BaseDebianTest): + def test_collect_source_packages(self): + index_file = self.get_test_loc("debian/sources/debian_Sources") + source_info = [info for info in debian.collect_source_packages(index_file)] + expected_loc = self.get_test_loc("debian/sources/debian_Sources_visit_expected") + self.check_objects_expected(source_info, expected_loc, regen=FIXTURES_REGEN) + + def test_collect_source_packages_ubuntu(self): + index_file = self.get_test_loc("debian/sources/ubuntu_Sources") + source_info = [info for info in debian.collect_source_packages(index_file)] + expected_loc = self.get_test_loc("debian/sources/ubuntu_Sources_visit_expected") + self.check_objects_expected(source_info, expected_loc, regen=FIXTURES_REGEN) + + @expectedFailure + def test_DebianSourcesVisitor(self): + uri = ( + "http://ftp.debian.org/debian/dists/jessie-backports/main/source/Sources.gz" + ) + test_loc = self.get_test_loc("debian/sources/Sources.gz") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = debian.DebianSourcesVisitor(uri) + expected_loc = self.get_test_loc("debian/sources/Sources.gz-expected.json") + self.check_expected_uris(list(uris), expected_loc) + + @expectedFailure + def test_DebianSourcesVisitor_with_invalid_file(self): + uri = "http://ftp.debian.org/debian/dists/jessie-backports/main/source/invalid_files/Sources.gz" + test_loc = self.get_test_loc("debian/invalid_files/ls-lR.gz") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _ = debian.DebianSourcesVisitor(uri) + self.assertEqual(0, len(list(uris))) + + @expectedFailure + def test_build_source_file_packages(self): + with open(self.get_test_loc("debian/sources/debian_Sources")) as packs: + packages = debian.build_source_file_packages(packs.read()) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc( + "debian/sources/debian_Sources_mapped-expected-packages.json" + ) + self.check_expected_results(packages, expected_loc) + + +class DebianPackagesTest(BaseDebianTest): + def test_parse_packages_index(self): + index_file = self.get_test_loc("debian/packages/debian_Packages") + package_info = [info for info in debian.parse_packages_index(index_file)] + expected_loc = self.get_test_loc( + "debian/packages/debian_Packages-visit-expected.json" + ) + self.check_objects_expected(package_info, expected_loc, regen=FIXTURES_REGEN) + + @expectedFailure + def test_parse_packages_from_debian_Packages(self): + with open(self.get_test_loc("debian/packages/debian_Packages")) as packs: + packages = debian.parse_packages(packs.read()) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc( + "debian/packages/debian_Packages-expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + @expectedFailure + def test_parse_packages_from_ubuntu_Packages(self): + with open(self.get_test_loc("debian/packages/ubuntu_Packages")) as packs: + packages = debian.parse_packages(packs.read()) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc( + "debian/packages/ubuntu_Packages-expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + @expectedFailure + def test_parse_packages_from_installed_status(self): + with open(self.get_test_loc("debian/status/simple_status")) as packs: + packages = debian.parse_packages(packs.read()) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc( + "debian/packages/ubuntu_Packages-expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + +class DebianLSLRTest(BaseDebianTest): + def test_DebianDirectoryIndexVisitor_from_debian(self): + uri = "http://ftp.debian.org/debian/ls-lR.gz" + test_loc = self.get_test_loc("debian/lslr/ls-lR_debian") + temp_gz_location = self.get_tmp_gz_file(test_loc) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, temp_gz_location) + uris, _, _ = debian.DebianDirectoryIndexVisitor(uri) + expected_loc = self.get_test_loc("debian/lslr/ls-lR_debian.gz-expected.json") + self.check_expected_uris(list(uris), expected_loc) + + def test_DebianDirectoryIndexVisitor_from_ubuntu(self): + uri = "http://archive.ubuntu.com/ubuntu/ls-lR.gz" + test_loc = self.get_test_loc("debian/lslr/ls-lR_ubuntu") + temp_gz_location = self.get_tmp_gz_file(test_loc) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, temp_gz_location) + uris, _, _ = debian.DebianDirectoryIndexVisitor(uri) + expected_loc = self.get_test_loc("debian/lslr/ls-lR_ubuntu.gz-expected.json") + self.check_expected_uris(list(uris), expected_loc) + + +class DebianDescriptionTest(BaseDebianTest): + @expectedFailure + def test_DebianDescriptionVisitor(self): + uri = "http://ftp.debian.org/debian/pool/main/7/7kaa/7kaa_2.14.3-1.dsc" + test_loc = self.get_test_loc("debian/dsc/7kaa_2.14.3-1.dsc") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = debian.DebianDescriptionVisitor(uri) + result = json.loads(data) + dsc_file = self.get_test_loc("debian/dsc/description_expected.json") + self.check_expected_deb822(result, dsc_file) + + @expectedFailure + def test_parse_description(self): + with open( + self.get_test_loc("debian/dsc/description.json") + ) as debian_description_meta: + metadata = json.load(debian_description_meta) + packages = debian.parse_description(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("debian/dsc/description-expected.json") + self.check_expected_results(packages, expected_loc) + + +class DebianMapperTest(BaseDebianTest): + @expectedFailure + def test_get_dependencies(self): + test = { + "build1": "build", + "build2": "build2", + "build3": "buildnot", + } + keys = ["build1", "build2"] + result = debian.get_dependencies(test, keys) + self.assertEqual(2, len(result)) + self.assertEqual("build", result[0].purl) + self.assertEqual(None, result[0].requirement) + self.assertEqual("build2", result[1].purl) + self.assertEqual(None, result[1].requirement) + + def test_get_programming_language(self): + tags = [ + "role::program", + "implemented-in::perl", + "use::converting", + "works-with::pim", + ] + result = debian.get_programming_language(tags) + self.assertEqual("perl", result) diff --git a/minecode/tests/test_dockerhub.py b/minecode/tests/miners/test_dockerhub.py similarity index 52% rename from minecode/tests/test_dockerhub.py rename to minecode/tests/miners/test_dockerhub.py index 66e48ec0..4785cc08 100644 --- a/minecode/tests/test_dockerhub.py +++ b/minecode/tests/miners/test_dockerhub.py @@ -10,80 +10,70 @@ import json import os from collections import OrderedDict +from unittest.mock import patch - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode.visitors import dockerhub +from minecode import miners +from minecode.miners import dockerhub from minecode.tests import FIXTURES_REGEN -from minecode import mappers +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class DockerHubTest(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) class DockerHubVistorTest(DockerHubTest): - def test_searching_condition(self): combinations = dockerhub.get_search_conditions() - expected_file = self.get_test_loc('dockerhub/conditions_expected') - self.check_expected_results( - combinations, expected_file, regen=FIXTURES_REGEN) + expected_file = self.get_test_loc("dockerhub/conditions_expected") + self.check_expected_results(combinations, expected_file, regen=FIXTURES_REGEN) def test_seeds(self): seed = dockerhub.DockerHubSeed() seeds = list(seed.get_seeds()) - expected_file = self.get_test_loc('dockerhub/seeds_expected') + expected_file = self.get_test_loc("dockerhub/seeds_expected") self.check_expected_results(seeds, expected_file, regen=FIXTURES_REGEN) def test_visit_dockerhub_exlpore_page(self): - uri = 'https://hub.docker.com/explore/?page=1' - test_loc = self.get_test_loc('dockerhub/Explore_DockerHub_Page1.html') - with patch('requests.get') as mock_http_get: + uri = "https://hub.docker.com/explore/?page=1" + test_loc = self.get_test_loc("dockerhub/Explore_DockerHub_Page1.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = dockerhub.DockHubExplorePageVisitor(uri) - expected_loc = self.get_test_loc( - 'dockerhub/visitor_explore_page1_expected') + expected_loc = self.get_test_loc("dockerhub/visitor_explore_page1_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_dockerhub_project(self): - uri = 'https://hub.docker.com/_/elixir/' - test_loc = self.get_test_loc('dockerhub/library_elixir.html') - with patch('requests.get') as mock_http_get: + uri = "https://hub.docker.com/_/elixir/" + test_loc = self.get_test_loc("dockerhub/library_elixir.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = dockerhub.DockHubProjectHTMLVisitor(uri) result = json.loads(data, object_pairs_hook=OrderedDict) - expected_file = self.get_test_loc( - 'dockerhub/visitor_library_elixir_expected') - self.check_expected_results( - result, expected_file, regen=FIXTURES_REGEN) + expected_file = self.get_test_loc("dockerhub/visitor_library_elixir_expected") + self.check_expected_results(result, expected_file, regen=FIXTURES_REGEN) def test_visit_dockerhub_search_api(self): - uri = 'https://index.docker.io/v1/search?q=1a&n=100&page=2' - test_loc = self.get_test_loc('dockerhub/search.json') - with patch('requests.get') as mock_http_get: + uri = "https://index.docker.io/v1/search?q=1a&n=100&page=2" + test_loc = self.get_test_loc("dockerhub/search.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = dockerhub.DockHubLibraryRESTJsonVisitor(uri) - expected_loc = self.get_test_loc('dockerhub/visitor_search_expected') + expected_loc = self.get_test_loc("dockerhub/visitor_search_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) class DockerHubMapperTest(DockerHubTest): - def test_build_packages_fromjson(self): - with open(self.get_test_loc('dockerhub/elixir.json')) as dockerhub_metadata: + with open(self.get_test_loc("dockerhub/elixir.json")) as dockerhub_metadata: metadata = dockerhub_metadata.read() - packages = mappers.dockerhub.build_packages_from_jsonfile( - metadata, 'https://registry.hub.docker.com/v2/repositories/library') + packages = miners.dockerhub.build_packages_from_jsonfile( + metadata, "https://registry.hub.docker.com/v2/repositories/library" + ) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'dockerhub/expected_dockerhubmapper.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("dockerhub/expected_dockerhubmapper.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_eclipse.py b/minecode/tests/miners/test_eclipse.py new file mode 100644 index 00000000..5466bbb9 --- /dev/null +++ b/minecode/tests/miners/test_eclipse.py @@ -0,0 +1,129 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os +import unittest +from unittest.mock import patch + +import requests + +from minecode import miners +from minecode.miners import URI +from minecode.miners import eclipse +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get + + +class EclipseVistorTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_visit_eclipse_projects(self): + uri = "https://projects.eclipse.org/list-of-projects" + test_loc = self.get_test_loc("eclipse/projects.eclipse.org.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = eclipse.EclipseProjectVisitors(uri) + expected_loc = self.get_test_loc("eclipse/eclipse_projects_expected") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_eclipse_project(self): + uri = "https://projects.eclipse.org/projects/modeling.m2t.acceleo" + test_loc = self.get_test_loc("eclipse/Acceleo_projects.eclipse.org.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = eclipse.EclipseSingleProjectVisitor(uri) + with open( + self.get_test_loc("eclipse/acceleo_expected.html"), "rb" + ) as data_file: + self.assertEqual(data_file.read(), data) + + def test_visit_eclipse_git_repo(self): + uri = "http://git.eclipse.org/c" + test_loc = self.get_test_loc("eclipse/Eclipse_Git_repositories.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = eclipse.EclipseGitVisitor(uri) + expected_loc = self.get_test_loc("eclipse/eclipse_git_repos_expected") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_eclipse_packages(self): + uri = "http://www.eclipse.org/downloads/packages/all" + test_loc = self.get_test_loc("eclipse/All_Releases_Packages.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = eclipse.EclipsePackagesVisitor(uri) + expected_loc = self.get_test_loc("eclipse/eclipse_packages_expected") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_eclipse_package_releases(self): + uri = "http://www.eclipse.org/downloads/packages/release/Neon/R" + test_loc = self.get_test_loc("eclipse/Neon_R.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = eclipse.EclipseReleaseVisitor(uri) + expected_loc = self.get_test_loc("eclipse/Neon_R-expected.json") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_eclipse_projects_json(self): + uri = "http://projects.eclipse.org/json/projects/all" + test_loc = self.get_test_loc("eclipse/birt.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, data, _error = eclipse.EclipseProjectsJsonVisitor(uri) + + expected_uris = [ + URI( + uri="http://projects.eclipse.org/json/project/birt", + source_uri="http://projects.eclipse.org/json/projects/all", + package_url="pkg:eclipse/birt", + ) + ] + self.assertEqual(expected_uris, list(uris)) + + expected_loc = self.get_test_loc("eclipse/birt-expected.json") + self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) + + @unittest.skip("This requires a live internet connection to test requests timeouts") + def test_visitor_eclipse_projects_json_download_timeout_error(self): + uri = "http://projects.eclipse.org/json/projects/all" + try: + eclipse.EclipseProjectsJsonVisitor(uri) + except requests.Timeout: + self.fail( + "Time out error happens when download the url, " + "this should be fixed by increaseing the timeout." + ) + + +class TestEclipseMap(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_build_packages(self): + with open(self.get_test_loc("eclipse/birt.json")) as eclipse_metadata: + metadata = json.load(eclipse_metadata) + packages = miners.eclipse.build_packages_with_json(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("eclipse/eclipse_birt_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_eclipse_html_packages(self): + with open( + self.get_test_loc("eclipse/Acceleo_projects.eclipse.org.html") + ) as eclipse_metadata: + metadata = eclipse_metadata.read() + packages = miners.eclipse.build_packages(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("eclipse/Acceleo_projects_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_fdroid.py b/minecode/tests/miners/test_fdroid.py similarity index 58% rename from minecode/tests/test_fdroid.py rename to minecode/tests/miners/test_fdroid.py index 021b5f53..f81543e2 100644 --- a/minecode/tests/test_fdroid.py +++ b/minecode/tests/miners/test_fdroid.py @@ -9,51 +9,51 @@ import json import os +from unittest.mock import patch -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode.mappers import fdroid as fdroid_mapper -from minecode.visitors import fdroid as fdroid_visitor -from minecode.visitors import URI +from minecode.miners import URI +from minecode.miners import fdroid from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class TestFdroidVisitor(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_FdroidPackageRepoVisitor(self): - uri = 'https://f-droid.org/repo/index-v2.json' - test_loc = self.get_test_loc('fdroid/index-v2.json') - with patch('requests.get') as mock_http_get: + uri = "https://f-droid.org/repo/index-v2.json" + test_loc = self.get_test_loc("fdroid/index-v2.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _errors = fdroid_visitor.FdroidIndexVisitor(uri) + uris, data, _errors = fdroid.FdroidIndexVisitor(uri) # this is a non-persistent visitor, lets make sure we dont return any data assert not data expected_loc = self.get_test_loc( - 'fdroid/index-v2-expected-visit.json',) + "fdroid/index-v2-expected-visit.json", + ) self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) class TestFdroidMapper(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_packages(self): - with open(self.get_test_loc('fdroid/index-v2-visited.json')) as fdroid_data: + with open(self.get_test_loc("fdroid/index-v2-visited.json")) as fdroid_data: visited_uris = json.load(fdroid_data) visited_uris = [URI(**uri) for uri in visited_uris] purl_data = [(u.package_url, json.loads(u.data)) for u in visited_uris] packages = [] for purl, data in purl_data: - pkgs = list(fdroid_mapper.build_packages(purl, data)) + pkgs = list(fdroid.build_packages(purl, data)) packages.extend(pkgs) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'fdroid/index-v2-visited-expected-mapped.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("fdroid/index-v2-visited-expected-mapped.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_freebsd.py b/minecode/tests/miners/test_freebsd.py similarity index 53% rename from minecode/tests/test_freebsd.py rename to minecode/tests/miners/test_freebsd.py index 9f3aaa00..5f25171a 100644 --- a/minecode/tests/test_freebsd.py +++ b/minecode/tests/miners/test_freebsd.py @@ -9,60 +9,59 @@ import os -import yaml +from unittest.mock import patch -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode.visitors import freebsd +from minecode import miners +from minecode.miners import freebsd from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class FreeBSDVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_freebsd_seed(self): - uri = 'https://pkg.freebsd.org' - test_loc = self.get_test_loc('freebsd/FreeBSD.org.html') - with patch('requests.get') as mock_http_get: + uri = "https://pkg.freebsd.org" + test_loc = self.get_test_loc("freebsd/FreeBSD.org.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = freebsd.FreeBSDBaseHTMLVisitors(uri) - expected_loc = self.get_test_loc('freebsd/FreeBSD.org.html_expected') + expected_loc = self.get_test_loc("freebsd/FreeBSD.org.html_expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_freebsd_subHTML(self): - uri = 'https://pkg.freebsd.org/FreeBSD:10:i386/release_0/' - test_loc = self.get_test_loc('freebsd/FreeBSD-10-i386_release_0_.html') - with patch('requests.get') as mock_http_get: + uri = "https://pkg.freebsd.org/FreeBSD:10:i386/release_0/" + test_loc = self.get_test_loc("freebsd/FreeBSD-10-i386_release_0_.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = freebsd.FreeBSDSubHTMLVisitors(uri) expected_loc = self.get_test_loc( - 'freebsd/FreeBSD-10-i386_release_0_.html_expected') + "freebsd/FreeBSD-10-i386_release_0_.html_expected" + ) self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_freebsd_indexvisitor(self): - uri = 'https://pkg.freebsd.org/FreeBSD:10:i386/release_0/packagesite.txz' - test_loc = self.get_test_loc('freebsd/packagesite.txz') - with patch('requests.get') as mock_http_get: + uri = "https://pkg.freebsd.org/FreeBSD:10:i386/release_0/packagesite.txz" + test_loc = self.get_test_loc("freebsd/packagesite.txz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = freebsd.FreeBSDIndexVisitors(uri) - expected_loc = self.get_test_loc('freebsd/indexfile_expected') + expected_loc = self.get_test_loc("freebsd/indexfile_expected") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) class FreedesktopMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_map_index_file(self): - with open(self.get_test_loc('freebsd/mapper_input1')) as freebsd_metadata: + with open(self.get_test_loc("freebsd/mapper_input1")) as freebsd_metadata: metadata = freebsd_metadata.read() - packages = mappers.freebsd.build_packages(metadata) + packages = miners.freebsd.build_packages(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'freebsd/indexfile_expected_mapper.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("freebsd/indexfile_expected_mapper.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_freedesktop.py b/minecode/tests/miners/test_freedesktop.py new file mode 100644 index 00000000..4932d02a --- /dev/null +++ b/minecode/tests/miners/test_freedesktop.py @@ -0,0 +1,62 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os +from unittest.mock import patch + +from minecode import miners +from minecode.miners import freedesktop +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get + + +class FreedesktopTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + +class FreedesktopVistorTest(FreedesktopTest): + def test_visit_software_html_page(self): + uri = "https://www.freedesktop.org/wiki/Software" + test_loc = self.get_test_loc("freedesktop/Software.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = freedesktop.FreedesktopHTMLVisitor(uri) + expected_loc = self.get_test_loc("freedesktop/freedesktop_software_expected") + self.check_expected_uris(uris, expected_loc) + + +class FreedesktopMapperTest(FreedesktopTest): + def test_map_software_html_page_hal(self): + with open(self.get_test_loc("freedesktop/hal.html")) as freedesktop_metadata: + metadata = freedesktop_metadata.read() + packages = miners.freedesktop.build_packages( + metadata, + "https://www.freedesktop.org/wiki/Software/hal", + purl="pkg:freedesktop/hal", + ) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("freedesktop/hal_project_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_map_software_html_page_libinput(self): + with open( + self.get_test_loc("freedesktop/libinput.html") + ) as freedesktop_metadata: + metadata = freedesktop_metadata.read() + packages = miners.freedesktop.build_packages( + metadata, + "https://www.freedesktop.org/wiki/Software/libinput/", + purl="pkg:freedesktop/libinput", + ) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("freedesktop/libinput_project_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_github.py b/minecode/tests/miners/test_github.py similarity index 51% rename from minecode/tests/test_github.py rename to minecode/tests/miners/test_github.py index b673a408..0ac02059 100644 --- a/minecode/tests/test_github.py +++ b/minecode/tests/miners/test_github.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -9,44 +8,42 @@ # import os - -from mock import MagicMock -from mock import Mock -from mock import patch +from unittest.mock import MagicMock +from unittest.mock import patch from github.Download import Download -from github.MainClass import Github from github.Repository import Repository -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode.visitors import github +from minecode import miners +from minecode.miners import github from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class GithubVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) - @patch('github.MainClass.Github.get_repo') + @patch("github.MainClass.Github.get_repo") def test_GithubRepoVisitor(self, mock_get_repo): repository = MagicMock(spec=Repository) - repository.name = 'grit' + repository.name = "grit" repository.size = 7954 repository.id = 1 - repository.description = '**Grit is no longer maintained. Check out libgit2/rugged.** Grit gives you object oriented read/write access to Git repositories via Ruby.' - repository.language = 'Ruby' - repository.homepage = 'http://grit.rubyforge.org/' + repository.description = "**Grit is no longer maintained. Check out libgit2/rugged.** Grit gives you object oriented read/write access to Git repositories via Ruby." + repository.language = "Ruby" + repository.homepage = "http://grit.rubyforge.org/" repository._issues_url = None repository._git_url = None - repository.html_url = 'https://github.com/mojombo/grit' + repository.html_url = "https://github.com/mojombo/grit" repository.svn_url = None repository.etag = None - repository.clone_url = 'https://github.com/mojombo/grit.git' + repository.clone_url = "https://github.com/mojombo/grit.git" repository.watchers = None - repository.full_name = 'mojombo/grit' - repository.ssh_url = 'git@github.com:mojombo/grit.git' + repository.full_name = "mojombo/grit" + repository.ssh_url = "git@github.com:mojombo/grit.git" repository.owner = None repository.blobs_url = None repository.master_branch = None @@ -54,17 +51,17 @@ def test_GithubRepoVisitor(self, mock_get_repo): repository.pushed_at = None download = MagicMock(spec=Download) - download.name = 'grit-1.0.1.gem' + download.name = "grit-1.0.1.gem" download.redirect = None download.description = None - download.url = 'https://api.github.com/repos/mojombo/grit/downloads/5' + download.url = "https://api.github.com/repos/mojombo/grit/downloads/5" download.size = 1861632 download.s3_url = None download.created_at = None download.download_count = 187 download.redirect = None download.signature = None - download.html_url = 'https://github.com/downloads/mojombo/grit/grit-1.0.1.gem' + download.html_url = "https://github.com/downloads/mojombo/grit/grit-1.0.1.gem" download.bucket = None download.acl = None download.accesskeyid = None @@ -72,44 +69,44 @@ def test_GithubRepoVisitor(self, mock_get_repo): repository.get_downloads.return_value = iter([download]) tag = MagicMock() - tag.name = 'tags' - tag.zipball_url = 'https://api.github.com/repos/mojombo/grit/zipball/v2.5.0' - tag.tarball_url = 'https://api.github.com/repos/mojombo/grit/tarball/v2.5.0' - tag.name = 'v2.5.0' + tag.name = "tags" + tag.zipball_url = "https://api.github.com/repos/mojombo/grit/zipball/v2.5.0" + tag.tarball_url = "https://api.github.com/repos/mojombo/grit/tarball/v2.5.0" + tag.name = "v2.5.0" tag.commit = None repository.get_tags.return_value = iter([tag]) label = MagicMock() - label.name = 'label 1' + label.name = "label 1" repository.get_labels.return_value = iter([label]) mock_get_repo.return_value = repository - uri = 'https://api.github.com/repos/mojombo/grit' + uri = "https://api.github.com/repos/mojombo/grit" _, data, _ = github.GithubSingleRepoVisitor(uri) - expected_loc = self.get_test_loc('github/mojombo_grit_expected.json') + expected_loc = self.get_test_loc("github/mojombo_grit_expected.json") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) - @patch('github.MainClass.Github.get_repo') + @patch("github.MainClass.Github.get_repo") def test_GithubRepoVisitor_without_tag_without_download(self, mock_get_repo): repository = MagicMock(spec=Repository) - repository.name = 'calendar_builder' + repository.name = "calendar_builder" repository.size = 188 repository.id = 367 repository.description = None - repository.language = 'Ruby' + repository.language = "Ruby" repository.homepage = None repository._issues_url = None repository._git_url = None - repository.html_url = 'https://github.com/collectiveidea/calendar_builder' + repository.html_url = "https://github.com/collectiveidea/calendar_builder" repository.svn_url = None repository.etag = '"e10b78ff74a199fcf802be4afc333275"' - repository.clone_url = 'git@github.com:collectiveidea/calendar_builder.git' + repository.clone_url = "git@github.com:collectiveidea/calendar_builder.git" repository.watchers = None - repository.full_name = 'collectiveidea/calendar_builder' - repository.ssh_url = 'git@github.com:collectiveidea/calendar_builder.git' + repository.full_name = "collectiveidea/calendar_builder" + repository.ssh_url = "git@github.com:collectiveidea/calendar_builder.git" repository.owner = None - repository.blobs_url = 'https://api.github.com/repos/collectiveidea/calendar_builder/git/blobs{/sha}' + repository.blobs_url = "https://api.github.com/repos/collectiveidea/calendar_builder/git/blobs{/sha}" repository.master_branch = None repository.updated_at = None repository.pushed_at = None @@ -119,49 +116,51 @@ def test_GithubRepoVisitor_without_tag_without_download(self, mock_get_repo): repository.get_labels.return_value = None master_branch = MagicMock() - master_branch.name = 'master' + master_branch.name = "master" refactoring_branch = MagicMock() - refactoring_branch.name = 'refactoring' - repository.get_branches.return_value = iter( - [master_branch, refactoring_branch]) + refactoring_branch.name = "refactoring" + repository.get_branches.return_value = iter([master_branch, refactoring_branch]) mock_get_repo.return_value = repository - uri = 'https://api.github.com/repos/collectiveidea/calendar_builder' + uri = "https://api.github.com/repos/collectiveidea/calendar_builder" _, data, _ = github.GithubSingleRepoVisitor(uri) - expected_loc = self.get_test_loc( - 'github/calendar_builder-expected.json') + expected_loc = self.get_test_loc("github/calendar_builder-expected.json") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) def test_GithubReposVisitor(self): - uri = 'https://api.github.com/repositories?since=0' - test_loc = self.get_test_loc('github/repo_since0.json') - with patch('requests.get') as mock_http_get: + uri = "https://api.github.com/repositories?since=0" + test_loc = self.get_test_loc("github/repo_since0.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = github.GithubReposVisitor(uri) - expected_loc = self.get_test_loc('github/repo_since0_expected.json') + expected_loc = self.get_test_loc("github/repo_since0_expected.json") self.check_expected_results(data, expected_loc) class GithubMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_github_repo_mapper1(self): - with open(self.get_test_loc('github/calendar_builder.json')) as json_metadata: + with open(self.get_test_loc("github/calendar_builder.json")) as json_metadata: metadata = json_metadata.read() - packages = mappers.github.build_github_packages( - metadata, 'https://api.github.com/repos/collectiveidea/calendar_builder') + packages = miners.github.build_github_packages( + metadata, "https://api.github.com/repos/collectiveidea/calendar_builder" + ) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'github/mapper_calendar_builder_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("github/mapper_calendar_builder_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_github_repo_mapper2(self): - with open(self.get_test_loc('github/mojombo_grit_from_visitor_4mapper_input.json')) as json_metadata: + with open( + self.get_test_loc("github/mojombo_grit_from_visitor_4mapper_input.json") + ) as json_metadata: metadata = json_metadata.read() - packages = mappers.github.build_github_packages( - metadata, 'https://api.github.com/repos/mojombo/grit') + packages = miners.github.build_github_packages( + metadata, "https://api.github.com/repos/mojombo/grit" + ) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'github/mojombo_grit_result_mapper_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "github/mojombo_grit_result_mapper_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_gitlab.py b/minecode/tests/miners/test_gitlab.py similarity index 50% rename from minecode/tests/test_gitlab.py rename to minecode/tests/miners/test_gitlab.py index 5450f0a6..70cc6153 100644 --- a/minecode/tests/test_gitlab.py +++ b/minecode/tests/miners/test_gitlab.py @@ -9,52 +9,56 @@ import os import unittest +from unittest.mock import patch -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode.visitors import gitlab +from minecode import miners +from minecode.miners import gitlab from minecode.tests import FIXTURES_REGEN -from minecode import mappers +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class GitlabTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) class GitlabVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) - @unittest.skip('The test is to test fetching remotely through http connection') + @unittest.skip("The test is to test fetching remotely through http connection") def test_visit_api_header_getheaders(self): - uri = 'https://gitlab.com/api/v4/projects' + uri = "https://gitlab.com/api/v4/projects" uris, _, _ = gitlab.GitlabAPIHeaderVisitor(uri) - expected_loc = self.get_test_loc('gitlab/expected_projects.json') + expected_loc = self.get_test_loc("gitlab/expected_projects.json") self.check_expected_uris(uris, expected_loc) def test_visit_metacpan_api_projects(self): - uri = 'https://gitlab.com/api/v4/projects?page=1&per_page=70&statistics=true' - test_loc = self.get_test_loc('gitlab/projects_visitor.json') - with patch('requests.get') as mock_http_get: + uri = "https://gitlab.com/api/v4/projects?page=1&per_page=70&statistics=true" + test_loc = self.get_test_loc("gitlab/projects_visitor.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = gitlab.GitlabAPIVisitor(uri) - expected_loc = self.get_test_loc( - 'gitlab/expected_projects_visitor.json') + expected_loc = self.get_test_loc("gitlab/expected_projects_visitor.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) class GitlabMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_map_software_html_page_hal(self): - with open(self.get_test_loc('gitlab/microservice-express-mongo.json')) as gitlab_json: + with open( + self.get_test_loc("gitlab/microservice-express-mongo.json") + ) as gitlab_json: metadata = gitlab_json.read() - packages = mappers.gitlab.build_packages_from_json(metadata) + packages = miners.gitlab.build_packages_from_json(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'gitlab/microservice-express-mongo_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "gitlab/microservice-express-mongo_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_golang.py b/minecode/tests/miners/test_golang.py new file mode 100644 index 00000000..285fe8e0 --- /dev/null +++ b/minecode/tests/miners/test_golang.py @@ -0,0 +1,99 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os +from unittest.mock import patch + +from packageurl import PackageURL + +from minecode.miners.golang import GodocIndexVisitor +from minecode.miners.golang import GodocSearchVisitor +from minecode.miners.golang import build_golang_package +from minecode.miners.golang import parse_package_path +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get + + +class GoLangVisitorTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_GoLangGoDocAPIVisitor(self): + uri = "https://api.godoc.org/packages" + test_loc = self.get_test_loc("golang/packages.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = GodocIndexVisitor(uri) + expected_loc = self.get_test_loc("golang/packages_expected_uris.json") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_GodocSearchVisitor(self): + uri = "https://api.godoc.org/search?q=github.com/golang" + test_loc = self.get_test_loc("golang/godoc_search.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = GodocSearchVisitor(uri) + expected_loc = self.get_test_loc("golang/godoc_search_expected_uris.json") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_GodocSearchVisitor_with_non_github_urls(self): + uri = "https://api.godoc.org/search?q=github.com/golang*" + test_loc = self.get_test_loc("golang/godoc_search_off_github.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = GodocSearchVisitor(uri) + expected_loc = self.get_test_loc( + "golang/godoc_search_off_github_expected_uris.json" + ) + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_parse_package_path(self): + test_path = "github.com/lambdasoup/go-netlink/log" + purl = PackageURL.from_string( + "pkg:golang/github.com/lambdasoup/go-netlink" + "?vcs_repository=https://github.com/lambdasoup/go-netlink" + ) + expected = purl, "github.com/lambdasoup/go-netlink" + assert expected == parse_package_path(test_path) + + +class GoLangMapperTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_build_golang_package(self): + purl = "pkg:golang/github.com/golang/glog?vcs_repository=https://github.com/golang/glog" + with open(self.get_test_loc("golang/glog.json")) as f: + package_data = json.load(f) + package = build_golang_package(package_data, purl) + package = package.to_dict() + expected_loc = self.get_test_loc("golang/glog_expected.json") + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_build_golang_package_bitbucket(self): + purl = "pkg:bitbucket/bitbucket.org/zombiezen/yaml?vcs_repository=https://bitbucket.org/zombiezen/yaml" + with open(self.get_test_loc("golang/math3.json")) as f: + package_data = json.load(f) + package = build_golang_package(package_data, purl) + package = package.to_dict() + expected_loc = self.get_test_loc("golang/math3_expected.json") + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_build_golang_package_non_well_known(self): + purl = "pkg:golang/winterdrache.de/bindings/sdl" + with open(self.get_test_loc("golang/winter.json")) as f: + package_data = json.load(f) + package = build_golang_package(package_data, purl) + package = package.to_dict() + expected_loc = self.get_test_loc("golang/winter_expected.json") + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_googlecode.py b/minecode/tests/miners/test_googlecode.py new file mode 100644 index 00000000..9938fbe0 --- /dev/null +++ b/minecode/tests/miners/test_googlecode.py @@ -0,0 +1,131 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os +from unittest.mock import patch + +from minecode import miners +from minecode.miners import URI +from minecode.miners import googlecode +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get + + +class GoogleNewAPIVisitorsTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_visit_google_download_zip_visitor(self): + uri = "https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip" + test_loc = self.get_test_loc("googlecode/google-code-archive.txt.zip") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = googlecode.GooglecodeArchiveVisitor(uri) + expected_loc = self.get_test_loc( + "googlecode/expected_google-code-archive.txt.zip.json" + ) + self.check_expected_uris(uris, expected_loc) + + def test_visit_google_projectpages(self): + uri = "https://code.google.com/archive/search?q=domain:code.google.com" + test_loc = self.get_test_loc("googlecode/v2_api/GoogleCodeProjectHosting.htm") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) + expected_loc = self.get_test_loc( + "googlecode/v2_api/expected_googleprojects.json" + ) + self.check_expected_uris(uris, expected_loc) + + def test_visit_google_projectpage2(self): + uri = "https://code.google.com/archive/search?q=domain:code.google.com&page=2" + test_loc = self.get_test_loc( + "googlecode/v2_api/GoogleCodeProjectHosting_page2.htm" + ) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) + expected_loc = self.get_test_loc( + "googlecode/v2_api/expected_googleproject_page2.json" + ) + self.check_expected_uris(uris, expected_loc) + + def test_visit_google_download_json(self): + uri = "https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/project.json" + test_loc = self.get_test_loc("googlecode/v2_api/project.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = googlecode.GoogleProjectJsonVisitor(uri) + self.assertEqual( + [ + URI( + uri="https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/downloads-page-1.json" + ) + ], + list(uris), + ) + + def test_visit_google_json(self): + uri = "https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/downloads-page-1.json" + test_loc = self.get_test_loc("googlecode/v2_api/downloads-page-1.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) + expected_loc = self.get_test_loc( + "googlecode/v2_api/hg4j_download_expected.json" + ) + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_googleapi_project_json(self): + uri = "https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2%2Fapache-extras.org%2F124799961-qian%2Fproject.json?alt=media" + test_loc = self.get_test_loc( + "googlecode/v2_apache-extras.org_124799961-qian_project.json" + ) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) + expected_loc = self.get_test_loc( + "googlecode/expected_v2_apache-extras.org_124799961-qian_project2.json" + ) + self.check_expected_results(data, expected_loc) + + +class GoogleNewAPIMappersTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_build_packages_from_v2_projects_json(self): + with open( + self.get_test_loc("googlecode/v2_api/project.json") + ) as projectsjson_meta: + metadata = json.load(projectsjson_meta) + packages = miners.googlecode.build_packages_from_projectsjson_v2(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc( + "googlecode/v2_api/package_expected_project.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_packages_from_v1_projects_json(self): + with open( + self.get_test_loc( + "googlecode/v2_apache-extras.org_124799961-qian_project.json" + ) + ) as projectsjson_meta: + metadata = json.load(projectsjson_meta) + packages = miners.googlecode.build_packages_from_projectsjson_v1(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc( + "googlecode/mapper_expected_v2_apache-extras.org_124799961-qian_project.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_gstreamer.py b/minecode/tests/miners/test_gstreamer.py new file mode 100644 index 00000000..14b528b3 --- /dev/null +++ b/minecode/tests/miners/test_gstreamer.py @@ -0,0 +1,66 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os +import re +from unittest.mock import patch + +from minecode import miners +from minecode.miners import gstreamer +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get + + +class GstreamerVistorTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_visit_gstreamer_source_root(self): + uri = "https://gstreamer.freedesktop.org/src/" + test_loc = self.get_test_loc("gstreamer/src_root.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = gstreamer.GstreamerHTMLVisitor(uri) + expected_loc = self.get_test_loc("gstreamer/src_root.html-expected") + self.check_expected_uris(uris, expected_loc) + + def test_visit_Gstreamer_subpath_contains_file_resources(self): + uri = "https://gstreamer.freedesktop.org/src/gst-openmax/pre/" + test_loc = self.get_test_loc("gstreamer/src_gst-openmax_pre.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = gstreamer.GstreamerHTMLVisitor(uri) + expected_loc = self.get_test_loc("gstreamer/src_gst-openmax_pre.html-expected") + self.check_expected_uris(uris, expected_loc) + + +class GstreamerMappersTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_build_package_mapper_regex(self): + regex = re.compile( + r"^https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*[\w\-\.]+[.tar\.bz2|\.sha1sum|\.md5|\.gz|\.tar\.xz|\.asc]$" + ) + result = re.match( + regex, + "https://gstreamer.freedesktop.org/src/gst-openmax/pre/gst-openmax-0.10.0.2.tar.bz2", + ) + self.assertTrue(result) + + def test_build_package_from_url(self): + packages = miners.gstreamer.build_package_from_url( + "https://gstreamer.freedesktop.org/src/gst-openmax/pre/gst-openmax-0.10.0.2.tar.bz2" + ) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("gstreamer/mapper_build_from_url-expected") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_haxe.py b/minecode/tests/miners/test_haxe.py similarity index 52% rename from minecode/tests/test_haxe.py rename to minecode/tests/miners/test_haxe.py index 0c27d4c0..7156f19e 100644 --- a/minecode/tests/test_haxe.py +++ b/minecode/tests/miners/test_haxe.py @@ -9,59 +9,57 @@ import json import os +from unittest.mock import patch -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode.visitors import haxe +from minecode import miners +from minecode.miners import haxe from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class HaxeVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_haxe_projects(self): - uri = 'https://lib.haxe.org/all' - test_loc = self.get_test_loc('haxe/all_haxelibs.html') - with patch('requests.get') as mock_http_get: + uri = "https://lib.haxe.org/all" + test_loc = self.get_test_loc("haxe/all_haxelibs.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = haxe.HaxeProjectsVisitor(uri) - expected_loc = self.get_test_loc('haxe/all_haxelibs.html-expected') + expected_loc = self.get_test_loc("haxe/all_haxelibs.html-expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_haxe_versions(self): - uri = 'https://lib.haxe.org/p/openfl/versions' - test_loc = self.get_test_loc('haxe/all_versions_openfl.html') - with patch('requests.get') as mock_http_get: + uri = "https://lib.haxe.org/p/openfl/versions" + test_loc = self.get_test_loc("haxe/all_versions_openfl.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = haxe.HaxeVersionsVisitor(uri) - expected_loc = self.get_test_loc( - 'haxe/all_versions_openfl.html-expected') + expected_loc = self.get_test_loc("haxe/all_versions_openfl.html-expected") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_visit_haxe_package_json(self): - uri = 'https://lib.haxe.org/p/openfl/8.5.1/raw-files/openfl/package.json' - test_loc = self.get_test_loc('haxe/openfl-8.5.1-package.json') - with patch('requests.get') as mock_http_get: + uri = "https://lib.haxe.org/p/openfl/8.5.1/raw-files/openfl/package.json" + test_loc = self.get_test_loc("haxe/openfl-8.5.1-package.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = haxe.HaxePackageJsonVisitor(uri) - expected_loc = self.get_test_loc( - 'haxe/openfl-8.5.1-package.json-expected') + expected_loc = self.get_test_loc("haxe/openfl-8.5.1-package.json-expected") self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) class HaxeMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_project_package_json(self): - with open(self.get_test_loc('haxe/project_package.json')) as projectsjson_meta: + with open(self.get_test_loc("haxe/project_package.json")) as projectsjson_meta: metadata = json.load(projectsjson_meta) - packages = mappers.haxe.build_packages_with_json(metadata) + packages = miners.haxe.build_packages_with_json(metadata) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('haxe/project_package.json-expected') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("haxe/project_package.json-expected") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_maven.py b/minecode/tests/miners/test_maven.py new file mode 100644 index 00000000..b122ec7c --- /dev/null +++ b/minecode/tests/miners/test_maven.py @@ -0,0 +1,899 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os +import re +from operator import itemgetter +from unittest.mock import patch + +from django.test import TestCase as DjangoTestCase + +import packagedb +from minecode.management.commands.run_map import map_uri +from minecode.management.commands.run_visit import visit_uri +from minecode.miners import maven +from minecode.models import ResourceURI +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get +from minecode.utils_test import model_to_dict + +# TODO: add tests from /maven-indexer/indexer-core/src/test/java/org/acche/maven/index/artifact + + +def sort_deps(results): + """ + FIXME: UGLY TEMP WORKAROUND: we sort the results because of a PyMaven bug + See https://github.com/sassoftware/pymaven/issues/11 + """ + if "dependencies" in results: + results["dependencies"].sort() + elif results and "metadata" in results[0]: + for result in results: + result["metadata"]["dependencies"].sort() + + +class MavenMiscTest(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_get_entries(self): + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + fields = ( + list(maven.ENTRY_FIELDS.keys()) + + list(maven.ENTRY_FIELDS_OTHER.keys()) + + list(maven.ENTRY_FIELDS_IGNORED.keys()) + ) + fields = set(fields) + result = list(maven.get_entries(index, fields=fields)) + expected_loc = self.get_test_loc("maven/index/expected_entries.json") + self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) + + def test_get_entries_increment(self): + index = self.get_test_loc( + "maven/index/increment/nexus-maven-repository-index.445.gz" + ) + fields = ( + list(maven.ENTRY_FIELDS.keys()) + + list(maven.ENTRY_FIELDS_OTHER.keys()) + + list(maven.ENTRY_FIELDS_IGNORED.keys()) + ) + fields = set(fields) + result = list(maven.get_entries(index, fields=fields)) + expected_loc = self.get_test_loc("maven/index/increment/expected_entries.json") + self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) + + def test_get_entries_buggy(self): + index = self.get_test_loc("maven/index/buggy/nexus-maven-repository-index.gz") + fields = ( + list(maven.ENTRY_FIELDS.keys()) + + list(maven.ENTRY_FIELDS_OTHER.keys()) + + list(maven.ENTRY_FIELDS_IGNORED.keys()) + ) + fields = set(fields) + result = list(maven.get_entries(index, fields=fields)) + expected_loc = self.get_test_loc("maven/index/buggy/expected_entries.json") + self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) + + def test_get_artifacts_full(self): + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + + fields = ( + list(maven.ENTRY_FIELDS) + + list(maven.ENTRY_FIELDS_OTHER) + + list(maven.ENTRY_FIELDS_IGNORED) + ) + fields = set(fields) + + result = [ + a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True) + ] + expected_loc = self.get_test_loc("maven/index/expected_artifacts.json") + self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) + + def test_get_artifacts_increment(self): + index = self.get_test_loc( + "maven/index/increment/nexus-maven-repository-index.445.gz" + ) + fields = ( + list(maven.ENTRY_FIELDS.keys()) + + list(maven.ENTRY_FIELDS_OTHER.keys()) + + list(maven.ENTRY_FIELDS_IGNORED.keys()) + ) + fields = set(fields) + result = [ + a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True) + ] + expected_loc = self.get_test_loc( + "maven/index/increment/expected_artifacts.json" + ) + self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) + + def test_get_artifacts_buggy(self): + index = self.get_test_loc("maven/index/buggy/nexus-maven-repository-index.gz") + fields = ( + list(maven.ENTRY_FIELDS.keys()) + + list(maven.ENTRY_FIELDS_OTHER.keys()) + + list(maven.ENTRY_FIELDS_IGNORED.keys()) + ) + fields = set(fields) + result = [ + a.to_dict() for a in maven.get_artifacts(index, fields, include_all=True) + ] + expected_loc = self.get_test_loc("maven/index/buggy/expected_artifacts.json") + self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) + + def test_get_artifacts_defaults(self): + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + result = [a.to_dict() for a in maven.get_artifacts(index)] + expected_loc = self.get_test_loc("maven/index/expected_artifacts-defaults.json") + self.check_expected_results(result, expected_loc) + + def test_get_artifacts_no_worthyness(self): + index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + + def worth(a): + return True + + result = [a.to_dict() for a in maven.get_artifacts(index, worthyness=worth)] + expected_loc = self.get_test_loc( + "maven/index/expected_artifacts-all-worthy.json" + ) + self.check_expected_results(result, expected_loc) + + def test_get_artifacts_defaults_increment(self): + index = self.get_test_loc( + "maven/index/increment/nexus-maven-repository-index.445.gz" + ) + result = [a.to_dict() for a in maven.get_artifacts(index)] + expected_loc = self.get_test_loc( + "maven/index/increment/expected_artifacts-defaults.json" + ) + self.check_expected_results(result, expected_loc) + + def test_get_artifacts_defaults_buggy(self): + index = self.get_test_loc("maven/index/buggy/nexus-maven-repository-index.gz") + result = [a.to_dict() for a in maven.get_artifacts(index)] + expected_loc = self.get_test_loc( + "maven/index/buggy/expected_artifacts-defaults.json" + ) + self.check_expected_results(result, expected_loc) + + def test_build_artifact(self): + entry = { + "i": "0-alpha-1-20050407.154541-1.pom|1131488721000|-1|2|2|0|pom", + "m": "1318447185654", + "u": "org.apache|maven|archetypes|1|0-alpha-1-20050407.154541-1.pom", + } + + result = maven.build_artifact(entry, include_all=True) + result = result.to_dict() + expected = dict( + [ + ("group_id", "org.apache"), + ("artifact_id", "maven"), + ("version", "archetypes"), + ("packaging", "0-alpha-1-20050407.154541-1.pom"), + ("classifier", "1"), + ("extension", "pom"), + ("last_modified", "2005-11-08T22:25:21+00:00"), + ("size", None), + ("sha1", None), + ("name", None), + ("description", None), + ("src_exist", False), + ("jdoc_exist", False), + ("sig_exist", False), + ("sha256", None), + ("osgi", dict()), + ("classes", []), + ] + ) + + self.assertEqual(expected.items(), result.items()) + + def test_build_url_and_filename_1(self): + test = { + "group_id": "de.alpharogroup", + "artifact_id": "address-book-domain", + "version": "3.12.0", + "classifier": None, + "extension": "jar", + } + expected = ( + "https://repo1.maven.org/maven2/de/alpharogroup/address-book-domain/3.12.0/address-book-domain-3.12.0.jar", + "address-book-domain-3.12.0.jar", + ) + self.assertEqual(expected, maven.build_url_and_filename(**test)) + + def test_build_url_and_filename_2(self): + test = { + "group_id": "de.alpharogroup", + "artifact_id": "address-book-data", + "version": "3.12.0", + "classifier": None, + "extension": "pom", + } + expected = ( + "https://repo1.maven.org/maven2/de/alpharogroup/address-book-data/3.12.0/address-book-data-3.12.0.pom", + "address-book-data-3.12.0.pom", + ) + self.assertEqual(expected, maven.build_url_and_filename(**test)) + + def test_build_url_and_filename_3(self): + test = { + "group_id": "de.alpharogroup", + "artifact_id": "address-book-rest-web", + "version": "3.12.0", + "classifier": None, + "extension": "war", + } + expected = ( + "https://repo1.maven.org/maven2/de/alpharogroup/address-book-rest-web/3.12.0/address-book-rest-web-3.12.0.war", + "address-book-rest-web-3.12.0.war", + ) + self.assertEqual(expected, maven.build_url_and_filename(**test)) + + def test_build_url_and_filename_4(self): + test = { + "group_id": "uk.com.robust-it", + "artifact_id": "cloning", + "version": "1.9.5", + "classifier": "sources", + "extension": "jar", + } + expected = ( + "https://repo1.maven.org/maven2/uk/com/robust-it/cloning/1.9.5/cloning-1.9.5-sources.jar", + "cloning-1.9.5-sources.jar", + ) + self.assertEqual(expected, maven.build_url_and_filename(**test)) + + def test_build_url_and_filename_with_alternate_base(self): + test = { + "group_id": "uk.com.robust-it", + "artifact_id": "cloning", + "version": "1.9.5", + "classifier": "sources", + "extension": "jar", + "base_repo_url": "maven-index://", + } + expected = ( + "maven-index:///uk/com/robust-it/cloning/1.9.5/cloning-1.9.5-sources.jar", + "cloning-1.9.5-sources.jar", + ) + self.assertEqual(expected, maven.build_url_and_filename(**test)) + + def test_build_maven_xml_url(self): + test = {"group_id": "de.alpharogroup", "artifact_id": "address-book-domain"} + expected = "https://repo1.maven.org/maven2/de/alpharogroup/address-book-domain/maven-metadata.xml" + self.assertEqual(expected, maven.build_maven_xml_url(**test)) + + +class MavenVisitorTest(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_MavenNexusIndexVisitor_uris(self): + uri = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz" + test_loc = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) + expected_loc = self.get_test_loc("maven/index/expected_uris.json") + self.check_expected_uris( + uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN + ) + + def test_MavenNexusIndexPropertiesVisitor(self): + uri = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties" + test_loc = self.get_test_loc( + "maven/index/increment/nexus-maven-repository-index.properties" + ) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = maven.MavenNexusPropertiesVisitor(uri) + expected_loc = self.get_test_loc( + "maven/index/increment/expected_properties_uris.json" + ) + self.check_expected_uris( + uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN + ) + + def test_MavenNexusIndexVisitor_uris_increment(self): + uri = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.445.gz" + ) + test_loc = self.get_test_loc( + "maven/index/increment/nexus-maven-repository-index.445.gz" + ) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) + expected_loc = self.get_test_loc("maven/index/increment/expected_uris.json") + self.check_expected_uris( + uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN + ) + + def test_MavenNexusIndexVisitor_uris_buggy(self): + uri = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz" + test_loc = self.get_test_loc( + "maven/index/buggy/nexus-maven-repository-index.gz" + ) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) + expected_loc = self.get_test_loc("maven/index/buggy/expected_uris.json") + self.check_expected_uris( + uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN + ) + + def test_visit_uri_does_not_fail_on_incorrect_sha1(self): + uri = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz" + resource_uri = ResourceURI.objects.insert(uri=uri) + + before = [p.id for p in ResourceURI.objects.all()] + test_loc = self.get_test_loc( + "maven/index/buggy/nexus-maven-repository-index.gz" + ) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + visit_uri(resource_uri) + + if before: + visited = ResourceURI.objects.exclude(id__in=before) + else: + visited = ResourceURI.objects.all() + + results = [model_to_dict(rec, fields=["uri", "sha1"]) for rec in visited] + results = sorted(results, key=itemgetter("uri")) + expected_loc = self.get_test_loc("maven/index/buggy/expected_visited_uris.json") + self.check_expected_results(results, expected_loc, regen=FIXTURES_REGEN) + visited.delete() + + def test_MavenPOMVisitor_data(self): + uri = "https://repo1.maven.org/maven2/classworlds/classworlds/1.1-alpha-2/classworlds-1.1-alpha-2.pom" + test_loc = self.get_test_loc("maven/pom/classworlds-1.1-alpha-2.pom") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, data, _ = maven.MavenPOMVisitor(uri) + self.assertEqual(None, uris) + expected = open(test_loc, "rb").read() + self.assertEqual(expected, data) + + +class MavenEnd2EndTest(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_MavenNexusIndexVisitor_with_run_visit_then_map_end2end(self): + # setup + before = sorted(p.id for p in ResourceURI.objects.all()) + uri = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.445.gz" + ) + + resource_uri = ResourceURI.objects.insert(uri=uri) + test_index = self.get_test_loc("maven/index/nexus-maven-repository-index.gz") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_index) + visit_uri(resource_uri) + + if before: + visited = ResourceURI.objects.exclude(id__in=before) + else: + visited = ResourceURI.objects.all() + + results = list(model_to_dict(rec, exclude=["id"]) for rec in visited) + results = sorted(results, key=itemgetter("uri")) + expected_loc = self.get_test_loc("maven/end2end/expected_visited_uris.json") + self.check_expected_results(results, expected_loc, regen=FIXTURES_REGEN) + + pre_visited_uris = ResourceURI.objects.filter( + uri__contains="maven-index://" + ).exclude(id__in=before) + + self.assertTrue( + all(ru.last_visit_date and not ru.last_map_date for ru in pre_visited_uris) + ) + + package_ids_before = sorted( + p.id for p in packagedb.models.Package.objects.all() + ) + + # now onto mapping the previsited URIs + # setup + # test proper + for res_uri in pre_visited_uris: + map_uri(res_uri) + + newly_mapped = packagedb.models.Package.objects.filter( + download_url__startswith="https://repo1.maven.org/maven2" + ).exclude(id__in=package_ids_before) + # check that the saved packages are there as planned + self.assertEqual(19, newly_mapped.count()) + + package_results = list(pac.to_dict() for pac in newly_mapped) + expected_loc = self.get_test_loc("maven/end2end/expected_mapped_packages.json") + self.check_expected_results( + package_results, + expected_loc, + fields_to_remove=["package_sets"], + regen=FIXTURES_REGEN, + ) + + # check that the map status has been updated correctly + visited_then_mapped = ResourceURI.objects.filter(uri__contains="maven-index://") + self.assertTrue(all(ru.last_map_date for ru in visited_then_mapped)) + + def test_visit_and_map_using_pom_with_unicode(self): + uri = "https://repo1.maven.org/maven2/edu/psu/swe/commons/commons-jaxrs/1.22/commons-jaxrs-1.22.pom" + test_loc = self.get_test_loc("maven/end2end_unicode/commons-jaxrs-1.22.pom") + + before_uri = [p.id for p in ResourceURI.objects.all()] + before_pkg = [p.id for p in packagedb.models.Package.objects.all()] + + resource_uri = ResourceURI.objects.insert(uri=uri) + + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + # visit test proper: this should insert all the test_uris + visit_uri(resource_uri) + map_uri(resource_uri) + + if before_uri: + visited = ResourceURI.objects.exclude(id__in=before_uri) + else: + visited = ResourceURI.objects.all() + + uri_results = sorted(model_to_dict(rec, exclude=["id"]) for rec in visited) + expected_loc = self.get_test_loc( + "maven/end2end_unicode/expected_visited_commons-jaxrs-1.22.json" + ) + self.check_expected_results(uri_results, expected_loc, regen=FIXTURES_REGEN) + + if before_pkg: + mapped = packagedb.models.Package.objects.exclude(id__in=before_pkg) + else: + mapped = packagedb.models.Package.objects.all() + + package_results = sorted(pac.to_dict() for pac in mapped) + expected_loc = self.get_test_loc( + "maven/end2end_unicode/expected_mapped_commons-jaxrs-1.22.json" + ) + self.check_expected_results(package_results, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_and_map_using_pom_with_unicode_multisteps(self): + # this test deals with a single POM and the results from + # the index and the pom visit yielding packages + + # Step 1: map some index data + before_pkg = [p.id for p in packagedb.models.Package.objects.all()] + + # this is a pre-visited as from the Maven index URI + index_uri_test_loc = self.get_test_loc( + "maven/end2end_multisteps/commons-jaxrs-1.21-index-data.json" + ) + index_uri = json.load(open(index_uri_test_loc, "rb")) + idx_resource_uri = ResourceURI.objects.insert(**index_uri) + + map_uri(idx_resource_uri) + + if before_pkg: + mapped = packagedb.models.Package.objects.exclude(id__in=before_pkg) + else: + mapped = packagedb.models.Package.objects.all() + + package_results = sorted( + (pac.to_dict() for pac in mapped), key=lambda d: list(d.keys()) + ) + expected_loc = self.get_test_loc( + "maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json" + ) + self.check_expected_results( + package_results, + expected_loc, + fields_to_remove=["package_sets"], + regen=FIXTURES_REGEN, + ) + + # Step 2: map a POM + + # this is a pre-visited URI as from a POM + pom_uri_test_loc = self.get_test_loc( + "maven/end2end_multisteps/commons-jaxrs-1.21-pom-data.json" + ) + pom_uri = json.load(open(pom_uri_test_loc, "rb")) + pom_resource_uri = ResourceURI.objects.insert(**pom_uri) + map_uri(pom_resource_uri) + + if before_pkg: + mapped = packagedb.models.Package.objects.exclude(id__in=before_pkg) + else: + mapped = packagedb.models.Package.objects.all() + + package_results = sorted( + (pac.to_dict() for pac in mapped), key=lambda d: list(d.keys()) + ) + expected_loc = self.get_test_loc( + "maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json" + ) + self.check_expected_results( + package_results, + expected_loc, + fields_to_remove=["package_sets"], + regen=FIXTURES_REGEN, + ) + + def test_visit_and_map_with_index(self): + uri = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties" + test_loc = self.get_test_loc( + "maven/end2end_index/nexus-maven-repository-index.properties" + ) + + before_uri = [p.id for p in ResourceURI.objects.all()] + + resource_uri = ResourceURI.objects.insert(uri=uri) + + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + # visit test proper: this should insert all the test_uris + visit_uri(resource_uri) + + if before_uri: + visited = ResourceURI.objects.exclude(id__in=before_uri).order_by("uri") + else: + visited = ResourceURI.objects.all().order_by("uri") + + uri_results = list(model_to_dict(rec, exclude=["id"]) for rec in visited) + expected_loc = self.get_test_loc( + "maven/end2end_index/expected_visited_index.json" + ) + self.check_expected_results(uri_results, expected_loc, regen=FIXTURES_REGEN) + + uri = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.543.gz" + ) + # Use a small index file for test cases + test_loc = self.get_test_loc( + "maven/end2end_index/nexus-maven-repository-index.163.gz" + ) + + resource_uri = ResourceURI.objects.get(uri=uri) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + # visit test proper: this should insert all the test_uris + visit_uri(resource_uri) + + if before_uri: + visited = ResourceURI.objects.exclude(id__in=before_uri).order_by("uri") + else: + visited = ResourceURI.objects.all().order_by("uri") + + uri_results = list(model_to_dict(rec, exclude=["id"]) for rec in visited) + expected_loc = self.get_test_loc( + "maven/end2end_index/expected_visited_increment_index.json" + ) + self.check_expected_results(uri_results, expected_loc, regen=FIXTURES_REGEN) + + +class MavenXmlMetadataVisitorTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_visit_maven_medatata_xml_file(self): + uri = ( + "https://repo1.maven.org/maven2/st/digitru/identity-core/maven-metadata.xml" + ) + test_loc = self.get_test_loc("maven/maven-metadata/maven-metadata.xml") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = maven.MavenMetaDataVisitor(uri) + expected_loc = self.get_test_loc("maven/maven-metadata/expected_maven_xml.json") + self.check_expected_uris(uris, expected_loc) + + +class MavenHtmlIndexVisitorTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_visit_maven_medatata_html_index_jcenter_1(self): + uri = "http://jcenter.bintray.com/" + test_loc = self.get_test_loc("maven/html/jcenter.bintray.com.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = maven.MavenMetaDataVisitor(uri) + expected_loc = self.get_test_loc( + "maven/html/visitor_expected_jcenter.bintray.com2.html.json" + ) + self.check_expected_uris(uris, expected_loc) + + def test_visit_maven_medatata_html_index_jcenter_2(self): + uri = "http://jcenter.bintray.com/Action/app/" + test_loc = self.get_test_loc("maven/html/app.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = maven.MavenMetaDataVisitor(uri) + expected_loc = self.get_test_loc("maven/html/visitor_expected_app.html.json") + self.check_expected_uris(uris, expected_loc) + + def test_visit_maven_medatata_html_index_jcenter_3(self): + uri = ( + "http://jcenter.bintray.com/'com/virtualightning'/stateframework-compiler/" + ) + test_loc = self.get_test_loc("maven/html/stateframework-compiler.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = maven.MavenMetaDataVisitor(uri) + expected_loc = self.get_test_loc( + "maven/html/visitor_expected_stateframework-compiler.html.json" + ) + self.check_expected_uris(uris, expected_loc) + + +# FIXME: we should not need to call a visitor for testing a mapper +class MavenMapperVisitAndMapTest(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_visit_and_build_package_from_pom_axis(self): + uri = "https://repo1.maven.org/maven2/axis/axis/1.4/axis-1.4.pom" + test_loc = self.get_test_loc("maven/mapper/axis-1.4.pom") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = maven.MavenPOMVisitor(uri) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc("maven/mapper/axis-1.4.pom.package.json") + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_and_build_package_from_pom_commons_pool(self): + uri = "https://repo1.maven.org/maven2/commons-pool/commons-pool/1.5.7/commons-pool-1.5.7.pom" + test_loc = self.get_test_loc("maven/mapper/commons-pool-1.5.7.pom") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = maven.MavenPOMVisitor(uri) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/mapper/commons-pool-1.5.7.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_and_build_package_from_pom_struts(self): + uri = "https://repo1.maven.org/maven2/struts-menu/struts-menu/2.4.2/struts-menu-2.4.2.pom" + test_loc = self.get_test_loc("maven/mapper/struts-menu-2.4.2.pom") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = maven.MavenPOMVisitor(uri) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/mapper/struts-menu-2.4.2.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_and_build_package_from_pom_mysql(self): + uri = "https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.27/mysql-connector-java-5.1.27.pom" + test_loc = self.get_test_loc("maven/mapper/mysql-connector-java-5.1.27.pom") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = maven.MavenPOMVisitor(uri) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/mapper/mysql-connector-java-5.1.27.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_and_build_package_from_pom_xbean(self): + uri = "https://repo1.maven.org/maven2/xbean/xbean-jmx/2.0/xbean-jmx-2.0.pom" + test_loc = self.get_test_loc("maven/mapper/xbean-jmx-2.0.pom") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = maven.MavenPOMVisitor(uri) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc("maven/mapper/xbean-jmx-2.0.pom.package.json") + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_and_build_package_from_pom_maven_all(self): + uri = "https://repo1.maven.org/maven2/date/yetao/maven/maven-all/1.0-RELEASE/maven-all-1.0-RELEASE.pom" + test_loc = self.get_test_loc("maven/mapper/maven-all-1.0-RELEASE.pom") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = maven.MavenPOMVisitor(uri) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/mapper/maven-all-1.0-RELEASE.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_visit_and_build_package_from_pom_with_unicode(self): + uri = "https://repo1.maven.org/maven2/edu/psu/swe/commons/commons-jaxrs/1.21/commons-jaxrs-1.21.pom" + test_loc = self.get_test_loc("maven/mapper/commons-jaxrs-1.21.pom") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = maven.MavenPOMVisitor(uri) + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/mapper/commons-jaxrs-1.21.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + +class MavenMapperGetPackageTest(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_get_package_from_pom_1(self): + test_loc = self.get_test_loc("maven/parsing/parse/jds-3.0.1.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/parse/jds-3.0.1.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_2(self): + test_loc = self.get_test_loc( + "maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom" + ) + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_3(self): + test_loc = self.get_test_loc("maven/parsing/parse/jds-2.17.0718b.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/parse/jds-2.17.0718b.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_4(self): + test_loc = self.get_test_loc("maven/parsing/parse/maven-javanet-plugin-1.7.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/parse/maven-javanet-plugin-1.7.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_5(self): + test_loc = self.get_test_loc("maven/parsing/loop/coreplugin-1.0.0.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/loop/coreplugin-1.0.0.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_6(self): + test_loc = self.get_test_loc("maven/parsing/loop/argus-webservices-2.7.0.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/loop/argus-webservices-2.7.0.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_7(self): + test_loc = self.get_test_loc("maven/parsing/loop/pkg-2.0.13.1005.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/loop/pkg-2.0.13.1005.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_8(self): + test_loc = self.get_test_loc("maven/parsing/loop/ojcms-beans-0.1-beta.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/loop/ojcms-beans-0.1-beta.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_9(self): + test_loc = self.get_test_loc("maven/parsing/loop/jacuzzi-annotations-0.2.1.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/loop/jacuzzi-annotations-0.2.1.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_10(self): + test_loc = self.get_test_loc("maven/parsing/loop/argus-webservices-2.8.0.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/loop/argus-webservices-2.8.0.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_11(self): + test_loc = self.get_test_loc("maven/parsing/loop/jacuzzi-database-0.2.1.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/loop/jacuzzi-database-0.2.1.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_12(self): + test_loc = self.get_test_loc("maven/parsing/empty/common-object-1.0.2.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/empty/common-object-1.0.2.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_get_package_from_pom_13(self): + test_loc = self.get_test_loc("maven/parsing/empty/osgl-http-1.1.2.pom") + data = open(test_loc).read() + package = maven.get_package(data).to_dict() + expected_loc = self.get_test_loc( + "maven/parsing/empty/osgl-http-1.1.2.pom.package.json" + ) + self.check_expected_results(package, expected_loc, regen=FIXTURES_REGEN) + + def test_regex_maven_pom_mapper_1(self): + regex = re.compile(r"^https?://repo1.maven.org/maven2/.*\.pom$") + result = re.match( + regex, + "https://repo1.maven.org/maven2/com/google/appengine/appengine-api-1.0-sdk/1.2.0/appengine-api-1.0-sdk-1.2.0.pom", + ) + self.assertTrue(result) + + def test_MavenNexusIndexVisitor_uris_increment_contain_correct_purl(self): + uri = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz" + ) + test_loc = self.get_test_loc( + "maven/index/increment2/nexus-maven-repository-index.457.gz" + ) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) + uris = [u for i, u in enumerate(uris) if i % 500 == 0] + expected_loc = self.get_test_loc("maven/index/increment2/expected_uris.json") + self.check_expected_uris( + uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN + ) + + def test_MavenNexusIndexVisitor_then_get_mini_package_from_index_data(self): + uri = ( + "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz" + ) + test_loc = self.get_test_loc( + "maven/index/increment2/nexus-maven-repository-index.457.gz" + ) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = maven.MavenNexusIndexVisitor(uri) + results = [] + for i, u in enumerate(uris): + # only get a few records + if i % 500 == 0: + minip = maven.get_mini_package(u.data, u.uri, u.package_url) + results.append(minip and minip.to_dict() or minip) + expected_loc = self.get_test_loc( + "maven/index/increment2/expected_mini_package.json" + ) + self.check_expected_results(results, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_npm.py b/minecode/tests/miners/test_npm.py new file mode 100644 index 00000000..bb409f3b --- /dev/null +++ b/minecode/tests/miners/test_npm.py @@ -0,0 +1,170 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os +import re +from unittest.mock import patch + +from minecode import miners +from minecode import route +from minecode.miners import npm +from minecode.models import ResourceURI +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get + + +class TestNPMVisit(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + # FIXME: use smaller test files + def test_NpmRegistryVisitor(self): + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=2300000" + test_loc = self.get_test_loc("npm/replicate_doc1.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, data, _errors = npm.NpmRegistryVisitor(uri) + # this is a non-persistent visitor, lets make sure we dont return any data + assert not data + expected_loc = self.get_test_loc("npm/expected_doclimit_visitor.json") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_NpmRegistryVisitor_OverLimit(self): + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=2300000" + test_loc = self.get_test_loc("npm/over_limit.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = npm.NpmRegistryVisitor(uri) + expected_loc = self.get_test_loc("npm/expected_over_limit.json") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_NpmRegistryVisitor_1000records(self): + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=77777" + test_loc = self.get_test_loc("npm/1000_records.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = npm.NpmRegistryVisitor(uri) + expected_loc = self.get_test_loc("npm/expected_1000_records.json") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + +class TestNPMMapper(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_build_packages(self): + with open(self.get_test_loc("npm/0flux.json")) as npm_metadata: + metadata = json.load(npm_metadata) + packages = miners.npm.build_packages(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("npm/0flux_npm_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_package2(self): + with open(self.get_test_loc("npm/2112.json")) as npm_metadata: + metadata = json.load(npm_metadata) + packages = miners.npm.build_packages(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("npm/npm_2112_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_package3(self): + with open(self.get_test_loc("npm/microdata.json")) as npm_metadata: + metadata = json.load(npm_metadata) + packages = miners.npm.build_packages(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("npm/microdata-node_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_package_with_visitor_data(self): + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=77777" + test_loc = self.get_test_loc("npm/1000_records.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = npm.NpmRegistryVisitor(uri) + uris_list = list(uris) + assert len(uris_list) == 1001 + # Randomly pick a record from 0-1000 + metadata = uris_list[29].data + packages = miners.npm.build_packages(json.loads(metadata)) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("npm/29_record_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + # Randomly pick a record from 0-1000 + metadata = uris_list[554].data + packages = miners.npm.build_packages(json.loads(metadata)) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("npm/554_record_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_package_with_ticket_439(self): + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=10&since=7333426" + test_loc = self.get_test_loc("npm/ticket_439.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = npm.NpmRegistryVisitor(uri) + uris_list = list(uris) + assert len(uris_list) == 11 + # Pickup the first one, since it's the one which is the problem package "angular2-autosize" + # The zero element in json is the url for next visitor use, and data is empty and the url is + metadata = uris_list[1].data + packages = miners.npm.build_packages(json.loads(metadata)) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("npm/expected_ticket_439.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_package_verify_ticket_440(self): + uri = "https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=10&since=7632607" + test_loc = self.get_test_loc("npm/ticket_440_records.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = npm.NpmRegistryVisitor(uri) + uris_list = list(uris) + assert len(uris_list) == 11 + # Pickup the index one instead of zero, since it's the one which is the problem package "npm-research", https://registry.npmjs.org/npm-research, + # The zero element in json is the url for next visitor use only + metadata = uris_list[1].data + packages = miners.npm.build_packages(json.loads(metadata)) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("npm/expected_ticket_440.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_npm_mapper(self): + test_uri = "https://registry.npmjs.org/angular-compare-validator" + router = route.Router() + router.append(test_uri, miners.npm.NpmPackageMapper) + test_loc = self.get_test_loc("npm/mapper/index.json") + with open(test_loc, "rb") as test_file: + test_data = test_file.read().decode("utf-8") + + test_res_uri = ResourceURI(uri=test_uri, data=test_data) + packages = miners.npm.NpmPackageMapper(test_uri, test_res_uri) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("npm/mapper/index.expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_package_for_jsonp_filter(self): + with open(self.get_test_loc("npm/jsonp-filter.json")) as npm_metadata: + metadata = json.load(npm_metadata) + packages = miners.npm.build_packages(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("npm/jsonp-filter-expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_regex_npm_mapper(self): + regex = re.compile(r"^https://registry.npmjs.org/[^\/]+$") + result = re.match( + regex, "https://registry.npmjs.org/react-mobile-navigation-modal" + ) + self.assertTrue(result) diff --git a/minecode/tests/miners/test_nuget.py b/minecode/tests/miners/test_nuget.py new file mode 100644 index 00000000..5f7e659a --- /dev/null +++ b/minecode/tests/miners/test_nuget.py @@ -0,0 +1,113 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os +import re +from unittest.mock import patch + +from minecode import miners +from minecode.miners import nuget +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get + + +class NugetVisitorsTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_NugetQueryVisitor(self): + uri = "https://api-v2v3search-0.nuget.org/query" + test_loc = self.get_test_loc("nuget/query.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = nuget.NugetQueryVisitor(uri) + expected_loc = self.get_test_loc("nuget/nuget_query_expected") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_PackagesPageVisitor(self): + uri = "https://api-v2v3search-0.nuget.org/query?skip=0" + test_loc = self.get_test_loc("nuget/query_search.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = nuget.PackagesPageVisitor(uri) + expected_loc = self.get_test_loc("nuget/nuget_page_json_expected") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_NugetAPIJsonVisitor(self): + uri = "https://api.nuget.org/v3/registration1/entityframework/6.1.3.json" + test_loc = self.get_test_loc("nuget/entityframework.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = nuget.NugetAPIJsonVisitor(uri) + expected_loc = self.get_test_loc("nuget/nuget_downlloadvisitor_json_expected") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_NugetHTMLPageVisitor(self): + uri = "https://www.nuget.org/packages?page=1" + test_loc = self.get_test_loc("nuget/packages.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = nuget.NugetHTMLPageVisitor(uri) + expected_loc = self.get_test_loc("nuget/packages.html.expected.json") + self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) + + def test_NugetHTMLPackageVisitor(self): + uri = "https://www.nuget.org/packages/log4net" + test_loc = self.get_test_loc("nuget/log4net.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _errors = nuget.NugetHTMLPackageVisitor(uri) + self.assertTrue(b"Apache-2.0 License " in data) + self.assertTrue(b"log4net is a tool to help the programmer" in data) + + +class TestNugetMap(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_build_packages(self): + with open(self.get_test_loc("nuget/entityframework2.json")) as nuget_metadata: + metadata = json.load(nuget_metadata) + packages = miners.nuget.build_packages_with_json(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("nuget/nuget_mapper_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_regex_1(self): + regex = re.compile(r"^https://api.nuget.org/packages/.*\.nupkg$") + result = re.match( + regex, "https://api.nuget.org/packages/entityframework.4.3.1.nupkg" + ) + self.assertTrue(result) + + def test_regex_2(self): + regex = re.compile(r"^https://api.nuget.org/v3/catalog.+\.json$") + result = re.match( + regex, + "https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json", + ) + self.assertTrue(result) + + def test_build_packages_from_html(self): + uri = "https://www.nuget.org/packages/log4net" + test_loc = self.get_test_loc("nuget/log4net.html") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _errors = nuget.NugetHTMLPackageVisitor(uri) + packages = miners.nuget.build_packages_from_html( + data, + uri, + ) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("nuget/nuget_mapper_log4net_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_openssl.py b/minecode/tests/miners/test_openssl.py similarity index 53% rename from minecode/tests/test_openssl.py rename to minecode/tests/miners/test_openssl.py index 7230557e..a42c4def 100644 --- a/minecode/tests/test_openssl.py +++ b/minecode/tests/miners/test_openssl.py @@ -7,62 +7,61 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from datetime import datetime import os - -from mock import Mock -from mock import patch +from datetime import datetime +from unittest.mock import patch from django.test import TestCase as DjangoTestCase -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode.mappers.openssl import build_packages +from minecode.miners import openssl +from minecode.miners.openssl import build_packages from minecode.models import ResourceURI -from minecode.visitors import openssl from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class OpenSSLVisitorsTest(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_OpenSSLVisitor(self): - uri = 'https://ftp.openssl.org/' - test_loc = self.get_test_loc('openssl/Index.html') - with patch('requests.get') as mock_http_get: + uri = "https://ftp.openssl.org/" + test_loc = self.get_test_loc("openssl/Index.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = openssl.OpenSSLVisitor(uri) - expected_loc = self.get_test_loc( - 'openssl/expected_uri_openssl_index.json') + expected_loc = self.get_test_loc("openssl/expected_uri_openssl_index.json") self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) def test_OpenSSLVisitor_sub_folder(self): - uri = 'https://ftp.openssl.org/source/' - test_loc = self.get_test_loc('openssl/Indexof_source.html') - with patch('requests.get') as mock_http_get: + uri = "https://ftp.openssl.org/source/" + test_loc = self.get_test_loc("openssl/Indexof_source.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _data, _errors = openssl.OpenSSLVisitor(uri) expected_loc = self.get_test_loc( - 'openssl/expected_uri_openssl_sourceindex.json') + "openssl/expected_uri_openssl_sourceindex.json" + ) self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) class OpenSSLTest(JsonBasedTesting, DjangoTestCase): - - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_OpenSSL_mapper(self): - uri = 'https://ftp.openssl.org/snapshot/openssl-1.0.2-stable-SNAP-20180518.tar.gz' - last_modified_date = '2014-11-19 17:49' - last_modified_date = datetime.strptime( - last_modified_date, '%Y-%m-%d %H:%M') + uri = ( + "https://ftp.openssl.org/snapshot/openssl-1.0.2-stable-SNAP-20180518.tar.gz" + ) + last_modified_date = "2014-11-19 17:49" + last_modified_date = datetime.strptime(last_modified_date, "%Y-%m-%d %H:%M") resource_uri = ResourceURI.objects.insert( - uri=uri, size='527', last_modified_date=last_modified_date) + uri=uri, size="527", last_modified_date=last_modified_date + ) packages = build_packages(resource_uri) packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'openssl/openssl_mapper_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + expected_loc = self.get_test_loc("openssl/openssl_mapper_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_openwrt.py b/minecode/tests/miners/test_openwrt.py similarity index 52% rename from minecode/tests/test_openwrt.py rename to minecode/tests/miners/test_openwrt.py index 06d957cc..8bf124a1 100644 --- a/minecode/tests/test_openwrt.py +++ b/minecode/tests/miners/test_openwrt.py @@ -10,97 +10,103 @@ import json import os from unittest.case import expectedFailure +from unittest.mock import patch -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode.visitors import openwrt +from minecode import miners +from minecode.miners import openwrt from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class OpenWRTVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_openwrt_download_pages(self): - uri = 'https://downloads.openwrt.org/chaos_calmer/15.05/' - test_loc = self.get_test_loc( - 'openwrt/Index_of_chaos_calmer_15.05_.html') - with patch('requests.get') as mock_http_get: + uri = "https://downloads.openwrt.org/chaos_calmer/15.05/" + test_loc = self.get_test_loc("openwrt/Index_of_chaos_calmer_15.05_.html") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = openwrt.OpenWrtDownloadPagesVisitor(uri) - expected_loc = self.get_test_loc('openwrt/chaos_calmer_15.05_expected') + expected_loc = self.get_test_loc("openwrt/chaos_calmer_15.05_expected") self.check_expected_uris(uris, expected_loc) def test_visitor_openwrt_download_pages2(self): - uri = 'https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/' + uri = "https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/" test_loc = self.get_test_loc( - 'openwrt/Index_of_chaos_calmer_15.05_adm5120_rb1xx_packages_base_.html') - with patch('requests.get') as mock_http_get: + "openwrt/Index_of_chaos_calmer_15.05_adm5120_rb1xx_packages_base_.html" + ) + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = openwrt.OpenWrtDownloadPagesVisitor(uri) - expected_loc = self.get_test_loc( - 'openwrt/chaos_calmer_15.05_expected_2') + expected_loc = self.get_test_loc("openwrt/chaos_calmer_15.05_expected_2") self.check_expected_uris(uris, expected_loc) @expectedFailure def test_visitor_openwrt_packages_gz(self): - uri = 'https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/Packages.gz' - test_loc = self.get_test_loc('openwrt/Packages.gz') - with patch('requests.get') as mock_http_get: + uri = "https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/Packages.gz" + test_loc = self.get_test_loc("openwrt/Packages.gz") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = openwrt.OpenWrtPackageIndexVisitor(uri) - expected_loc = self.get_test_loc('openwrt/Packages_gz_expected') + expected_loc = self.get_test_loc("openwrt/Packages_gz_expected") self.check_expected_uris(uris, expected_loc) @expectedFailure def test_visitor_openwrt_ipk(self): - uri = 'https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/6to4_12-2_all.ipk' - test_loc = self.get_test_loc('openwrt/6to4_12-2_all.ipk') - with patch('requests.get') as mock_http_get: + uri = "https://downloads.openwrt.org/chaos_calmer/15.05/adm5120/rb1xx/packages/base/6to4_12-2_all.ipk" + test_loc = self.get_test_loc("openwrt/6to4_12-2_all.ipk") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = openwrt.OpenWrtPackageIndexVisitor(uri) result = json.loads(data) - json_file = self.get_test_loc('openwrt/6to4_12-2_all_ipk_expected') + json_file = self.get_test_loc("openwrt/6to4_12-2_all_ipk_expected") self.check_expected_results(result, json_file, regen=FIXTURES_REGEN) @expectedFailure def test_visitor_openwrt_ipk2(self): - uri = 'https://downloads.openwrt.org/kamikaze/7.09/brcm-2.4/packages/wpa-cli_0.5.7-1_mipsel.ipk' - test_loc = self.get_test_loc('openwrt/wpa-cli_0.5.7-1_mipsel.ipk') - with patch('requests.get') as mock_http_get: + uri = "https://downloads.openwrt.org/kamikaze/7.09/brcm-2.4/packages/wpa-cli_0.5.7-1_mipsel.ipk" + test_loc = self.get_test_loc("openwrt/wpa-cli_0.5.7-1_mipsel.ipk") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) _, data, _ = openwrt.OpenWrtPackageIndexVisitor(uri) result = json.loads(data) - json_file = self.get_test_loc( - 'openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected') + json_file = self.get_test_loc("openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected") self.check_expected_results(result, json_file) class OpenWRTMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) @expectedFailure def test_build_packages_1(self): - with open(self.get_test_loc('openwrt/6to4_12-2_all_ipk_expected')) as openwrt_ipk_meta: + with open( + self.get_test_loc("openwrt/6to4_12-2_all_ipk_expected") + ) as openwrt_ipk_meta: metadata = json.load(openwrt_ipk_meta) - packages = mappers.openwrt.build_packages(metadata) + packages = miners.openwrt.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'openwrt/6to4_12-2_all_ipk_expected_mapper.json') + "openwrt/6to4_12-2_all_ipk_expected_mapper.json" + ) self.check_expected_results(packages, expected_loc) @expectedFailure def test_build_packages_2(self): - with open(self.get_test_loc('openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected')) as openwrt_ipk_meta: + with open( + self.get_test_loc("openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected") + ) as openwrt_ipk_meta: metadata = json.load(openwrt_ipk_meta) - packages = mappers.openwrt.build_packages(metadata) + packages = miners.openwrt.build_packages(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected_mapper.json') + "openwrt/wpa-cli_0.5.7-1_mipsel.ipk_expected_mapper.json" + ) self.check_expected_results(packages, expected_loc) diff --git a/minecode/tests/test_packagist.py b/minecode/tests/miners/test_packagist.py similarity index 52% rename from minecode/tests/test_packagist.py rename to minecode/tests/miners/test_packagist.py index 4303e171..4103e810 100644 --- a/minecode/tests/test_packagist.py +++ b/minecode/tests/miners/test_packagist.py @@ -9,40 +9,43 @@ import json import os +from unittest.mock import patch -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode.visitors import packagist +from minecode import miners +from minecode.miners import packagist from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class PackagistVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_visit_packagistlist(self): - uri = 'https://packagist.org/packages/list.json' - test_loc = self.get_test_loc('packagist/list.json') - with patch('requests.get') as mock_http_get: + uri = "https://packagist.org/packages/list.json" + test_loc = self.get_test_loc("packagist/list.json") + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) uris, _, _ = packagist.PackagistListVisitor(uri) - expected_loc = self.get_test_loc('packagist/packagist_list_expected') + expected_loc = self.get_test_loc("packagist/packagist_list_expected") self.check_expected_uris(uris, expected_loc) class TestPackagistMap(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) def test_build_packages(self): - with open(self.get_test_loc('packagist/00f100_cakephp-opauth.json')) as packagist_package: + with open( + self.get_test_loc("packagist/00f100_cakephp-opauth.json") + ) as packagist_package: metadata = json.load(packagist_package) - packages = mappers.packagist.build_packages_with_json(metadata) + packages = miners.packagist.build_packages_with_json(metadata) packages = [p.to_dict() for p in packages] expected_loc = self.get_test_loc( - 'packagist/packaglist_00f100_cakephp-opauth_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "packagist/packaglist_00f100_cakephp-opauth_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/miners/test_pypi.py b/minecode/tests/miners/test_pypi.py new file mode 100644 index 00000000..d6143c17 --- /dev/null +++ b/minecode/tests/miners/test_pypi.py @@ -0,0 +1,222 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + + +import json +import os +from unittest.mock import patch + +from django.test import TestCase as DjangoTestCase + +from minecode import miners +from minecode.management.commands.run_map import map_uri +from minecode.models import ResourceURI +from minecode.route import Router +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get +from packagedb.models import Package + + +class TestPypiVisit(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + ''' +import unittest +import xmlrpc +from mock import patch + +class TestFoo(unittest.TestCase): + """ + A simple test + """ + @patch('xmlrpc.server') + def test_first(self, mock_xmlrpc): + m = mock_xmlrpc.return_value + m.multiply.return_value = 6 + server = xmlrpc.server("http://kushaldas.in/") + res = server.multiply(2, 3) + self.assertEqual(res, 6) +''' + + @patch("xmlrpc.client.ServerProxy") + def test_PypiIndexVisitor(self, mock_serverproxyclass): + package_list = [ + "0", + "0-._.-._.-._.-._.-._.-._.-0", + "0.0.1", + "00print_lol", + "vmnet", + "vmo", + "vmock", + "vmonere", + "VMPC", + ] + instance = mock_serverproxyclass.return_value + instance.list_packages.return_value = iter(package_list) + uri = "https://pypi.python.org/pypi/" + uris, _data, _error = miners.pypi.PypiIndexVisitor(uri) + self.assertIsNone(_data) + + expected_loc = self.get_test_loc("pypi/pypiindexvisitor-expected.json") + self.check_expected_uris(uris, expected_loc) + + def test_PypiPackageVisitor(self): + uri = "https://pypi.python.org/pypi/CAGE/json" + test_loc = self.get_test_loc("pypi/cage.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _error = miners.pypi.PypiPackageVisitor(uri) + + expected_loc = self.get_test_loc("pypi/expected_uris-cage.json") + self.check_expected_uris(uris, expected_loc) + + def test_PypiPackageVisitor_2(self): + uri = "https://pypi.python.org/pypi/boolean.py/json" + test_loc = self.get_test_loc("pypi/boolean.py.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, _errors = miners.pypi.PypiPackageVisitor(uri) + + expected_loc = self.get_test_loc("pypi/expected_uris-boolean.py.json") + self.check_expected_uris(uris, expected_loc) + + def test_PypiPackageReleaseVisitor_cage12(self): + uri = "https://pypi.python.org/pypi/CAGE/1.1.2/json" + test_loc = self.get_test_loc("pypi/cage_1.1.2.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, data, _error = miners.pypi.PypiPackageReleaseVisitor(uri) + + expected_loc = self.get_test_loc("pypi/expected_uris-cage_1.1.2.json") + self.check_expected_uris(uris, expected_loc) + + expected_loc = self.get_test_loc("pypi/expected_data-cage_1.1.2.json") + self.check_expected_results(data, expected_loc) + + def test_PypiPackageReleaseVisitor_cage13(self): + uri = "https://pypi.python.org/pypi/CAGE/1.1.3/json" + test_loc = self.get_test_loc("pypi/cage_1.1.3.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, data, _error = miners.pypi.PypiPackageReleaseVisitor(uri) + + expected_loc = self.get_test_loc("pypi/expected_uris-cage_1.1.3.json") + self.check_expected_uris(uris, expected_loc) + + expected_loc = self.get_test_loc("pypi/expected_data-cage_1.1.3.json") + self.check_expected_results(data, expected_loc) + + def test_PypiPackageReleaseVisitor_boolean(self): + uri = "https://pypi.python.org/pypi/boolean.py/2.0.dev3/json" + test_loc = self.get_test_loc("pypi/boolean.py-2.0.dev3.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, data, _error = miners.pypi.PypiPackageReleaseVisitor(uri) + + expected_loc = self.get_test_loc("pypi/expected_uris-boolean.py-2.0.dev3.json") + self.check_expected_uris(uris, expected_loc) + + expected_loc = self.get_test_loc("pypi/expected_data-boolean.py-2.0.dev3.json") + self.check_expected_results(data, expected_loc) + + +class MockResourceURI: + def __init__(self, uri, data): + self.uri = uri + self.data = data + self.package_url = None + + +class TestPypiMap(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_build_packages_lxml(self): + with open(self.get_test_loc("pypi/lxml-3.2.0.json")) as pypi_meta: + metadata = json.load(pypi_meta) + packages = miners.pypi.build_packages(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("pypi/expected-lxml-3.2.0.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_packages_boolean(self): + with open(self.get_test_loc("pypi/boolean.py-2.0.dev3.json")) as pypi_meta: + metadata = json.load(pypi_meta) + packages = miners.pypi.build_packages(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("pypi/expected-boolean.py-2.0.dev3.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_packages_cage13(self): + with open(self.get_test_loc("pypi/cage_1.1.3.json")) as pypi_meta: + metadata = json.load(pypi_meta) + packages = miners.pypi.build_packages(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("pypi/expected-CAGE-1.1.3.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_packages_cage12(self): + with open(self.get_test_loc("pypi/cage_1.1.2.json")) as pypi_meta: + metadata = json.load(pypi_meta) + packages = miners.pypi.build_packages(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("pypi/expected-CAGE-1.1.2.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_PypiPackageMapper_cage(self): + data = open(self.get_test_loc("pypi/cage_1.1.2.json")).read() + uri = "https://pypi.python.org/pypi/CAGE/1.1.2/json" + resuri = MockResourceURI(uri, data) + packages = miners.pypi.PypiPackageMapper(uri, resuri) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("pypi/expected-CAGE-1.1.2.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_PypiPackageMapper_lxml(self): + data = open(self.get_test_loc("pypi/lxml-3.2.0.json")).read() + uri = "https://pypi.python.org/pypi/lxml/3.2.0/json" + resuri = MockResourceURI(uri, data) + packages = miners.pypi.PypiPackageMapper(uri, resuri) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("pypi/expected-lxml-3.2.0.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_pypi_map(self): + # setup: add a mappable URI + with open(self.get_test_loc("pypi/map/3to2-1.1.1.json")) as mappable: + resuri = ResourceURI(**json.load(mappable)) + resuri.save() + + # sanity check + packages = miners.pypi.PypiPackageMapper(resuri.uri, resuri) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("pypi/map/expected-3to2-1.1.1.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + # build a mock router + router = Router() + router.append( + "https://pypi.python.org/pypi/3to2/1.1.1/json", + miners.pypi.PypiPackageMapper, + ) + + # sanity check + expected_mapped_package_uri = "https://pypi.python.org/packages/8f/ab/58a363eca982c40e9ee5a7ca439e8ffc5243dde2ae660ba1ffdd4868026b/3to2-1.1.1.zip" + self.assertEqual( + 0, Package.objects.filter(download_url=expected_mapped_package_uri).count() + ) + + # test proper + map_uri(resuri, _map_router=router) + mapped = Package.objects.filter(download_url=expected_mapped_package_uri) + self.assertEqual(1, mapped.count()) diff --git a/minecode/tests/miners/test_repodata.py b/minecode/tests/miners/test_repodata.py new file mode 100644 index 00000000..959f1321 --- /dev/null +++ b/minecode/tests/miners/test_repodata.py @@ -0,0 +1,85 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os + +from commoncode.testcase import FileBasedTesting + +from minecode.miners import repodata + + +class TestRepoData(FileBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_get_pkg_infos(self): + filelists_xml = self.get_test_loc("repodata_rpms/repodata/filelists.xml") + primary_xml = self.get_test_loc("repodata_rpms/repodata/primary.xml") + other_xml = self.get_test_loc("repodata_rpms/repodata/other.xml") + expected = [ + { + "build_time": "1442515098", + "buildhost": "c1bk.rdu2.centos.org", + "href": "python-ceilometerclient-1.5.0-1.el7.src.rpm", + "pkgid": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "group": "Development/Languages", + "end_header_range": "4876", + "archive_size": "99648", + "package_size": "101516", + "epoch": "0", + "changelogs": [ + { + "date": "1387195200", + "changelog": "- Update to upstream 1.0.8\n- New dependency: python-six", + "author": "Jakub Ruzicka 1.0.8-1", + } + ], + "rel": "1.el7", + "type": "rpm", + "files": [ + {"name": "python-ceilometerclient-1.5.0.tar.gz"}, + {"name": "python-ceilometerclient.spec"}, + ], + "description": None, + "installed_size": "99230", + "file_time": "1446590411", + "arch": "src", + "name": "python-ceilometerclient", + "license": "ASL 2.0", + "url": "https://github.com/openstack/python-ceilometerclient", + "checksum": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "directories": [], + "summary": "Python API and CLI for OpenStack Ceilometer", + "start_header_range": "880", + "required_rpms": [ + {"name": "python-d2to1"}, + { + "ver": "2.5.0", + "epoch": "0", + "flags": "GE", + "name": "python-oslo-sphinx", + }, + {"name": "python-pbr"}, + {"name": "python-setuptools"}, + {"name": "python-sphinx"}, + {"name": "python2-devel"}, + ], + "sourcerpm": None, + "ver": "1.5.0", + } + ] + result = repodata.get_pkg_infos(filelists_xml, primary_xml, other_xml) + self.assertEqual(expected, result) + + def test_get_url_for_tag(self): + expected = "repodata/4c31e7e12c7aa42cf4d7d0b6ab7166fad76b5e40ea18f911e4a820cfa68d1541-filelists.xml.gz" + repomdxml_file = self.get_test_loc("repodata_rpms/repodata/repomd.xml") + output = repodata.get_url_for_tag(repomdxml_file, "filelists") + self.assertEqual(expected, output) diff --git a/minecode/tests/test_repodata_rpms.py b/minecode/tests/miners/test_repodata_rpms.py similarity index 56% rename from minecode/tests/test_repodata_rpms.py rename to minecode/tests/miners/test_repodata_rpms.py index 8ea5f8d8..d61aedc8 100644 --- a/minecode/tests/test_repodata_rpms.py +++ b/minecode/tests/miners/test_repodata_rpms.py @@ -1,5 +1,4 @@ #!/usr/bin/python -# -*- coding: utf-8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -9,24 +8,21 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals import os +from minecode.miners import repodata_rpms from minecode.utils_test import MiningTestCase -from minecode.visitors import repodata_rpms class RepodataRPMVisitorsTest(MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "testfiles") def test_collect_rsync_urls(self): - directory_listing_loc = self.get_test_loc( - 'repodata_rpms/centos_dir_listing') - base_url = 'http://mirrors.kernel.org/centos/' + directory_listing_loc = self.get_test_loc("repodata_rpms/centos_dir_listing") + base_url = "http://mirrors.kernel.org/centos/" uris = repodata_rpms.collect_rsync_urls( - directory_listing_loc, base_url, file_names=('repomd.xml',)) + directory_listing_loc, base_url, file_names=("repomd.xml",) + ) uris = list(uris) self.assertEqual(1, len(uris)) diff --git a/minecode/tests/miners/test_repomd_parser.py b/minecode/tests/miners/test_repomd_parser.py new file mode 100644 index 00000000..6db6bf55 --- /dev/null +++ b/minecode/tests/miners/test_repomd_parser.py @@ -0,0 +1,286 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import os +from unittest.mock import patch + +from packagedcode.rpm import EVR + +from minecode.miners import URI +from minecode.miners.repodata import combine_dicts_using_pkgid +from minecode.miners.repodata import combine_list_of_dicts +from minecode.miners.repomd import collect_rpm_packages_from_repomd +from minecode.miners.repomd import generate_rpm_objects +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get_for_uris + +# TODO: add redhat repo test! + + +class TestRepomdParser(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_combine_list_of_dicts(self): + expected = {"a": "1", "b": "2", "c": "3"} + output = combine_list_of_dicts([{"a": "1"}, {"b": "2"}, {"c": "3"}]) + self.assertEqual(expected, output) + + def test_generate_rpm_objects(self): + packages = [ + { + "name": "python-ceilometerclient", + "arch": "src", + "ver": "1.5.0", + "rel": "1.el7", + "href": "/python-ceilometerclient-1.5.0-1.el7.src.rpm", + } + ] + repomdxml_url = ( + "http://vault.centos.org/7.1.1503/cloud/Source/openstack-liberty" + ) + rpms = list(generate_rpm_objects(packages, repomdxml_url)) + self.assertEqual(1, len(rpms)) + rpm = rpms[0] + self.assertEqual("python-ceilometerclient", rpm.name) + self.assertEqual(EVR(version="1.5.0", release="1.el7").to_string(), rpm.version) + + def test_collect_rpm_packages_from_repomd_cloudera(self): + uri2loc = { + "http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera/repomd.xml" + ), + "http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera/filelists.xml.gz" + ), + "http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera/other.xml.gz" + ), + "http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera/primary.xml.gz" + ), + } + + uri = "http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) + _uris, packages, _error = collect_rpm_packages_from_repomd(uri) + + expected_loc = self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_collect_rpm_packages_from_repomd_centos(self): + uri2loc = { + "http://vault.centos.org/3.8/updates/x86_64/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/centos/repomd.xml" + ), + "http://vault.centos.org/3.8/updates/x86_64/repodata/filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/centos/filelists.xml.gz" + ), + "http://vault.centos.org/3.8/updates/x86_64/repodata/other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/centos/other.xml.gz" + ), + "http://vault.centos.org/3.8/updates/x86_64/repodata/primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/centos/primary.xml.gz" + ), + } + + uri = "http://vault.centos.org/3.8/updates/x86_64/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) + uris, packages, _error = collect_rpm_packages_from_repomd(uri) + + expected_uris = [ + URI( + uri="http://vault.centos.org/3.8/updates/x86_64/RPMS/wireshark-0.99.2-EL3.1.x86_64.rpm" + ), + URI( + uri="http://vault.centos.org/3.8/updates/x86_64/RPMS/wireshark-gnome-0.99.2-EL3.1.x86_64.rpm" + ), + URI( + uri="http://vault.centos.org/3.8/updates/x86_64/RPMS/XFree86-100dpi-fonts-4.3.0-111.EL.x86_64.rpm" + ), + ] + self.assertEqual(expected_uris, uris) + + expected_loc = self.get_test_loc( + "repodata_rpms/repomd_parser/centos/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_collect_rpm_packages_from_repomd_cloudera_2(self): + uri2loc = { + "http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera2/repomd.xml" + ), + "http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera2/filelists.xml.gz" + ), + "http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera2/primary.xml.gz" + ), + "http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera2/other.xml.gz" + ), + } + + uri = "http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) + _uris, packages, _error = collect_rpm_packages_from_repomd(uri) + + expected_loc = self.get_test_loc( + "repodata_rpms/repomd_parser/cloudera2/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_collect_rpm_packages_from_repomd_postgresql(self): + uri2loc = { + "http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/postgresql/repomd.xml" + ), + "http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/d5b4a2d13632cceb2a13a42fdb2887a22c1e262e6eeeb7270a80beec453392cd-filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/postgresql/d5b4a2d13632cceb2a13a42fdb2887a22c1e262e6eeeb7270a80beec453392cd-filelists.xml.gz" + ), + "http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/fc8c4fa6295d68abddcf5bba71435ecf585c439b86d7e75e0ba9bf3951f914b5-other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/postgresql/fc8c4fa6295d68abddcf5bba71435ecf585c439b86d7e75e0ba9bf3951f914b5-other.xml.gz" + ), + "http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/d5cb2a54df0aa000ac2a007b1d9b0d1f2e6a924d2d97584acbe654e59aa993e8-primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/postgresql/d5cb2a54df0aa000ac2a007b1d9b0d1f2e6a924d2d97584acbe654e59aa993e8-primary.xml.gz" + ), + } + + uri = "http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) + uris, packages, error = collect_rpm_packages_from_repomd(uri) + self.assertEqual(None, error) + expected_uris = [ + URI( + uri="http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/skytools-92-debuginfo-3.1.5-1.rhel6.x86_64.rpm" + ), + URI( + uri="http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repmgr92-2.0.2-4.rhel6.x86_64.rpm" + ), + URI( + uri="http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/pgagent_92-3.2.1-1.rhel6.x86_64.rpm" + ), + ] + + self.assertEqual(expected_uris, uris) + expected_loc = self.get_test_loc( + "repodata_rpms/repomd_parser/postgresql/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_collect_rpm_packages_from_repomd_opensuse(self): + uri2loc = { + "http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/opensuse/repomd.xml" + ), + "http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/09ed18eaa761fe64c863137db5c51fdb4e60fbb29d6c9b0c424e3119ba4875cd-filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/opensuse/09ed18eaa761fe64c863137db5c51fdb4e60fbb29d6c9b0c424e3119ba4875cd-filelists.xml.gz" + ), + "http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/9c100bbff252834349ca677813f333881ce9d2ca9db8091ce387156ba7a22859-other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/opensuse/9c100bbff252834349ca677813f333881ce9d2ca9db8091ce387156ba7a22859-other.xml.gz" + ), + "http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/314da4321afcff987bd3e28672e60f1a2324f2698480b84812f7ec0a1aef4041-primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/opensuse/314da4321afcff987bd3e28672e60f1a2324f2698480b84812f7ec0a1aef4041-primary.xml.gz" + ), + } + + uri = "http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) + _uris, packages, _error = collect_rpm_packages_from_repomd(uri) + + expected_loc = self.get_test_loc( + "repodata_rpms/repomd_parser/opensuse/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_collect_rpm_packages_from_repomd_pgpool(self): + uri2loc = { + "http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/repomd.xml": self.get_test_loc( + "repodata_rpms/repomd_parser/pgpool/repomd.xml" + ), + "http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/filelists.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/pgpool/filelists.xml.gz" + ), + "http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/other.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/pgpool/other.xml.gz" + ), + "http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/primary.xml.gz": self.get_test_loc( + "repodata_rpms/repomd_parser/pgpool/primary.xml.gz" + ), + } + + uri = "http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/repomd.xml" + with patch("requests.get") as mock_http_get: + mock_http_get.side_effect = ( + lambda *args, **kwargs: mocked_requests_get_for_uris( + uri2loc, *args, **kwargs + ) + ) + _uris, packages, _error = collect_rpm_packages_from_repomd(uri) + + expected_loc = self.get_test_loc( + "repodata_rpms/repomd_parser/pgpool/expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_combine_dicts_using_pkgid(self): + all_dicts = [ + { + "pkgid": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "name": "python-ceilometerclient", + }, + { + "pkgid": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "ver": "1.5.0", + }, + { + "pkgid": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "rel": "1.el7", + }, + ] + expected = [ + { + "pkgid": "36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5", + "name": "python-ceilometerclient", + "rel": "1.el7", + "ver": "1.5.0", + } + ] + output = combine_dicts_using_pkgid(all_dicts) + self.assertEqual(expected, output) diff --git a/minecode/tests/miners/test_rubygems.py b/minecode/tests/miners/test_rubygems.py new file mode 100644 index 00000000..0b3b0740 --- /dev/null +++ b/minecode/tests/miners/test_rubygems.py @@ -0,0 +1,345 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + + +import codecs +import json +import os +from unittest.mock import patch + +from django.test import TestCase as DjangoTestCase + +from commoncode.fileutils import file_name + +from minecode import miners +from minecode import route +from minecode import visit_router +from minecode.miners.rubygems import RubyGemsApiManyVersionsVisitor +from minecode.miners.rubygems import RubyGemsApiVersionsJsonMapper +from minecode.miners.rubygems import RubyGemsIndexVisitor +from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataMapper +from minecode.miners.rubygems import RubyGemsPackageArchiveMetadataVisitor +from minecode.miners.rubygems import build_rubygem_packages_from_api_data +from minecode.miners.rubygems import build_rubygem_packages_from_metadata +from minecode.miners.rubygems import get_gem_metadata +from minecode.models import ResourceURI +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get +from minecode.utils_test import model_to_dict + +# +# TODO: also parse Gemspec +# ('rubygems/address_standardization.gemspec', 'rubygems/address_standardization.gemspec.json'), +# ('rubygems/arel.gemspec', 'rubygems/arel.gemspec.json'), + + +class RubyGemsVisitorTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_check_gem_file_visitor_routes(self): + routes = [ + "https://rubygems.org/downloads/m2r-2.1.0.gem", # https + "http://rubygems.org/downloads/m2r-2.1.0.gem", # http + "https://rubygems.org/downloads/O365RubyEasy-0.0.1.gem", # upper + ] + + for gem_file_visitor_route in routes: + self.assertTrue(visit_router.resolve(gem_file_visitor_route)) + + def test_RubyGemsIndexVisitor_latest(self): + uri = "http://rubygems.org/specs.4.8.gz" + test_loc = self.get_test_loc("rubygems/index/latest_specs.4.8.gz") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, _ = RubyGemsIndexVisitor(uri) + expected_loc = self.get_test_loc( + "rubygems/index/latest_specs.4.8.gz.expected.json" + ) + uris_list = list(uris) + self.assertTrue(len(uris_list) > 1000) + self.check_expected_uris(uris_list[0:1000], expected_loc, regen=FIXTURES_REGEN) + + def test_RubyGemsApiVersionVisitor(self): + uri = "https://rubygems.org/api/v1/versions/0xffffff.json" + test_loc = self.get_test_loc("rubygems/apiv1/0xffffff.api.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = RubyGemsApiManyVersionsVisitor(uri) + expected_loc = self.get_test_loc("rubygems/apiv1/expected_0xffffff.api.json") + self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) + + def test_RubyGemsApiVersionVisitor2(self): + uri = "https://rubygems.org/api/v1/versions/a1630ty_a1630ty.json" + test_loc = self.get_test_loc("rubygems/apiv1/a1630ty_a1630ty.api.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = RubyGemsApiManyVersionsVisitor(uri) + expected_loc = self.get_test_loc( + "rubygems/apiv1/expected_a1630ty_a1630ty.api.json" + ) + self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) + + def test_RubyGemsApiVersionVisitor3(self): + uri = "https://rubygems.org/api/v1/versions/zuck.json" + test_loc = self.get_test_loc("rubygems/apiv1/zuck.api.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = RubyGemsApiManyVersionsVisitor(uri) + expected_loc = self.get_test_loc("rubygems/apiv1/expected_zuck.api.json") + self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) + + def test_RubyGemsPackageArchiveMetadataVisitor(self): + uri = "https://rubygems.org/downloads/a_okay-0.1.0.gem" + test_loc = self.get_test_loc("rubygems/a_okay-0.1.0.gem", copy=True) + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, _ = RubyGemsPackageArchiveMetadataVisitor(uri) + expected_loc = self.get_test_loc("rubygems/a_okay-0.1.0.gem.metadata") + with open(expected_loc) as expect_file: + self.assertEqual(expect_file.read(), data) + + +class RubyGemsApiMapperTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_build_rubygem_packages_from_api_data_1(self): + with open(self.get_test_loc("rubygems/apiv1/0xffffff.api.json")) as api: + apidata = json.load(api) + packages = build_rubygem_packages_from_api_data(apidata, "0xffffff") + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("rubygems/apiv1/0xffffff.api.package.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_rubygem_packages_from_api_data_2(self): + with open(self.get_test_loc("rubygems/apiv1/zuck.api.json")) as api: + apidata = json.load(api) + packages = build_rubygem_packages_from_api_data(apidata, "zuck") + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("rubygems/apiv1/zuck.api.package.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_rubygem_packages_from_api_data_3(self): + with open(self.get_test_loc("rubygems/apiv1/a1630ty_a1630ty.api.json")) as api: + apidata = json.load(api) + packages = miners.rubygems.build_rubygem_packages_from_api_data( + apidata, "a1630ty_a1630ty" + ) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc( + "rubygems/apiv1/a1630ty_a1630ty.api.package.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_rubygem_packages_from_api_data_with_deps(self): + with open(self.get_test_loc("rubygems/apiv1/action_tracker.api.json")) as api: + apidata = json.load(api) + packages = miners.rubygems.build_rubygem_packages_from_api_data( + apidata, "action_tracker" + ) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc( + "rubygems/apiv1/action_tracker.api.package.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_RubyGemsApiVersionsJsonMapper(self): + test_uri = "https://rubygems.org/api/v1/versions/a1630ty_a1630ty.json" + router = route.Router() + router.append(test_uri, RubyGemsApiVersionsJsonMapper) + test_loc = self.get_test_loc("rubygems/apiv1/a1630ty_a1630ty.api.json") + with codecs.open(test_loc, encoding="utf-8") as ltest_file: + test_data = ltest_file.read() + + test_res_uri = ResourceURI(uri=test_uri, data=test_data) + packages = RubyGemsApiVersionsJsonMapper(test_uri, test_res_uri) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc( + "rubygems/apiv1/a1630ty_a1630ty.api.mapped.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + +class RubyGemsArchiveMapperTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_test_RubyGemsPackageArchiveMetadataMapper(self): + test_uri = "https://rubygems.org/downloads/mysmallidea-address_standardization-0.4.1.gem" + router = route.Router() + router.append(test_uri, RubyGemsPackageArchiveMetadataMapper) + test_loc = self.get_test_loc( + "rubygems/mysmallidea-address_standardization-0.4.1.gem.metadata" + ) + with codecs.open(test_loc, encoding="utf-8") as test_file: + test_data = test_file.read() + + test_res_uri = ResourceURI(uri=test_uri, data=test_data) + packages = RubyGemsPackageArchiveMetadataMapper(test_uri, test_res_uri) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc( + "rubygems/mysmallidea-address_standardization-0.4.1.gem.mapped.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def check_mapped_packages( + self, test_loc, expected_loc, extract=True, regen=FIXTURES_REGEN + ): + test_loc = self.get_test_loc(test_loc, copy=True) + + if extract: + metadata = get_gem_metadata(test_loc) + else: + with open(test_loc) as tl: + metadata = tl.read() + + download_url = "https://rubygems.org/downloads/{}".format( + file_name(test_loc).replace(".metadata", "") + ) + results = build_rubygem_packages_from_metadata(metadata, download_url) + results = [p.to_dict() for p in results] + + expected_loc = self.get_test_loc(expected_loc) + if regen: + with codecs.open(expected_loc, "wb", encoding="UTF-8") as ex: + json.dump(results, ex, indent=2) + + with open(expected_loc) as ex: + expected = json.load(ex) + + assert expected == results + + def test_build_rubygem_packages_from_metadata_plain(self): + self.check_mapped_packages( + "rubygems/0mq-0.4.1.gem.metadata", + "rubygems/0mq-0.4.1.gem.package.json", + extract=False, + ) + + def test_build_rubygem_packages_from_metadata_0(self): + self.check_mapped_packages( + "rubygems/a_okay-0.1.0.gem", "rubygems/a_okay-0.1.0.gem.package.json" + ) + + def test_build_rubygem_packages_from_metadata_1(self): + self.check_mapped_packages( + "rubygems/archive-tar-minitar-0.5.2.gem", + "rubygems/archive-tar-minitar-0.5.2.gem.package.json", + ) + + def test_build_rubygem_packages_from_metadata_2(self): + self.check_mapped_packages( + "rubygems/blankslate-3.1.3.gem", + "rubygems/blankslate-3.1.3.gem.package.json", + ) + + def test_build_rubygem_packages_from_metadata_3(self): + self.check_mapped_packages( + "rubygems/m2r-2.1.0.gem", "rubygems/m2r-2.1.0.gem.package.json" + ) + + def test_build_rubygem_packages_from_metadata_4(self): + self.check_mapped_packages( + "rubygems/mysmallidea-address_standardization-0.4.1.gem", + "rubygems/mysmallidea-address_standardization-0.4.1.gem.package.json", + ) + + def test_build_rubygem_packages_from_metadata_5(self): + self.check_mapped_packages( + "rubygems/mysmallidea-mad_mimi_mailer-0.0.9.gem", + "rubygems/mysmallidea-mad_mimi_mailer-0.0.9.gem.package.json", + ) + + def test_build_rubygem_packages_from_metadata_6(self): + self.check_mapped_packages( + "rubygems/ng-rails-csrf-0.1.0.gem", + "rubygems/ng-rails-csrf-0.1.0.gem.package.json", + ) + + def test_build_rubygem_packages_from_metadata_7(self): + self.check_mapped_packages( + "rubygems/small_wonder-0.1.10.gem", + "rubygems/small_wonder-0.1.10.gem.package.json", + ) + + def test_build_rubygem_packages_from_metadata_8(self): + self.check_mapped_packages( + "rubygems/small-0.2.gem", "rubygems/small-0.2.gem.package.json" + ) + + def test_build_rubygem_packages_from_metadata_9(self): + self.check_mapped_packages( + "rubygems/sprockets-vendor_gems-0.1.3.gem", + "rubygems/sprockets-vendor_gems-0.1.3.gem.package.json", + ) + + def test_build_rubygem_packages_from_metadata_with_deps(self): + self.check_mapped_packages( + "rubygems/action_tracker-1.0.2.gem", + "rubygems/action_tracker-1.0.2.gem.package.json", + ) + + +class RubyEnd2EndTest(JsonBasedTesting, DjangoTestCase): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_visit_and_map_end2end(self): + import packagedb + from minecode.management.commands.run_map import map_uri + from minecode.management.commands.run_visit import visit_uri + + uri = "https://rubygems.org/downloads/sprockets-vendor_gems-0.1.3.gem" + test_loc = self.get_test_loc( + "rubygems/sprockets-vendor_gems-0.1.3.gem", copy=True + ) + + before_uri = [p.id for p in ResourceURI.objects.all()] + before_pkg = [p.id for p in packagedb.models.Package.objects.all()] + + resource_uri = ResourceURI.objects.insert(uri=uri) + + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + # visit test proper: this should process all the test uris + visit_uri(resource_uri) + map_uri(resource_uri) + + if before_uri: + visited = ResourceURI.objects.exclude(id__in=before_uri) + else: + visited = ResourceURI.objects.all() + + uri_results = [model_to_dict(rec, exclude=["id"]) for rec in visited] + expected_loc = self.get_test_loc( + "rubygems/sprockets-vendor_gems-0.1.3.gem.visited.json" + ) + self.check_expected_results(uri_results, expected_loc, regen=FIXTURES_REGEN) + + if before_pkg: + mapped = packagedb.models.Package.objects.exclude(id__in=before_pkg) + else: + mapped = packagedb.models.Package.objects.all() + + package_results = [pac.to_dict() for pac in mapped] + expected_loc = self.get_test_loc( + "rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json" + ) + self.check_expected_results( + package_results, + expected_loc, + fields_to_remove=["package_sets"], + regen=FIXTURES_REGEN, + ) diff --git a/minecode/tests/miners/test_sourceforge.py b/minecode/tests/miners/test_sourceforge.py new file mode 100644 index 00000000..8bf0ad24 --- /dev/null +++ b/minecode/tests/miners/test_sourceforge.py @@ -0,0 +1,112 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os +from unittest.mock import patch + +from minecode import miners +from minecode.miners import sourceforge +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get + + +class SourceforgeVisitorsTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_visit_sf_sitemap_index_new(self): + uri = "http://sourceforge.net/sitemap.xml" + test_loc = self.get_test_loc("sourceforge/sitemap.xml") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _data, error = sourceforge.SourceforgeSitemapIndexVisitor(uri) + + expected_loc = self.get_test_loc("sourceforge/expected_sf_sitemap_new.json") + self.check_expected_uris(uris, expected_loc) + self.assertIsNone(error) + + def test_visit_sf_sitemap_page_new(self): + uri = "http://sourceforge.net/sitemap-1.xml" + test_loc = self.get_test_loc("sourceforge/sitemap-1.xml") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, error = sourceforge.SourceforgeSitemapPageVisitor(uri) + + expected_loc = self.get_test_loc( + "sourceforge/expected_sf_sitemap_page_new.json" + ) + self.check_expected_uris(uris, expected_loc) + self.assertIsNone(error) + + def test_visit_sf_sitemap_page6(self): + uri = "https://sourceforge.net/sitemap-6.xml" + test_loc = self.get_test_loc("sourceforge/sitemap-6.xml") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + uris, _, error = sourceforge.SourceforgeSitemapPageVisitor(uri) + + expected_loc = self.get_test_loc("sourceforge/expected_sitemap-6.json") + self.check_expected_uris(uris, expected_loc) + self.assertIsNone(error) + + def test_visit_sf_project_json_api_new(self): + uri = "https://sourceforge.net/api/project/name/netwiki/json" + test_loc = self.get_test_loc("sourceforge/netwiki.json") + with patch("requests.get") as mock_http_get: + mock_http_get.return_value = mocked_requests_get(uri, test_loc) + _, data, error = sourceforge.SourceforgeProjectJsonVisitor(uri) + + expected_loc = self.get_test_loc("sourceforge/expected_netwiki.json") + self.check_expected_results(data, expected_loc) + self.assertIsNone(error) + + +class SourceforgeMappersTest(JsonBasedTesting): + test_data_dir = os.path.join( + os.path.dirname(os.path.dirname(__file__)), "testfiles" + ) + + def test_build_packages(self): + with open(self.get_test_loc("sourceforge/odanur.json")) as sourceforge_metadata: + metadata = json.load(sourceforge_metadata) + packages = miners.sourceforge.build_packages_from_metafile(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("sourceforge/mapper_odanur_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_packages2(self): + with open( + self.get_test_loc("sourceforge/openstunts.json") + ) as sourceforge_metadata: + metadata = json.load(sourceforge_metadata) + packages = miners.sourceforge.build_packages_from_metafile(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("sourceforge/mapper_openstunts_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_packages3(self): + with open(self.get_test_loc("sourceforge/monoql.json")) as sourceforge_metadata: + metadata = json.load(sourceforge_metadata) + packages = miners.sourceforge.build_packages_from_metafile(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("sourceforge/mapper_omonoql_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) + + def test_build_packages4(self): + with open( + self.get_test_loc("sourceforge/niftyphp.json") + ) as sourceforge_metadata: + metadata = json.load(sourceforge_metadata) + packages = miners.sourceforge.build_packages_from_metafile(metadata) + packages = [p.to_dict() for p in packages] + expected_loc = self.get_test_loc("sourceforge/mapper_niftyphp_expected.json") + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_api.py b/minecode/tests/test_api.py index f3507a12..5996a6d1 100644 --- a/minecode/tests/test_api.py +++ b/minecode/tests/test_api.py @@ -10,43 +10,49 @@ import json import os -from django.contrib.auth.models import Group, User +from django.contrib.auth.models import Group +from django.contrib.auth.models import User from django.core import signing from django.test import TestCase + from rest_framework import status from rest_framework.test import APIClient from minecode.models import ScannableURI from minecode.utils import get_webhook_url from minecode.utils_test import JsonBasedTesting -from packagedb.models import Package, Resource +from packagedb.models import Package +from packagedb.models import Resource class ScannableURIAPITestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): self.scan_queue_worker_user = User.objects.create_user( username="username", email="e@mail.com", - password="secret" + password="secret", # NOQA ) scan_queue_workers_group, _ = Group.objects.get_or_create( - name='scan_queue_workers') + name="scan_queue_workers" + ) scan_queue_workers_group.user_set.add(self.scan_queue_worker_user) - self.scan_queue_worker_auth = f"Token {self.scan_queue_worker_user.auth_token.key}" + self.scan_queue_worker_auth = ( + f"Token {self.scan_queue_worker_user.auth_token.key}" + ) self.scan_queue_worker_client = APIClient(enforce_csrf_checks=True) self.scan_queue_worker_client.credentials( - HTTP_AUTHORIZATION=self.scan_queue_worker_auth) - self.scan_queue_worker_user_id_str = str( - self.scan_queue_worker_user.id) + HTTP_AUTHORIZATION=self.scan_queue_worker_auth + ) + self.scan_queue_worker_user_id_str = str(self.scan_queue_worker_user.id) # create a staff user self.staff_user = User.objects.create_user( username="staff_username", email="staff_e@mail.com", - password="secret", - is_staff=True + password="secret", # NOQA + is_staff=True, ) self.staff_auth = f"Token {self.staff_user.auth_token.key}" self.staff_client = APIClient(enforce_csrf_checks=True) @@ -56,7 +62,7 @@ def setUp(self): self.regular_user = User.objects.create_user( username="regular_username", email="regular_e@mail.com", - password="secret", + password="secret", # NOQA ) self.regular_auth = f"Token {self.regular_user.auth_token.key}" self.regular_client = APIClient(enforce_csrf_checks=True) @@ -65,161 +71,159 @@ def setUp(self): self.anonymous_client = APIClient() self.package1 = Package.objects.create( - download_url='https://test-url.com/package1.tar.gz', - type='type1', - name='name1', - version='1.0', + download_url="https://test-url.com/package1.tar.gz", + type="type1", + name="name1", + version="1.0", ) self.scannable_uri1 = ScannableURI.objects.create( - uri='https://test-url.com/package1.tar.gz', - package=self.package1 + uri="https://test-url.com/package1.tar.gz", package=self.package1 ) self.package2 = Package.objects.create( - download_url='https://test-url.com/package2.tar.gz', - type='type2', - name='name2', - version='2.0', + download_url="https://test-url.com/package2.tar.gz", + type="type2", + name="name2", + version="2.0", ) self.scannable_uri2 = ScannableURI.objects.create( - uri='https://test-url.com/package2.tar.gz', - package=self.package2 + uri="https://test-url.com/package2.tar.gz", package=self.package2 ) self.package3 = Package.objects.create( - download_url='https://test-url.com/package3.tar.gz', - type='type3', - name='name3', - version='3.0', + download_url="https://test-url.com/package3.tar.gz", + type="type3", + name="name3", + version="3.0", ) self.scannable_uri3 = ScannableURI.objects.create( - uri='https://test-url.com/package3.tar.gz', - package=self.package3 + uri="https://test-url.com/package3.tar.gz", package=self.package3 ) def test_api_scannable_uri_permissions(self): - response = self.anonymous_client.get('/api/scan_queue/') + response = self.anonymous_client.get("/api/scan_queue/") self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) - response = self.anonymous_client.get( - '/api/scan_queue/get_next_download_url/') + response = self.anonymous_client.get("/api/scan_queue/get_next_download_url/") self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) - response = self.anonymous_client.post('/api/scan_queue/update_status/') + response = self.anonymous_client.post("/api/scan_queue/update_status/") self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) - response = self.regular_client.get('/api/scan_queue/') + response = self.regular_client.get("/api/scan_queue/") self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) - response = self.regular_client.get( - '/api/scan_queue/get_next_download_url/') + response = self.regular_client.get("/api/scan_queue/get_next_download_url/") self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) - response = self.regular_client.post('/api/scan_queue/update_status/') + response = self.regular_client.post("/api/scan_queue/update_status/") self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) def test_api_scannable_uri_list_endpoint(self): - response = self.scan_queue_worker_client.get('/api/scan_queue/') + response = self.scan_queue_worker_client.get("/api/scan_queue/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(3, response.data.get('count')) + self.assertEqual(3, response.data.get("count")) - response = self.staff_client.get('/api/scan_queue/') + response = self.staff_client.get("/api/scan_queue/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(3, response.data.get('count')) + self.assertEqual(3, response.data.get("count")) def test_api_scannable_uri_get_next_download_url(self): def check_webhook_url(self, webhook_url): - webhook_url = response.data.get('webhook_url') - key = webhook_url.rstrip('/').split('/')[-1] - self.assertIn('/api/scan_queue/index_package_scan/', webhook_url) - self.assertEqual(signing.loads(key), str( - self.scan_queue_worker_user.id)) + webhook_url = response.data.get("webhook_url") + key = webhook_url.rstrip("/").split("/")[-1] + self.assertIn("/api/scan_queue/index_package_scan/", webhook_url) + self.assertEqual(signing.loads(key), str(self.scan_queue_worker_user.id)) response = self.scan_queue_worker_client.get( - '/api/scan_queue/get_next_download_url/') + "/api/scan_queue/get_next_download_url/" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get( - 'scannable_uri_uuid'), self.scannable_uri1.uuid) - self.assertEqual(response.data.get( - 'download_url'), self.scannable_uri1.uri) - check_webhook_url(self, response.data.get('webhook_url')) + self.assertEqual( + response.data.get("scannable_uri_uuid"), self.scannable_uri1.uuid + ) + self.assertEqual(response.data.get("download_url"), self.scannable_uri1.uri) + check_webhook_url(self, response.data.get("webhook_url")) response = self.scan_queue_worker_client.get( - '/api/scan_queue/get_next_download_url/') + "/api/scan_queue/get_next_download_url/" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get( - 'scannable_uri_uuid'), self.scannable_uri2.uuid) - self.assertEqual(response.data.get( - 'download_url'), self.scannable_uri2.uri) - check_webhook_url(self, response.data.get('webhook_url')) + self.assertEqual( + response.data.get("scannable_uri_uuid"), self.scannable_uri2.uuid + ) + self.assertEqual(response.data.get("download_url"), self.scannable_uri2.uri) + check_webhook_url(self, response.data.get("webhook_url")) response = self.scan_queue_worker_client.get( - '/api/scan_queue/get_next_download_url/') + "/api/scan_queue/get_next_download_url/" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get( - 'scannable_uri_uuid'), self.scannable_uri3.uuid) - self.assertEqual(response.data.get( - 'download_url'), self.scannable_uri3.uri) - check_webhook_url(self, response.data.get('webhook_url')) + self.assertEqual( + response.data.get("scannable_uri_uuid"), self.scannable_uri3.uuid + ) + self.assertEqual(response.data.get("download_url"), self.scannable_uri3.uri) + check_webhook_url(self, response.data.get("webhook_url")) response = self.scan_queue_worker_client.get( - '/api/scan_queue/get_next_download_url/') + "/api/scan_queue/get_next_download_url/" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get('scannable_uri_uuid'), '') - self.assertEqual(response.data.get('download_url'), '') - self.assertEqual(response.data.get('webhook_url'), '') + self.assertEqual(response.data.get("scannable_uri_uuid"), "") + self.assertEqual(response.data.get("download_url"), "") + self.assertEqual(response.data.get("webhook_url"), "") - response = self.staff_client.get( - '/api/scan_queue/get_next_download_url/') + response = self.staff_client.get("/api/scan_queue/get_next_download_url/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get('scannable_uri_uuid'), '') - self.assertEqual(response.data.get('download_url'), '') - self.assertEqual(response.data.get('webhook_url'), '') + self.assertEqual(response.data.get("scannable_uri_uuid"), "") + self.assertEqual(response.data.get("download_url"), "") + self.assertEqual(response.data.get("webhook_url"), "") def test_api_scannable_uri_update_status(self): scannable_uri1_uuid = self.scannable_uri1.uuid scannable_uri2_uuid = self.scannable_uri2.uuid - scannable_uri1_update_status_url = f'/api/scan_queue/{scannable_uri1_uuid}/update_status/' - scannable_uri2_update_status_url = f'/api/scan_queue/{scannable_uri2_uuid}/update_status/' + scannable_uri1_update_status_url = ( + f"/api/scan_queue/{scannable_uri1_uuid}/update_status/" + ) + scannable_uri2_update_status_url = ( + f"/api/scan_queue/{scannable_uri2_uuid}/update_status/" + ) - self.assertEqual(ScannableURI.SCAN_NEW, - self.scannable_uri1.scan_status) + self.assertEqual(ScannableURI.SCAN_NEW, self.scannable_uri1.scan_status) data = { "scannable_uri_uuid": scannable_uri1_uuid, - "scan_status": 'failed', - 'scan_log': 'scan_log', + "scan_status": "failed", + "scan_log": "scan_log", } response = self.scan_queue_worker_client.post( - scannable_uri1_update_status_url, data=data) + scannable_uri1_update_status_url, data=data + ) self.assertEqual(response.status_code, status.HTTP_200_OK) self.scannable_uri1.refresh_from_db() - self.assertEqual(ScannableURI.SCAN_FAILED, - self.scannable_uri1.scan_status) - self.assertEqual('scan_log', self.scannable_uri1.scan_error) + self.assertEqual(ScannableURI.SCAN_FAILED, self.scannable_uri1.scan_status) + self.assertEqual("scan_log", self.scannable_uri1.scan_error) - data = { - 'scan_status': '' - } + data = {"scan_status": ""} response = self.scan_queue_worker_client.post( - scannable_uri2_update_status_url, data=data) - expected_response = {'error': 'missing scan_status'} + scannable_uri2_update_status_url, data=data + ) + expected_response = {"error": "missing scan_status"} self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertEqual(expected_response, response.data) - data = { - 'scan_status': 'invalid' - } + data = {"scan_status": "invalid"} response = self.scan_queue_worker_client.post( - scannable_uri2_update_status_url, data=data) - expected_response = {'error': 'invalid scan_status: invalid'} + scannable_uri2_update_status_url, data=data + ) + expected_response = {"error": "invalid scan_status: invalid"} self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertEqual(expected_response, response.data) data = {} response = self.scan_queue_worker_client.post( - '/api/scan_queue/asdf/', data=data) - self.assertEqual(response.status_code, - status.HTTP_405_METHOD_NOT_ALLOWED) + "/api/scan_queue/asdf/", data=data + ) + self.assertEqual(response.status_code, status.HTTP_405_METHOD_NOT_ALLOWED) def test_api_scannable_uri_update_status_update_finished_scannable_uri(self): scannable_uri_uuid = self.scannable_uri3.uuid @@ -231,16 +235,13 @@ def test_api_scannable_uri_update_status_update_finished_scannable_uri(self): ]: self.scannable_uri3.scan_status = scan_status self.scannable_uri3.save() - data = { - 'scannable_uri_uuid': scannable_uri_uuid, - 'scan_status': 'scanned' - } + data = {"scannable_uri_uuid": scannable_uri_uuid, "scan_status": "scanned"} response = self.scan_queue_worker_client.post( - f'/api/scan_queue/{scannable_uri_uuid}/update_status/', data=data + f"/api/scan_queue/{scannable_uri_uuid}/update_status/", data=data ) expected_response = { - 'error': 'cannot update status for scannable_uri ' - f'{self.scannable_uri3.uuid}: scannable_uri has finished ' + "error": "cannot update status for scannable_uri " + f"{self.scannable_uri3.uuid}: scannable_uri has finished " f'with status "{ScannableURI.SCAN_STATUSES_BY_CODE[scan_status]}"' } self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) @@ -255,49 +256,51 @@ def test_api_scannable_uri_index_package_scan(self): self.assertFalse(self.package2.declared_license_expression) self.assertFalse(self.package2.copyright) self.assertEqual(0, Resource.objects.all().count()) - scan_file_location = self.get_test_loc('scancodeio/get_scan_data.json') + scan_file_location = self.get_test_loc("scancodeio/get_scan_data.json") summary_file_location = self.get_test_loc( - 'scancodeio/scan_summary_response.json') + "scancodeio/scan_summary_response.json" + ) project_extra_data = { - 'scannable_uri_uuid': self.scannable_uri2.uuid, - 'md5': 'md5', - 'sha1': 'sha1', - 'sha256': 'sha256', - 'sha512': 'sha512', - 'size': 100, + "scannable_uri_uuid": self.scannable_uri2.uuid, + "md5": "md5", + "sha1": "sha1", + "sha256": "sha256", + "sha512": "sha512", + "size": 100, } with ( open(scan_file_location) as scan_file, - open(summary_file_location) as summary_file + open(summary_file_location) as summary_file, ): results = json.load(scan_file) summary = json.load(summary_file) data = { - 'project': { - 'extra_data': project_extra_data, + "project": { + "extra_data": project_extra_data, }, - 'results': results, - 'summary': summary, + "results": results, + "summary": summary, } webhook_url = get_webhook_url( - 'index_package_scan', self.scan_queue_worker_user.id) + "index_package_scan", self.scan_queue_worker_user.id + ) response = self.scan_queue_worker_client.post( - webhook_url, data=data, format='json') + webhook_url, data=data, format="json" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) self.scannable_uri2.refresh_from_db() - self.assertEqual(ScannableURI.SCAN_INDEXED, - self.scannable_uri2.scan_status) + self.assertEqual(ScannableURI.SCAN_INDEXED, self.scannable_uri2.scan_status) self.package2.refresh_from_db() - self.assertEqual('md5', self.package2.md5) - self.assertEqual('sha1', self.package2.sha1) - self.assertEqual('sha256', self.package2.sha256) - self.assertEqual('sha512', self.package2.sha512) + self.assertEqual("md5", self.package2.md5) + self.assertEqual("sha1", self.package2.sha1) + self.assertEqual("sha256", self.package2.sha256) + self.assertEqual("sha512", self.package2.sha512) self.assertEqual(100, self.package2.size) + self.assertEqual("apache-2.0", self.package2.declared_license_expression) self.assertEqual( - 'apache-2.0', self.package2.declared_license_expression) - self.assertEqual( - 'Copyright (c) Apache Software Foundation', self.package2.copyright) + "Copyright (c) Apache Software Foundation", self.package2.copyright + ) self.assertFalse(self.scannable_uri2.scan_error) self.assertEqual(64, Resource.objects.all().count()) diff --git a/minecode/tests/test_bitbucket.py b/minecode/tests/test_bitbucket.py deleted file mode 100644 index 1284c4ac..00000000 --- a/minecode/tests/test_bitbucket.py +++ /dev/null @@ -1,147 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from collections import OrderedDict -import json -import os -import re - -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode.mappers.bitbucket import build_bitbucket_download_packages -from minecode.mappers.bitbucket import build_bitbucket_repo_package - -from minecode.visitors.bitbucket import BitbucketDetailsVisitorPaginated -from minecode.visitors.bitbucket import BitbucketIndexVisitor -from minecode.visitors.bitbucket import BitbucketSingleRepoVisitor - -from minecode.tests import FIXTURES_REGEN - - -class BitbucketVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_BitbucketIndexVisitor(self): - uri = 'https://api.bitbucket.org/2.0/repositories?pagelen=10' - test_loc = self.get_test_loc('bitbucket/visit/index-repositories.json') - - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _ = BitbucketIndexVisitor(uri) - - expected_uri_loc = self.get_test_loc( - 'bitbucket/visit/index-repositories_expected_uris.json') - self.check_expected_uris(uris, expected_uri_loc, regen=FIXTURES_REGEN) - - expected_data_loc = self.get_test_loc( - 'bitbucket/visit/index-repositories_expected_data.json') - self.check_expected_results( - data, expected_data_loc, regen=FIXTURES_REGEN) - - def test_BitbucketSingleRepoVisitor(self): - uri = 'https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/' - test_loc = self.get_test_loc('bitbucket/visit/singlerepo.json') - - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _ = BitbucketSingleRepoVisitor(uri) - - expected_data_loc = self.get_test_loc( - 'bitbucket/visit/singlerepo_expected_data.json') - self.check_expected_results( - data, expected_data_loc, regen=FIXTURES_REGEN) - - expected_uris_loc = self.get_test_loc( - 'bitbucket/visit/singlerepo_expected_uris.json') - self.check_expected_uris(uris, expected_uris_loc, regen=FIXTURES_REGEN) - - def test_BitbucketDetailsVisitorPaginated(self): - uri = 'https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/refs/tags?pagelen=2' - test_loc = self.get_test_loc('bitbucket/visit/paginated_tags.json') - - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _ = BitbucketDetailsVisitorPaginated(uri) - - expected_data_loc = self.get_test_loc( - 'bitbucket/visit/paginated_tags_expected_data.json') - self.check_expected_results( - data, expected_data_loc, regen=FIXTURES_REGEN) - - expected_uris_loc = self.get_test_loc( - 'bitbucket/visit/paginated_tags_expected_uris.json') - self.check_expected_uris(uris, expected_uris_loc, regen=FIXTURES_REGEN) - - -class BitbucketMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_pattern_match_without_download(self): - url = 'https://api.bitbucket.org/2.0/repositories/phlogistonjohn/tweakmsg' - pattern = 'https://api.bitbucket.org/2.0/repositories/.*(?= 5.0.37.2)' in result['Build-Depends']) - self.assertTrue('cmake' in result['Build-Depends']) - - @expectedFailure - def test_debcon_get_paragraph_data_from_file_control_invalid(self): - control_file = self.get_test_loc('debian/debutils/control_invalid') - result = debcon.get_paragraph_data_from_file(control_file) - self.assertEqual({}, result) - - @expectedFailure - def test_debcon_get_paragraph_data_from_file_with_non_existing_path(self): - control_file = 'path_invalid' - with self.assertRaises(Exception) as context: - debcon.get_paragraph_data_from_file(control_file) - self.assertTrue('No such file or directory' in context.exception) - - def test_parse_deb822_dsc(self): - dsc_file = self.get_test_loc('debian/debutils/3dldf_2.0.3+dfsg-2.dsc') - result = debcon.get_paragraph_data_from_file(dsc_file) - expected_loc = self.get_test_loc( - 'debian/debutils/3dldf_2.0.3+dfsg-2.dsc-expected') - self.check_expected_deb822(result, expected_loc, regen=FIXTURES_REGEN) - - ################################################################# - - def test_parse_email(self): - content = 'Debian TeX Maintainers ' - name, email = debutils.parse_email(content) - self.assertEqual('Debian TeX Maintainers', name) - self.assertEqual('debian-tex-maint@lists.debian.org', email) - - def test_parse_email_2(self): - # Space left Purposefully - content = ' Debian TeX Maintainers ' - name, email = debutils.parse_email(content) - self.assertEqual('Debian TeX Maintainers', name) - self.assertEqual(None, email) - - def test_parse_email_3(self): - # Space left Purposefully - content = '< debian-tex-maint@lists.debian.org >' - name, email = debutils.parse_email(content) - self.assertEqual(None, name) - self.assertEqual("debian-tex-maint@lists.debian.org", email) - - def test_comma_separated(self): - tags = 'implemented-in::perl, role::program, use::converting, works-with::pim' - result = list(debutils.comma_separated(tags)) - self.assertEqual([u'implemented-in::perl', u'role::program', - u'use::converting', u'works-with::pim'], result) - - -class DebianReleaseTest(BaseDebianTest): - - def test_parse_release(self): - release_file = self.get_test_loc('debian/release/Release') - result = list(debian_visitor.parse_release(release_file)) - expected_loc = self.get_test_loc('debian/release/Release_expected') - self.check_expected_deb822(result, expected_loc) - - def test_parse_release_with_md5(self): - release_file = self.get_test_loc('debian/release/Release_with_md5') - result = list(debian_visitor.parse_release(release_file)) - expected_loc = self.get_test_loc( - 'debian/release/Release_with_md5_expected') - self.check_expected_deb822(result, expected_loc) - - @expectedFailure - def test_visit_debian_release(self): - uri = 'http://ftp.debian.org/debian/dists/Debian8.3/Release' - test_loc = self.get_test_loc('debian/release/visited_Release') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = debian_visitor.DebianReleaseVisitor(uri) - result = json.loads(data) - - release_file = self.get_test_loc( - 'debian/release/visited_Release-expected.json') - self.check_expected_deb822(result, release_file) - - -class DebianCopyrightTest(BaseDebianTest): - - # TODO: There is an exception for the current debian copyright parser - @expectedFailure - def test_parse_copyright_only_basic(self): - copyright_file = self.get_test_loc('debian/copyright/basic_copyright') - copyrights = [ - info for info in debian_visitor.parse_copyright_only(copyright_file)] - self.assertTrue( - 'Copyright 1998 John Doe ' in copyrights) - self.assertTrue( - 'Copyright 1998 Jane Doe ' in copyrights) - - @expectedFailure - def test_parse_copyright_only_with_incorrect_file(self): - copyright_file = self.get_test_loc( - 'debian/copyright/invalid_copyright') - with self.assertRaises(Exception) as context: - [info for info in debian_visitor.parse_copyright_only( - copyright_file)] - self.assertTrue('no paragraphs in input' in context.exception) - - @expectedFailure - def test_parse_copyright_only_with_incorrect_path(self): - copyright_file = 'path_invalid' - with self.assertRaises(Exception) as context: - [info for info in debian_visitor.parse_copyright_only( - copyright_file)] - self.assertTrue('No such file or directory' in context.exception) - - @expectedFailure - def test_parse_copyright_allinfo_basic(self): - copyright_file = self.get_test_loc('debian/copyright/basic_copyright') - copyright_data = [ - info for info in debian_visitor.parse_copyright_allinfo(copyright_file)] - expected = [ - {'files': (u'*',), - 'license': u'GPL-2+', - 'copyright': 'Copyright 1998 John Doe ' - }, - {'files': (u'debian/*',), - 'license': u'GPL-2+', - 'copyright': 'Copyright 1998 Jane Doe ' - } - ] - self.assertEqual(expected, copyright_data) - - @expectedFailure - def test_parse_copyright_allinfo_with_invalid_file(self): - copyright_file = self.get_test_loc( - 'debian/copyright/invalid_copyright') - with self.assertRaises(Exception) as context: - [info for info in debian_visitor.parse_copyright_allinfo( - copyright_file)] - self.assertTrue('no paragraphs in input' in context.exception) - - @expectedFailure - def test_parse_copyright_allinfo_with_incorrect_path(self): - copyright_file = 'path_invalid' - with self.assertRaises(Exception) as context: - [info for info in debian_visitor.parse_copyright_allinfo( - copyright_file)] - self.assertTrue('No such file or directory' in context.exception) - - @expectedFailure - def test_parse_license_basic(self): - copyright_file = self.get_test_loc('debian/copyright/basic_copyright') - licenses, licensetexts = debian_visitor.parse_license(copyright_file) - expected = { - 'GPL-2+': [ - "This program is free software; you can redistribute it\n" - "and/or modify it under the terms of the GNU General Public\n" - "License as published by the Free Software Foundation; either\n" - "version 2 of the License, or (at your option) any later\n" - "version.\n\n" - "This program is distributed in the hope that it will be\n" - "useful, but WITHOUT ANY WARRANTY; without even the implied\n" - "warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR\n" - "PURPOSE. See the GNU General Public License for more\ndetails.\n\n" - "You should have received a copy of the GNU General Public\n" - "License along with this package; if not, write to the Free\n" - "Software Foundation, Inc., 51 Franklin St, Fifth Floor,\n" - "Boston, MA 02110-1301 USA\n\n" - "On Debian systems, the full text of the GNU General Public\n" - "License version 2 can be found in the file\n" - "`/usr/share/common-licenses/GPL-2'." - ]} - self.assertEqual(expected, licenses) - self.assertEqual([], licensetexts) - - @expectedFailure - def test_parse_license_with_invalid_file(self): - copyright_file = self.get_test_loc( - 'debian/copyright/invalid_copyright') - with self.assertRaises(Exception) as context: - debian_visitor.parse_license(copyright_file) - self.assertTrue('no paragraphs in input' in context.exception) - - @expectedFailure - def test_parse_license_with_incorrect_path(self): - copyright_file = 'path_invalid' - with self.assertRaises(Exception) as context: - debian_visitor.parse_license(copyright_file) - self.assertTrue('No such file or directory' in context.exception) - - -class DebianSourcesTest(BaseDebianTest): - - def test_collect_source_packages(self): - index_file = self.get_test_loc('debian/sources/debian_Sources') - source_info = [ - info for info in debian_visitor.collect_source_packages(index_file)] - expected_loc = self.get_test_loc( - 'debian/sources/debian_Sources_visit_expected') - self.check_objects_expected( - source_info, expected_loc, regen=FIXTURES_REGEN) - - def test_collect_source_packages_ubuntu(self): - index_file = self.get_test_loc('debian/sources/ubuntu_Sources') - source_info = [ - info for info in debian_visitor.collect_source_packages(index_file)] - expected_loc = self.get_test_loc( - 'debian/sources/ubuntu_Sources_visit_expected') - self.check_objects_expected( - source_info, expected_loc, regen=FIXTURES_REGEN) - - @expectedFailure - def test_DebianSourcesVisitor(self): - uri = 'http://ftp.debian.org/debian/dists/jessie-backports/main/source/Sources.gz' - test_loc = self.get_test_loc('debian/sources/Sources.gz') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = debian_visitor.DebianSourcesVisitor(uri) - expected_loc = self.get_test_loc( - 'debian/sources/Sources.gz-expected.json') - self.check_expected_uris(list(uris), expected_loc) - - @expectedFailure - def test_DebianSourcesVisitor_with_invalid_file(self): - uri = 'http://ftp.debian.org/debian/dists/jessie-backports/main/source/invalid_files/Sources.gz' - test_loc = self.get_test_loc('debian/invalid_files/ls-lR.gz') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _ = debian_visitor.DebianSourcesVisitor(uri) - self.assertEqual(0, len(list(uris))) - - @expectedFailure - def test_build_source_file_packages(self): - with open(self.get_test_loc('debian/sources/debian_Sources')) as packs: - packages = debian_mapper.build_source_file_packages(packs.read()) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'debian/sources/debian_Sources_mapped-expected-packages.json') - self.check_expected_results(packages, expected_loc) - - -class DebianPackagesTest(BaseDebianTest): - - def test_parse_packages_index(self): - index_file = self.get_test_loc('debian/packages/debian_Packages') - package_info = [ - info for info in debian_visitor.parse_packages_index(index_file)] - expected_loc = self.get_test_loc( - 'debian/packages/debian_Packages-visit-expected.json') - self.check_objects_expected( - package_info, expected_loc, regen=FIXTURES_REGEN) - - @expectedFailure - def test_parse_packages_from_debian_Packages(self): - with open(self.get_test_loc('debian/packages/debian_Packages')) as packs: - packages = debian_mapper.parse_packages(packs.read()) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'debian/packages/debian_Packages-expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - @expectedFailure - def test_parse_packages_from_ubuntu_Packages(self): - with open(self.get_test_loc('debian/packages/ubuntu_Packages')) as packs: - packages = debian_mapper.parse_packages(packs.read()) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'debian/packages/ubuntu_Packages-expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - @expectedFailure - def test_parse_packages_from_installed_status(self): - with open(self.get_test_loc('debian/status/simple_status')) as packs: - packages = debian_mapper.parse_packages(packs.read()) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'debian/packages/ubuntu_Packages-expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - -class DebianLSLRTest(BaseDebianTest): - - def test_DebianDirectoryIndexVisitor_from_debian(self): - uri = 'http://ftp.debian.org/debian/ls-lR.gz' - test_loc = self.get_test_loc('debian/lslr/ls-lR_debian') - temp_gz_location = self.get_tmp_gz_file(test_loc) - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get( - uri, temp_gz_location) - uris, _, _ = debian_visitor.DebianDirectoryIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'debian/lslr/ls-lR_debian.gz-expected.json') - self.check_expected_uris(list(uris), expected_loc) - - def test_DebianDirectoryIndexVisitor_from_ubuntu(self): - uri = 'http://archive.ubuntu.com/ubuntu/ls-lR.gz' - test_loc = self.get_test_loc('debian/lslr/ls-lR_ubuntu') - temp_gz_location = self.get_tmp_gz_file(test_loc) - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get( - uri, temp_gz_location) - uris, _, _ = debian_visitor.DebianDirectoryIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'debian/lslr/ls-lR_ubuntu.gz-expected.json') - self.check_expected_uris(list(uris), expected_loc) - - -class DebianDescriptionTest(BaseDebianTest): - - @expectedFailure - def test_DebianDescriptionVisitor(self): - uri = 'http://ftp.debian.org/debian/pool/main/7/7kaa/7kaa_2.14.3-1.dsc' - test_loc = self.get_test_loc('debian/dsc/7kaa_2.14.3-1.dsc') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = debian_visitor.DebianDescriptionVisitor(uri) - result = json.loads(data) - dsc_file = self.get_test_loc('debian/dsc/description_expected.json') - self.check_expected_deb822(result, dsc_file) - - @expectedFailure - def test_parse_description(self): - with open(self.get_test_loc('debian/dsc/description.json')) as debian_description_meta: - metadata = json.load(debian_description_meta) - packages = debian_mapper.parse_description(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'debian/dsc/description-expected.json') - self.check_expected_results(packages, expected_loc) - - -class DebianMapperTest(BaseDebianTest): - - @expectedFailure - def test_get_dependencies(self): - test = { - 'build1': 'build', - 'build2': 'build2', - 'build3': 'buildnot', - } - keys = ['build1', 'build2'] - result = debian_mapper.get_dependencies(test, keys) - self.assertEqual(2, len(result)) - self.assertEqual('build', result[0].purl) - self.assertEqual(None, result[0].requirement) - self.assertEqual('build2', result[1].purl) - self.assertEqual(None, result[1].requirement) - - def test_get_programming_language(self): - tags = ['role::program', 'implemented-in::perl', - 'use::converting', 'works-with::pim'] - result = debian_mapper.get_programming_language(tags) - self.assertEqual('perl', result) diff --git a/minecode/tests/test_eclipse.py b/minecode/tests/test_eclipse.py deleted file mode 100644 index db744de9..00000000 --- a/minecode/tests/test_eclipse.py +++ /dev/null @@ -1,123 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import os -import unittest - -from mock import Mock -from mock import patch -import requests - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode.visitors import URI -from minecode.visitors import eclipse -from minecode.tests import FIXTURES_REGEN - - -class EclipseVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_visit_eclipse_projects(self): - uri = 'https://projects.eclipse.org/list-of-projects' - test_loc = self.get_test_loc('eclipse/projects.eclipse.org.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = eclipse.EclipseProjectVisitors(uri) - expected_loc = self.get_test_loc('eclipse/eclipse_projects_expected') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_eclipse_project(self): - uri = 'https://projects.eclipse.org/projects/modeling.m2t.acceleo' - test_loc = self.get_test_loc( - 'eclipse/Acceleo_projects.eclipse.org.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = eclipse.EclipseSingleProjectVisitor(uri) - with open(self.get_test_loc('eclipse/acceleo_expected.html'), 'rb') as data_file: - self.assertEqual(data_file.read(), data) - - def test_visit_eclipse_git_repo(self): - uri = 'http://git.eclipse.org/c' - test_loc = self.get_test_loc('eclipse/Eclipse_Git_repositories.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = eclipse.EclipseGitVisitor(uri) - expected_loc = self.get_test_loc('eclipse/eclipse_git_repos_expected') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_eclipse_packages(self): - uri = 'http://www.eclipse.org/downloads/packages/all' - test_loc = self.get_test_loc('eclipse/All_Releases_Packages.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = eclipse.EclipsePackagesVisitor(uri) - expected_loc = self.get_test_loc('eclipse/eclipse_packages_expected') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_eclipse_package_releases(self): - uri = 'http://www.eclipse.org/downloads/packages/release/Neon/R' - test_loc = self.get_test_loc('eclipse/Neon_R.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = eclipse.EclipseReleaseVisitor(uri) - expected_loc = self.get_test_loc('eclipse/Neon_R-expected.json') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_eclipse_projects_json(self): - uri = 'http://projects.eclipse.org/json/projects/all' - test_loc = self.get_test_loc('eclipse/birt.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _error = eclipse.EclipseProjectsJsonVisitor(uri) - - expected_uris = [ - URI(uri=u'http://projects.eclipse.org/json/project/birt', - source_uri=u'http://projects.eclipse.org/json/projects/all', - package_url=u'pkg:eclipse/birt')] - self.assertEqual(expected_uris, list(uris)) - - expected_loc = self.get_test_loc('eclipse/birt-expected.json') - self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) - - @unittest.skip('This requires a live internet connection to test requests timeouts') - def test_visitor_eclipse_projects_json_download_timeout_error(self): - uri = 'http://projects.eclipse.org/json/projects/all' - try: - eclipse.EclipseProjectsJsonVisitor(uri) - except requests.Timeout: - self.fail( - "Time out error happens when download the url, " - "this should be fixed by increaseing the timeout.") - - -class TestEclipseMap(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_build_packages(self): - with open(self.get_test_loc('eclipse/birt.json')) as eclipse_metadata: - metadata = json.load(eclipse_metadata) - packages = mappers.eclipse.build_packages_with_json(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('eclipse/eclipse_birt_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_eclipse_html_packages(self): - with open(self.get_test_loc('eclipse/Acceleo_projects.eclipse.org.html')) as eclipse_metadata: - metadata = eclipse_metadata.read() - packages = mappers.eclipse.build_packages(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'eclipse/Acceleo_projects_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_filter.py b/minecode/tests/test_filter.py index 03cb7697..87b67903 100644 --- a/minecode/tests/test_filter.py +++ b/minecode/tests/test_filter.py @@ -9,19 +9,19 @@ import os -from minecode.utils_test import MiningTestCase from minecode.filter import sf_net +from minecode.utils_test import MiningTestCase class FilterTest(MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def test_filter(self): - inputf = self.get_test_loc('filter_sf/tst_sfnet.csv') - exf = self.get_test_loc('filter_sf/tst_sfnet2.csv') - expected = open(exf, 'rb').read() + inputf = self.get_test_loc("filter_sf/tst_sfnet.csv") + exf = self.get_test_loc("filter_sf/tst_sfnet2.csv") + expected = open(exf, "rb").read() tdir = self.get_temp_dir() - output = os.path.join(tdir, 'out.csv') + output = os.path.join(tdir, "out.csv") sf_net(inputf, output) - test = open(output, 'rb').read() + test = open(output, "rb").read() self.assertEqual(expected, test) diff --git a/minecode/tests/test_freedesktop.py b/minecode/tests/test_freedesktop.py deleted file mode 100644 index 278c3da3..00000000 --- a/minecode/tests/test_freedesktop.py +++ /dev/null @@ -1,67 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import os - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode.visitors import freedesktop -from minecode.tests import FIXTURES_REGEN - - -class FreedesktopTest(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - -class FreedesktopVistorTest(FreedesktopTest): - - def test_visit_software_html_page(self): - uri = 'https://www.freedesktop.org/wiki/Software' - test_loc = self.get_test_loc('freedesktop/Software.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = freedesktop.FreedesktopHTMLVisitor(uri) - expected_loc = self.get_test_loc( - 'freedesktop/freedesktop_software_expected') - self.check_expected_uris(uris, expected_loc) - - -class FreedesktopMapperTest(FreedesktopTest): - - def test_map_software_html_page_hal(self): - with open(self.get_test_loc('freedesktop/hal.html')) as freedesktop_metadata: - metadata = freedesktop_metadata.read() - packages = mappers.freedesktop.build_packages( - metadata, - 'https://www.freedesktop.org/wiki/Software/hal', - purl='pkg:freedesktop/hal') - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'freedesktop/hal_project_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_map_software_html_page_libinput(self): - with open(self.get_test_loc('freedesktop/libinput.html')) as freedesktop_metadata: - metadata = freedesktop_metadata.read() - packages = mappers.freedesktop.build_packages( - metadata, - 'https://www.freedesktop.org/wiki/Software/libinput/', - purl='pkg:freedesktop/libinput') - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'freedesktop/libinput_project_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_golang.py b/minecode/tests/test_golang.py deleted file mode 100644 index d63be206..00000000 --- a/minecode/tests/test_golang.py +++ /dev/null @@ -1,100 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import os - -from mock import Mock -from mock import patch - -from packageurl import PackageURL - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode.visitors.golang import GodocIndexVisitor -from minecode.visitors.golang import GodocSearchVisitor -from minecode.visitors.golang import parse_package_path -from minecode.mappers.golang import build_golang_package -from minecode.tests import FIXTURES_REGEN - - -class GoLangVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_GoLangGoDocAPIVisitor(self): - uri = 'https://api.godoc.org/packages' - test_loc = self.get_test_loc('golang/packages.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = GodocIndexVisitor(uri) - expected_loc = self.get_test_loc('golang/packages_expected_uris.json') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_GodocSearchVisitor(self): - uri = 'https://api.godoc.org/search?q=github.com/golang' - test_loc = self.get_test_loc('golang/godoc_search.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = GodocSearchVisitor(uri) - expected_loc = self.get_test_loc( - 'golang/godoc_search_expected_uris.json') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_GodocSearchVisitor_with_non_github_urls(self): - uri = 'https://api.godoc.org/search?q=github.com/golang*' - test_loc = self.get_test_loc('golang/godoc_search_off_github.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = GodocSearchVisitor(uri) - expected_loc = self.get_test_loc( - 'golang/godoc_search_off_github_expected_uris.json') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_parse_package_path(self): - test_path = 'github.com/lambdasoup/go-netlink/log' - purl = PackageURL.from_string( - 'pkg:golang/github.com/lambdasoup/go-netlink' - '?vcs_repository=https://github.com/lambdasoup/go-netlink') - expected = purl, 'github.com/lambdasoup/go-netlink' - assert expected == parse_package_path(test_path) - - -class GoLangMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_build_golang_package(self): - purl = 'pkg:golang/github.com/golang/glog?vcs_repository=https://github.com/golang/glog' - with open(self.get_test_loc('golang/glog.json')) as f: - package_data = json.load(f) - package = build_golang_package(package_data, purl) - package = package.to_dict() - expected_loc = self.get_test_loc('golang/glog_expected.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_build_golang_package_bitbucket(self): - purl = 'pkg:bitbucket/bitbucket.org/zombiezen/yaml?vcs_repository=https://bitbucket.org/zombiezen/yaml' - with open(self.get_test_loc('golang/math3.json')) as f: - package_data = json.load(f) - package = build_golang_package(package_data, purl) - package = package.to_dict() - expected_loc = self.get_test_loc('golang/math3_expected.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_build_golang_package_non_well_known(self): - purl = 'pkg:golang/winterdrache.de/bindings/sdl' - with open(self.get_test_loc('golang/winter.json')) as f: - package_data = json.load(f) - package = build_golang_package(package_data, purl) - package = package.to_dict() - expected_loc = self.get_test_loc('golang/winter_expected.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_googlecode.py b/minecode/tests/test_googlecode.py deleted file mode 100644 index ae0f974b..00000000 --- a/minecode/tests/test_googlecode.py +++ /dev/null @@ -1,115 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import os - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode.visitors import URI -from minecode.visitors import googlecode -from minecode.tests import FIXTURES_REGEN - - -class GoogleNewAPIVisitorsTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_visit_google_download_zip_visitor(self): - uri = 'https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip' - test_loc = self.get_test_loc('googlecode/google-code-archive.txt.zip') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = googlecode.GooglecodeArchiveVisitor(uri) - expected_loc = self.get_test_loc( - 'googlecode/expected_google-code-archive.txt.zip.json') - self.check_expected_uris(uris, expected_loc) - - def test_visit_google_projectpages(self): - uri = 'https://code.google.com/archive/search?q=domain:code.google.com' - test_loc = self.get_test_loc( - 'googlecode/v2_api/GoogleCodeProjectHosting.htm') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) - expected_loc = self.get_test_loc( - 'googlecode/v2_api/expected_googleprojects.json') - self.check_expected_uris(uris, expected_loc) - - def test_visit_google_projectpage2(self): - uri = 'https://code.google.com/archive/search?q=domain:code.google.com&page=2' - test_loc = self.get_test_loc( - 'googlecode/v2_api/GoogleCodeProjectHosting_page2.htm') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) - expected_loc = self.get_test_loc( - 'googlecode/v2_api/expected_googleproject_page2.json') - self.check_expected_uris(uris, expected_loc) - - def test_visit_google_download_json(self): - uri = 'https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/project.json' - test_loc = self.get_test_loc('googlecode/v2_api/project.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = googlecode.GoogleProjectJsonVisitor(uri) - self.assertEqual( - [URI(uri=u'https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/downloads-page-1.json')], list(uris)) - - def test_visit_google_json(self): - uri = 'https://storage.googleapis.com/google-code-archive/v2/code.google.com/hg4j/downloads-page-1.json' - test_loc = self.get_test_loc('googlecode/v2_api/downloads-page-1.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) - expected_loc = self.get_test_loc( - 'googlecode/v2_api/hg4j_download_expected.json') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_googleapi_project_json(self): - uri = 'https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2%2Fapache-extras.org%2F124799961-qian%2Fproject.json?alt=media' - test_loc = self.get_test_loc( - 'googlecode/v2_apache-extras.org_124799961-qian_project.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = googlecode.GoogleDownloadsPageJsonVisitor(uri) - expected_loc = self.get_test_loc( - 'googlecode/expected_v2_apache-extras.org_124799961-qian_project2.json') - self.check_expected_results(data, expected_loc) - - -class GoogleNewAPIMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_build_packages_from_v2_projects_json(self): - with open(self.get_test_loc('googlecode/v2_api/project.json')) as projectsjson_meta: - metadata = json.load(projectsjson_meta) - packages = mappers.googlecode.build_packages_from_projectsjson_v2( - metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'googlecode/v2_api/package_expected_project.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_packages_from_v1_projects_json(self): - with open(self.get_test_loc('googlecode/v2_apache-extras.org_124799961-qian_project.json')) as projectsjson_meta: - metadata = json.load(projectsjson_meta) - packages = mappers.googlecode.build_packages_from_projectsjson_v1( - metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'googlecode/mapper_expected_v2_apache-extras.org_124799961-qian_project.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_gstreamer.py b/minecode/tests/test_gstreamer.py deleted file mode 100644 index 4c1fcd48..00000000 --- a/minecode/tests/test_gstreamer.py +++ /dev/null @@ -1,64 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import os -import re - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode.visitors import gstreamer -from minecode.tests import FIXTURES_REGEN -from minecode import mappers - - -class GstreamerVistorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_visit_gstreamer_source_root(self): - uri = 'https://gstreamer.freedesktop.org/src/' - test_loc = self.get_test_loc('gstreamer/src_root.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = gstreamer.GstreamerHTMLVisitor(uri) - expected_loc = self.get_test_loc('gstreamer/src_root.html-expected') - self.check_expected_uris(uris, expected_loc) - - def test_visit_Gstreamer_subpath_contains_file_resources(self): - uri = 'https://gstreamer.freedesktop.org/src/gst-openmax/pre/' - test_loc = self.get_test_loc('gstreamer/src_gst-openmax_pre.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = gstreamer.GstreamerHTMLVisitor(uri) - expected_loc = self.get_test_loc( - 'gstreamer/src_gst-openmax_pre.html-expected') - self.check_expected_uris(uris, expected_loc) - - -class GstreamerMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_build_package_mapper_regex(self): - regex = re.compile( - r'^https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*[\w\-\.]+[.tar\.bz2|\.sha1sum|\.md5|\.gz|\.tar\.xz|\.asc]$') - result = re.match( - regex, 'https://gstreamer.freedesktop.org/src/gst-openmax/pre/gst-openmax-0.10.0.2.tar.bz2') - self.assertTrue(result) - - def test_build_package_from_url(self): - packages = mappers.gstreamer.build_package_from_url( - 'https://gstreamer.freedesktop.org/src/gst-openmax/pre/gst-openmax-0.10.0.2.tar.bz2') - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'gstreamer/mapper_build_from_url-expected') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_housekeeping.py b/minecode/tests/test_housekeeping.py index 6d9ec6d2..f0053a60 100644 --- a/minecode/tests/test_housekeeping.py +++ b/minecode/tests/test_housekeeping.py @@ -7,89 +7,85 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -import codecs import json import os from io import StringIO - -from mock import patch +from unittest.mock import patch from django.core import management from django.test import TestCase as DjangoTestCase import packagedb - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - from minecode.management.commands.check_licenses import find_ambiguous_packages from minecode.management.commands.run_map import map_uri from minecode.management.commands.run_visit import visit_uri - from minecode.models import ResourceURI from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting +from minecode.utils_test import mocked_requests_get class PackageLicenseCheckTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def test_find_ambiguous_packages_declared_license(self): packagedb.models.Package.objects.create( - download_url='http://example.com', - name='Foo', - declared_license_expression='apache-2.0 and unknown', - type='maven' + download_url="http://example.com", + name="Foo", + declared_license_expression="apache-2.0 and unknown", + type="maven", ) packages = [p.to_dict() for p in find_ambiguous_packages()] expected_loc = self.get_test_loc( - 'housekeeping/declared_license_search_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "housekeeping/declared_license_search_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_find_ambiguous_packages_license_expression(self): packagedb.models.Package.objects.create( - download_url='http://example.com', - name='Foo', - declared_license_expression='apache-2.0 and unknown', - type='maven' + download_url="http://example.com", + name="Foo", + declared_license_expression="apache-2.0 and unknown", + type="maven", ) packages = [p.to_dict() for p in find_ambiguous_packages()] expected_loc = self.get_test_loc( - 'housekeeping/license_expression_search_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "housekeeping/license_expression_search_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_find_ambiguous_packages_license_expression_ignore_uppercase(self): packagedb.models.Package.objects.create( - download_url='http://example.com', - name='Foo', - declared_license_expression='Unknown', - type='maven' + download_url="http://example.com", + name="Foo", + declared_license_expression="Unknown", + type="maven", ) packages = [p.to_dict() for p in find_ambiguous_packages()] expected_loc = self.get_test_loc( - 'housekeeping/ignore_upper_case_search_expected.json') + "housekeeping/ignore_upper_case_search_expected.json" + ) - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) def test_run_check_licenses_command(self): packagedb.models.Package.objects.create( - download_url='http://example.com', - name='Foo', - declared_license_expression='apache-2.0 and unknown', - type='maven' + download_url="http://example.com", + name="Foo", + declared_license_expression="apache-2.0 and unknown", + type="maven", ) results_loc = self.get_temp_file() - expected_loc = self.get_test_loc('housekeeping/example_expected.json') + expected_loc = self.get_test_loc("housekeeping/example_expected.json") output = StringIO() - management.call_command('check_licenses', '-o', - results_loc, stdout=output) + management.call_command("check_licenses", "-o", results_loc, stdout=output) self.assertTrue( - 'Visited 1 packages\nFound 1 possible packages\nFound packages dumped to:' in output.getvalue()) + "Visited 1 packages\nFound 1 possible packages\nFound packages dumped to:" + in output.getvalue() + ) with open(results_loc) as results: res = json.load(results) @@ -98,25 +94,24 @@ def test_run_check_licenses_command(self): def test_run_check_licenses_command_with_empty_package(self): output = StringIO() results_loc = self.get_temp_file() - management.call_command('check_licenses', '-o', - results_loc, stdout=output) + management.call_command("check_licenses", "-o", results_loc, stdout=output) self.assertTrue( - 'Visited 0 packages\nFound 0 possible packages' in output.getvalue()) + "Visited 0 packages\nFound 0 possible packages" in output.getvalue() + ) def test_visit_and_map_using_pom(self): - uri = 'http://repo1.maven.org/maven2/org/bytesoft/bytejta-supports/0.5.0-ALPHA4/bytejta-supports-0.5.0-ALPHA4.pom' - test_loc = self.get_test_loc( - 'housekeeping/bytejta-supports-0.5.0-ALPHA4.pom') + uri = "http://repo1.maven.org/maven2/org/bytesoft/bytejta-supports/0.5.0-ALPHA4/bytejta-supports-0.5.0-ALPHA4.pom" + test_loc = self.get_test_loc("housekeeping/bytejta-supports-0.5.0-ALPHA4.pom") resource_uri = ResourceURI.objects.insert(uri=uri) - with patch('requests.get') as mock_http_get: + with patch("requests.get") as mock_http_get: mock_http_get.return_value = mocked_requests_get(uri, test_loc) # visit test proper: this should insert all the test_uris visit_uri(resource_uri) map_uri(resource_uri) packages = [p.to_dict() for p in find_ambiguous_packages()] expected_loc = self.get_test_loc( - 'housekeeping/bytejta-supports-0.5.0-ALPHA4.pom_search_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) + "housekeeping/bytejta-supports-0.5.0-ALPHA4.pom_search_expected.json" + ) + self.check_expected_results(packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_indexing.py b/minecode/tests/test_indexing.py index d55c420d..ca6e905d 100644 --- a/minecode/tests/test_indexing.py +++ b/minecode/tests/test_indexing.py @@ -24,23 +24,23 @@ class IndexingTest(MiningTestCase, JsonBasedTesting): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): self.package1 = Package.objects.create( - download_url='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', - type='maven', - namespace='', - name='wagon-api', - version='20040705.181715' + download_url="https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar", + type="maven", + namespace="", + name="wagon-api", + version="20040705.181715", ) self.package2 = Package.objects.create( - download_url='https://github.com/nexB/elf-inspector/raw/4333e1601229da87fa88961389d7397af6e027c4/tests/data/dwarf_and_elf/analyze.so.debug', - type='generic', - namespace='', - name='debug', - version='1.23' + download_url="https://github.com/nexB/elf-inspector/raw/4333e1601229da87fa88961389d7397af6e027c4/tests/data/dwarf_and_elf/analyze.so.debug", + type="generic", + namespace="", + name="debug", + version="1.23", ) def test_indexing_index_package_files(self): @@ -53,17 +53,16 @@ def test_indexing_index_package_files(self): self.assertEqual(0, Resource.objects.count()) scan_data_loc = self.get_test_loc( - 'indexing/scancodeio_wagon-api-20040705.181715.json') - with open(scan_data_loc, 'rb') as f: + "indexing/scancodeio_wagon-api-20040705.181715.json" + ) + with open(scan_data_loc, "rb") as f: scan_data = json.loads(f.read()) - indexing_errors = indexing.index_package_files( - self.package1, scan_data) + indexing_errors = indexing.index_package_files(self.package1, scan_data) self.assertEqual(0, len(indexing_errors)) self.assertEqual(11, ApproximateDirectoryContentIndex.objects.count()) - self.assertEqual( - 11, ApproximateDirectoryStructureIndex.objects.count()) + self.assertEqual(11, ApproximateDirectoryStructureIndex.objects.count()) self.assertEqual(2, ApproximateResourceContentIndex.objects.count()) self.assertEqual(45, ExactFileIndex.objects.count()) @@ -71,34 +70,38 @@ def test_indexing_index_package_files(self): self.assertEqual(64, len(resources)) resource_data = [r.to_dict() for r in resources] expected_resources_loc = self.get_test_loc( - 'indexing/scancodeio_wagon-api-20040705.181715-expected.json') + "indexing/scancodeio_wagon-api-20040705.181715-expected.json" + ) self.check_expected_results( - resource_data, expected_resources_loc, regen=FIXTURES_REGEN) + resource_data, expected_resources_loc, regen=FIXTURES_REGEN + ) def test_indexing_index_package(self): scan_data_loc = self.get_test_loc( - 'indexing/scancodeio_wagon-api-20040705.181715.json') - with open(scan_data_loc, 'rb') as f: + "indexing/scancodeio_wagon-api-20040705.181715.json" + ) + with open(scan_data_loc, "rb") as f: scan_data = json.load(f) scan_summary_loc = self.get_test_loc( - 'indexing/scancodeio_wagon-api-20040705.181715-summary.json') - with open(scan_summary_loc, 'rb') as f: + "indexing/scancodeio_wagon-api-20040705.181715-summary.json" + ) + with open(scan_summary_loc, "rb") as f: scan_summary = json.load(f) project_extra_data = { - 'md5': 'md5', - 'sha1': 'sha1', - 'sha256': 'sha256', - 'sha512': 'sha512', - 'size': 100, + "md5": "md5", + "sha1": "sha1", + "sha256": "sha256", + "sha512": "sha512", + "size": 100, } # Set up ScannableURI scannable_uri = ScannableURI.objects.create( - uri='https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar', + uri="https://repo1.maven.org/maven2/maven/wagon-api/20040705.181715/wagon-api-20040705.181715.jar", scan_status=ScannableURI.SCAN_COMPLETED, - package=self.package1 + package=self.package1, ) # Ensure that we do not have any Package data updated, Resources, and fingerprints @@ -125,14 +128,14 @@ def test_indexing_index_package(self): ) # Make sure that Package data is updated + self.assertEqual("apache-2.0", self.package1.declared_license_expression) self.assertEqual( - 'apache-2.0', self.package1.declared_license_expression) - self.assertEqual( - 'Copyright (c) Apache Software Foundation', self.package1.copyright) - self.assertEqual('md5', self.package1.md5) - self.assertEqual('sha1', self.package1.sha1) - self.assertEqual('sha256', self.package1.sha256) - self.assertEqual('sha512', self.package1.sha512) + "Copyright (c) Apache Software Foundation", self.package1.copyright + ) + self.assertEqual("md5", self.package1.md5) + self.assertEqual("sha1", self.package1.sha1) + self.assertEqual("sha256", self.package1.sha256) + self.assertEqual("sha512", self.package1.sha512) self.assertEqual(100, self.package1.size) for expected_count, model in [ @@ -142,27 +145,25 @@ def test_indexing_index_package(self): (45, ExactFileIndex), ]: self.assertEqual( - expected_count, - model.objects.filter(package=self.package1).count() + expected_count, model.objects.filter(package=self.package1).count() ) def test_indexing_index_package_dwarf(self): - scan_data_loc = self.get_test_loc('indexing/get_scan_data_dwarf.json') - with open(scan_data_loc, 'rb') as f: + scan_data_loc = self.get_test_loc("indexing/get_scan_data_dwarf.json") + with open(scan_data_loc, "rb") as f: scan_data = json.load(f) - scan_summary_loc = self.get_test_loc( - 'indexing/scan_summary_dwarf.json') - with open(scan_summary_loc, 'rb') as f: + scan_summary_loc = self.get_test_loc("indexing/scan_summary_dwarf.json") + with open(scan_summary_loc, "rb") as f: scan_summary = json.load(f) project_extra_data = {} # Set up ScannableURI scannable_uri = ScannableURI.objects.create( - uri='https://github.com/nexB/elf-inspector/raw/4333e1601229da87fa88961389d7397af6e027c4/tests/data/dwarf_and_elf/analyze.so.debug', + uri="https://github.com/nexB/elf-inspector/raw/4333e1601229da87fa88961389d7397af6e027c4/tests/data/dwarf_and_elf/analyze.so.debug", scan_status=ScannableURI.SCAN_COMPLETED, - package=self.package2 + package=self.package2, ) # Run test diff --git a/minecode/tests/test_ls.py b/minecode/tests/test_ls.py index f8c2c8cc..a98af247 100644 --- a/minecode/tests/test_ls.py +++ b/minecode/tests/test_ls.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -12,30 +11,31 @@ import os -from minecode.utils_test import JsonBasedTesting -from minecode.tests import FIXTURES_REGEN from minecode import ls +from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting class ParseDirectoryListingTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') -# maxDiff = None + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") + # maxDiff = None def test_remove_inode_works_with_no_space_at_line_start(self): - test = '12190083 4 drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl' - expected = u'drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl' + test = "12190083 4 drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl" + expected = "drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl" self.assertEqual(expected, ls.remove_inode(test)) def test_remove_inode_works_even_with_space_at_line_start(self): - test = ' 12190083 4 drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl' - expected = u'drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl' + test = " 12190083 4 drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl" + expected = "drwxrwxr-x 4 svnwc svnwc 4096 May 4 15:57 ./perl" self.assertEqual(expected, ls.remove_inode(test)) - def check_listing(self, test_file, expected_file, from_find=True, regen=FIXTURES_REGEN): + def check_listing( + self, test_file, expected_file, from_find=True, regen=FIXTURES_REGEN + ): test_file = self.get_test_loc(test_file) test_text = open(test_file).read() - results = list(ls.parse_directory_listing( - test_text, from_find=from_find)) + results = list(ls.parse_directory_listing(test_text, from_find=from_find)) for r in results: if r.date: # we remove the year in YYYY-MM-DD to avoid date-sensitive test @@ -47,26 +47,30 @@ def check_listing(self, test_file, expected_file, from_find=True, regen=FIXTURES self.check_expected_results(results, expected_file, regen=regen) def test_parse_listing_from_findls(self): - test_file = 'directories/find-ls' - expected_file = 'directories/find-ls-expected.json' - self.check_listing(test_file, expected_file, - from_find=True, regen=FIXTURES_REGEN) + test_file = "directories/find-ls" + expected_file = "directories/find-ls-expected.json" + self.check_listing( + test_file, expected_file, from_find=True, regen=FIXTURES_REGEN + ) def test_parse_listing_from_findls_from_apache_does_not_fail_on_first_line(self): - test_file = 'directories/find-ls-apache-start' - expected_file = 'directories/find-ls-apache-start-expected.json' - self.check_listing(test_file, expected_file, - from_find=True, regen=FIXTURES_REGEN) + test_file = "directories/find-ls-apache-start" + expected_file = "directories/find-ls-apache-start-expected.json" + self.check_listing( + test_file, expected_file, from_find=True, regen=FIXTURES_REGEN + ) def test_parse_listing_from_lslr(self): - test_file = 'directories/ls-lr' - expected_file = 'directories/ls-lr-expected.json' - self.check_listing(test_file, expected_file, - from_find=False, regen=FIXTURES_REGEN) + test_file = "directories/ls-lr" + expected_file = "directories/ls-lr-expected.json" + self.check_listing( + test_file, expected_file, from_find=False, regen=FIXTURES_REGEN + ) def test_parse_listing_from_lslr_at_ubuntu(self): - test_file = 'directories/ls-lr-ubuntu' - expected_file = 'directories/ls-lr-ubuntu-expected.json' + test_file = "directories/ls-lr-ubuntu" + expected_file = "directories/ls-lr-ubuntu-expected.json" self.maxDiff = None - self.check_listing(test_file, expected_file, - from_find=False, regen=FIXTURES_REGEN) + self.check_listing( + test_file, expected_file, from_find=False, regen=FIXTURES_REGEN + ) diff --git a/minecode/tests/test_maven.py b/minecode/tests/test_maven.py deleted file mode 100644 index 8518b4bd..00000000 --- a/minecode/tests/test_maven.py +++ /dev/null @@ -1,1362 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from operator import itemgetter -import json -import os -import re - -from mock import patch -from unittest import mock - -from django.test import TestCase as DjangoTestCase - -from minecode.management.commands.run_map import map_uri -from minecode.management.commands.run_visit import visit_uri -from minecode.mappers import maven as maven_mapper -from minecode.models import ResourceURI -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting -from minecode.utils_test import model_to_dict -from minecode.visitors import maven as maven_visitor -from minecode.tests import FIXTURES_REGEN -import packagedb - -from packagedcode.maven import _parse -from packageurl import PackageURL - -# TODO: add tests from /maven-indexer/indexer-core/src/test/java/org/acche/maven/index/artifact - - -def sort_deps(results): - """ - FIXME: UGLY TEMP WORKAROUND: we sort the results because of a PyMaven bug - See https://github.com/sassoftware/pymaven/issues/11 - """ - if 'dependencies' in results: - results['dependencies'].sort() - elif results and 'metadata' in results[0]: - for result in results: - result['metadata']['dependencies'].sort() - - -class MavenMiscTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_get_entries(self): - index = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') - fields = list(maven_visitor.ENTRY_FIELDS.keys( - )) + list(maven_visitor.ENTRY_FIELDS_OTHER.keys()) + list(maven_visitor.ENTRY_FIELDS_IGNORED.keys()) - fields = set(fields) - result = list(maven_visitor.get_entries(index, fields=fields)) - expected_loc = self.get_test_loc('maven/index/expected_entries.json') - self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) - - def test_get_entries_increment(self): - index = self.get_test_loc( - 'maven/index/increment/nexus-maven-repository-index.445.gz') - fields = list(maven_visitor.ENTRY_FIELDS.keys( - )) + list(maven_visitor.ENTRY_FIELDS_OTHER.keys()) + list(maven_visitor.ENTRY_FIELDS_IGNORED.keys()) - fields = set(fields) - result = list(maven_visitor.get_entries(index, fields=fields)) - expected_loc = self.get_test_loc( - 'maven/index/increment/expected_entries.json') - self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) - - def test_get_entries_buggy(self): - index = self.get_test_loc( - 'maven/index/buggy/nexus-maven-repository-index.gz') - fields = list(maven_visitor.ENTRY_FIELDS.keys( - )) + list(maven_visitor.ENTRY_FIELDS_OTHER.keys()) + list(maven_visitor.ENTRY_FIELDS_IGNORED.keys()) - fields = set(fields) - result = list(maven_visitor.get_entries(index, fields=fields)) - expected_loc = self.get_test_loc( - 'maven/index/buggy/expected_entries.json') - self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) - - def test_get_artifacts_full(self): - index = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') - - fields = ( - list(maven_visitor.ENTRY_FIELDS) + - list(maven_visitor.ENTRY_FIELDS_OTHER) + - list(maven_visitor.ENTRY_FIELDS_IGNORED) - ) - fields = set(fields) - - result = [a.to_dict() for a in maven_visitor.get_artifacts( - index, fields, include_all=True)] - expected_loc = self.get_test_loc('maven/index/expected_artifacts.json') - self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) - - def test_get_artifacts_increment(self): - index = self.get_test_loc( - 'maven/index/increment/nexus-maven-repository-index.445.gz') - fields = list(maven_visitor.ENTRY_FIELDS.keys( - )) + list(maven_visitor.ENTRY_FIELDS_OTHER.keys()) + list(maven_visitor.ENTRY_FIELDS_IGNORED.keys()) - fields = set(fields) - result = [a.to_dict() for a in maven_visitor.get_artifacts( - index, fields, include_all=True)] - expected_loc = self.get_test_loc( - 'maven/index/increment/expected_artifacts.json') - self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) - - def test_get_artifacts_buggy(self): - index = self.get_test_loc( - 'maven/index/buggy/nexus-maven-repository-index.gz') - fields = list(maven_visitor.ENTRY_FIELDS.keys( - )) + list(maven_visitor.ENTRY_FIELDS_OTHER.keys()) + list(maven_visitor.ENTRY_FIELDS_IGNORED.keys()) - fields = set(fields) - result = [a.to_dict() for a in maven_visitor.get_artifacts( - index, fields, include_all=True)] - expected_loc = self.get_test_loc( - 'maven/index/buggy/expected_artifacts.json') - self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) - - def test_get_artifacts_defaults(self): - index = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') - result = [a.to_dict() for a in maven_visitor.get_artifacts(index)] - expected_loc = self.get_test_loc( - 'maven/index/expected_artifacts-defaults.json') - self.check_expected_results(result, expected_loc) - - def test_get_artifacts_no_worthyness(self): - index = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') - - def worth(a): - return True - - result = [a.to_dict() - for a in maven_visitor.get_artifacts(index, worthyness=worth)] - expected_loc = self.get_test_loc( - 'maven/index/expected_artifacts-all-worthy.json') - self.check_expected_results(result, expected_loc) - - def test_get_artifacts_defaults_increment(self): - index = self.get_test_loc( - 'maven/index/increment/nexus-maven-repository-index.445.gz') - result = [a.to_dict() for a in maven_visitor.get_artifacts(index)] - expected_loc = self.get_test_loc( - 'maven/index/increment/expected_artifacts-defaults.json') - self.check_expected_results(result, expected_loc) - - def test_get_artifacts_defaults_buggy(self): - index = self.get_test_loc( - 'maven/index/buggy/nexus-maven-repository-index.gz') - result = [a.to_dict() for a in maven_visitor.get_artifacts(index)] - expected_loc = self.get_test_loc( - 'maven/index/buggy/expected_artifacts-defaults.json') - self.check_expected_results(result, expected_loc) - - def test_build_artifact(self): - entry = { - u'i': u'0-alpha-1-20050407.154541-1.pom|1131488721000|-1|2|2|0|pom', - u'm': u'1318447185654', - u'u': u'org.apache|maven|archetypes|1|0-alpha-1-20050407.154541-1.pom'} - - result = maven_visitor.build_artifact(entry, include_all=True) - result = result.to_dict() - expected = dict([ - (u'group_id', u'org.apache'), - (u'artifact_id', u'maven'), - (u'version', u'archetypes'), - (u'packaging', u'0-alpha-1-20050407.154541-1.pom'), - (u'classifier', u'1'), - (u'extension', u'pom'), - (u'last_modified', '2005-11-08T22:25:21+00:00'), - (u'size', None), - (u'sha1', None), - (u'name', None), - (u'description', None), - (u'src_exist', False), - (u'jdoc_exist', False), - (u'sig_exist', False), - (u'sha256', None), - (u'osgi', dict()), - (u'classes', []) - ]) - - self.assertEqual(expected.items(), result.items()) - - def test_build_url_and_filename_1(self): - test = {'group_id': 'de.alpharogroup', 'artifact_id': 'address-book-domain', - 'version': '3.12.0', 'classifier': None, 'extension': 'jar'} - expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-domain/3.12.0/address-book-domain-3.12.0.jar', 'address-book-domain-3.12.0.jar' - self.assertEqual( - expected, maven_visitor.build_url_and_filename(**test)) - - def test_build_url_and_filename_2(self): - test = {'group_id': 'de.alpharogroup', 'artifact_id': 'address-book-data', - 'version': '3.12.0', 'classifier': None, 'extension': 'pom'} - expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-data/3.12.0/address-book-data-3.12.0.pom', 'address-book-data-3.12.0.pom' - self.assertEqual( - expected, maven_visitor.build_url_and_filename(**test)) - - def test_build_url_and_filename_3(self): - test = {'group_id': 'de.alpharogroup', 'artifact_id': 'address-book-rest-web', - 'version': '3.12.0', 'classifier': None, 'extension': 'war'} - expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-rest-web/3.12.0/address-book-rest-web-3.12.0.war', 'address-book-rest-web-3.12.0.war' - self.assertEqual( - expected, maven_visitor.build_url_and_filename(**test)) - - def test_build_url_and_filename_4(self): - test = {'group_id': 'uk.com.robust-it', 'artifact_id': 'cloning', - 'version': '1.9.5', 'classifier': 'sources', 'extension': 'jar'} - expected = 'https://repo1.maven.org/maven2/uk/com/robust-it/cloning/1.9.5/cloning-1.9.5-sources.jar', 'cloning-1.9.5-sources.jar' - self.assertEqual( - expected, maven_visitor.build_url_and_filename(**test)) - - def test_build_url_and_filename_with_alternate_base(self): - test = { - 'group_id': 'uk.com.robust-it', 'artifact_id': 'cloning', - 'version': '1.9.5', 'classifier': 'sources', 'extension': 'jar', - 'base_repo_url': 'maven-index://'} - expected = 'maven-index:///uk/com/robust-it/cloning/1.9.5/cloning-1.9.5-sources.jar', 'cloning-1.9.5-sources.jar' - self.assertEqual( - expected, maven_visitor.build_url_and_filename(**test)) - - def test_build_maven_xml_url(self): - test = {'group_id': 'de.alpharogroup', - 'artifact_id': 'address-book-domain'} - expected = 'https://repo1.maven.org/maven2/de/alpharogroup/address-book-domain/maven-metadata.xml' - self.assertEqual(expected, maven_visitor.build_maven_xml_url(**test)) - - -class MavenVisitorTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_MavenNexusIndexVisitor_uris(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' - test_loc = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusIndexVisitor(uri) - expected_loc = self.get_test_loc('maven/index/expected_uris.json') - self.check_expected_uris( - uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) - - def test_MavenNexusIndexPropertiesVisitor(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties' - test_loc = self.get_test_loc( - 'maven/index/increment/nexus-maven-repository-index.properties') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusPropertiesVisitor( - uri) - expected_loc = self.get_test_loc( - 'maven/index/increment/expected_properties_uris.json') - self.check_expected_uris( - uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) - - def test_MavenNexusIndexVisitor_uris_increment(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.445.gz' - test_loc = self.get_test_loc( - 'maven/index/increment/nexus-maven-repository-index.445.gz') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/index/increment/expected_uris.json') - self.check_expected_uris( - uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) - - def test_MavenNexusIndexVisitor_uris_buggy(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' - test_loc = self.get_test_loc( - 'maven/index/buggy/nexus-maven-repository-index.gz') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/index/buggy/expected_uris.json') - self.check_expected_uris( - uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) - - def test_visit_uri_does_not_fail_on_incorrect_sha1(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' - resource_uri = ResourceURI.objects.insert(uri=uri) - - before = [p.id for p in ResourceURI.objects.all()] - test_loc = self.get_test_loc( - 'maven/index/buggy/nexus-maven-repository-index.gz') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - visit_uri(resource_uri) - - if before: - visited = ResourceURI.objects.exclude(id__in=before) - else: - visited = ResourceURI.objects.all() - - results = [model_to_dict(rec, fields=['uri', 'sha1']) - for rec in visited] - results = sorted(results, key=itemgetter('uri')) - expected_loc = self.get_test_loc( - 'maven/index/buggy/expected_visited_uris.json') - self.check_expected_results( - results, expected_loc, regen=FIXTURES_REGEN) - visited.delete() - - def test_MavenPOMVisitor_data(self): - uri = 'https://repo1.maven.org/maven2/classworlds/classworlds/1.1-alpha-2/classworlds-1.1-alpha-2.pom' - test_loc = self.get_test_loc('maven/pom/classworlds-1.1-alpha-2.pom') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _ = maven_visitor.MavenPOMVisitor(uri) - self.assertEqual(None, uris) - expected = open(test_loc, 'rb').read() - self.assertEqual(expected, data) - - -class MavenEnd2EndTest(JsonBasedTesting, DjangoTestCase): - - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_MavenNexusIndexVisitor_with_run_visit_then_map_end2end(self): - # setup - before = sorted(p.id for p in ResourceURI.objects.all()) - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.445.gz' - - resource_uri = ResourceURI.objects.insert(uri=uri) - test_index = self.get_test_loc( - 'maven/index/nexus-maven-repository-index.gz') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_index) - visit_uri(resource_uri) - - if before: - visited = ResourceURI.objects.exclude(id__in=before) - else: - visited = ResourceURI.objects.all() - - results = list(model_to_dict(rec, exclude=['id']) for rec in visited) - results = sorted(results, key=itemgetter('uri')) - expected_loc = self.get_test_loc( - 'maven/end2end/expected_visited_uris.json') - self.check_expected_results( - results, expected_loc, regen=FIXTURES_REGEN) - - pre_visited_uris = ResourceURI.objects.filter( - uri__contains='maven-index://').exclude(id__in=before) - - self.assertTrue( - all(ru.last_visit_date and not ru.last_map_date - for ru in pre_visited_uris)) - - package_ids_before = sorted( - p.id for p in packagedb.models.Package.objects.all()) - - # now onto mapping the previsited URIs - # setup - # test proper - for res_uri in pre_visited_uris: - map_uri(res_uri) - - newly_mapped = packagedb.models.Package.objects.filter( - download_url__startswith='https://repo1.maven.org/maven2').exclude(id__in=package_ids_before) - # check that the saved packages are there as planned - self.assertEqual(19, newly_mapped.count()) - - package_results = list(pac.to_dict() for pac in newly_mapped) - expected_loc = self.get_test_loc( - 'maven/end2end/expected_mapped_packages.json') - self.check_expected_results(package_results, expected_loc, fields_to_remove=[ - 'package_sets'], regen=FIXTURES_REGEN) - - # check that the map status has been updated correctly - visited_then_mapped = ResourceURI.objects.filter( - uri__contains='maven-index://') - self.assertTrue(all(ru.last_map_date for ru in visited_then_mapped)) - - def test_visit_and_map_using_pom_with_unicode(self): - uri = 'https://repo1.maven.org/maven2/edu/psu/swe/commons/commons-jaxrs/1.22/commons-jaxrs-1.22.pom' - test_loc = self.get_test_loc( - 'maven/end2end_unicode/commons-jaxrs-1.22.pom') - - before_uri = [p.id for p in ResourceURI.objects.all()] - before_pkg = [p.id for p in packagedb.models.Package.objects.all()] - - resource_uri = ResourceURI.objects.insert(uri=uri) - - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - # visit test proper: this should insert all the test_uris - visit_uri(resource_uri) - map_uri(resource_uri) - - if before_uri: - visited = ResourceURI.objects.exclude(id__in=before_uri) - else: - visited = ResourceURI.objects.all() - - uri_results = sorted(model_to_dict( - rec, exclude=['id']) for rec in visited) - expected_loc = self.get_test_loc( - 'maven/end2end_unicode/expected_visited_commons-jaxrs-1.22.json') - self.check_expected_results( - uri_results, expected_loc, regen=FIXTURES_REGEN) - - if before_pkg: - mapped = packagedb.models.Package.objects.exclude( - id__in=before_pkg) - else: - mapped = packagedb.models.Package.objects.all() - - package_results = sorted(pac.to_dict() for pac in mapped) - expected_loc = self.get_test_loc( - 'maven/end2end_unicode/expected_mapped_commons-jaxrs-1.22.json') - self.check_expected_results( - package_results, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_and_map_using_pom_with_unicode_multisteps(self): - # this test deals with a single POM and the results from - # the index and the pom visit yielding packages - - # Step 1: map some index data - before_pkg = [p.id for p in packagedb.models.Package.objects.all()] - - # this is a pre-visited as from the Maven index URI - index_uri_test_loc = self.get_test_loc( - 'maven/end2end_multisteps/commons-jaxrs-1.21-index-data.json') - index_uri = json.load(open(index_uri_test_loc, 'rb')) - idx_resource_uri = ResourceURI.objects.insert(**index_uri) - - map_uri(idx_resource_uri) - - if before_pkg: - mapped = packagedb.models.Package.objects.exclude( - id__in=before_pkg) - else: - mapped = packagedb.models.Package.objects.all() - - package_results = sorted((pac.to_dict() - for pac in mapped), key=lambda d: list(d.keys())) - expected_loc = self.get_test_loc( - 'maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-index.json') - self.check_expected_results(package_results, expected_loc, fields_to_remove=[ - 'package_sets'], regen=FIXTURES_REGEN) - - # Step 2: map a POM - - # this is a pre-visited URI as from a POM - pom_uri_test_loc = self.get_test_loc( - 'maven/end2end_multisteps/commons-jaxrs-1.21-pom-data.json') - pom_uri = json.load(open(pom_uri_test_loc, 'rb')) - pom_resource_uri = ResourceURI.objects.insert(**pom_uri) - map_uri(pom_resource_uri) - - if before_pkg: - mapped = packagedb.models.Package.objects.exclude( - id__in=before_pkg) - else: - mapped = packagedb.models.Package.objects.all() - - package_results = sorted((pac.to_dict() - for pac in mapped), key=lambda d: list(d.keys())) - expected_loc = self.get_test_loc( - 'maven/end2end_multisteps/expected_mapped_commons-jaxrs-1.21-from-pom.json') - self.check_expected_results(package_results, expected_loc, fields_to_remove=[ - 'package_sets'], regen=FIXTURES_REGEN) - - def test_visit_and_map_with_index(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties' - test_loc = self.get_test_loc( - 'maven/end2end_index/nexus-maven-repository-index.properties') - - before_uri = [p.id for p in ResourceURI.objects.all()] - before_pkg = [p.id for p in packagedb.models.Package.objects.all()] - - resource_uri = ResourceURI.objects.insert(uri=uri) - - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - # visit test proper: this should insert all the test_uris - visit_uri(resource_uri) - - if before_uri: - visited = ResourceURI.objects.exclude( - id__in=before_uri).order_by('uri') - else: - visited = ResourceURI.objects.all().order_by('uri') - - uri_results = list(model_to_dict( - rec, exclude=['id']) for rec in visited) - expected_loc = self.get_test_loc( - 'maven/end2end_index/expected_visited_index.json') - self.check_expected_results( - uri_results, expected_loc, regen=FIXTURES_REGEN) - - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.543.gz' - # Use a small index file for test cases - test_loc = self.get_test_loc( - 'maven/end2end_index/nexus-maven-repository-index.163.gz') - - resource_uri = ResourceURI.objects.get(uri=uri) - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - # visit test proper: this should insert all the test_uris - visit_uri(resource_uri) - - if before_uri: - visited = ResourceURI.objects.exclude( - id__in=before_uri).order_by('uri') - else: - visited = ResourceURI.objects.all().order_by('uri') - - uri_results = list(model_to_dict( - rec, exclude=['id']) for rec in visited) - expected_loc = self.get_test_loc( - 'maven/end2end_index/expected_visited_increment_index.json') - self.check_expected_results( - uri_results, expected_loc, regen=FIXTURES_REGEN) - - -class MavenXmlMetadataVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_visit_maven_medatata_xml_file(self): - uri = 'https://repo1.maven.org/maven2/st/digitru/identity-core/maven-metadata.xml' - test_loc = self.get_test_loc('maven/maven-metadata/maven-metadata.xml') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = maven_visitor.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/maven-metadata/expected_maven_xml.json') - self.check_expected_uris(uris, expected_loc) - - -class MavenHtmlIndexVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_visit_maven_medatata_html_index_jcenter_1(self): - uri = 'http://jcenter.bintray.com/' - test_loc = self.get_test_loc('maven/html/jcenter.bintray.com.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = maven_visitor.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/html/visitor_expected_jcenter.bintray.com2.html.json') - self.check_expected_uris(uris, expected_loc) - - def test_visit_maven_medatata_html_index_jcenter_2(self): - uri = 'http://jcenter.bintray.com/Action/app/' - test_loc = self.get_test_loc('maven/html/app.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = maven_visitor.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/html/visitor_expected_app.html.json') - self.check_expected_uris(uris, expected_loc) - - def test_visit_maven_medatata_html_index_jcenter_3(self): - uri = "http://jcenter.bintray.com/'com/virtualightning'/stateframework-compiler/" - test_loc = self.get_test_loc('maven/html/stateframework-compiler.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = maven_visitor.MavenMetaDataVisitor(uri) - expected_loc = self.get_test_loc( - 'maven/html/visitor_expected_stateframework-compiler.html.json') - self.check_expected_uris(uris, expected_loc) - - -# FIXME: we should not need to call a visitor for testing a mapper -class MavenMapperVisitAndMapTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_visit_and_build_package_from_pom_axis(self): - uri = 'https://repo1.maven.org/maven2/axis/axis/1.4/axis-1.4.pom' - test_loc = self.get_test_loc('maven/mapper/axis-1.4.pom') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/axis-1.4.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_and_build_package_from_pom_commons_pool(self): - uri = 'https://repo1.maven.org/maven2/commons-pool/commons-pool/1.5.7/commons-pool-1.5.7.pom' - test_loc = self.get_test_loc('maven/mapper/commons-pool-1.5.7.pom') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/commons-pool-1.5.7.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_and_build_package_from_pom_struts(self): - uri = 'https://repo1.maven.org/maven2/struts-menu/struts-menu/2.4.2/struts-menu-2.4.2.pom' - test_loc = self.get_test_loc('maven/mapper/struts-menu-2.4.2.pom') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/struts-menu-2.4.2.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_and_build_package_from_pom_mysql(self): - uri = 'https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.27/mysql-connector-java-5.1.27.pom' - test_loc = self.get_test_loc( - 'maven/mapper/mysql-connector-java-5.1.27.pom') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/mysql-connector-java-5.1.27.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_and_build_package_from_pom_xbean(self): - uri = 'https://repo1.maven.org/maven2/xbean/xbean-jmx/2.0/xbean-jmx-2.0.pom' - test_loc = self.get_test_loc('maven/mapper/xbean-jmx-2.0.pom') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/xbean-jmx-2.0.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_and_build_package_from_pom_maven_all(self): - uri = 'https://repo1.maven.org/maven2/date/yetao/maven/maven-all/1.0-RELEASE/maven-all-1.0-RELEASE.pom' - test_loc = self.get_test_loc('maven/mapper/maven-all-1.0-RELEASE.pom') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/maven-all-1.0-RELEASE.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_visit_and_build_package_from_pom_with_unicode(self): - uri = 'https://repo1.maven.org/maven2/edu/psu/swe/commons/commons-jaxrs/1.21/commons-jaxrs-1.21.pom' - test_loc = self.get_test_loc('maven/mapper/commons-jaxrs-1.21.pom') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = maven_visitor.MavenPOMVisitor(uri) - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/mapper/commons-jaxrs-1.21.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - -class MavenMapperGetPackageTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_get_package_from_pom_1(self): - test_loc = self.get_test_loc('maven/parsing/parse/jds-3.0.1.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/parse/jds-3.0.1.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_2(self): - test_loc = self.get_test_loc( - 'maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/parse/springmvc-rest-docs-maven-plugin-1.0-RC1.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_3(self): - test_loc = self.get_test_loc('maven/parsing/parse/jds-2.17.0718b.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/parse/jds-2.17.0718b.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_4(self): - test_loc = self.get_test_loc( - 'maven/parsing/parse/maven-javanet-plugin-1.7.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/parse/maven-javanet-plugin-1.7.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_5(self): - test_loc = self.get_test_loc('maven/parsing/loop/coreplugin-1.0.0.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/coreplugin-1.0.0.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_6(self): - test_loc = self.get_test_loc( - 'maven/parsing/loop/argus-webservices-2.7.0.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/argus-webservices-2.7.0.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_7(self): - test_loc = self.get_test_loc('maven/parsing/loop/pkg-2.0.13.1005.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/pkg-2.0.13.1005.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_8(self): - test_loc = self.get_test_loc( - 'maven/parsing/loop/ojcms-beans-0.1-beta.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/ojcms-beans-0.1-beta.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_9(self): - test_loc = self.get_test_loc( - 'maven/parsing/loop/jacuzzi-annotations-0.2.1.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/jacuzzi-annotations-0.2.1.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_10(self): - test_loc = self.get_test_loc( - 'maven/parsing/loop/argus-webservices-2.8.0.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/argus-webservices-2.8.0.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_11(self): - test_loc = self.get_test_loc( - 'maven/parsing/loop/jacuzzi-database-0.2.1.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/loop/jacuzzi-database-0.2.1.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_12(self): - test_loc = self.get_test_loc( - 'maven/parsing/empty/common-object-1.0.2.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/empty/common-object-1.0.2.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_13(self): - test_loc = self.get_test_loc('maven/parsing/empty/osgl-http-1.1.2.pom') - data = open(test_loc).read() - package = maven_mapper.get_package(data).to_dict() - expected_loc = self.get_test_loc( - 'maven/parsing/empty/osgl-http-1.1.2.pom.package.json') - self.check_expected_results( - package, expected_loc, regen=FIXTURES_REGEN) - - def test_regex_maven_pom_mapper_1(self): - regex = re.compile(r'^https?://repo1.maven.org/maven2/.*\.pom$') - result = re.match( - regex, 'https://repo1.maven.org/maven2/com/google/appengine/appengine-api-1.0-sdk/1.2.0/appengine-api-1.0-sdk-1.2.0.pom') - self.assertTrue(result) - - def test_MavenNexusIndexVisitor_uris_increment_contain_correct_purl(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz' - test_loc = self.get_test_loc( - 'maven/index/increment2/nexus-maven-repository-index.457.gz') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusIndexVisitor(uri) - uris = [u for i, u in enumerate(uris) if i % 500 == 0] - expected_loc = self.get_test_loc( - 'maven/index/increment2/expected_uris.json') - self.check_expected_uris( - uris, expected_loc, data_is_json=True, regen=FIXTURES_REGEN) - - def test_MavenNexusIndexVisitor_then_get_mini_package_from_index_data(self): - uri = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz' - test_loc = self.get_test_loc( - 'maven/index/increment2/nexus-maven-repository-index.457.gz') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = maven_visitor.MavenNexusIndexVisitor(uri) - results = [] - for i, u in enumerate(uris): - # only get a few records - if i % 500 == 0: - minip = maven_mapper.get_mini_package( - u.data, u.uri, u.package_url) - results.append(minip and minip.to_dict() or minip) - expected_loc = self.get_test_loc( - 'maven/index/increment2/expected_mini_package.json') - self.check_expected_results( - results, expected_loc, regen=FIXTURES_REGEN) - - def test_get_package_from_pom_does_create_a_correct_qualifier(self): - 'https://repo1.maven.org/maven2/org/hspconsortium/reference/hspc-reference-auth-server-webapp/1.9.1/hspc-reference-auth-server-webapp-1.9.1.pom' - - -class MavenPriorityQueueTests(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def setUp(self): - super(MavenPriorityQueueTests, self).setUp() - - self.expected_pom_loc = self.get_test_loc( - 'maven/pom/classworlds-1.1.pom') - with open(self.expected_pom_loc) as f: - self.expected_pom_contents = f.read() - - self.scan_package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=self.expected_pom_contents, - ) - - def test_get_pom_text(self, regen=FIXTURES_REGEN): - pom_contents = maven_visitor.get_pom_text( - namespace=self.scan_package.namespace, - name=self.scan_package.name, - version=self.scan_package.version - ) - if regen: - with open(self.expected_pom_loc, 'w') as f: - f.write(pom_contents) - self.assertEqual(self.expected_pom_contents, pom_contents) - - pom_contents = maven_visitor.get_pom_text( - namespace='', - name='does-not-exist', - version='1.0', - ) - self.assertFalse(pom_contents) - - def test_get_package_sha1(self): - sha1 = maven_visitor.get_package_sha1(self.scan_package) - expected_sha1 = '60c708f55deeb7c5dfce8a7886ef09cbc1388eca' - self.assertEqual(expected_sha1, sha1) - - def test_map_maven_package(self): - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(0, package_count) - package_url = PackageURL.from_string(self.scan_package.purl) - maven_visitor.map_maven_package( - package_url, packagedb.models.PackageContentType.BINARY, ('test_pipeline')) - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(1, package_count) - package = packagedb.models.Package.objects.all().first() - expected_purl_str = 'pkg:maven/classworlds/classworlds@1.1' - self.assertEqual(expected_purl_str, package.purl) - - def test_map_maven_package_custom_repo_url(self): - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(0, package_count) - custom_repo_purl = "pkg:maven/org.eclipse.core/runtime@20070801?repository_url=https://packages.atlassian.com/mvn/maven-atlassian-external/" - package_url = PackageURL.from_string(custom_repo_purl) - maven_visitor.map_maven_package( - package_url, packagedb.models.PackageContentType.BINARY, ('test_pipeline')) - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(1, package_count) - package = packagedb.models.Package.objects.all().first() - expected_repo_url = 'https://packages.atlassian.com/mvn/maven-atlassian-external//org/eclipse/core/runtime/20070801/runtime-20070801.jar' - self.assertEqual(expected_repo_url, package.download_url) - - def test_process_request(self): - purl_str = 'pkg:maven/org.apache.twill/twill-core@0.12.0' - download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar' - purl_sources_str = f'{purl_str}?classifier=sources' - sources_download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar' - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(0, package_count) - maven_visitor.process_request(purl_str) - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(2, package_count) - purls = [ - (package.purl, package.download_url) - for package in packagedb.models.Package.objects.all() - ] - self.assertIn( - (purl_str, download_url), purls - ) - self.assertIn( - (purl_sources_str, sources_download_url), purls - ) - - def test_fetch_parent(self, regen=FIXTURES_REGEN): - pom_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1.pom') - with open(pom_loc) as f: - pom_text = f.read() - parent_pom_text = maven_visitor.fetch_parent(pom_text) - expected_loc = self.get_test_loc('maven/pom/ant-parent-1.10.1.pom') - - if regen: - with open(expected_loc, 'w') as f: - f.write(parent_pom_text) - - with open(expected_loc) as f: - expected_pom_text = f.read() - self.assertEqual(expected_pom_text, parent_pom_text) - - def test_get_ancestry(self): - pom_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1.pom') - with open(pom_loc) as f: - pom_text = f.read() - ancestor_pom_texts = list(maven_visitor.get_ancestry(pom_text)) - expected_ancestor_pom_texts = [] - for expected_loc in [ - self.get_test_loc('maven/pom/apache-18.pom'), - self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), - self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') - ]: - with open(expected_loc) as f: - expected_pom_text = f.read() - expected_ancestor_pom_texts.append(expected_pom_text) - self.assertEqual(expected_ancestor_pom_texts, ancestor_pom_texts) - - def test_merge_parent(self, regen=FIXTURES_REGEN): - pom_loc = self.get_test_loc('maven/pom/ant-antlr-1.10.1.pom') - with open(pom_loc) as f: - pom_text = f.read() - package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=pom_text - ) - expected_before_loc = self.get_test_loc( - 'maven/pom/ant-antlr-1.10.1-package_before.json') - self.check_expected_results( - package.to_dict(), expected_before_loc, regen=regen) - - parent_pom_loc = self.get_test_loc('maven/pom/ant-parent-1.10.1.pom') - with open(parent_pom_loc) as f: - parent_pom_text = f.read() - parent_package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=parent_pom_text - ) - package = maven_visitor.merge_parent(package, parent_package) - expected_after_loc = self.get_test_loc( - 'maven/pom/ant-antlr-1.10.1-package_after.json') - self.check_expected_results( - package.to_dict(), expected_after_loc, regen=regen) - - def test_merge_ancestors(self, regen=FIXTURES_REGEN): - pom_loc = self.get_test_loc('maven/pom/pulsar-client-1x-2.5.1.pom') - with open(pom_loc) as f: - pom_text = f.read() - package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=pom_text - ) - expected_before_loc = self.get_test_loc( - 'maven/pom/pulsar-client-1x-2.5.1-package_before.json') - self.check_expected_results( - package.to_dict(), expected_before_loc, regen=regen) - - ancestor_pom_texts = [] - for loc in [ - self.get_test_loc('maven/pom/apache-18.pom'), - self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), - self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') - ]: - with open(loc) as f: - pom_text = f.read() - ancestor_pom_texts.append(pom_text) - - maven_visitor.merge_ancestors(ancestor_pom_texts, package) - expected_after_loc = self.get_test_loc( - 'maven/pom/pulsar-client-1x-2.5.1-package_after.json') - self.check_expected_results( - package.to_dict(), expected_after_loc, regen=regen) - - @mock.patch("minecode.visitors.maven.get_pom_text") - def test_get_merged_ancestor_package_from_maven_package(self, get_pom_text_mock, regen=FIXTURES_REGEN): - get_pom_text_mock.return_value = "" - ancestor_pom_texts = [] - with patch("minecode.visitors.maven.get_ancestry") as mock_get_ancestry: - for loc in [ - self.get_test_loc('maven/pom/apache-18.pom'), - self.get_test_loc('maven/pom/pulsar-2.5.1.pom'), - self.get_test_loc('maven/pom/pulsar-client-1x-base-2.5.1.pom') - ]: - with open(loc) as f: - pom_text = f.read() - ancestor_pom_texts.append(pom_text) - mock_get_ancestry.return_value = ancestor_pom_texts - db_package = packagedb.models.Package.objects.create( - name="pulsar-client", - namespace="org.apache.pulsar", - version="2.5.1", - type="maven", - download_url="https://repo1.maven.org/maven2/org/apache/pulsar/pulsar-client/2.5.1/pulsar-client-2.5.1.jar", - ) - merged_package = maven_visitor.get_merged_ancestor_package_from_maven_package( - package=db_package) - expected_loc = self.get_test_loc( - 'maven/pom/pulsar-client-merged-ancestor-package.json') - self.check_expected_results( - merged_package.to_dict(), expected_loc, regen=regen) - - -class MavenCrawlerFunctionsTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_check_if_file_name_is_linked_on_page(self): - links = ['foo/', 'bar/', 'baz/'] - self.assertTrue( - maven_visitor.check_if_file_name_is_linked_on_page('foo/', links) - ) - self.assertFalse( - maven_visitor.check_if_file_name_is_linked_on_page('qux/', links) - ) - - def test_check_if_page_has_pom_files(self): - links1 = ['foo/', 'bar.jar', 'bar.pom'] - links2 = ['foo/', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_page_has_pom_files(links1)) - self.assertFalse(maven_visitor.check_if_page_has_pom_files(links2)) - - def test_check_if_page_has_directories(self): - links1 = ['foo/', 'bar/', 'baz/'] - links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_page_has_directories(links1)) - self.assertFalse(maven_visitor.check_if_page_has_directories(links2)) - - def test_check_if_package_version_page(self): - links1 = ['../', 'bar.pom', 'bar.jar'] - links2 = ['../', 'foo/', 'bar/', 'baz/'] - self.assertTrue(maven_visitor.check_if_package_version_page(links1)) - self.assertFalse(maven_visitor.check_if_package_version_page(links2)) - - def test_check_if_package_page(self): - links1 = ['../', 'maven-metadata.xml'] - links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_package_page(links1)) - self.assertFalse(maven_visitor.check_if_package_page(links2)) - - def test_check_if_maven_root(self): - links1 = ['../', 'archetype-catalog.xml'] - links2 = ['../', 'bar.pom', 'bar.jar'] - self.assertTrue(maven_visitor.check_if_maven_root(links1)) - self.assertFalse(maven_visitor.check_if_maven_root(links2)) - - @mock.patch('requests.get') - def test_check_on_page(self, mock_request_get): - checker = maven_visitor.check_if_page_has_pom_files - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = 'parent-7.11.0.pom' - self.assertTrue(maven_visitor.check_on_page( - 'https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/', checker)) - - @mock.patch('requests.get') - def test_is_maven_root(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = 'archetype-catalog.xml' - self.assertTrue(maven_visitor.is_maven_root( - 'https://repo1.maven.org/maven2/')) - - @mock.patch('requests.get') - def test_is_package_page(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = 'maven-metadata.xml' - self.assertTrue(maven_visitor.is_package_page( - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/')) - - @mock.patch('requests.get') - def test_is_package_version_page(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = ''' - ../ - parent-7.11.0.pom - ''' - self.assertTrue(maven_visitor.is_package_version_page( - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/')) - - def test_url_parts(self): - url = 'https://example.com/foo/bar/baz.jar' - scheme, netloc, path_segments = maven_visitor.url_parts(url) - self.assertEqual('https', scheme) - self.assertEqual('example.com', netloc) - self.assertEqual(['foo', 'bar', 'baz.jar'], path_segments) - - def test_create_url(self): - scheme = 'https' - netloc = 'example.com' - path_segments = ['foo', 'bar', 'baz.jar'] - url = 'https://example.com/foo/bar/baz.jar' - self.assertEqual( - url, - maven_visitor.create_url(scheme, netloc, path_segments) - ) - - @mock.patch('requests.get') - def test_get_maven_root(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = 'archetype-catalog.xml' - self.assertEqual( - 'https://repo1.maven.org/maven2', - maven_visitor.get_maven_root( - 'https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') - ) - - @mock.patch('requests.get') - def test_determine_namespace_name_version_from_url(self, mock_request_get): - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2' - root_url = 'https://repo1.maven.org/maven2' - - package_page_text = ''' - 1.0.b2/ - 2005-09-20 05:53 - - maven-metadata.xml - 2012-06-26 17:01 567 - ''' - package_page = mock.Mock(ok=True, text=package_page_text) - - package_version_page_text = ''' - ../ - - xml-apis-1.0.b2.pom - 2005-09-20 05:53 2249 - ''' - package_version_page = mock.Mock( - ok=True, text=package_version_page_text) - mock_request_get.side_effect = [ - mock.Mock(ok=True, text=''), - mock.Mock(ok=True, text=''), - package_page, - mock.Mock(ok=True, text=''), - package_version_page - ] - - namespace, package_name, package_version = maven_visitor.determine_namespace_name_version_from_url( - url, root_url) - self.assertEqual('xml-apis', namespace) - self.assertEqual('xml-apis', package_name) - self.assertEqual('1.0.b2', package_version) - - @mock.patch('requests.get') - def test_add_to_import_queue(self, mock_request_get): - from minecode.models import ImportableURI - - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' - root_url = 'https://repo1.maven.org/maven2' - - package_page_text = ''' - 1.0.b2/ - 2005-09-20 05:53 - - maven-metadata.xml - 2012-06-26 17:01 567 - ''' - package_page = mock.Mock(ok=True, text=package_page_text) - - package_version_page_text = ''' - ../ - - xml-apis-1.0.b2.pom - 2005-09-20 05:53 2249 - ''' - package_version_page = mock.Mock( - ok=True, text=package_version_page_text) - mock_request_get.side_effect = [ - package_page, - mock.Mock(ok=True, text=''), - mock.Mock(ok=True, text=''), - package_page, - mock.Mock(ok=True, text=''), - package_version_page - ] - - self.assertEqual(0, ImportableURI.objects.all().count()) - maven_visitor.add_to_import_queue(url, root_url) - self.assertEqual(1, ImportableURI.objects.all().count()) - importable_uri = ImportableURI.objects.get(uri=url) - self.assertEqual('pkg:maven/xml-apis/xml-apis', - importable_uri.package_url) - - def test_filter_only_directories(self): - timestamps_by_links = { - '../': '-', - 'foo/': '-', - 'foo.pom': '2023-09-28', - } - expected = { - 'foo/': '-', - } - self.assertEqual( - expected, - maven_visitor.filter_only_directories(timestamps_by_links) - ) - - def test_filter_for_artifacts(self): - timestamps_by_links = { - '../': '2023-09-28', - 'foo.pom': '2023-09-28', - 'foo.ejb3': '2023-09-28', - 'foo.ear': '2023-09-28', - 'foo.aar': '2023-09-28', - 'foo.apk': '2023-09-28', - 'foo.gem': '2023-09-28', - 'foo.jar': '2023-09-28', - 'foo.nar': '2023-09-28', - 'foo.so': '2023-09-28', - 'foo.swc': '2023-09-28', - 'foo.tar': '2023-09-28', - 'foo.tar.gz': '2023-09-28', - 'foo.war': '2023-09-28', - 'foo.xar': '2023-09-28', - 'foo.zip': '2023-09-28', - } - expected = { - 'foo.ejb3': '2023-09-28', - 'foo.ear': '2023-09-28', - 'foo.aar': '2023-09-28', - 'foo.apk': '2023-09-28', - 'foo.gem': '2023-09-28', - 'foo.jar': '2023-09-28', - 'foo.nar': '2023-09-28', - 'foo.so': '2023-09-28', - 'foo.swc': '2023-09-28', - 'foo.tar': '2023-09-28', - 'foo.tar.gz': '2023-09-28', - 'foo.war': '2023-09-28', - 'foo.xar': '2023-09-28', - 'foo.zip': '2023-09-28', - } - self.assertEqual( - expected, maven_visitor.filter_for_artifacts(timestamps_by_links)) - - def test_collect_links_from_text(self): - filter = maven_visitor.filter_only_directories - text = ''' - ../ - 1.0.b2/ - 2005-09-20 05:53 - - 1.2.01/ - 2010-02-03 21:05 - - ''' - expected = { - '1.0.b2/': '2005-09-20 05:53', - '1.2.01/': '2010-02-03 21:05' - } - self.assertEqual( - expected, - maven_visitor.collect_links_from_text(text, filter=filter) - ) - - def test_create_absolute_urls_for_links(self): - filter = maven_visitor.filter_only_directories - text = ''' - ../ - 1.0.b2/ - 2005-09-20 05:53 - - 1.2.01/ - 2010-02-03 21:05 - - ''' - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' - expected = { - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' - } - self.assertEqual( - expected, - maven_visitor.create_absolute_urls_for_links( - text, url, filter=filter) - ) - - @mock.patch('requests.get') - def test_get_directory_links(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = ''' - ../ - 1.0.b2/ - 2005-09-20 05:53 - - 1.2.01/ - 2010-02-03 21:05 - - ''' - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/' - expected = { - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/': '2005-09-20 05:53', - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.2.01/': '2010-02-03 21:05' - } - self.assertEqual(expected, maven_visitor.get_directory_links(url)) - - @mock.patch('requests.get') - def test_get_artifact_links(self, mock_request_get): - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = ''' - ../ - xml-apis-1.0.b2.jar - 2005-09-20 05:53 109318 - xml-apis-1.0.b2.pom - 2005-09-20 05:53 2249 - ''' - url = 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/' - expected = { - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar': '2005-09-20 05:53', - } - self.assertEqual(expected, maven_visitor.get_artifact_links(url)) - - def test_crawl_to_package(self): - pass - - def test_crawl_maven_repo_from_root(self): - pass - - @mock.patch('requests.get') - def test_get_artifact_sha1(self, mock_request_get): - sha1 = '3136ca936f64c9d68529f048c2618bd356bf85c9' - mock_request_get.return_value.ok = True - mock_request_get.return_value.text = sha1 - self.assertEqual(sha1, maven_visitor.get_artifact_sha1( - 'https://repo1.maven.org/maven2/xml-apis/xml-apis/1.0.b2/xml-apis-1.0.b2.jar.sha1')) - - def test_get_classifier_from_artifact_url(self): - artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' - package_version_page_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/' - package_name = 'livereload-jvm' - package_version = '0.2.0' - classifier = maven_visitor.get_classifier_from_artifact_url( - artifact_url, - package_version_page_url, - package_name, - package_version - ) - self.assertEqual('onejar', classifier) diff --git a/minecode/tests/test_migrations.py b/minecode/tests/test_migrations.py index 31f04c86..45dd1243 100644 --- a/minecode/tests/test_migrations.py +++ b/minecode/tests/test_migrations.py @@ -53,36 +53,36 @@ def test_populate_has_error_fields(self): "map_error", "has_visit_error", "visit_error", - ).order_by('uri') + ).order_by("uri") ) expected = [ { - 'has_map_error': True, - 'has_visit_error': True, - 'map_error': 'error', - 'uri': 'http://example.com/1', - 'visit_error': 'error' + "has_map_error": True, + "has_visit_error": True, + "map_error": "error", + "uri": "http://example.com/1", + "visit_error": "error", }, { - 'has_map_error': False, - 'has_visit_error': True, - 'map_error': None, - 'uri': 'http://example.com/2', - 'visit_error': 'error' + "has_map_error": False, + "has_visit_error": True, + "map_error": None, + "uri": "http://example.com/2", + "visit_error": "error", }, { - 'has_map_error': True, - 'has_visit_error': False, - 'map_error': 'error', - 'uri': 'http://example.com/3', - 'visit_error': None + "has_map_error": True, + "has_visit_error": False, + "map_error": "error", + "uri": "http://example.com/3", + "visit_error": None, }, { - 'has_map_error': False, - 'has_visit_error': False, - 'map_error': None, - 'uri': 'http://example.com/4', - 'visit_error': None + "has_map_error": False, + "has_visit_error": False, + "map_error": None, + "uri": "http://example.com/4", + "visit_error": None, }, ] self.assertEqual(results, expected) @@ -122,57 +122,13 @@ def test_set_is_visitable_for_maven_index_uris(self): ) expected = [ { - 'is_visitable': False, - 'uri': 'maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.8.jar' + "is_visitable": False, + "uri": "maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.8.jar", }, { - 'is_visitable': False, - 'uri': 'maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.7.jar' - } - ] - self.assertEqual(results, expected) - -class TestSetIsVisitableForMavenIndexURIs(TestMigrations): - app_name = "minecode" - migrate_from = "0025_populate_has_error_fields" - migrate_to = "0026_set_is_visitable_for_maven_index_uris" - - def setUpBeforeMigration(self, apps): - # using get_model to avoid circular import - ResourceURI = apps.get_model("minecode", "ResourceURI") - - self.resource_uris = [ - ResourceURI.objects.create( - uri="maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.7.jar", - is_visitable=True, - ), - ResourceURI.objects.create( - uri="maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.8.jar", - is_visitable=False, - ), - ] - - for resource_uri in self.resource_uris: - resource_uri.save() - - def test_set_is_visitable_for_maven_index_uris(self): - # using get_model to avoid circular import - ResourceURI = apps.get_model("minecode", "ResourceURI") - results = list( - ResourceURI.objects.values( - "uri", - "is_visitable", - ).all() - ) - expected = [ - { - 'is_visitable': False, - 'uri': 'maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.8.jar' + "is_visitable": False, + "uri": "maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.7.jar", }, - { - 'is_visitable': False, - 'uri': 'maven-index://repo1.maven.org/zone/src/sheaf/logback-sheaf/1.1.7/logback-sheaf-1.1.7.jar' - } ] self.assertEqual(results, expected) @@ -208,10 +164,10 @@ def test_replace_http_with_https_in_maven_uris(self): ) expected = [ { - 'uri': 'https://repo1.maven.org/maven2/xyz/upperlevel/command/spigot/spigot-command-api/1.1.1/spigot-command-api-1.1.1.pom' + "uri": "https://repo1.maven.org/maven2/xyz/upperlevel/command/spigot/spigot-command-api/1.1.1/spigot-command-api-1.1.1.pom" }, { - 'uri': 'https://repo1.maven.org/maven2/xyz/upperlevel/command/spigot/spigot-command-api/1.1.1/spigot-command-api-1.1.1.pom' - } + "uri": "https://repo1.maven.org/maven2/xyz/upperlevel/command/spigot/spigot-command-api/1.1.1/spigot-command-api-1.1.1.pom" + }, ] self.assertEqual(results, expected) diff --git a/minecode/tests/test_model_utils.py b/minecode/tests/test_model_utils.py index bf64067b..34e72fa4 100644 --- a/minecode/tests/test_model_utils.py +++ b/minecode/tests/test_model_utils.py @@ -10,44 +10,43 @@ import os from django.test import TransactionTestCase + from packagedcode.maven import _parse from minecode.model_utils import merge_or_create_package from minecode.model_utils import update_or_create_resource +from minecode.tests import FIXTURES_REGEN from minecode.utils_test import JsonBasedTesting from minecode.utils_test import MiningTestCase -from minecode.tests import FIXTURES_REGEN from packagedb.models import Package from packagedb.models import Resource class ModelUtilsTestCase(MiningTestCase, JsonBasedTesting): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - pom_loc = self.get_test_loc('maven/pom/pulsar-2.5.1.pom') - self.scanned_package = _parse( - 'maven_pom', 'maven', 'Java', location=pom_loc) - self.scanned_package.download_url = 'https://repo1.maven.org/maven2/org/apache/pulsar/pulsar/2.5.1/pulsar-2.5.1.jar' + pom_loc = self.get_test_loc("maven/pom/pulsar-2.5.1.pom") + self.scanned_package = _parse("maven_pom", "maven", "Java", location=pom_loc) + self.scanned_package.download_url = "https://repo1.maven.org/maven2/org/apache/pulsar/pulsar/2.5.1/pulsar-2.5.1.jar" def test_merge_or_create_package_create_package(self): self.assertEqual(0, Package.objects.all().count()) package, created, merged, map_error = merge_or_create_package( - self.scanned_package, - visit_level=50 + self.scanned_package, visit_level=50 ) self.assertEqual(1, Package.objects.all().count()) self.assertEqual(package, Package.objects.all().first()) self.assertTrue(created) self.assertFalse(merged) - self.assertEqual('', map_error) + self.assertEqual("", map_error) self.assertTrue(package.created_date) self.assertTrue(package.last_modified_date) - expected_loc = self.get_test_loc('model_utils/created_package.json') + expected_loc = self.get_test_loc("model_utils/created_package.json") self.check_expected_results( package.to_dict(), expected_loc, - fields_to_remove=['package_sets'], + fields_to_remove=["package_sets"], regen=FIXTURES_REGEN, ) @@ -55,53 +54,55 @@ def test_merge_or_create_package_merge_package(self): # ensure fields get updated # ensure history is properly updated package = Package.objects.create( - type='maven', - namespace='org.apache.pulsar', - name='pulsar', - version='2.5.1', - download_url='https://repo1.maven.org/maven2/org/apache/pulsar/pulsar/2.5.1/pulsar-2.5.1.jar', + type="maven", + namespace="org.apache.pulsar", + name="pulsar", + version="2.5.1", + download_url="https://repo1.maven.org/maven2/org/apache/pulsar/pulsar/2.5.1/pulsar-2.5.1.jar", ) - before_merge_loc = self.get_test_loc('model_utils/before_merge.json') + before_merge_loc = self.get_test_loc("model_utils/before_merge.json") self.check_expected_results( package.to_dict(), before_merge_loc, - fields_to_remove=['package_sets'], + fields_to_remove=["package_sets"], regen=FIXTURES_REGEN, ) package, created, merged, map_error = merge_or_create_package( - self.scanned_package, - visit_level=50 + self.scanned_package, visit_level=50 ) self.assertEqual(1, Package.objects.all().count()) self.assertEqual(package, Package.objects.all().first()) self.assertFalse(created) self.assertTrue(merged) - self.assertEqual('', map_error) - expected_loc = self.get_test_loc('model_utils/after_merge.json') + self.assertEqual("", map_error) + expected_loc = self.get_test_loc("model_utils/after_merge.json") self.check_expected_results( package.to_dict(), expected_loc, - fields_to_remove=['package_sets'], + fields_to_remove=["package_sets"], regen=FIXTURES_REGEN, ) history = package.get_history() self.assertEqual(1, len(history)) entry = history[0] - timestamp = entry['timestamp'] - message = entry['message'] + timestamp = entry["timestamp"] + message = entry["message"] self.assertEqual( - 'Package field values have been updated.', + "Package field values have been updated.", message, ) last_modified_date_formatted = package.last_modified_date.strftime( - "%Y-%m-%d-%H:%M:%S") + "%Y-%m-%d-%H:%M:%S" + ) self.assertEqual(timestamp, last_modified_date_formatted) - data = entry['data'] - updated_fields = data['updated_fields'] + data = entry["data"] + updated_fields = data["updated_fields"] expected_updated_fields_loc = self.get_test_loc( - 'model_utils/expected_updated_fields.json') + "model_utils/expected_updated_fields.json" + ) self.check_expected_results( - updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN) + updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN + ) class UpdateORCreateResourceTest(TransactionTestCase): diff --git a/minecode/tests/test_models.py b/minecode/tests/test_models.py index 7b0864cd..7c180808 100644 --- a/minecode/tests/test_models.py +++ b/minecode/tests/test_models.py @@ -13,32 +13,31 @@ from django.test import TestCase from django.utils import timezone -from minecode import visitors -from minecode import mappers - from minecode.models import ResourceURI -from packagedb.models import Package -from minecode.models import get_canonical from minecode.models import ScannableURI +from minecode.models import get_canonical +from packagedb.models import Package class ResourceURIModelTestCase(TestCase): - def setUp(self): self.res = ResourceURI.objects.insert( - uri='http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom') + uri="http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom" + ) def test_get_canonical(self): data = ( - ('http://www.nexb.com', 'http://www.nexb.com/'), - ('http://www.nexb.com/', 'http://www.nexb.com/'), - ('http://www.nexb.com/a/b/../../c/', 'http://www.nexb.com/c/'), - ('http://www.nexb.com:80', 'http://www.nexb.com/'), - ('https://www.nexb.com:443', 'https://www.nexb.com/'), - ('http://www.nexb.com:443', 'http://www.nexb.com:443/'), - ('https://www.nexb.com:80', 'https://www.nexb.com:80/'), - ('http://www.nexb.com/A 0.0.1 Alpha/a_0_0_1.zip', - 'http://www.nexb.com/A%200.0.1%20Alpha/a_0_0_1.zip'), + ("http://www.nexb.com", "http://www.nexb.com/"), + ("http://www.nexb.com/", "http://www.nexb.com/"), + ("http://www.nexb.com/a/b/../../c/", "http://www.nexb.com/c/"), + ("http://www.nexb.com:80", "http://www.nexb.com/"), + ("https://www.nexb.com:443", "https://www.nexb.com/"), + ("http://www.nexb.com:443", "http://www.nexb.com:443/"), + ("https://www.nexb.com:80", "https://www.nexb.com:80/"), + ( + "http://www.nexb.com/A 0.0.1 Alpha/a_0_0_1.zip", + "http://www.nexb.com/A%200.0.1%20Alpha/a_0_0_1.zip", + ), ) for test, expected in data: self.assertEqual(expected, get_canonical(test)) @@ -46,23 +45,24 @@ def test_get_canonical(self): def test_is_routable_flags_are_not_overwritten_on_save(self): self.assertTrue(self.res.is_visitable) self.assertTrue(self.res.is_mappable) - self.res.sha1 = 'a' * 40 + self.res.sha1 = "a" * 40 self.res.save() res1 = ResourceURI.objects.get( - uri='http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom') + uri="http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom" + ) self.assertTrue(res1.is_visitable) self.assertTrue(res1.is_mappable) res1.save() res2 = ResourceURI.objects.get( - uri='http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom') + uri="http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom" + ) self.assertTrue(res2.is_visitable) self.assertTrue(res2.is_mappable) class ResourceURIManagerTestCase(TestCase): - def setUp(self): - self.uri = 'https://sourceforge.net/sitemap.xml' + self.uri = "https://sourceforge.net/sitemap.xml" self.resource = ResourceURI.objects.insert(uri=self.uri, priority=100) def test_insert(self): @@ -97,7 +97,7 @@ def test_successful(self): self.resource.last_visit_date = timezone.now() self.resource.save() self.assertTrue(ResourceURI.objects.successfully_visited()) - self.resource.visit_error = 'error' + self.resource.visit_error = "error" self.resource.save() self.assertFalse(ResourceURI.objects.successfully_visited()) @@ -106,7 +106,7 @@ def test_unsuccessful(self): self.resource.last_visit_date = timezone.now() self.resource.save() self.assertFalse(ResourceURI.objects.unsuccessfully_visited()) - self.resource.visit_error = 'error' + self.resource.visit_error = "error" self.resource.save() self.assertTrue(ResourceURI.objects.unsuccessfully_visited()) @@ -114,152 +114,121 @@ def test_needs_revisit_force_revisit_at_0_hours(self): self.resource.last_visit_date = timezone.now() self.resource.save() - self.assertTrue(ResourceURI.objects.needs_revisit( - uri=self.uri, hours=0)) + self.assertTrue(ResourceURI.objects.needs_revisit(uri=self.uri, hours=0)) def test_needs_revisit_very_old_visit(self): self.resource.last_visit_date = timezone.now() - timedelta(days=20) self.resource.save() - self.assertTrue(ResourceURI.objects.needs_revisit( - uri=self.uri, hours=240)) + self.assertTrue(ResourceURI.objects.needs_revisit(uri=self.uri, hours=240)) def test_needs_revisit_near_visit(self): self.resource.last_visit_date = timezone.now() - timedelta(hours=3) self.resource.save() - self.assertTrue(ResourceURI.objects.needs_revisit( - uri=self.uri, hours=2)) + self.assertTrue(ResourceURI.objects.needs_revisit(uri=self.uri, hours=2)) def test_needs_revisit_recent_visit(self): self.resource.last_visit_date = timezone.now() self.resource.save() - self.assertFalse( - ResourceURI.objects.needs_revisit(uri=self.uri, hours=2)) + self.assertFalse(ResourceURI.objects.needs_revisit(uri=self.uri, hours=2)) def test_needs_revisit_never_been_visited(self): - self.assertFalse(ResourceURI.objects.needs_revisit( - uri=self.uri, hours=200)) + self.assertFalse(ResourceURI.objects.needs_revisit(uri=self.uri, hours=200)) class ResourceURIManagerGetRevisitablesUnmappableURITestCase(TestCase): - def setUp(self): - self.uri = 'https://sourceforge.net/sitemap.xml' + self.uri = "https://sourceforge.net/sitemap.xml" self.resource = ResourceURI.objects.insert(uri=self.uri, priority=100) def test_get_revisitables_last_visit_date_now(self): self.resource.last_visit_date = timezone.now() self.resource.save() - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=240).count()) def test_get_revisitables_last_visit_date_10_days_ago(self): self.resource.last_visit_date = timezone.now() - timedelta(hours=240) self.resource.save() - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=240).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=241).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=241).count()) class ResourceURIManagerGetRevisitablesMappableURITestCase(TestCase): - def setUp(self): # this is a mappable ResourceURI - self.uri = 'http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom' + self.uri = ( + "http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom" + ) self.resource = ResourceURI.objects.insert(uri=self.uri, priority=100) def test_get_revisitables_unmapped_last_visit_date_now(self): self.resource.last_visit_date = timezone.now() self.resource.save() - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=240).count()) def test_get_revisitables_unmapped_last_visit_date_less_than_threshold(self): self.resource.last_visit_date = timezone.now() self.resource.save() - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) def test_get_revisitables_unmapped_last_visit_date_10_days_ago(self): self.resource.last_visit_date = timezone.now() - timedelta(hours=240) self.resource.save() - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=240).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=241).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=241).count()) def test_get_revisitables_mapped_last_visit_date_now(self): self.resource.last_visit_date = timezone.now() self.resource.last_map_date = timezone.now() self.resource.save() - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=240).count()) def test_get_revisitables_mapped_last_visit_date_less_than_threshold(self): self.resource.last_visit_date = timezone.now() self.resource.last_map_date = timezone.now() self.resource.save() - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=1).count()) def test_get_revisitables_mapped_last_visit_date_10_days_ago(self): self.resource.last_visit_date = timezone.now() - timedelta(hours=240) self.resource.last_map_date = timezone.now() self.resource.save() - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=0).count()) - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=1).count()) - self.assertEqual( - 1, ResourceURI.objects.get_revisitables(hours=240).count()) - self.assertEqual( - 0, ResourceURI.objects.get_revisitables(hours=241).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=0).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=1).count()) + self.assertEqual(1, ResourceURI.objects.get_revisitables(hours=240).count()) + self.assertEqual(0, ResourceURI.objects.get_revisitables(hours=241).count()) class ResourceURIManagerGetNextVisitableUnmappableURITestCase(TestCase): - def setUp(self): - self.uri0 = 'https://sourceforge.net/sitemap.xml' - self.uri1 = 'https://sourceforge.net/sitemap-0.xml' + self.uri0 = "https://sourceforge.net/sitemap.xml" + self.uri1 = "https://sourceforge.net/sitemap-0.xml" self.resource0 = ResourceURI.objects.insert(uri=self.uri0, priority=1) self.resource1 = ResourceURI.objects.insert(uri=self.uri1, priority=2) def test_get_next_visitable_unvisited(self): - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) def test_get_next_visitable_none_when_both_visited_less_than_10_days_ago(self): @@ -276,20 +245,19 @@ def test_get_next_visitable_when_both_visited_10_days_ago(self): self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) - def test_get_next_visitable_when_one_unvisited_and_one_visited_less_than_10_days_ago(self): + def test_get_next_visitable_when_one_unvisited_and_one_visited_less_than_10_days_ago( + self, + ): self.resource0.last_visit_date = None self.resource1.last_visit_date = timezone.now() - timedelta(hours=24) self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) self.resource0.last_visit_date = timezone.now() - timedelta(hours=24) @@ -297,18 +265,18 @@ def test_get_next_visitable_when_one_unvisited_and_one_visited_less_than_10_days self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) - def test_get_next_visitable_when_one_visited_more_and_one_visited_less_than_10_days_ago(self): + def test_get_next_visitable_when_one_visited_more_and_one_visited_less_than_10_days_ago( + self, + ): self.resource0.last_visit_date = timezone.now() - timedelta(hours=250) self.resource1.last_visit_date = timezone.now() - timedelta(hours=24) self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) self.resource0.last_visit_date = timezone.now() - timedelta(hours=24) @@ -316,27 +284,25 @@ def test_get_next_visitable_when_one_visited_more_and_one_visited_less_than_10_d self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) class ResourceURIManagerGetNextVisitableMappableURITestCase(TestCase): - def setUp(self): # this is a mappable ResourceURI - self.uri0 = 'http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom' - self.uri1 = 'http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.1/mav-all-1.1.pom' - self.resource0 = ResourceURI.objects.insert( - uri=self.uri0, priority=100) - self.resource1 = ResourceURI.objects.insert( - uri=self.uri1, priority=100) + self.uri0 = ( + "http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.0/mav-all-1.0.pom" + ) + self.uri1 = ( + "http://repo1.maven.org/maven2/org/ye/mav/mav-all/1.1/mav-all-1.1.pom" + ) + self.resource0 = ResourceURI.objects.insert(uri=self.uri0, priority=100) + self.resource1 = ResourceURI.objects.insert(uri=self.uri1, priority=100) def test_get_next_visitable_unvisited(self): - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) def test_get_next_visitable_visited_unmapped(self): @@ -355,10 +321,8 @@ def test_get_next_visitable_visited_10_days_ago_mapped(self): self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) def test_get_next_visitable_visited_10_days_ago_one_unmapped(self): @@ -368,8 +332,7 @@ def test_get_next_visitable_visited_10_days_ago_one_unmapped(self): self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource0, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource0, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) self.resource0.last_map_date = None @@ -377,8 +340,7 @@ def test_get_next_visitable_visited_10_days_ago_one_unmapped(self): self.resource0.save() self.resource1.save() - self.assertEqual( - self.resource1, ResourceURI.objects.get_next_visitable()) + self.assertEqual(self.resource1, ResourceURI.objects.get_next_visitable()) self.assertIsNone(ResourceURI.objects.get_next_visitable()) def test_get_next_visitable_recently_visited_mapped(self): @@ -393,14 +355,15 @@ def test_get_next_visitable_recently_visited_mapped(self): class ResourceURIManagerGetMappablesTestCase(TestCase): - def setUp(self): - self.uri1 = 'maven-index://repo1.maven.org/o/a/this.jar' - self.uri2 = 'maven-index://repo1.maven.org/o/a/thisother.jar' + self.uri1 = "maven-index://repo1.maven.org/o/a/this.jar" + self.uri2 = "maven-index://repo1.maven.org/o/a/thisother.jar" self.resource1 = ResourceURI.objects.create( - uri=self.uri1, priority=1, last_visit_date=timezone.now()) + uri=self.uri1, priority=1, last_visit_date=timezone.now() + ) self.resource2 = ResourceURI.objects.create( - uri=self.uri2, priority=2, last_visit_date=timezone.now()) + uri=self.uri2, priority=2, last_visit_date=timezone.now() + ) def test_get_mappables(self): assert self.resource1.is_mappable @@ -408,43 +371,57 @@ def test_get_mappables(self): self.assertEqual(2, ResourceURI.objects.get_mappables().count()) self.resource1.last_map_date = timezone.now() self.resource1.save() - resource1 = ResourceURI.objects.get(id=self.resource1.id) - self.assertEqual([self.resource2], list( - ResourceURI.objects.get_mappables())) + # resource2 should only be mappable + self.assertEqual([self.resource2], list(ResourceURI.objects.get_mappables())) def test_get_mappables__map_error_must_make_a_resourceuri_non_mappable(self): assert self.resource1.is_mappable self.assertEqual(2, ResourceURI.objects.get_mappables().count()) - self.resource1.map_error = 'Some error happened' - self.resource2.map_error = 'Some error happened' + self.resource1.map_error = "Some error happened" + self.resource2.map_error = "Some error happened" self.resource1.save() self.resource2.save() - resource1 = ResourceURI.objects.get(id=self.resource1.id) self.assertEqual([], list(ResourceURI.objects.get_mappables())) class ScannableURIManagerTestCase(TestCase): def setUp(self): - self.test_uri1 = 'http://example.com' + self.test_uri1 = "http://example.com" self.test_package1 = Package.objects.create( - download_url=self.test_uri1, name='Foo', version='12.34') - self.scannable_uri1 = ScannableURI.objects.create(uri=self.test_uri1, package=self.test_package1, - scan_status=ScannableURI.SCAN_NEW) - self.test_uri2 = 'http://elpmaxe.com' + download_url=self.test_uri1, name="Foo", version="12.34" + ) + self.scannable_uri1 = ScannableURI.objects.create( + uri=self.test_uri1, + package=self.test_package1, + scan_status=ScannableURI.SCAN_NEW, + ) + self.test_uri2 = "http://elpmaxe.com" self.test_package2 = Package.objects.create( - download_url=self.test_uri2, name='Bar', version='11.75') - self.scannable_uri2 = ScannableURI.objects.create(uri=self.test_uri2, package=self.test_package2, - scan_status=ScannableURI.SCAN_SUBMITTED) - self.test_uri3 = 'http://nexb.com' + download_url=self.test_uri2, name="Bar", version="11.75" + ) + self.scannable_uri2 = ScannableURI.objects.create( + uri=self.test_uri2, + package=self.test_package2, + scan_status=ScannableURI.SCAN_SUBMITTED, + ) + self.test_uri3 = "http://nexb.com" self.test_package3 = Package.objects.create( - download_url=self.test_uri3, name='Baz', version='5') - self.scannable_uri3 = ScannableURI.objects.create(uri=self.test_uri3, package=self.test_package3, - scan_status=ScannableURI.SCAN_IN_PROGRESS) - self.test_uri4 = 'http://realsite.com' + download_url=self.test_uri3, name="Baz", version="5" + ) + self.scannable_uri3 = ScannableURI.objects.create( + uri=self.test_uri3, + package=self.test_package3, + scan_status=ScannableURI.SCAN_IN_PROGRESS, + ) + self.test_uri4 = "http://realsite.com" self.test_package4 = Package.objects.create( - download_url=self.test_uri4, name='Qux', version='87') - self.scannable_uri4 = ScannableURI.objects.create(uri=self.test_uri4, package=self.test_package4, - scan_status=ScannableURI.SCAN_COMPLETED) + download_url=self.test_uri4, name="Qux", version="87" + ) + self.scannable_uri4 = ScannableURI.objects.create( + uri=self.test_uri4, + package=self.test_package4, + scan_status=ScannableURI.SCAN_COMPLETED, + ) def test_ScannableURIManager_get_scannables(self): result = ScannableURI.objects.get_scannables() @@ -473,21 +450,22 @@ def test_ScannableURI_get_next_processable(self): class ScannableURIModelTestCase(TestCase): def setUp(self): - self.test_uri = 'http://example.com' + self.test_uri = "http://example.com" self.test_package = Package.objects.create( - download_url=self.test_uri, name='Foo', version='12.34') + download_url=self.test_uri, name="Foo", version="12.34" + ) def test_ScannableURI_create_basic_record(self): - scannable_uri = ScannableURI.objects.create( - uri=self.test_uri, package=self.test_package) + ScannableURI.objects.create(uri=self.test_uri, package=self.test_package) result = ScannableURI.objects.get(uri=self.test_uri) self.assertEqual(self.test_uri, result.uri) self.assertEqual(self.test_package, result.package) def test_ScannableURI_save(self): - test_error_message = 'error' + test_error_message = "error" scannable_uri = ScannableURI.objects.create( - uri=self.test_uri, package=self.test_package) + uri=self.test_uri, package=self.test_package + ) self.assertFalse(scannable_uri.scan_error) scannable_uri.scan_error = test_error_message scannable_uri.save() @@ -495,9 +473,8 @@ def test_ScannableURI_save(self): self.assertEqual(test_error_message, result.scan_error) def test_ScannableURI_save_set_canonical_uri(self): - scannable_uri = ScannableURI( - uri=self.test_uri, package=self.test_package) + scannable_uri = ScannableURI(uri=self.test_uri, package=self.test_package) self.assertFalse(scannable_uri.canonical) scannable_uri.save() result = ScannableURI.objects.get(uri=self.test_uri) - self.assertEqual('http://example.com/', result.canonical) + self.assertEqual("http://example.com/", result.canonical) diff --git a/minecode/tests/test_npm.py b/minecode/tests/test_npm.py deleted file mode 100644 index 5271308a..00000000 --- a/minecode/tests/test_npm.py +++ /dev/null @@ -1,221 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import codecs -import json -import os -import re - -from django.test import TestCase as DjangoTestCase -from mock import Mock -from mock import patch -from packagedcode.npm import NpmPackageJsonHandler -from packageurl import PackageURL - -import packagedb -from minecode import mappers -from minecode import route -from minecode.models import ResourceURI -from minecode.utils_test import JsonBasedTesting -from minecode.utils_test import mocked_requests_get -from minecode.visitors import npm -from minecode.tests import FIXTURES_REGEN - - -class TestNPMVisit(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - # FIXME: use smaller test files - def test_NpmRegistryVisitor(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=2300000' - test_loc = self.get_test_loc('npm/replicate_doc1.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _errors = npm.NpmRegistryVisitor(uri) - # this is a non-persistent visitor, lets make sure we dont return any data - assert not data - expected_loc = self.get_test_loc('npm/expected_doclimit_visitor.json') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_NpmRegistryVisitor_OverLimit(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=2300000' - test_loc = self.get_test_loc('npm/over_limit.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = npm.NpmRegistryVisitor(uri) - expected_loc = self.get_test_loc('npm/expected_over_limit.json') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_NpmRegistryVisitor_1000records(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=77777' - test_loc = self.get_test_loc('npm/1000_records.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = npm.NpmRegistryVisitor(uri) - expected_loc = self.get_test_loc('npm/expected_1000_records.json') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - -class TestNPMMapper(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_build_packages(self): - with open(self.get_test_loc('npm/0flux.json')) as npm_metadata: - metadata = json.load(npm_metadata) - packages = mappers.npm.build_packages(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/0flux_npm_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_package2(self): - with open(self.get_test_loc('npm/2112.json')) as npm_metadata: - metadata = json.load(npm_metadata) - packages = mappers.npm.build_packages(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/npm_2112_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_package3(self): - with open(self.get_test_loc('npm/microdata.json')) as npm_metadata: - metadata = json.load(npm_metadata) - packages = mappers.npm.build_packages(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/microdata-node_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_package_with_visitor_data(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=77777' - test_loc = self.get_test_loc('npm/1000_records.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = npm.NpmRegistryVisitor(uri) - uris_list = list(uris) - assert len(uris_list) == 1001 - # Randomly pick a record from 0-1000 - metadata = uris_list[29].data - packages = mappers.npm.build_packages(json.loads(metadata)) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/29_record_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - # Randomly pick a record from 0-1000 - metadata = uris_list[554].data - packages = mappers.npm.build_packages(json.loads(metadata)) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/554_record_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_package_with_ticket_439(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=10&since=7333426' - test_loc = self.get_test_loc('npm/ticket_439.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = npm.NpmRegistryVisitor(uri) - uris_list = list(uris) - assert len(uris_list) == 11 - # Pickup the first one, since it's the one which is the problem package "angular2-autosize" - # The zero element in json is the url for next visitor use, and data is empty and the url is - metadata = uris_list[1].data - packages = mappers.npm.build_packages(json.loads(metadata)) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/expected_ticket_439.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_package_verify_ticket_440(self): - uri = 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=10&since=7632607' - test_loc = self.get_test_loc('npm/ticket_440_records.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = npm.NpmRegistryVisitor(uri) - uris_list = list(uris) - assert len(uris_list) == 11 - # Pickup the index one instead of zero, since it's the one which is the problem package "npm-research", https://registry.npmjs.org/npm-research, - # The zero element in json is the url for next visitor use only - metadata = uris_list[1].data - packages = mappers.npm.build_packages(json.loads(metadata)) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/expected_ticket_440.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_npm_mapper(self): - test_uri = 'https://registry.npmjs.org/angular-compare-validator' - router = route.Router() - router.append(test_uri, mappers.npm.NpmPackageMapper) - test_loc = self.get_test_loc('npm/mapper/index.json') - with open(test_loc, 'rb') as test_file: - test_data = test_file.read().decode('utf-8') - - test_res_uri = ResourceURI(uri=test_uri, data=test_data) - packages = mappers.npm.NpmPackageMapper(test_uri, test_res_uri) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/mapper/index.expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_package_for_jsonp_filter(self): - with open(self.get_test_loc('npm/jsonp-filter.json')) as npm_metadata: - metadata = json.load(npm_metadata) - packages = mappers.npm.build_packages(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('npm/jsonp-filter-expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_regex_npm_mapper(self): - regex = re.compile(r'^https://registry.npmjs.org/[^\/]+$') - result = re.match( - regex, 'https://registry.npmjs.org/react-mobile-navigation-modal') - self.assertTrue(result) - - -class NpmPriorityQueueTests(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def setUp(self): - super(NpmPriorityQueueTests, self).setUp() - self.expected_json_loc = self.get_test_loc( - 'npm/lodash_package-expected.json') - with open(self.expected_json_loc) as f: - self.expected_json_contents = json.load(f) - - self.scan_package = NpmPackageJsonHandler._parse( - json_data=self.expected_json_contents, - ) - - def test_get_package_json(self, regen=FIXTURES_REGEN): - json_contents = npm.get_package_json( - namespace=self.scan_package.namespace, - name=self.scan_package.name, - version=self.scan_package.version - ) - if regen: - with open(self.expected_json_loc, 'w') as f: - json.dump(json_contents, f, indent=3, separators=(',', ':')) - self.assertEqual(self.expected_json_contents, json_contents) - - def test_map_npm_package(self): - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(0, package_count) - package_url = PackageURL.from_string(self.scan_package.purl) - npm.map_npm_package(package_url, ('test_pipeline')) - package_count = packagedb.models.Package.objects.all().count() - self.assertEqual(1, package_count) - package = packagedb.models.Package.objects.all().first() - expected_purl_str = 'pkg:npm/lodash@4.17.21' - expected_download_url = 'https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz' - self.assertEqual(expected_purl_str, package.purl) - self.assertEqual(expected_download_url, package.download_url) diff --git a/minecode/tests/test_nuget.py b/minecode/tests/test_nuget.py deleted file mode 100644 index 875a086f..00000000 --- a/minecode/tests/test_nuget.py +++ /dev/null @@ -1,112 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import os -import re - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode.visitors import nuget -from minecode.tests import FIXTURES_REGEN - - -class NugetVisitorsTest(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_NugetQueryVisitor(self): - uri = 'https://api-v2v3search-0.nuget.org/query' - test_loc = self.get_test_loc('nuget/query.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = nuget.NugetQueryVisitor(uri) - expected_loc = self.get_test_loc('nuget/nuget_query_expected') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_PackagesPageVisitor(self): - uri = 'https://api-v2v3search-0.nuget.org/query?skip=0' - test_loc = self.get_test_loc('nuget/query_search.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = nuget.PackagesPageVisitor(uri) - expected_loc = self.get_test_loc('nuget/nuget_page_json_expected') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_NugetAPIJsonVisitor(self): - uri = 'https://api.nuget.org/v3/registration1/entityframework/6.1.3.json' - test_loc = self.get_test_loc('nuget/entityframework.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = nuget.NugetAPIJsonVisitor(uri) - expected_loc = self.get_test_loc( - 'nuget/nuget_downlloadvisitor_json_expected') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_NugetHTMLPageVisitor(self): - uri = 'https://www.nuget.org/packages?page=1' - test_loc = self.get_test_loc('nuget/packages.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = nuget.NugetHTMLPageVisitor(uri) - expected_loc = self.get_test_loc('nuget/packages.html.expected.json') - self.check_expected_uris(uris, expected_loc, regen=FIXTURES_REGEN) - - def test_NugetHTMLPackageVisitor(self): - uri = 'https://www.nuget.org/packages/log4net' - test_loc = self.get_test_loc('nuget/log4net.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _errors = nuget.NugetHTMLPackageVisitor(uri) - self.assertTrue(b'Apache-2.0 License ' in data) - self.assertTrue(b'log4net is a tool to help the programmer' in data) - - -class TestNugetMap(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_build_packages(self): - with open(self.get_test_loc('nuget/entityframework2.json')) as nuget_metadata: - metadata = json.load(nuget_metadata) - packages = mappers.nuget.build_packages_with_json(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('nuget/nuget_mapper_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_regex_1(self): - regex = re.compile(r'^https://api.nuget.org/packages/.*\.nupkg$') - result = re.match( - regex, 'https://api.nuget.org/packages/entityframework.4.3.1.nupkg') - self.assertTrue(result) - - def test_regex_2(self): - regex = re.compile(r'^https://api.nuget.org/v3/catalog.+\.json$') - result = re.match( - regex, 'https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json') - self.assertTrue(result) - - def test_build_packages_from_html(self): - uri = 'https://www.nuget.org/packages/log4net' - test_loc = self.get_test_loc('nuget/log4net.html') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _errors = nuget.NugetHTMLPackageVisitor(uri) - packages = mappers.nuget.build_packages_from_html(data, uri,) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'nuget/nuget_mapper_log4net_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_priority_queue.py b/minecode/tests/test_priority_queue.py index 2c94f02d..45cf7b72 100644 --- a/minecode/tests/test_priority_queue.py +++ b/minecode/tests/test_priority_queue.py @@ -9,9 +9,10 @@ from django.test import TestCase as DjangoTestCase -from minecode.utils_test import JsonBasedTesting -from minecode.models import PriorityResourceURI + from minecode.management.commands import priority_queue +from minecode.models import PriorityResourceURI +from minecode.utils_test import JsonBasedTesting from packagedb.models import Package @@ -20,10 +21,10 @@ def test_process_request(self): package_count = Package.objects.all().count() self.assertEqual(0, package_count) - purl_str = 'pkg:maven/org.apache.twill/twill-core@0.12.0' - download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar' - purl_sources_str = f'{purl_str}?classifier=sources' - sources_download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar' + purl_str = "pkg:maven/org.apache.twill/twill-core@0.12.0" + download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar" + purl_sources_str = f"{purl_str}?classifier=sources" + sources_download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar" p = PriorityResourceURI.objects.create(uri=purl_str) priority_queue.process_request(p) @@ -32,12 +33,7 @@ def test_process_request(self): self.assertEqual(2, package_count) purls = [ - (package.purl, package.download_url) - for package in Package.objects.all() + (package.purl, package.download_url) for package in Package.objects.all() ] - self.assertIn( - (purl_str, download_url), purls - ) - self.assertIn( - (purl_sources_str, sources_download_url), purls - ) + self.assertIn((purl_str, download_url), purls) + self.assertIn((purl_sources_str, sources_download_url), purls) diff --git a/minecode/tests/test_pypi.py b/minecode/tests/test_pypi.py deleted file mode 100644 index 0ed802b4..00000000 --- a/minecode/tests/test_pypi.py +++ /dev/null @@ -1,232 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import json -import os - -from django.test import TestCase as DjangoTestCase - -from mock import MagicMock -from mock import Mock -from mock import patch - -from packagedb.models import Package - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode import visitors -from minecode.visitors import URI -from minecode.models import ResourceURI -from minecode.route import Router -from minecode.tests import FIXTURES_REGEN -from minecode.management.commands.run_map import map_uri - - -class TestPypiVisit(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - ''' -import unittest -import xmlrpc -from mock import patch - -class TestFoo(unittest.TestCase): - """ - A simple test - """ - @patch('xmlrpc.server') - def test_first(self, mock_xmlrpc): - m = mock_xmlrpc.return_value - m.multiply.return_value = 6 - server = xmlrpc.server("http://kushaldas.in/") - res = server.multiply(2, 3) - self.assertEqual(res, 6) -''' - @patch('xmlrpc.client.ServerProxy') - def test_PypiIndexVisitor(self, mock_serverproxyclass): - package_list = ["0", - "0-._.-._.-._.-._.-._.-._.-0", - "0.0.1", - "00print_lol", - "vmnet", - "vmo", - "vmock", - "vmonere", - "VMPC", ] - instance = mock_serverproxyclass.return_value - instance.list_packages.return_value = iter(package_list) - uri = 'https://pypi.python.org/pypi/' - uris, _data, _error = visitors.pypi.PypiIndexVisitor(uri) - self.assertIsNone(_data) - - expected_loc = self.get_test_loc('pypi/pypiindexvisitor-expected.json') - self.check_expected_uris(uris, expected_loc) - - def test_PypiPackageVisitor(self): - uri = 'https://pypi.python.org/pypi/CAGE/json' - test_loc = self.get_test_loc('pypi/cage.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _error = visitors.pypi.PypiPackageVisitor(uri) - - expected_loc = self.get_test_loc('pypi/expected_uris-cage.json') - self.check_expected_uris(uris, expected_loc) - - def test_PypiPackageVisitor_2(self): - uri = 'https://pypi.python.org/pypi/boolean.py/json' - test_loc = self.get_test_loc('pypi/boolean.py.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, _errors = visitors.pypi.PypiPackageVisitor(uri) - - expected_loc = self.get_test_loc('pypi/expected_uris-boolean.py.json') - self.check_expected_uris(uris, expected_loc) - - def test_PypiPackageReleaseVisitor_cage12(self): - uri = 'https://pypi.python.org/pypi/CAGE/1.1.2/json' - test_loc = self.get_test_loc('pypi/cage_1.1.2.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _error = visitors.pypi.PypiPackageReleaseVisitor(uri) - - expected_loc = self.get_test_loc('pypi/expected_uris-cage_1.1.2.json') - self.check_expected_uris(uris, expected_loc) - - expected_loc = self.get_test_loc('pypi/expected_data-cage_1.1.2.json') - self.check_expected_results(data, expected_loc) - - def test_PypiPackageReleaseVisitor_cage13(self): - uri = 'https://pypi.python.org/pypi/CAGE/1.1.3/json' - test_loc = self.get_test_loc('pypi/cage_1.1.3.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _error = visitors.pypi.PypiPackageReleaseVisitor(uri) - - expected_loc = self.get_test_loc('pypi/expected_uris-cage_1.1.3.json') - self.check_expected_uris(uris, expected_loc) - - expected_loc = self.get_test_loc('pypi/expected_data-cage_1.1.3.json') - self.check_expected_results(data, expected_loc) - - def test_PypiPackageReleaseVisitor_boolean(self): - uri = 'https://pypi.python.org/pypi/boolean.py/2.0.dev3/json' - test_loc = self.get_test_loc('pypi/boolean.py-2.0.dev3.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, data, _error = visitors.pypi.PypiPackageReleaseVisitor(uri) - - expected_loc = self.get_test_loc( - 'pypi/expected_uris-boolean.py-2.0.dev3.json') - self.check_expected_uris(uris, expected_loc) - - expected_loc = self.get_test_loc( - 'pypi/expected_data-boolean.py-2.0.dev3.json') - self.check_expected_results(data, expected_loc) - - -class MockResourceURI(object): - - def __init__(self, uri, data): - self.uri = uri - self.data = data - self.package_url = None - - -class TestPypiMap(JsonBasedTesting, DjangoTestCase): - - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_build_packages_lxml(self): - with open(self.get_test_loc('pypi/lxml-3.2.0.json')) as pypi_meta: - metadata = json.load(pypi_meta) - packages = mappers.pypi.build_packages(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/expected-lxml-3.2.0.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_packages_boolean(self): - with open(self.get_test_loc('pypi/boolean.py-2.0.dev3.json')) as pypi_meta: - metadata = json.load(pypi_meta) - packages = mappers.pypi.build_packages(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'pypi/expected-boolean.py-2.0.dev3.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_packages_cage13(self): - with open(self.get_test_loc('pypi/cage_1.1.3.json')) as pypi_meta: - metadata = json.load(pypi_meta) - packages = mappers.pypi.build_packages(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/expected-CAGE-1.1.3.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_packages_cage12(self): - with open(self.get_test_loc('pypi/cage_1.1.2.json')) as pypi_meta: - metadata = json.load(pypi_meta) - packages = mappers.pypi.build_packages(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/expected-CAGE-1.1.2.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_PypiPackageMapper_cage(self): - data = open(self.get_test_loc('pypi/cage_1.1.2.json')).read() - uri = 'https://pypi.python.org/pypi/CAGE/1.1.2/json' - resuri = MockResourceURI(uri, data) - packages = mappers.pypi.PypiPackageMapper(uri, resuri) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/expected-CAGE-1.1.2.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_PypiPackageMapper_lxml(self): - data = open(self.get_test_loc('pypi/lxml-3.2.0.json')).read() - uri = 'https://pypi.python.org/pypi/lxml/3.2.0/json' - resuri = MockResourceURI(uri, data) - packages = mappers.pypi.PypiPackageMapper(uri, resuri) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/expected-lxml-3.2.0.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_pypi_map(self): - # setup: add a mappable URI - with open(self.get_test_loc('pypi/map/3to2-1.1.1.json')) as mappable: - resuri = ResourceURI(**json.load(mappable)) - resuri.save() - - # sanity check - packages = mappers.pypi.PypiPackageMapper(resuri.uri, resuri) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc('pypi/map/expected-3to2-1.1.1.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - # build a mock router - router = Router() - router.append('https://pypi.python.org/pypi/3to2/1.1.1/json', - mappers.pypi.PypiPackageMapper) - - # sanity check - expected_mapped_package_uri = 'https://pypi.python.org/packages/8f/ab/58a363eca982c40e9ee5a7ca439e8ffc5243dde2ae660ba1ffdd4868026b/3to2-1.1.1.zip' - self.assertEqual(0, Package.objects.filter( - download_url=expected_mapped_package_uri).count()) - - # test proper - map_uri(resuri, _map_router=router) - mapped = Package.objects.filter( - download_url=expected_mapped_package_uri) - self.assertEqual(1, mapped.count()) diff --git a/minecode/tests/test_repodata.py b/minecode/tests/test_repodata.py deleted file mode 100644 index e91aa009..00000000 --- a/minecode/tests/test_repodata.py +++ /dev/null @@ -1,98 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import os - -from commoncode.testcase import FileBasedTesting - -from minecode.visitors import repodata - - -class TestRepoData(FileBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_get_pkg_infos(self): - filelists_xml = self.get_test_loc( - 'repodata_rpms/repodata/filelists.xml') - primary_xml = self.get_test_loc('repodata_rpms/repodata/primary.xml') - other_xml = self.get_test_loc('repodata_rpms/repodata/other.xml') - expected = [ - { - u'build_time': '1442515098', - u'buildhost': 'c1bk.rdu2.centos.org', - u'href': 'python-ceilometerclient-1.5.0-1.el7.src.rpm', - u'pkgid': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', - u'group': 'Development/Languages', - u'end_header_range': '4876', - u'archive_size': '99648', - u'package_size': '101516', - 'epoch': '0', - u'changelogs': [ - { - u'date': '1387195200', - u'changelog': '- Update to upstream 1.0.8\n- New dependency: python-six', - u'author': 'Jakub Ruzicka 1.0.8-1' - } - ], - 'rel': '1.el7', - 'type': 'rpm', - u'files': [ - { - u'name': 'python-ceilometerclient-1.5.0.tar.gz' - }, - { - u'name': 'python-ceilometerclient.spec' - } - ], - u'description': None, - u'installed_size': '99230', - u'file_time': '1446590411', - 'arch': 'src', - 'name': 'python-ceilometerclient', - u'license': 'ASL 2.0', - u'url': 'https://github.com/openstack/python-ceilometerclient', - u'checksum': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', - u'directories': [], - u'summary': 'Python API and CLI for OpenStack Ceilometer', - u'start_header_range': '880', - u'required_rpms': [ - { - u'name': 'python-d2to1' - }, - { - u'ver': '2.5.0', - u'epoch': '0', - u'flags': 'GE', - u'name': 'python-oslo-sphinx' - }, - { - u'name': 'python-pbr' - }, - { - u'name': 'python-setuptools' - }, - { - u'name': 'python-sphinx' - }, - { - u'name': 'python2-devel' - } - ], - u'sourcerpm': None, - 'ver': '1.5.0' - } - ] - result = repodata.get_pkg_infos(filelists_xml, primary_xml, other_xml) - self.assertEqual(expected, result) - - def test_get_url_for_tag(self): - expected = 'repodata/4c31e7e12c7aa42cf4d7d0b6ab7166fad76b5e40ea18f911e4a820cfa68d1541-filelists.xml.gz' - repomdxml_file = self.get_test_loc('repodata_rpms/repodata/repomd.xml') - output = repodata.get_url_for_tag(repomdxml_file, 'filelists') - self.assertEqual(expected, output) diff --git a/minecode/tests/test_repomd_parser.py b/minecode/tests/test_repomd_parser.py deleted file mode 100644 index 0e42d574..00000000 --- a/minecode/tests/test_repomd_parser.py +++ /dev/null @@ -1,239 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import os - -from packagedcode.rpm import EVR - -from mock import Mock -from mock import patch - -from minecode.utils_test import mocked_requests_get_for_uris -from minecode.utils_test import JsonBasedTesting - -from minecode.visitors import URI -from minecode.visitors.repodata import combine_list_of_dicts -from minecode.visitors.repodata import combine_dicts_using_pkgid -from minecode.visitors.repomd_parser import generate_rpm_objects -from minecode.visitors.repomd_parser import collect_rpm_packages_from_repomd -from minecode.tests import FIXTURES_REGEN - -# TODO: add redhat repo test! - - -class TestRepomdParser(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_combine_list_of_dicts(self): - expected = {'a': '1', 'b': '2', 'c': '3'} - output = combine_list_of_dicts([{'a': '1'}, {'b': '2'}, {'c': '3'}]) - self.assertEqual(expected, output) - - def test_generate_rpm_objects(self): - packages = [{'name': 'python-ceilometerclient', 'arch': 'src', 'ver': '1.5.0', - 'rel': '1.el7', 'href': '/python-ceilometerclient-1.5.0-1.el7.src.rpm'}] - repomdxml_url = 'http://vault.centos.org/7.1.1503/cloud/Source/openstack-liberty' - rpms = list(generate_rpm_objects(packages, repomdxml_url)) - self.assertEqual(1, len(rpms)) - rpm = rpms[0] - self.assertEqual('python-ceilometerclient', rpm.name) - self.assertEqual( - EVR(version='1.5.0', release='1.el7').to_string(), rpm.version) - - def test_collect_rpm_packages_from_repomd_cloudera(self): - uri2loc = { - 'http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera/repomd.xml'), - 'http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera/filelists.xml.gz'), - 'http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera/other.xml.gz'), - 'http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera/primary.xml.gz'), - } - - uri = 'http://archive.cloudera.com/cm5/redhat/6/x86_64/cm/5.3.2/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) - _uris, packages, _error = collect_rpm_packages_from_repomd(uri) - - expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_collect_rpm_packages_from_repomd_centos(self): - uri2loc = { - 'http://vault.centos.org/3.8/updates/x86_64/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/centos/repomd.xml'), - 'http://vault.centos.org/3.8/updates/x86_64/repodata/filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/centos/filelists.xml.gz'), - 'http://vault.centos.org/3.8/updates/x86_64/repodata/other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/centos/other.xml.gz'), - 'http://vault.centos.org/3.8/updates/x86_64/repodata/primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/centos/primary.xml.gz'), - } - - uri = 'http://vault.centos.org/3.8/updates/x86_64/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) - uris, packages, _error = collect_rpm_packages_from_repomd(uri) - - expected_uris = [ - URI(uri='http://vault.centos.org/3.8/updates/x86_64/RPMS/wireshark-0.99.2-EL3.1.x86_64.rpm'), - URI(uri='http://vault.centos.org/3.8/updates/x86_64/RPMS/wireshark-gnome-0.99.2-EL3.1.x86_64.rpm'), - URI(uri='http://vault.centos.org/3.8/updates/x86_64/RPMS/XFree86-100dpi-fonts-4.3.0-111.EL.x86_64.rpm') - ] - self.assertEqual(expected_uris, uris) - - expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/centos/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_collect_rpm_packages_from_repomd_cloudera_2(self): - uri2loc = { - 'http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera2/repomd.xml'), - 'http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera2/filelists.xml.gz'), - 'http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera2/primary.xml.gz'), - 'http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera2/other.xml.gz'), - } - - uri = 'http://archive.cloudera.com/cm5/redhat/5/x86_64/cm/5.2.0/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) - _uris, packages, _error = collect_rpm_packages_from_repomd(uri) - - expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/cloudera2/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_collect_rpm_packages_from_repomd_postgresql(self): - uri2loc = { - 'http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/postgresql/repomd.xml'), - 'http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/d5b4a2d13632cceb2a13a42fdb2887a22c1e262e6eeeb7270a80beec453392cd-filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/postgresql/d5b4a2d13632cceb2a13a42fdb2887a22c1e262e6eeeb7270a80beec453392cd-filelists.xml.gz'), - 'http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/fc8c4fa6295d68abddcf5bba71435ecf585c439b86d7e75e0ba9bf3951f914b5-other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/postgresql/fc8c4fa6295d68abddcf5bba71435ecf585c439b86d7e75e0ba9bf3951f914b5-other.xml.gz'), - 'http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/d5cb2a54df0aa000ac2a007b1d9b0d1f2e6a924d2d97584acbe654e59aa993e8-primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/postgresql/d5cb2a54df0aa000ac2a007b1d9b0d1f2e6a924d2d97584acbe654e59aa993e8-primary.xml.gz'), - } - - uri = 'http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) - uris, packages, error = collect_rpm_packages_from_repomd(uri) - self.assertEqual(None, error) - expected_uris = [ - URI(uri='http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/skytools-92-debuginfo-3.1.5-1.rhel6.x86_64.rpm'), - URI(uri='http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/repmgr92-2.0.2-4.rhel6.x86_64.rpm'), - URI(uri='http://yum.postgresql.org/9.2/redhat/rhel-6-x86_64/pgagent_92-3.2.1-1.rhel6.x86_64.rpm') - ] - - self.assertEqual(expected_uris, uris) - expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/postgresql/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_collect_rpm_packages_from_repomd_opensuse(self): - uri2loc = { - 'http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/opensuse/repomd.xml'), - 'http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/09ed18eaa761fe64c863137db5c51fdb4e60fbb29d6c9b0c424e3119ba4875cd-filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/opensuse/09ed18eaa761fe64c863137db5c51fdb4e60fbb29d6c9b0c424e3119ba4875cd-filelists.xml.gz'), - 'http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/9c100bbff252834349ca677813f333881ce9d2ca9db8091ce387156ba7a22859-other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/opensuse/9c100bbff252834349ca677813f333881ce9d2ca9db8091ce387156ba7a22859-other.xml.gz'), - 'http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/314da4321afcff987bd3e28672e60f1a2324f2698480b84812f7ec0a1aef4041-primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/opensuse/314da4321afcff987bd3e28672e60f1a2324f2698480b84812f7ec0a1aef4041-primary.xml.gz'), - } - - uri = 'http://download.opensuse.org/distribution/12.3/repo/oss/suse/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) - _uris, packages, _error = collect_rpm_packages_from_repomd(uri) - - expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/opensuse/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_collect_rpm_packages_from_repomd_pgpool(self): - uri2loc = { - 'http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/repomd.xml': - self.get_test_loc( - 'repodata_rpms/repomd_parser/pgpool/repomd.xml'), - 'http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/filelists.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/pgpool/filelists.xml.gz'), - 'http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/other.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/pgpool/other.xml.gz'), - 'http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/primary.xml.gz': - self.get_test_loc( - 'repodata_rpms/repomd_parser/pgpool/primary.xml.gz'), - } - - uri = 'http://pgpool.net/yum/rpms/3.4/redhat/rhel-6-x86_64/repodata/repomd.xml' - with patch('requests.get') as mock_http_get: - mock_http_get.side_effect = lambda * args, **kwargs: mocked_requests_get_for_uris( - uri2loc, *args, **kwargs) - _uris, packages, _error = collect_rpm_packages_from_repomd(uri) - - expected_loc = self.get_test_loc( - 'repodata_rpms/repomd_parser/pgpool/expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_combine_dicts_using_pkgid(self): - all_dicts = [ - {'pkgid': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', - 'name': 'python-ceilometerclient'}, - {'pkgid': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', 'ver': '1.5.0'}, - {'pkgid': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', 'rel': '1.el7'} - ] - expected = [ - {'pkgid': '36547e200627ea25c4e3fb6f9735d58e682f8e35cd815dceed796c83628e60d5', - 'name': 'python-ceilometerclient', - 'rel': '1.el7', - 'ver': '1.5.0'} - ] - output = combine_dicts_using_pkgid(all_dicts) - self.assertEqual(expected, output) diff --git a/minecode/tests/test_route.py b/minecode/tests/test_route.py index 2a20ce7c..e719932d 100644 --- a/minecode/tests/test_route.py +++ b/minecode/tests/test_route.py @@ -15,53 +15,56 @@ class RouteTest(TestCase): - def test_rule(self): - self.assertRaises(AssertionError, Rule, '', '') + self.assertRaises(AssertionError, Rule, "", "") - class non_callable(object): + class non_callable: pass - self.assertRaises(AssertionError, Rule, 'abc', non_callable) + self.assertRaises(AssertionError, Rule, "abc", non_callable) - class RoutableClass(object): - """ A callable class can be routed.""" + class RoutableClass: + """A callable class can be routed.""" def __call__(self): pass ca = RoutableClass() - Rule('asas', ca) - Rule('asas', RoutableClass) + Rule("asas", ca) + Rule("asas", RoutableClass) def func(): pass - Rule('asas', func) + Rule("asas", func) import re - invalid_regex = '(({wewew' + + invalid_regex = "(({wewew" self.assertRaises(re.error, Rule, invalid_regex, func) def test_class_routing(self): uris = route.Router() - @uris.route('this') - class CallableClass(object): - """ A callable class can be routed.""" + @uris.route("this") + class CallableClass: + """A callable class can be routed.""" def __call__(self, uri, *args, **kwargs): return uri - self.assertEqual('this', uris.process('this')) + self.assertEqual("this", uris.process("this")) - def test_that_each_processing_of_routed_class_is_done_with_a_new_instance_that_does_not_share_state(self): + def test_that_each_processing_of_routed_class_is_done_with_a_new_instance_that_does_not_share_state( + self, + ): import time + uris = route.Router() - @uris.route('this', 'that') - class CallableClass(object): - """ A callable class can be routed.""" + @uris.route("this", "that") + class CallableClass: + """A callable class can be routed.""" def __init__(self): # some more or less unique thing for a given instance @@ -72,25 +75,23 @@ def __call__(self, uri): return self.ts # ensure that two routes with the same object are same class - thi = uris.resolve('this') - thi2 = uris.resolve('this') + thi = uris.resolve("this") + thi2 = uris.resolve("this") self.assertTrue(thi is thi2) - tha = uris.resolve('that') + tha = uris.resolve("that") self.assertTrue(thi is tha) # ensure that processing of routes for the same registered class # is done by different objects with different state - p1 = uris.process('this') - p2 = uris.process('this') + p1 = uris.process("this") + p2 = uris.process("this") self.assertNotEqual(p1, p2) - p3 = uris.process('this') - p4 = uris.process('that') + p3 = uris.process("this") + p4 = uris.process("that") self.assertNotEqual(p3, p4) def test_that_subclasses_are_routed_correctly_with_append_to_route(self): - - class CallableParentClass(object): - + class CallableParentClass: def __call__(self, uri): return self.myfunc() @@ -98,31 +99,27 @@ def myfunc(self): pass class CallableSubClass1(CallableParentClass): - def myfunc(self): - return 'done1' + return "done1" class CallableSubClass2(CallableParentClass): - def myfunc(self): - return 'done2' + return "done2" uris = route.Router() - uris.append('base', CallableParentClass) - uris.append('this', CallableSubClass1) - uris.append('that', CallableSubClass2) + uris.append("base", CallableParentClass) + uris.append("this", CallableSubClass1) + uris.append("that", CallableSubClass2) - self.assertEqual(None, uris.process('base')) - self.assertEqual('done1', uris.process('this')) - self.assertEqual('done2', uris.process('that')) + self.assertEqual(None, uris.process("base")) + self.assertEqual("done1", uris.process("this")) + self.assertEqual("done2", uris.process("that")) def test_that_subclasses_are_routed_correctly_with_class_decorator(self): uris = route.Router() - class CallableParentClass(object): - """ - Note: The parent class CANNOT be decorated. Only subclasses can - """ + class CallableParentClass: + """Note: The parent class CANNOT be decorated. Only subclasses can""" def __call__(self, uri): return self.myfunc() @@ -130,139 +127,134 @@ def __call__(self, uri): def myfunc(self): raise NotImplementedError - @uris.route('this') + @uris.route("this") class CallableSubClass1(CallableParentClass): - def myfunc(self): - return 'done1' + return "done1" - @uris.route('that') + @uris.route("that") class CallableSubClass2(CallableParentClass): - def __call__(self, uri): - return 'done3' + return "done3" - self.assertEqual('done1', uris.process('this')) - self.assertEqual('done3', uris.process('that')) + self.assertEqual("done1", uris.process("this")) + self.assertEqual("done3", uris.process("that")) def test_rule_match(self): - def func(uri): pass - r = Rule('asas', func) - self.assertTrue(r.match('asas')) - self.assertFalse(r.match('bbb')) + r = Rule("asas", func) + self.assertTrue(r.match("asas")) + self.assertFalse(r.match("bbb")) - r = Rule('.*abc', func) - self.assertTrue(r.match('abc')) - self.assertTrue(r.match('123abc')) - self.assertFalse(r.match('bbb')) - self.assertFalse(r.match('abcXYZ')) + r = Rule(".*abc", func) + self.assertTrue(r.match("abc")) + self.assertTrue(r.match("123abc")) + self.assertFalse(r.match("bbb")) + self.assertFalse(r.match("abcXYZ")) - r = Rule('https*://', func) - self.assertTrue(r.match('http://')) - self.assertTrue(r.match('https://')) + r = Rule("https*://", func) + self.assertTrue(r.match("http://")) + self.assertTrue(r.match("https://")) def test_routing_resolving_and_exceptions(self): uris = route.Router() - @uris.route(r'http://nexb\.com') + @uris.route(r"http://nexb\.com") def myroute(uri): pass - @uris.route(r'http://nexb\.com.*') + @uris.route(r"http://nexb\.com.*") def myroute2(uri): pass - self.assertRaises(route.RouteAlreadyDefined, uris.append, - r'http://nexb\.com', myroute) - self.assertRaises(route.RouteAlreadyDefined, uris.append, - r'http://nexb\.com', myroute) + self.assertRaises( + route.RouteAlreadyDefined, uris.append, r"http://nexb\.com", myroute + ) + self.assertRaises( + route.RouteAlreadyDefined, uris.append, r"http://nexb\.com", myroute + ) - self.assertRaises(route.MultipleRoutesDefined, uris.resolve, - r'http://nexb.com') - self.assertRaises(route.NoRouteAvailable, uris.resolve, 'impossible') + self.assertRaises(route.MultipleRoutesDefined, uris.resolve, r"http://nexb.com") + self.assertRaises(route.NoRouteAvailable, uris.resolve, "impossible") def test_route_resolution_and_execution(self): uris = route.Router() - @uris.route(r'http://nexb\.com') + @uris.route(r"http://nexb\.com") def myroute(uri): - return 'r1' + return "r1" - u1 = 'http://nexb.com' - self.assertEqual('r1', myroute(u1)) + u1 = "http://nexb.com" + self.assertEqual("r1", myroute(u1)) - @uris.route(r'http://dejacode\.com') + @uris.route(r"http://dejacode\.com") def myroute2(uri): - return 'r2' + return "r2" - u1 = 'http://nexb.com' + u1 = "http://nexb.com" self.assertEqual(myroute.__name__, uris.resolve(u1).__name__) # these three calls are equivalent: the uri determines what is executed - self.assertEqual('r1', myroute(u1)) - self.assertEqual('r1', myroute2(u1)) - self.assertEqual('r1', uris.process(u1)) + self.assertEqual("r1", myroute(u1)) + self.assertEqual("r1", myroute2(u1)) + self.assertEqual("r1", uris.process(u1)) - u2 = 'http://dejacode.com' + u2 = "http://dejacode.com" self.assertEqual(myroute2.__name__, uris.resolve(u2).__name__) # these three calls are equivalent: the uri determines what is executed - self.assertEqual('r2', myroute2(u2)) - self.assertEqual('r2', myroute(u2)) - self.assertEqual('r2', uris.process(u2)) + self.assertEqual("r2", myroute2(u2)) + self.assertEqual("r2", myroute(u2)) + self.assertEqual("r2", uris.process(u2)) def test_that_multiple_patterns_can_be_used_in_a_route_decorator(self): uris = route.Router() - @uris.route(r'http://nexb\.com', - r'http://deja\.com') + @uris.route(r"http://nexb\.com", r"http://deja\.com") def myroute(uri): - return 'r1' + return "r1" - u1 = 'http://nexb.com' - self.assertEqual('r1', myroute(u1)) - u1 = 'http://deja.com' - self.assertEqual('r1', myroute(u1)) + u1 = "http://nexb.com" + self.assertEqual("r1", myroute(u1)) + u1 = "http://deja.com" + self.assertEqual("r1", myroute(u1)) def test_translate_globs_can_be_used_instead_of_regex_patterns(self): uris = route.Router() from fnmatch import translate - @uris.route(translate('http://nexb.com/')) + @uris.route(translate("http://nexb.com/")) def myroute(uri): - return 'r1' + return "r1" - u1 = 'http://nexb.com/' - self.assertEqual('r1', myroute(u1)) + u1 = "http://nexb.com/" + self.assertEqual("r1", myroute(u1)) - @uris.route(translate('http://nexb.com/*/*/')) + @uris.route(translate("http://nexb.com/*/*/")) def myroute2(uri): - return 'r2' + return "r2" - u1 = 'http://nexb.com/somepath/otherpath/' - self.assertEqual('r2', myroute(u1)) - u1 = 'http://nexb.com/somepath/yetanotherotherpath/' - self.assertEqual('r2', myroute(u1)) + u1 = "http://nexb.com/somepath/otherpath/" + self.assertEqual("r2", myroute(u1)) + u1 = "http://nexb.com/somepath/yetanotherotherpath/" + self.assertEqual("r2", myroute(u1)) def test_is_routable(self): uris = route.Router() - @uris.route(r'http://nexb\.com', - r'http://deja\.com') + @uris.route(r"http://nexb\.com", r"http://deja\.com") def myroute(uri): pass - @uris.route(r'http://nexc\.com', - r'http://dejb\.com') + @uris.route(r"http://nexc\.com", r"http://dejb\.com") def myroute2(uri): pass - self.assertTrue(uris.is_routable('http://nexb.com')) - self.assertTrue(uris.is_routable('http://deja.com')) - self.assertTrue(uris.is_routable('http://nexc.com')) - self.assertTrue(uris.is_routable('http://dejb.com')) - self.assertFalse(uris.is_routable('https://deja.com')) + self.assertTrue(uris.is_routable("http://nexb.com")) + self.assertTrue(uris.is_routable("http://deja.com")) + self.assertTrue(uris.is_routable("http://nexc.com")) + self.assertTrue(uris.is_routable("http://dejb.com")) + self.assertFalse(uris.is_routable("https://deja.com")) diff --git a/minecode/tests/test_rsync.py b/minecode/tests/test_rsync.py index 7a3901e4..35b8e50c 100644 --- a/minecode/tests/test_rsync.py +++ b/minecode/tests/test_rsync.py @@ -8,47 +8,53 @@ # # -from unittest import skipIf import os +from unittest import skipIf -from minecode import rsync from minecode import ON_WINDOWS +from minecode import rsync from minecode.utils_test import MiningTestCase class RsyncTest(MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def test_modules(self): - inp = self.get_test_loc('rsync/rsync_modules') + inp = self.get_test_loc("rsync/rsync_modules") output = list(rsync.modules(inp)) - expected = '''apache CPAN CTAN eclipse flightgear gnualpha gnuftp - mozdev mozilla opencsw simgear sugar xemacs'''.split() + expected = """apache CPAN CTAN eclipse flightgear gnualpha gnuftp + mozdev mozilla opencsw simgear sugar xemacs""".split() self.assertEqual(expected, output) def test_entry_rsync_31(self): # $ rsync --no-motd --recursive rsync/rsync_dir/ lines = [ - 'drwxrwxr-x 4,096 2015/07/23 17:36:47 .', - '-rw-rw-r-- 0 2015/07/23 17:36:47 foo', - 'drwxrwxr-x 4,096 2015/07/23 17:36:47 bar', - '-rw-rw-r-- 0 2015/07/23 17:36:47 bar/this', - 'drwxrwxr-x 4,096 2015/07/23 17:36:47 bar/that', - '-rw-rw-r-- 0 2015/07/23 17:36:47 bar/that/baz', + "drwxrwxr-x 4,096 2015/07/23 17:36:47 .", + "-rw-rw-r-- 0 2015/07/23 17:36:47 foo", + "drwxrwxr-x 4,096 2015/07/23 17:36:47 bar", + "-rw-rw-r-- 0 2015/07/23 17:36:47 bar/this", + "drwxrwxr-x 4,096 2015/07/23 17:36:47 bar/that", + "-rw-rw-r-- 0 2015/07/23 17:36:47 bar/that/baz", ] expected = [ - rsync.Entry('d', 'rwxrwxr-x', 4096, - '2015-07-23T17:36:47+00:00', '.')._asdict(), - rsync.Entry('-', 'rw-rw-r--', 0, - '2015-07-23T17:36:47+00:00', 'foo')._asdict(), - rsync.Entry('d', 'rwxrwxr-x', 4096, - '2015-07-23T17:36:47+00:00', 'bar')._asdict(), - rsync.Entry('-', 'rw-rw-r--', 0, - '2015-07-23T17:36:47+00:00', 'bar/this')._asdict(), - rsync.Entry('d', 'rwxrwxr-x', 4096, - '2015-07-23T17:36:47+00:00', 'bar/that')._asdict(), - rsync.Entry('-', 'rw-rw-r--', 0, - '2015-07-23T17:36:47+00:00', 'bar/that/baz')._asdict(), + rsync.Entry( + "d", "rwxrwxr-x", 4096, "2015-07-23T17:36:47+00:00", "." + )._asdict(), + rsync.Entry( + "-", "rw-rw-r--", 0, "2015-07-23T17:36:47+00:00", "foo" + )._asdict(), + rsync.Entry( + "d", "rwxrwxr-x", 4096, "2015-07-23T17:36:47+00:00", "bar" + )._asdict(), + rsync.Entry( + "-", "rw-rw-r--", 0, "2015-07-23T17:36:47+00:00", "bar/this" + )._asdict(), + rsync.Entry( + "d", "rwxrwxr-x", 4096, "2015-07-23T17:36:47+00:00", "bar/that" + )._asdict(), + rsync.Entry( + "-", "rw-rw-r--", 0, "2015-07-23T17:36:47+00:00", "bar/that/baz" + )._asdict(), ] for test, exp in zip(lines, expected): @@ -57,20 +63,27 @@ def test_entry_rsync_31(self): def test_entry(self): lines = [ - '-rw-r--r-- 4399746 2008/11/23 16:03:57 zz/ZZUL P/ZUL.gz', - 'drwxrwxr-x 4096 2004/08/09 00:47:02 pub/sou/a/a7', - '-rwxrwxr-x 4096 2004/08/09 00:47:02 pub/#345sou/a/a7', - 'lrwxrwxrwx 19 2007/11/22 11:37:54 s/c/a/index.html', - 'crwxrwxrwx 19 2007/11/22 11:37:54 dev/pts1', + "-rw-r--r-- 4399746 2008/11/23 16:03:57 zz/ZZUL P/ZUL.gz", + "drwxrwxr-x 4096 2004/08/09 00:47:02 pub/sou/a/a7", + "-rwxrwxr-x 4096 2004/08/09 00:47:02 pub/#345sou/a/a7", + "lrwxrwxrwx 19 2007/11/22 11:37:54 s/c/a/index.html", + "crwxrwxrwx 19 2007/11/22 11:37:54 dev/pts1", ] expected = [ - rsync.Entry('-', 'rw-r--r--', 4399746, - '2008-11-23T16:03:57+00:00', 'zz/ZZUL P/ZUL.gz')._asdict(), - rsync.Entry('d', 'rwxrwxr-x', 4096, - '2004-08-09T00:47:02+00:00', 'pub/sou/a/a7')._asdict(), - rsync.Entry('-', 'rwxrwxr-x', 4096, - '2004-08-09T00:47:02+00:00', 'pub/\xe5sou/a/a7')._asdict(), + rsync.Entry( + "-", + "rw-r--r--", + 4399746, + "2008-11-23T16:03:57+00:00", + "zz/ZZUL P/ZUL.gz", + )._asdict(), + rsync.Entry( + "d", "rwxrwxr-x", 4096, "2004-08-09T00:47:02+00:00", "pub/sou/a/a7" + )._asdict(), + rsync.Entry( + "-", "rwxrwxr-x", 4096, "2004-08-09T00:47:02+00:00", "pub/\xe5sou/a/a7" + )._asdict(), None, None, ] @@ -80,105 +93,210 @@ def test_entry(self): self.assertEqual(exp, result) def test_directory(self): - test_dir = self.get_test_loc('rsync/rsync_wicket.dir') + test_dir = self.get_test_loc("rsync/rsync_wicket.dir") output = list(rsync.directory_entries(test_dir)) expected = [ - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-03-18T19:02:46+00:00', path='.'), - rsync.Entry(type='-', perm='rw-rw-r--', size=5, - date='2014-03-18T19:02:46+00:00', path='.revision'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-02-05T09:34:20+00:00', path='1.4.23'), - rsync.Entry(type='-', perm='rw-rw-r--', size=95314, - date='2014-02-05T09:23:44+00:00', path='1.4.23/CHANGELOG-1.4'), - rsync.Entry(type='-', perm='rw-rw-r--', size=3712820, - date='2014-02-05T09:23:44+00:00', path='1.4.23/apache-wicket-1.4.23-source.tgz'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-02-05T09:34:20+00:00', path='1.4.23/binaries'), - rsync.Entry(type='-', perm='rw-rw-r--', size=23622515, date='2014-02-05T09:23:44+00:00', - path='1.4.23/binaries/apache-wicket-1.4.23.tar.gz'), - rsync.Entry(type='-', perm='rw-rw-r--', size=32524295, - date='2014-02-05T09:23:44+00:00', path='1.4.23/binaries/apache-wicket-1.4.23.zip'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-01-27T09:09:40+00:00', path='1.5.11'), - rsync.Entry(type='-', perm='rw-rw-r--', size=115587, - date='2014-01-20T16:53:10+00:00', path='1.5.11/CHANGELOG-1.5'), - rsync.Entry(type='-', perm='rw-rw-r--', size=4116809, - date='2014-01-20T16:53:10+00:00', path='1.5.11/apache-wicket-1.5.11-source.tgz'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-01-27T09:09:39+00:00', path='1.5.11/binaries'), - rsync.Entry(type='-', perm='rw-rw-r--', size=26048500, date='2014-01-20T16:53:10+00:00', - path='1.5.11/binaries/apache-wicket-1.5.11.tar.gz'), - rsync.Entry(type='-', perm='rw-rw-r--', size=36156260, - date='2014-01-20T16:53:10+00:00', path='1.5.11/binaries/apache-wicket-1.5.11.zip'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-02-19T08:36:07+00:00', path='6.14.0'), - rsync.Entry(type='-', perm='rw-rw-r--', size=78058, - date='2014-02-14T15:51:23+00:00', path='6.14.0/CHANGELOG-6.x'), - rsync.Entry(type='-', perm='rw-rw-r--', size=4792619, - date='2014-02-14T15:51:23+00:00', path='6.14.0/apache-wicket-6.14.0.tar.gz'), - rsync.Entry(type='-', perm='rw-rw-r--', size=9038442, - date='2014-02-14T15:51:23+00:00', path='6.14.0/apache-wicket-6.14.0.zip'), - rsync.Entry(type='d', perm='rwxrwxr-x', size=4096, - date='2014-02-19T08:36:05+00:00', path='6.14.0/binaries'), - rsync.Entry(type='-', perm='rw-rw-r--', size=29851252, date='2014-02-14T15:51:23+00:00', - path='6.14.0/binaries/apache-wicket-6.14.0-bin.tar.gz'), - rsync.Entry(type='-', perm='rw-rw-r--', size=29890658, date='2014-02-14T15:51:23+00:00', - path='6.14.0/binaries/apache-wicket-6.14.0-bin.zip') + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-03-18T19:02:46+00:00", + path=".", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=5, + date="2014-03-18T19:02:46+00:00", + path=".revision", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-02-05T09:34:20+00:00", + path="1.4.23", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=95314, + date="2014-02-05T09:23:44+00:00", + path="1.4.23/CHANGELOG-1.4", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=3712820, + date="2014-02-05T09:23:44+00:00", + path="1.4.23/apache-wicket-1.4.23-source.tgz", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-02-05T09:34:20+00:00", + path="1.4.23/binaries", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=23622515, + date="2014-02-05T09:23:44+00:00", + path="1.4.23/binaries/apache-wicket-1.4.23.tar.gz", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=32524295, + date="2014-02-05T09:23:44+00:00", + path="1.4.23/binaries/apache-wicket-1.4.23.zip", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-01-27T09:09:40+00:00", + path="1.5.11", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=115587, + date="2014-01-20T16:53:10+00:00", + path="1.5.11/CHANGELOG-1.5", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=4116809, + date="2014-01-20T16:53:10+00:00", + path="1.5.11/apache-wicket-1.5.11-source.tgz", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-01-27T09:09:39+00:00", + path="1.5.11/binaries", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=26048500, + date="2014-01-20T16:53:10+00:00", + path="1.5.11/binaries/apache-wicket-1.5.11.tar.gz", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=36156260, + date="2014-01-20T16:53:10+00:00", + path="1.5.11/binaries/apache-wicket-1.5.11.zip", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-02-19T08:36:07+00:00", + path="6.14.0", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=78058, + date="2014-02-14T15:51:23+00:00", + path="6.14.0/CHANGELOG-6.x", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=4792619, + date="2014-02-14T15:51:23+00:00", + path="6.14.0/apache-wicket-6.14.0.tar.gz", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=9038442, + date="2014-02-14T15:51:23+00:00", + path="6.14.0/apache-wicket-6.14.0.zip", + ), + rsync.Entry( + type="d", + perm="rwxrwxr-x", + size=4096, + date="2014-02-19T08:36:05+00:00", + path="6.14.0/binaries", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=29851252, + date="2014-02-14T15:51:23+00:00", + path="6.14.0/binaries/apache-wicket-6.14.0-bin.tar.gz", + ), + rsync.Entry( + type="-", + perm="rw-rw-r--", + size=29890658, + date="2014-02-14T15:51:23+00:00", + path="6.14.0/binaries/apache-wicket-6.14.0-bin.zip", + ), ] expected = [dict(x._asdict()) for x in expected] self.assertEqual(expected, output) def test_directory_weird_file_types_are_ignored(self): self.maxDiff = None - inp = self.get_test_loc('rsync/rsync_dev.dir') + inp = self.get_test_loc("rsync/rsync_dev.dir") output = rsync.directory_entries(inp) - results = [e['path'] for e in output if e['type'] == '-'] - expected = ['dev/.udev/rules.d/root.rules'] + results = [e["path"] for e in output if e["type"] == "-"] + expected = ["dev/.udev/rules.d/root.rules"] self.assertEqual(expected, results) - @skipIf(ON_WINDOWS, 'rsync is not available on windows') + @skipIf(ON_WINDOWS, "rsync is not available on windows") def test_fetch_directory(self): self.maxDiff = None - inp = self.get_test_loc('rsync/rsync_dir') + inp = self.get_test_loc("rsync/rsync_dir") output = rsync.fetch_directory(inp) - expected = 'foo bar bar/this bar/that bar/that/baz'.split() + expected = "foo bar bar/this bar/that bar/that/baz".split() with open(output) as f: results = f.read() self.assertTrue(all(e in results for e in expected)) - @skipIf(ON_WINDOWS, 'rsync is not available on windows') + @skipIf(ON_WINDOWS, "rsync is not available on windows") def test_fetch_directory_no_recurse(self): self.maxDiff = None - inp = self.get_test_loc('rsync/rsync_dir') + inp = self.get_test_loc("rsync/rsync_dir") output = rsync.fetch_directory(inp, recurse=False) - expected = ['foo', 'bar'] + expected = ["foo", "bar"] with open(output) as f: results = f.read() self.assertTrue(all(e in results for e in expected)) - self.assertTrue('bar/this' not in results) + self.assertTrue("bar/this" not in results) def get_dirs(self, input_path): - """ - Returns only the type and path from rsync entries. - """ - return [(e['type'], e['path']) - for e in rsync.directory_entries(input_path) - if '.svn' not in e['path']] - - @skipIf(ON_WINDOWS, 'rsync is not available on windows') + """Return only the type and path from rsync entries.""" + return [ + (e["type"], e["path"]) + for e in rsync.directory_entries(input_path) + if ".svn" not in e["path"] + ] + + @skipIf(ON_WINDOWS, "rsync is not available on windows") def test_fetch_and_parse_directory_no_recurse(self): self.maxDiff = None - inp = self.get_test_loc('rsync/rsync_dir') + inp = self.get_test_loc("rsync/rsync_dir") output = rsync.fetch_directory(inp, recurse=False) results = self.get_dirs(output) - expected = [('d', '.'), ('-', 'foo'), ('d', 'bar')] + expected = [("d", "."), ("-", "foo"), ("d", "bar")] self.assertEqual(sorted(expected), sorted(results)) def test_directory_output_can_be_parsed_on_protocol_30_and_31(self): self.maxDiff = None - input_30 = self.get_test_loc('rsync/rsync_v3.0.9_protocol30.dir') - input_31 = self.get_test_loc('rsync/rsync_v3.1.0_protocol31.dir') + input_30 = self.get_test_loc("rsync/rsync_v3.0.9_protocol30.dir") + input_31 = self.get_test_loc("rsync/rsync_v3.1.0_protocol31.dir") self.assertEqual(self.get_dirs(input_30), self.get_dirs(input_31)) diff --git a/minecode/tests/test_rubygems.py b/minecode/tests/test_rubygems.py deleted file mode 100644 index 2126d096..00000000 --- a/minecode/tests/test_rubygems.py +++ /dev/null @@ -1,329 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import codecs -import json -import os - -from mock import Mock -from mock import patch - -from commoncode.fileutils import file_name -from django.test import TestCase as DjangoTestCase - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting -from minecode.utils_test import model_to_dict - -from minecode import mappers -from minecode import route -from minecode.models import ResourceURI -from minecode import visit_router -from minecode.mappers.rubygems import build_rubygem_packages_from_api_data -from minecode.mappers.rubygems import build_rubygem_packages_from_metadata -from minecode.mappers.rubygems import RubyGemsApiVersionsJsonMapper -from minecode.mappers.rubygems import RubyGemsPackageArchiveMetadataMapper - -from minecode.visitors.rubygems import get_gem_metadata -from minecode.visitors.rubygems import RubyGemsApiManyVersionsVisitor -from minecode.visitors.rubygems import RubyGemsIndexVisitor -from minecode.visitors.rubygems import RubyGemsPackageArchiveMetadataVisitor -from minecode.tests import FIXTURES_REGEN - - -# -# TODO: also parse Gemspec -# ('rubygems/address_standardization.gemspec', 'rubygems/address_standardization.gemspec.json'), -# ('rubygems/arel.gemspec', 'rubygems/arel.gemspec.json'), - - -class RubyGemsVisitorTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_check_gem_file_visitor_routes(self): - routes = [ - 'https://rubygems.org/downloads/m2r-2.1.0.gem', # https - 'http://rubygems.org/downloads/m2r-2.1.0.gem', # http - 'https://rubygems.org/downloads/O365RubyEasy-0.0.1.gem', # upper - ] - - for route in routes: - self.assertTrue(visit_router.resolve(route)) - - def test_RubyGemsIndexVisitor_latest(self): - uri = 'http://rubygems.org/specs.4.8.gz' - test_loc = self.get_test_loc('rubygems/index/latest_specs.4.8.gz') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, _ = RubyGemsIndexVisitor(uri) - expected_loc = self.get_test_loc( - 'rubygems/index/latest_specs.4.8.gz.expected.json') - uris_list = list(uris) - self.assertTrue(len(uris_list) > 1000) - self.check_expected_uris( - uris_list[0:1000], expected_loc, regen=FIXTURES_REGEN) - - def test_RubyGemsApiVersionVisitor(self): - uri = 'https://rubygems.org/api/v1/versions/0xffffff.json' - test_loc = self.get_test_loc('rubygems/apiv1/0xffffff.api.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = RubyGemsApiManyVersionsVisitor(uri) - expected_loc = self.get_test_loc( - 'rubygems/apiv1/expected_0xffffff.api.json') - self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) - - def test_RubyGemsApiVersionVisitor2(self): - uri = 'https://rubygems.org/api/v1/versions/a1630ty_a1630ty.json' - test_loc = self.get_test_loc('rubygems/apiv1/a1630ty_a1630ty.api.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = RubyGemsApiManyVersionsVisitor(uri) - expected_loc = self.get_test_loc( - 'rubygems/apiv1/expected_a1630ty_a1630ty.api.json') - self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) - - def test_RubyGemsApiVersionVisitor3(self): - uri = 'https://rubygems.org/api/v1/versions/zuck.json' - test_loc = self.get_test_loc('rubygems/apiv1/zuck.api.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = RubyGemsApiManyVersionsVisitor(uri) - expected_loc = self.get_test_loc( - 'rubygems/apiv1/expected_zuck.api.json') - self.check_expected_results(data, expected_loc, regen=FIXTURES_REGEN) - - def test_RubyGemsPackageArchiveMetadataVisitor(self): - uri = 'https://rubygems.org/downloads/a_okay-0.1.0.gem' - test_loc = self.get_test_loc('rubygems/a_okay-0.1.0.gem', copy=True) - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, _ = RubyGemsPackageArchiveMetadataVisitor(uri) - expected_loc = self.get_test_loc('rubygems/a_okay-0.1.0.gem.metadata') - with open(expected_loc) as expect_file: - self.assertEqual(expect_file.read(), data) - - -class RubyGemsApiMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_build_rubygem_packages_from_api_data_1(self): - with open(self.get_test_loc('rubygems/apiv1/0xffffff.api.json')) as api: - apidata = json.load(api) - packages = build_rubygem_packages_from_api_data(apidata, '0xffffff') - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'rubygems/apiv1/0xffffff.api.package.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_rubygem_packages_from_api_data_2(self): - with open(self.get_test_loc('rubygems/apiv1/zuck.api.json')) as api: - apidata = json.load(api) - packages = build_rubygem_packages_from_api_data(apidata, 'zuck') - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'rubygems/apiv1/zuck.api.package.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_rubygem_packages_from_api_data_3(self): - with open(self.get_test_loc('rubygems/apiv1/a1630ty_a1630ty.api.json')) as api: - apidata = json.load(api) - packages = mappers.rubygems.build_rubygem_packages_from_api_data( - apidata, 'a1630ty_a1630ty') - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'rubygems/apiv1/a1630ty_a1630ty.api.package.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_rubygem_packages_from_api_data_with_deps(self): - with open(self.get_test_loc('rubygems/apiv1/action_tracker.api.json')) as api: - apidata = json.load(api) - packages = mappers.rubygems.build_rubygem_packages_from_api_data( - apidata, 'action_tracker') - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'rubygems/apiv1/action_tracker.api.package.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_RubyGemsApiVersionsJsonMapper(self): - test_uri = 'https://rubygems.org/api/v1/versions/a1630ty_a1630ty.json' - router = route.Router() - router.append(test_uri, RubyGemsApiVersionsJsonMapper) - test_loc = self.get_test_loc('rubygems/apiv1/a1630ty_a1630ty.api.json') - with codecs.open(test_loc, encoding='utf-8') as ltest_file: - test_data = ltest_file.read() - - test_res_uri = ResourceURI(uri=test_uri, data=test_data) - packages = RubyGemsApiVersionsJsonMapper(test_uri, test_res_uri) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'rubygems/apiv1/a1630ty_a1630ty.api.mapped.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - -class RubyGemsArchiveMapperTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_test_RubyGemsPackageArchiveMetadataMapper(self): - test_uri = 'https://rubygems.org/downloads/mysmallidea-address_standardization-0.4.1.gem' - router = route.Router() - router.append(test_uri, RubyGemsPackageArchiveMetadataMapper) - test_loc = self.get_test_loc( - 'rubygems/mysmallidea-address_standardization-0.4.1.gem.metadata') - with codecs.open(test_loc, encoding='utf-8') as test_file: - test_data = test_file.read() - - test_res_uri = ResourceURI(uri=test_uri, data=test_data) - packages = RubyGemsPackageArchiveMetadataMapper(test_uri, test_res_uri) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'rubygems/mysmallidea-address_standardization-0.4.1.gem.mapped.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def check_mapped_packages(self, test_loc, expected_loc, extract=True, regen=FIXTURES_REGEN): - - test_loc = self.get_test_loc(test_loc, copy=True) - - if extract: - metadata = get_gem_metadata(test_loc) - else: - with open(test_loc) as tl: - metadata = tl.read() - - download_url = 'https://rubygems.org/downloads/{}'.format( - file_name(test_loc).replace('.metadata', '')) - results = build_rubygem_packages_from_metadata(metadata, download_url) - results = [p.to_dict() for p in results] - - expected_loc = self.get_test_loc(expected_loc) - if regen: - with codecs.open(expected_loc, 'wb', encoding='UTF-8') as ex: - json.dump(results, ex, indent=2) - - with open(expected_loc) as ex: - expected = json.load(ex) - - assert expected == results - - def test_build_rubygem_packages_from_metadata_plain(self): - self.check_mapped_packages( - 'rubygems/0mq-0.4.1.gem.metadata', - 'rubygems/0mq-0.4.1.gem.package.json', - extract=False) - - def test_build_rubygem_packages_from_metadata_0(self): - self.check_mapped_packages( - 'rubygems/a_okay-0.1.0.gem', - 'rubygems/a_okay-0.1.0.gem.package.json') - - def test_build_rubygem_packages_from_metadata_1(self): - self.check_mapped_packages( - 'rubygems/archive-tar-minitar-0.5.2.gem', - 'rubygems/archive-tar-minitar-0.5.2.gem.package.json') - - def test_build_rubygem_packages_from_metadata_2(self): - self.check_mapped_packages( - 'rubygems/blankslate-3.1.3.gem', - 'rubygems/blankslate-3.1.3.gem.package.json') - - def test_build_rubygem_packages_from_metadata_3(self): - self.check_mapped_packages( - 'rubygems/m2r-2.1.0.gem', - 'rubygems/m2r-2.1.0.gem.package.json') - - def test_build_rubygem_packages_from_metadata_4(self): - self.check_mapped_packages( - 'rubygems/mysmallidea-address_standardization-0.4.1.gem', - 'rubygems/mysmallidea-address_standardization-0.4.1.gem.package.json') - - def test_build_rubygem_packages_from_metadata_5(self): - self.check_mapped_packages( - 'rubygems/mysmallidea-mad_mimi_mailer-0.0.9.gem', - 'rubygems/mysmallidea-mad_mimi_mailer-0.0.9.gem.package.json') - - def test_build_rubygem_packages_from_metadata_6(self): - self.check_mapped_packages( - 'rubygems/ng-rails-csrf-0.1.0.gem', - 'rubygems/ng-rails-csrf-0.1.0.gem.package.json') - - def test_build_rubygem_packages_from_metadata_7(self): - self.check_mapped_packages( - 'rubygems/small_wonder-0.1.10.gem', - 'rubygems/small_wonder-0.1.10.gem.package.json') - - def test_build_rubygem_packages_from_metadata_8(self): - self.check_mapped_packages( - 'rubygems/small-0.2.gem', - 'rubygems/small-0.2.gem.package.json') - - def test_build_rubygem_packages_from_metadata_9(self): - self.check_mapped_packages( - 'rubygems/sprockets-vendor_gems-0.1.3.gem', - 'rubygems/sprockets-vendor_gems-0.1.3.gem.package.json') - - def test_build_rubygem_packages_from_metadata_with_deps(self): - self.check_mapped_packages( - 'rubygems/action_tracker-1.0.2.gem', - 'rubygems/action_tracker-1.0.2.gem.package.json') - - -class RubyEnd2EndTest(JsonBasedTesting, DjangoTestCase): - - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_visit_and_map_end2end(self): - from minecode.management.commands.run_visit import visit_uri - from minecode.management.commands.run_map import map_uri - import packagedb - - uri = 'https://rubygems.org/downloads/sprockets-vendor_gems-0.1.3.gem' - test_loc = self.get_test_loc( - 'rubygems/sprockets-vendor_gems-0.1.3.gem', copy=True) - - before_uri = [p.id for p in ResourceURI.objects.all()] - before_pkg = [p.id for p in packagedb.models.Package.objects.all()] - - resource_uri = ResourceURI.objects.insert(uri=uri) - - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - # visit test proper: this should process all the test uris - visit_uri(resource_uri) - map_uri(resource_uri) - - if before_uri: - visited = ResourceURI.objects.exclude(id__in=before_uri) - else: - visited = ResourceURI.objects.all() - - uri_results = [model_to_dict(rec, exclude=['id']) for rec in visited] - expected_loc = self.get_test_loc( - 'rubygems/sprockets-vendor_gems-0.1.3.gem.visited.json') - self.check_expected_results( - uri_results, expected_loc, regen=FIXTURES_REGEN) - - if before_pkg: - mapped = packagedb.models.Package.objects.exclude( - id__in=before_pkg) - else: - mapped = packagedb.models.Package.objects.all() - - package_results = [pac.to_dict() for pac in mapped] - expected_loc = self.get_test_loc( - 'rubygems/sprockets-vendor_gems-0.1.3.gem.mapped.json') - self.check_expected_results(package_results, expected_loc, fields_to_remove=[ - 'package_sets'], regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_run_map.py b/minecode/tests/test_run_map.py index 7bca14f1..f3ad262a 100644 --- a/minecode/tests/test_run_map.py +++ b/minecode/tests/test_run_map.py @@ -15,36 +15,38 @@ from packagedcode.models import Package as ScannedPackage +import packagedb from minecode.management.commands.run_map import map_uri from minecode.model_utils import merge_packages from minecode.models import ResourceURI from minecode.models import ScannableURI from minecode.route import Router +from minecode.tests import FIXTURES_REGEN from minecode.utils_test import JsonBasedTesting from minecode.utils_test import MiningTestCase -from minecode.tests import FIXTURES_REGEN -import packagedb class RunMapTest(JsonBasedTesting, MiningTestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") maxDiff = None def test_map_uri(self): # setup # build a mock mapper and register it in a router - uri = 'http://testdomap.com' + uri = "http://testdomap.com" def mock_mapper(uri, resource_uri): - return [ScannedPackage( - type='maven', - namespace='org.apache.spark', - name='spark-streaming_2.10', - version='1.2.0', - qualifiers=dict(extension='pom'), - download_url='http://testdomap.com', - sha1='beef' - )] + return [ + ScannedPackage( + type="maven", + namespace="org.apache.spark", + name="spark-streaming_2.10", + version="1.2.0", + qualifiers=dict(extension="pom"), + download_url="http://testdomap.com", + sha1="beef", + ) + ] router = Router() router.append(uri, mock_mapper) @@ -56,33 +58,37 @@ def mock_mapper(uri, resource_uri): resource_uri = ResourceURI.objects.insert( uri=uri, last_visit_date=timezone.now(), - package_url='pkg:maven/org.apache.spark/spark-streaming_2.10@1.2.0?extension=pom') + package_url="pkg:maven/org.apache.spark/spark-streaming_2.10@1.2.0?extension=pom", + ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True resource_uri.save() # ensure that we are clear of Package before before = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, before.count()) # test proper map_uri(resource_uri, _map_router=router) mapped = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(1, mapped.count()) mapped_package = mapped.first() self.assertEqual( - 'pkg:maven/org.apache.spark/spark-streaming_2.10@1.2.0?extension=pom', mapped_package.package_url) + "pkg:maven/org.apache.spark/spark-streaming_2.10@1.2.0?extension=pom", + mapped_package.package_url, + ) # test history history = mapped_package.get_history() self.assertIsNotNone(history) self.assertEqual(1, len(history)) entry = history[0] - message = entry.get('message') - self.assertEqual( - 'New Package created from URI: {}'.format(uri), message) + message = entry.get("message") + self.assertEqual(f"New Package created from URI: {uri}", message) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) @@ -90,13 +96,13 @@ def mock_mapper(uri, resource_uri): self.assertFalse(resource_uri.last_map_date is None) # check that a ScannableURI has been created - scannable = ScannableURI.objects.filter(uri='http://testdomap.com') + scannable = ScannableURI.objects.filter(uri="http://testdomap.com") self.assertEqual(1, scannable.count()) def test_map_uri_continues_after_raised_exception(self): # setup # build a mock mapper and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_mapper(uri, resource_uri): raise Exception() @@ -109,20 +115,23 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, last_visit_date=timezone.now()) + uri=uri, last_visit_date=timezone.now() + ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True resource_uri.save() # ensure that we are clear of Package before before = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, before.count()) # test proper map_uri(resource_uri, _map_router=router) mapped = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, mapped.count()) # check that the ResourceURI status has been updated correctly @@ -132,16 +141,16 @@ def mock_mapper(uri, resource_uri): self.assertTrue(resource_uri.map_error is not None) # check that a ScannableURI has not been created - scannable = ScannableURI.objects.filter(uri='http://testdomap.com') + scannable = ScannableURI.objects.filter(uri="http://testdomap.com") self.assertEqual(0, scannable.count()) def test_map_uri_continues_if_unknown_type_in_package_iterator(self): # setup # build a mock mapper and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_mapper(uri, resource_uri): - return ['some string'] + return ["some string"] router = Router() router.append(uri, mock_mapper) @@ -151,45 +160,45 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, last_visit_date=timezone.now()) + uri=uri, last_visit_date=timezone.now() + ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True resource_uri.save() # ensure that we are clear of Package before before = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, before.count()) # test proper map_uri(resource_uri, _map_router=router) mapped = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, mapped.count()) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) self.assertEqual(None, resource_uri.wip_date) self.assertFalse(resource_uri.last_map_date is None) - self.assertTrue( - 'Not a ScanCode PackageData type' in resource_uri.map_error) + self.assertTrue("Not a ScanCode PackageData type" in resource_uri.map_error) # check that a ScannableURI has not been created - scannable = ScannableURI.objects.filter(uri='http://testdomap.com') + scannable = ScannableURI.objects.filter(uri="http://testdomap.com") self.assertEqual(0, scannable.count()) def test_map_uri_continues_if_no_download_url_in_package_iterator(self): # setup # build a mock mapper and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" class MP(ScannedPackage): pass def mock_mapper(uri, resource_uri): - return [ - MP(type='generic', name='foo', sha1='beef') - ] + return [MP(type="generic", name="foo", sha1="beef")] router = Router() router.append(uri, mock_mapper) @@ -199,51 +208,50 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, last_visit_date=timezone.now()) + uri=uri, last_visit_date=timezone.now() + ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True resource_uri.save() # ensure that we are clear of Package before before = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, before.count()) # test proper map_uri(resource_uri, _map_router=router) mapped = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, mapped.count()) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) self.assertEqual(None, resource_uri.wip_date) self.assertFalse(resource_uri.last_map_date is None) - self.assertTrue( - 'No download_url for package' in resource_uri.map_error) + self.assertTrue("No download_url for package" in resource_uri.map_error) # check that a ScannableURI has not been created - scannable = ScannableURI.objects.filter(uri='http://testdomap.com') + scannable = ScannableURI.objects.filter(uri="http://testdomap.com") self.assertEqual(0, scannable.count()) def test_map_uri_continues_after_raised_exception_in_package_iterator(self): # setup # build a mock mapper and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" class MP(ScannedPackage): - def to_dict(self, **kwargs): - raise Exception('ScannedPackage issue') + raise Exception("ScannedPackage issue") def __getattribute__(self, item): - raise Exception('ScannedPackage issue') + raise Exception("ScannedPackage issue") return ScannedPackage.__getattribute__(self, item) def mock_mapper(uri, resource_uri): - return [ - MP(type='generic', name='foo', download_url=uri, sha1='beef') - ] + return [MP(type="generic", name="foo", download_url=uri, sha1="beef")] router = Router() router.append(uri, mock_mapper) @@ -253,77 +261,80 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, last_visit_date=timezone.now()) + uri=uri, last_visit_date=timezone.now() + ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True resource_uri.save() # ensure that we are clear of Package before before = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, before.count()) # test proper map_uri(resource_uri, _map_router=router) mapped = packagedb.models.Package.objects.filter( - download_url='http://testdomap.com') + download_url="http://testdomap.com" + ) self.assertEqual(0, mapped.count()) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) self.assertEqual(None, resource_uri.wip_date) self.assertFalse(resource_uri.last_map_date is None) - self.assertTrue('ScannedPackage issue' in resource_uri.map_error) - self.assertTrue('Failed to map while' in resource_uri.map_error) + self.assertTrue("ScannedPackage issue" in resource_uri.map_error) + self.assertTrue("Failed to map while" in resource_uri.map_error) # check that a ScannableURI has not been created - scannable = ScannableURI.objects.filter(uri='http://testdomap.com') + scannable = ScannableURI.objects.filter(uri="http://testdomap.com") self.assertEqual(0, scannable.count()) def test_map_uri_with_no_route_defined_does_not_map(self): # setup # build a mock mapper and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_mapper(uri, resource_uri): return [ ScannedPackage( - uri='http://test.com', - type='generic', - name='testpack', + uri="http://test.com", + type="generic", + name="testpack", ) ] router = Router() - router.append('http://nexb.com', mock_mapper) + router.append("http://nexb.com", mock_mapper) resource_uri = ResourceURI.objects.create(uri=uri) # test proper map_uri(resource_uri, _map_router=router) try: - ResourceURI.objects.get(uri='http://test.com') - self.fail('URI should not have been created') + ResourceURI.objects.get(uri="http://test.com") + self.fail("URI should not have been created") except ResourceURI.DoesNotExist: pass def test_run_map_command(self): output = StringIO() - management.call_command('run_map', exit_on_empty=True, stdout=output) - self.assertEqual('', output.getvalue()) + management.call_command("run_map", exit_on_empty=True, stdout=output) + self.assertEqual("", output.getvalue()) def test_map_uri_does_update_with_same_mining_level(self): # setup # build a mock mapper and register it in a router - download_url = 'http://testdomap2.com' + download_url = "http://testdomap2.com" new_p = ScannedPackage( - type='generic', - name='pack', - version='0.2', - description='Description Updated', - download_url=download_url + type="generic", + name="pack", + version="0.2", + description="Description Updated", + download_url=download_url, ) - uri = 'http://testdomap2.com' + uri = "http://testdomap2.com" def mock_mapper(uri, resource_uri): return [new_p] @@ -336,9 +347,7 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, - last_visit_date=timezone.now(), - mining_level=0 + uri=uri, last_visit_date=timezone.now(), mining_level=0 ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True @@ -347,19 +356,18 @@ def mock_mapper(uri, resource_uri): # ensure that we have an existing Package before packagedb.models.Package.objects.insert( mining_level=0, - type='generic', - name='pack', - version='0.1', - description='Description Existing', + type="generic", + name="pack", + version="0.1", + description="Description Existing", download_url=download_url, - sha1='beef', + sha1="beef", ) # test proper map_uri(resource_uri, _map_router=router) - mapped = packagedb.models.Package.objects.filter( - download_url=download_url) + mapped = packagedb.models.Package.objects.filter(download_url=download_url) self.assertEqual(1, mapped.count()) mapped_package = mapped.first() @@ -368,15 +376,16 @@ def mock_mapper(uri, resource_uri): self.assertIsNotNone(history) self.assertEqual(1, len(history)) entry = history[0] - message = entry.get('message') - self.assertEqual('Package field values have been updated.', message) - data = entry.get('data') - updated_fields = data.get('updated_fields') + message = entry.get("message") + self.assertEqual("Package field values have been updated.", message) + data = entry.get("data") + updated_fields = data.get("updated_fields") expected_updated_fields_loc = self.get_test_loc( - 'run_map/test_map_uri_does_update_with_same_mining_level_expected_updated_fields.json' + "run_map/test_map_uri_does_update_with_same_mining_level_expected_updated_fields.json" ) self.check_expected_results( - updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN) + updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN + ) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) @@ -385,7 +394,8 @@ def mock_mapper(uri, resource_uri): # check that the Package has been updated correctly expected_loc = self.get_test_loc( - 'run_map/test_map_uri_does_update_with_same_mining_level-expected.json') + "run_map/test_map_uri_does_update_with_same_mining_level-expected.json" + ) result = mapped_package.to_dict() self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) @@ -398,17 +408,17 @@ def mock_mapper(uri, resource_uri): def test_map_uri_update_only_empties_with_lesser_new_mining_level(self): # setup # build a mock mapper and register it in a router - download_url = 'http://testdomap3.com' + download_url = "http://testdomap3.com" new_p = ScannedPackage( - type='generic', - name='pack', - version='0.2', - description='Description Updated', + type="generic", + name="pack", + version="0.2", + description="Description Updated", download_url=download_url, - sha1='feed' + sha1="feed", ) - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_mapper(uri, resource_uri): return [new_p] @@ -421,9 +431,7 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, - last_visit_date=timezone.now(), - mining_level=0 + uri=uri, last_visit_date=timezone.now(), mining_level=0 ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True @@ -433,18 +441,17 @@ def mock_mapper(uri, resource_uri): packagedb.models.Package.objects.insert( # NOTE: existing is 10, new is 0 mining_level=10, - type='generic', - name='pack', - version='0.1', - description='', + type="generic", + name="pack", + version="0.1", + description="", download_url=download_url, - sha1='', + sha1="", ) # test proper map_uri(resource_uri, _map_router=router) - mapped = packagedb.models.Package.objects.filter( - download_url=download_url) + mapped = packagedb.models.Package.objects.filter(download_url=download_url) self.assertEqual(1, mapped.count()) mapped_package = mapped.first() @@ -453,21 +460,17 @@ def mock_mapper(uri, resource_uri): self.assertIsNotNone(history) self.assertEqual(1, len(history)) entry = history[0] - message = entry.get('message') - self.assertEqual('Package field values have been updated.', message) - data = entry.get('data') - updated_fields = data.get('updated_fields') + message = entry.get("message") + self.assertEqual("Package field values have been updated.", message) + data = entry.get("data") + updated_fields = data.get("updated_fields") expected_updated_fields = [ { - 'field': 'description', - 'new_value': 'Description Updated', - 'old_value': '' + "field": "description", + "new_value": "Description Updated", + "old_value": "", }, - { - 'field': 'sha1', - 'new_value': 'feed', - 'old_value': '' - } + {"field": "sha1", "new_value": "feed", "old_value": ""}, ] self.assertEqual(expected_updated_fields, updated_fields) @@ -478,7 +481,8 @@ def mock_mapper(uri, resource_uri): # check that the Package has been updated correctly expected_loc = self.get_test_loc( - 'run_map/test_map_uri_update_only_empties_with_lesser_new_mining_level-expected.json') + "run_map/test_map_uri_update_only_empties_with_lesser_new_mining_level-expected.json" + ) result = mapped[0].to_dict() self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) @@ -491,16 +495,16 @@ def mock_mapper(uri, resource_uri): def test_map_uri_replace_with_new_with_higher_new_mining_level(self): # setup # build a mock mapper and register it in a router - download_url = 'http://testdomap4.com' + download_url = "http://testdomap4.com" new_p = ScannedPackage( - type='generic', - name='pack2', - version='0.2', - description='Description Updated', - download_url=download_url + type="generic", + name="pack2", + version="0.2", + description="Description Updated", + download_url=download_url, ) - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_mapper(uri, resource_uri): return [new_p] @@ -513,9 +517,7 @@ def mock_mapper(uri, resource_uri): # seed ResourceURI with a uri resource_uri = ResourceURI.objects.insert( - uri=uri, - last_visit_date=timezone.now(), - mining_level=10 + uri=uri, last_visit_date=timezone.now(), mining_level=10 ) assert ResourceURI.objects.get(uri=uri) == resource_uri resource_uri.is_mappable = True @@ -525,18 +527,17 @@ def mock_mapper(uri, resource_uri): packagedb.models.Package.objects.insert( # NOTE: existing is 5, new is 10 mining_level=5, - name='pack', - version='0.1', - description='', + name="pack", + version="0.1", + description="", download_url=download_url, - type='generic', - sha1='beef', + type="generic", + sha1="beef", ) # test proper map_uri(resource_uri, _map_router=router) - mapped = packagedb.models.Package.objects.filter( - download_url=download_url) + mapped = packagedb.models.Package.objects.filter(download_url=download_url) self.assertEqual(1, mapped.count()) mapped_package = mapped.first() @@ -545,15 +546,16 @@ def mock_mapper(uri, resource_uri): self.assertIsNotNone(history) self.assertEqual(1, len(history)) entry = history[0] - message = entry.get('message') - self.assertEqual('Package field values have been updated.', message) - data = entry.get('data') - updated_fields = data.get('updated_fields') + message = entry.get("message") + self.assertEqual("Package field values have been updated.", message) + data = entry.get("data") + updated_fields = data.get("updated_fields") expected_updated_fields_loc = self.get_test_loc( - 'run_map/test_map_uri_replace_with_new_with_higher_new_mining_level_expected_updated_fields.json' + "run_map/test_map_uri_replace_with_new_with_higher_new_mining_level_expected_updated_fields.json" ) self.check_expected_results( - updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN) + updated_fields, expected_updated_fields_loc, regen=FIXTURES_REGEN + ) # check that the ResourceURI status has been updated correctly resource_uri = ResourceURI.objects.get(uri=uri) @@ -562,7 +564,8 @@ def mock_mapper(uri, resource_uri): # check that the Package has been updated correctly expected_loc = self.get_test_loc( - 'run_map/test_map_uri_replace_with_new_with_higher_new_mining_level-expected.json') + "run_map/test_map_uri_replace_with_new_with_higher_new_mining_level-expected.json" + ) result = mapped[0].to_dict() self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) @@ -573,69 +576,71 @@ def mock_mapper(uri, resource_uri): self.assertEqual(0, scannable.count()) def test_merge_packages_no_replace(self): - download_url = 'http://testdomap3.com' + download_url = "http://testdomap3.com" existing_package, _created = packagedb.models.Package.objects.get_or_create( - type='generic', - name='pack', - version='0.1', - description='', + type="generic", + name="pack", + version="0.1", + description="", download_url=download_url, - sha1='beef', + sha1="beef", ) new_package_data = ScannedPackage( - type='generic', - name='pack', - version='0.2', - description='Description Updated', - download_url=download_url + type="generic", + name="pack", + version="0.2", + description="Description Updated", + download_url=download_url, ).to_dict() merge_packages(existing_package, new_package_data, replace=False) expected_loc = self.get_test_loc( - 'run_map/test_merge_packages_no_replace-expected.json') + "run_map/test_merge_packages_no_replace-expected.json" + ) result = existing_package.to_dict() self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_merge_packages_with_replace(self): - download_url = 'http://testdomap3.com' + download_url = "http://testdomap3.com" existing_package, _created = packagedb.models.Package.objects.get_or_create( - type='generic', - name='pack', - version='0.1', - description='', + type="generic", + name="pack", + version="0.1", + description="", download_url=download_url, - sha1='beef', + sha1="beef", ) new_package_data = ScannedPackage( - type='generic', - name='pack', - version='0.2', - description='Description Updated', + type="generic", + name="pack", + version="0.2", + description="Description Updated", download_url=download_url, ).to_dict() merge_packages(existing_package, new_package_data, replace=True) expected_loc = self.get_test_loc( - 'run_map/test_merge_packages_with_replace-expected.json') + "run_map/test_merge_packages_with_replace-expected.json" + ) result = existing_package.to_dict() self.check_expected_results(result, expected_loc, regen=FIXTURES_REGEN) def test_merge_packages_different_sha1(self): - download_url = 'http://testdomap3.com' + download_url = "http://testdomap3.com" existing_package, _created = packagedb.models.Package.objects.get_or_create( - type='generic', - name='pack', - version='0.1', - description='', + type="generic", + name="pack", + version="0.1", + description="", download_url=download_url, - sha1='beef', + sha1="beef", ) new_package_data = ScannedPackage( - type='generic', - name='pack', - version='0.2', - description='Description Updated', + type="generic", + name="pack", + version="0.2", + description="Description Updated", download_url=download_url, - sha1='feed' + sha1="feed", ).to_dict() with self.assertRaises(Exception) as e: merge_packages(existing_package, new_package_data) - self.assertTrue('Mismatched sha1' in e.exception) + self.assertTrue("Mismatched sha1" in e.exception) diff --git a/minecode/tests/test_run_visit.py b/minecode/tests/test_run_visit.py index 08f0e480..ae9e6b0c 100644 --- a/minecode/tests/test_run_visit.py +++ b/minecode/tests/test_run_visit.py @@ -7,39 +7,55 @@ # See https://aboutcode.org for more information about nexB OSS projects. # - -from operator import itemgetter +from collections import Counter from io import StringIO -from collections import Counter from django.core import management -from django.forms.models import model_to_dict -from minecode.utils_test import MiningTestCase from minecode.management.commands.run_visit import visit_uri +from minecode.miners import URI from minecode.models import ResourceURI from minecode.route import Router -from minecode.visitors import URI +from minecode.utils_test import MiningTestCase class RunVisitWithCounterTest(MiningTestCase): - def test_visit_uri_with_counter_0_max_uris_3_multi_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-0-max-uris-3-multi-uri1.com', package_url='pkg:npm/foobar@12.3.1'), - URI(uri='http://test-counter-0-max-uris-3-multi-uri2.com', - package_url='pkg:npm/foobar@12.3.2'), - URI(uri='http://test-counter-0-max-uris-3-multi-uri3.com', - package_url='pkg:npm/foobar@12.3.3'), - URI(uri='http://test-counter-0-max-uris-3-multi-uri4.com', - package_url='pkg:npm/foobar@12.3.4'), - URI(uri='http://test-counter-0-max-uris-3-multi-uri5.com', - package_url='pkg:npm/foobar@12.3.5'), - URI(uri='http://test-counter-0-max-uris-3-multi-uri6.com', package_url='pkg:npm/foobar@12.3.5')], None, None + return ( + [ + URI( + uri="http://test-counter-0-max-uris-3-multi-uri1.com", + package_url="pkg:npm/foobar@12.3.1", + ), + URI( + uri="http://test-counter-0-max-uris-3-multi-uri2.com", + package_url="pkg:npm/foobar@12.3.2", + ), + URI( + uri="http://test-counter-0-max-uris-3-multi-uri3.com", + package_url="pkg:npm/foobar@12.3.3", + ), + URI( + uri="http://test-counter-0-max-uris-3-multi-uri4.com", + package_url="pkg:npm/foobar@12.3.4", + ), + URI( + uri="http://test-counter-0-max-uris-3-multi-uri5.com", + package_url="pkg:npm/foobar@12.3.5", + ), + URI( + uri="http://test-counter-0-max-uris-3-multi-uri6.com", + package_url="pkg:npm/foobar@12.3.5", + ), + ], + None, + None, + ) router = Router() router.append(uri, mock_visitor) @@ -56,46 +72,62 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 0 - visit_uri(resource_uri, _visit_router=router, - max_uris=3, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=3, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri1.com') + uri="http://test-counter-0-max-uris-3-multi-uri1.com" + ) self.assertEqual(1, visited.count()) - self.assertEqual('pkg:npm/foobar@12.3.1', visited[0].package_url) + self.assertEqual("pkg:npm/foobar@12.3.1", visited[0].package_url) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri2.com') + uri="http://test-counter-0-max-uris-3-multi-uri2.com" + ) self.assertEqual(1, visited.count()) - self.assertEqual('pkg:npm/foobar@12.3.2', visited[0].package_url) + self.assertEqual("pkg:npm/foobar@12.3.2", visited[0].package_url) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri3.com') + uri="http://test-counter-0-max-uris-3-multi-uri3.com" + ) self.assertEqual(1, visited.count()) - self.assertEqual('pkg:npm/foobar@12.3.3', visited[0].package_url) + self.assertEqual("pkg:npm/foobar@12.3.3", visited[0].package_url) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri4.com') + uri="http://test-counter-0-max-uris-3-multi-uri4.com" + ) self.assertEqual(1, visited.count()) - self.assertEqual('pkg:npm/foobar@12.3.4', visited[0].package_url) + self.assertEqual("pkg:npm/foobar@12.3.4", visited[0].package_url) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri5.com') + uri="http://test-counter-0-max-uris-3-multi-uri5.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-3-multi-uri6.com') + uri="http://test-counter-0-max-uris-3-multi-uri6.com" + ) self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_0_max_uris_1_multi_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-0-max-uris-1-multi-uri1.com'), - URI(uri='http://test-counter-0-max-uris-1-multi-uri2.com'), - URI(uri='http://test-counter-0-max-uris-1-multi-uri3.com')], None, None + return ( + [ + URI(uri="http://test-counter-0-max-uris-1-multi-uri1.com"), + URI(uri="http://test-counter-0-max-uris-1-multi-uri2.com"), + URI(uri="http://test-counter-0-max-uris-1-multi-uri3.com"), + ], + None, + None, + ) router = Router() router.append(uri, mock_visitor) @@ -112,40 +144,53 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 0 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-1-multi-uri1.com') + uri="http://test-counter-0-max-uris-1-multi-uri1.com" + ) self.assertEqual(1, visited.count()) # MAX_URIS=1 still gives us two URIs visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-1-multi-uri2.com') + uri="http://test-counter-0-max-uris-1-multi-uri2.com" + ) self.assertEqual(1, visited.count()) # ... but not 3 visited = ResourceURI.objects.filter( - uri='http://test-counter-0-max-uris-1-multi-uri3.com') + uri="http://test-counter-0-max-uris-1-multi-uri3.com" + ) self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_10_max_uris_10_multi_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-10-max-uris-10-multi-uri1.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri2.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri3.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri4.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri5.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri6.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri7.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri8.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri9.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri10.com'), - URI(uri='http://test-counter-10-max-uris-10-multi-uri11.com')], None, None + return ( + [ + URI(uri="http://test-counter-10-max-uris-10-multi-uri1.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri2.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri3.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri4.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri5.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri6.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri7.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri8.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri9.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri10.com"), + URI(uri="http://test-counter-10-max-uris-10-multi-uri11.com"), + ], + None, + None, + ) router = Router() router.append(uri, mock_visitor) @@ -162,54 +207,75 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 1 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri1.com') + uri="http://test-counter-10-max-uris-10-multi-uri1.com" + ) self.assertEqual(1, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri2.com') + uri="http://test-counter-10-max-uris-10-multi-uri2.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri3.com') + uri="http://test-counter-10-max-uris-10-multi-uri3.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri4.com') + uri="http://test-counter-10-max-uris-10-multi-uri4.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri5.com') + uri="http://test-counter-10-max-uris-10-multi-uri5.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri6.com') + uri="http://test-counter-10-max-uris-10-multi-uri6.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri7.com') + uri="http://test-counter-10-max-uris-10-multi-uri7.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri8.com') + uri="http://test-counter-10-max-uris-10-multi-uri8.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri9.com') + uri="http://test-counter-10-max-uris-10-multi-uri9.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri10.com') + uri="http://test-counter-10-max-uris-10-multi-uri10.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10-multi-uri11.com') + uri="http://test-counter-10-max-uris-10-multi-uri11.com" + ) self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_3_max_uris_3_multi_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-3-max-uris-3-multi-uri1.com'), - URI(uri='http://test-counter-3-max-uris-3-multi-uri2.com'), - URI(uri='http://test-counter-3-max-uris-3-multi-uri3.com'), - URI(uri='http://test-counter-3-max-uris-3-multi-uri4.com')], None, None + return ( + [ + URI(uri="http://test-counter-3-max-uris-3-multi-uri1.com"), + URI(uri="http://test-counter-3-max-uris-3-multi-uri2.com"), + URI(uri="http://test-counter-3-max-uris-3-multi-uri3.com"), + URI(uri="http://test-counter-3-max-uris-3-multi-uri4.com"), + ], + None, + None, + ) router = Router() router.append(uri, mock_visitor) @@ -226,31 +292,45 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 1 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-3-max-uris-3-multi-uri1.com') + uri="http://test-counter-3-max-uris-3-multi-uri1.com" + ) self.assertEqual(1, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-3-max-uris-3-multi-uri2.com') + uri="http://test-counter-3-max-uris-3-multi-uri2.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-3-max-uris-3-multi-uri3.com') + uri="http://test-counter-3-max-uris-3-multi-uri3.com" + ) self.assertEqual(0, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-3-max-uris-3-multi-uri3.com') + uri="http://test-counter-3-max-uris-3-multi-uri3.com" + ) self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_1_max_uris_1_multi_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-1-max-uris-1-multi-uri1.com'), - URI(uri='http://test-counter-1-max-uris-1-multi-uri2.com')], None, None + return ( + [ + URI(uri="http://test-counter-1-max-uris-1-multi-uri1.com"), + URI(uri="http://test-counter-1-max-uris-1-multi-uri2.com"), + ], + None, + None, + ) router = Router() router.append(uri, mock_visitor) @@ -267,24 +347,30 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 1 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-1-max-uris-1-multi-uri1.com') + uri="http://test-counter-1-max-uris-1-multi-uri1.com" + ) self.assertEqual(1, visited.count()) visited = ResourceURI.objects.filter( - uri='http://test-counter-1-max-uris-1-multi-uri2.com') + uri="http://test-counter-1-max-uris-1-multi-uri2.com" + ) self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_10_max_uris_10(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-10-max-uris-10.com')], None, None + return [URI(uri="http://test-counter-10-max-uris-10.com")], None, None router = Router() router.append(uri, mock_visitor) @@ -301,20 +387,25 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 10 - visit_uri(resource_uri, _visit_router=router, - max_uris=10, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=10, + uri_counter_by_visitor=counter, + ) visited = ResourceURI.objects.filter( - uri='http://test-counter-10-max-uris-10.com') + uri="http://test-counter-10-max-uris-10.com" + ) self.assertEqual(1, visited.count()) def test_visit_uri_with_counter_3_max_uris_3(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-3-max-uris-3.com')], None, None + return [URI(uri="http://test-counter-3-max-uris-3.com")], None, None router = Router() router.append(uri, mock_visitor) @@ -331,20 +422,23 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 3 - visit_uri(resource_uri, _visit_router=router, - max_uris=3, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=3, + uri_counter_by_visitor=counter, + ) - visited = ResourceURI.objects.filter( - uri='http://test-counter-3-max-uris-3.com') + visited = ResourceURI.objects.filter(uri="http://test-counter-3-max-uris-3.com") self.assertEqual(1, visited.count()) def test_visit_uri_with_counter_1_max_uris_1(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-1-max-uris-1.com')], None, None + return [URI(uri="http://test-counter-1-max-uris-1.com")], None, None router = Router() router.append(uri, mock_visitor) @@ -361,20 +455,23 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 1 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) - visited = ResourceURI.objects.filter( - uri='http://test-counter-1-max-uris-1.com') + visited = ResourceURI.objects.filter(uri="http://test-counter-1-max-uris-1.com") self.assertEqual(1, visited.count()) def test_visit_uri_with_counter_2_max_uris_1(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-2-max-uris-1.com')], None, None + return [URI(uri="http://test-counter-2-max-uris-1.com")], None, None router = Router() router.append(uri, mock_visitor) @@ -391,20 +488,23 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 2 - visit_uri(resource_uri, _visit_router=router, - max_uris=1, uri_counter_by_visitor=counter) + visit_uri( + resource_uri, + _visit_router=router, + max_uris=1, + uri_counter_by_visitor=counter, + ) - visited = ResourceURI.objects.filter( - uri='http://test-counter-2-max-uris-1.com') + visited = ResourceURI.objects.filter(uri="http://test-counter-2-max-uris-1.com") self.assertEqual(0, visited.count()) def test_visit_uri_with_counter_1_no_max_uri(self): # setup # build a test visitor and register it in a router - uri = 'http://nexb_visit.com' + uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test-counter-2-max-uris-1.com')], None, None + return [URI(uri="http://test-counter-2-max-uris-1.com")], None, None router = Router() router.append(uri, mock_visitor) @@ -421,28 +521,28 @@ def mock_visitor(uri): key = visitor.__module__ + visitor.__name__ counter[key] += 1 - visit_uri( - resource_uri, _visit_router=router, uri_counter_by_visitor=counter) + visit_uri(resource_uri, _visit_router=router, uri_counter_by_visitor=counter) - visited = ResourceURI.objects.filter( - uri='http://test-counter-2-max-uris-1.com') + visited = ResourceURI.objects.filter(uri="http://test-counter-2-max-uris-1.com") self.assertEqual(1, visited.count()) class RunVisitTest(MiningTestCase): - def setUp(self): - self.uri = 'http://nexb_visit.com' + self.uri = "http://nexb_visit.com" def mock_visitor(uri): - return [URI(uri='http://test.com')], None, None + return [URI(uri="http://test.com")], None, None def mock_visitor2(uri): - return [ - URI(uri='http://test.com', package_url='pkg:npm/foobar@12.3.1'), - URI(uri='http://test.com', visited=True, - data={'some': 'data'}), - ], None, None + return ( + [ + URI(uri="http://test.com", package_url="pkg:npm/foobar@12.3.1"), + URI(uri="http://test.com", visited=True, data={"some": "data"}), + ], + None, + None, + ) self.router = Router() self.router.append(self.uri, mock_visitor) @@ -459,36 +559,36 @@ def tearDown(self): def test_visit_uri(self): visit_uri(self.resource_uri, _visit_router=self.router) - visited = ResourceURI.objects.filter(uri='http://test.com') + visited = ResourceURI.objects.filter(uri="http://test.com") self.assertEqual(1, visited.count()) def test_visit_uri_with_no_route_defined_does_not_visit(self): - resource_uri = ResourceURI.objects.create( - uri='http://undefined-route.com') + resource_uri = ResourceURI.objects.create(uri="http://undefined-route.com") resource_uri.is_visitable = True resource_uri.save() visit_uri(resource_uri, _visit_router=self.router) try: - ResourceURI.objects.get(uri='http://test.com') - self.fail('URI should not have been created.') + ResourceURI.objects.get(uri="http://test.com") + self.fail("URI should not have been created.") except ResourceURI.DoesNotExist: pass def test_run_visit_command(self): output = StringIO() - management.call_command('run_visit', exit_on_empty=True, stdout=output) - expected = 'Visited 0 URIs\nInserted 0 new URIs\n' + management.call_command("run_visit", exit_on_empty=True, stdout=output) + expected = "Visited 0 URIs\nInserted 0 new URIs\n" self.assertEqual(expected, output.getvalue()) def test_visit_uri_always_inserts_new_uri(self): # test proper visit_uri(self.resource_uri, _visit_router=self.router2) - visited = ResourceURI.objects.filter( - uri='http://test.com').order_by('-package_url') + visited = ResourceURI.objects.filter(uri="http://test.com").order_by( + "-package_url" + ) expected = [ - URI(uri=u'http://test.com', data=u"{'some': 'data'}"), - URI(uri=u'http://test.com', package_url='pkg:npm/foobar@12.3.1'), + URI(uri="http://test.com", data="{'some': 'data'}"), + URI(uri="http://test.com", package_url="pkg:npm/foobar@12.3.1"), ] results = sorted(URI.from_db(ruri) for ruri in visited) @@ -496,15 +596,13 @@ def test_visit_uri_always_inserts_new_uri(self): def test_visit_uri_always_inserts_new_uri_unless_there_is_pending_for_visit(self): # create a uri that is already pending visit - resource_uri2 = ResourceURI.objects.insert(uri='http://test.com') + resource_uri2 = ResourceURI.objects.insert(uri="http://test.com") resource_uri2.is_visitable = True resource_uri2.save() # test proper visit_uri(self.resource_uri, _visit_router=self.router) - visited = ResourceURI.objects.filter(uri='http://test.com') - expected = [ - resource_uri2 - ] + visited = ResourceURI.objects.filter(uri="http://test.com") + expected = [resource_uri2] self.assertEqual(expected, list(visited)) diff --git a/minecode/tests/test_seed.py b/minecode/tests/test_seed.py index 017cb477..a6d896fe 100644 --- a/minecode/tests/test_seed.py +++ b/minecode/tests/test_seed.py @@ -8,41 +8,41 @@ # -from datetime import timedelta import os +from datetime import timedelta from io import StringIO +from unittest.mock import patch from django.core import management from django.utils import timezone -from mock import patch +from minecode import seed from minecode.management.commands.seed import SEED_PRIORITY from minecode.management.commands.seed import insert_seed_uris from minecode.models import ResourceURI -from minecode import seed from minecode.utils_test import MiningTestCase class RevisitSeedTest(MiningTestCase): - def setUp(self): class SampleSeed0(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/foo/json' + yield "https://pypi.python.org/pypi/foo/json" class SampleSeed1(seed.Seeder): revisit_after = 1 # hours def get_seeds(self): - yield 'https://pypi.python.org/pypi/foo/json' + yield "https://pypi.python.org/pypi/foo/json" self.SampleSeed0 = SampleSeed0() self.SampleSeed1 = SampleSeed1() def test_insert_seed_uris_revisit_before_10_days_custom_revisit_after(self): # we consume generators to insert seed URI - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed1])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed1]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) @@ -51,15 +51,17 @@ def test_insert_seed_uris_revisit_before_10_days_custom_revisit_after(self): s.last_visit_date = timezone.now() - timedelta(minutes=10) s.save() - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed1])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed1]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) def test_insert_seed_uris_revisit_after_10_days_custom_revisit_after(self): # we consume generators to insert seed URI - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed1])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed1]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) @@ -68,15 +70,17 @@ def test_insert_seed_uris_revisit_after_10_days_custom_revisit_after(self): s.last_visit_date = timezone.now() - timedelta(days=10) s.save() - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed1])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed1]) + ) seeded = ResourceURI.objects.all() self.assertEqual(2, len(seeded)) def test_insert_seed_uris_revisit_before_10_days_default_revisit_after(self): # we consume generators to insert seed URI - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed0])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed0]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) @@ -85,15 +89,17 @@ def test_insert_seed_uris_revisit_before_10_days_default_revisit_after(self): s.last_visit_date = timezone.now() - timedelta(days=9) s.save() - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed0])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed0]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) def test_insert_seed_uris_revisit_after_10_days_default_revisit_after(self): # we consume generators to insert seed URI - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed0])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed0]) + ) seeded = ResourceURI.objects.all() self.assertEqual(1, len(seeded)) @@ -102,42 +108,42 @@ def test_insert_seed_uris_revisit_after_10_days_default_revisit_after(self): s.last_visit_date = timezone.now() - timedelta(days=10) s.save() - list(insert_seed_uris(pattern='.*python.org/pypi/.*', - seeders=[self.SampleSeed0])) + list( + insert_seed_uris(pattern=".*python.org/pypi/.*", seeders=[self.SampleSeed0]) + ) seeded = ResourceURI.objects.all() self.assertEqual(2, len(seeded)) class SeedTest(MiningTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - class SampleSeed0(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/thatbar/json' - yield 'https://pypi.python.org/pypi/that/json' - yield 'https://elsewehre.com' + yield "https://pypi.python.org/pypi/thatbar/json" + yield "https://pypi.python.org/pypi/that/json" + yield "https://elsewehre.com" class SampleSeed1(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/igloo/json' - yield 'https://pypi.python.org/pypi/someigloo/json' + yield "https://pypi.python.org/pypi/igloo/json" + yield "https://pypi.python.org/pypi/someigloo/json" class SampleSeed2(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/igloo2/json' - yield 'https://pypi.python.org/pypi/otherigloo/json' + yield "https://pypi.python.org/pypi/igloo2/json" + yield "https://pypi.python.org/pypi/otherigloo/json" class SampleSeed3(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/foo/json' - yield 'https://pypi.python.org/pypi/foobar/json' + yield "https://pypi.python.org/pypi/foo/json" + yield "https://pypi.python.org/pypi/foobar/json" class SampleSeed4(seed.Seeder): def get_seeds(self): - yield 'https://pypi.python.org/pypi/foo/json' - yield 'https://pypi.python.org/pypi/foobaz/json' + yield "https://pypi.python.org/pypi/foo/json" + yield "https://pypi.python.org/pypi/foobaz/json" self.SampleSeed0 = SampleSeed0() self.SampleSeed1 = SampleSeed1() @@ -145,14 +151,14 @@ def get_seeds(self): self.SampleSeed3 = SampleSeed3() self.SampleSeed4 = SampleSeed4() - @patch('minecode.seed.get_active_seeders') + @patch("minecode.seed.get_active_seeders") def test_seed_command(self, mock_get_active_seeders): output = StringIO() mock_get_active_seeders.return_value = [self.SampleSeed0] - before = list(ResourceURI.objects.all().values_list('id')) + before = list(ResourceURI.objects.all().values_list("id")) - management.call_command('seed', pattern=None, stdout=output) - expected = 'Inserted 3 seed URIs\n' + management.call_command("seed", pattern=None, stdout=output) + expected = "Inserted 3 seed URIs\n" self.assertEqual(expected, output.getvalue()) if before: @@ -160,32 +166,39 @@ def test_seed_command(self, mock_get_active_seeders): else: seeded = ResourceURI.objects.all() - expected = sorted([ - 'https://pypi.python.org/pypi/thatbar/json', - 'https://pypi.python.org/pypi/that/json', - 'https://elsewehre.com', - ]) + expected = sorted( + [ + "https://pypi.python.org/pypi/thatbar/json", + "https://pypi.python.org/pypi/that/json", + "https://elsewehre.com", + ] + ) self.assertEqual(expected, sorted([s.uri for s in seeded])) self.assertTrue(not all(s.is_visitable for s in seeded)) self.assertEqual(3, len([s.is_visitable for s in seeded])) self.assertTrue(all(s.priority == SEED_PRIORITY for s in seeded)) - @patch('minecode.seed.get_active_seeders') - def test_insert_seed_uris_inserts_uris_for_active_seeders_with_pattern(self, mock_get_active_seeders): + @patch("minecode.seed.get_active_seeders") + def test_insert_seed_uris_inserts_uris_for_active_seeders_with_pattern( + self, mock_get_active_seeders + ): mock_get_active_seeders.return_value = [self.SampleSeed1] - before = list(ResourceURI.objects.all().values_list('id')) + before = list(ResourceURI.objects.all().values_list("id")) seeders = seed.get_active_seeders() - results = sorted(insert_seed_uris( - pattern='.*python.*igloo.json', seeders=seeders)) + results = sorted( + insert_seed_uris(pattern=".*python.*igloo.json", seeders=seeders) + ) if before: seeded = ResourceURI.objects.exclude(uri__in=before) else: seeded = ResourceURI.objects.all() - expected = sorted([ - 'https://pypi.python.org/pypi/igloo/json', - 'https://pypi.python.org/pypi/someigloo/json', - ]) + expected = sorted( + [ + "https://pypi.python.org/pypi/igloo/json", + "https://pypi.python.org/pypi/someigloo/json", + ] + ) self.assertEqual(expected, sorted(results)) self.assertEqual(expected, sorted([s.uri for s in seeded])) @@ -193,7 +206,7 @@ def test_insert_seed_uris_inserts_uris_for_active_seeders_with_pattern(self, moc self.assertTrue(all(s.priority == SEED_PRIORITY for s in seeded)) def test_insert_seed_uris_inserts_uris_for_active_seeders_without_pattern(self): - before = list(ResourceURI.objects.all().values_list('id')) + before = list(ResourceURI.objects.all().values_list("id")) results = list(insert_seed_uris(seeders=[self.SampleSeed1])) @@ -202,10 +215,12 @@ def test_insert_seed_uris_inserts_uris_for_active_seeders_without_pattern(self): else: seeded = ResourceURI.objects.all() - expected = sorted([ - 'https://pypi.python.org/pypi/igloo/json', - 'https://pypi.python.org/pypi/someigloo/json', - ]) + expected = sorted( + [ + "https://pypi.python.org/pypi/igloo/json", + "https://pypi.python.org/pypi/someigloo/json", + ] + ) self.assertEqual(expected, sorted(results)) self.assertEqual(expected, sorted([s.uri for s in seeded])) @@ -214,7 +229,7 @@ def test_insert_seed_uris_inserts_uris_for_active_seeders_without_pattern(self): def test_insert_seed_uris_does_not_insert_duplicate(self): seeders = [self.SampleSeed3, self.SampleSeed4] - before = list(ResourceURI.objects.all().values_list('id')) + before = list(ResourceURI.objects.all().values_list("id")) # seed twice seed_results = sorted(insert_seed_uris(seeders=seeders)) no_seed_results = sorted(insert_seed_uris()) @@ -224,11 +239,13 @@ def test_insert_seed_uris_does_not_insert_duplicate(self): else: seeded = ResourceURI.objects.all() - expected = sorted([ - 'https://pypi.python.org/pypi/foo/json', - 'https://pypi.python.org/pypi/foobar/json', - 'https://pypi.python.org/pypi/foobaz/json', - ]) + expected = sorted( + [ + "https://pypi.python.org/pypi/foo/json", + "https://pypi.python.org/pypi/foobar/json", + "https://pypi.python.org/pypi/foobaz/json", + ] + ) self.assertEqual(expected, sorted(seed_results)) self.assertEqual([], no_seed_results) @@ -241,13 +258,13 @@ def test_get_active_seeders(self): # and needs to be updated each time we enable a new seed seeds = [c.__class__.__name__ for c in seed.get_active_seeders()] expected = [ - 'MavenSeed', + "MavenSeed", ] assert sorted(expected) == sorted(seeds) def test_get_configured_seeders(self): seeders = seed.get_configured_seeders() expected = [ - 'minecode.visitors.maven.MavenSeed', + "minecode.miners.maven.MavenSeed", ] assert sorted(expected) == sorted(seeders) diff --git a/minecode/tests/test_sourceforge.py b/minecode/tests/test_sourceforge.py deleted file mode 100644 index 0397b96f..00000000 --- a/minecode/tests/test_sourceforge.py +++ /dev/null @@ -1,116 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import json -import os - -from mock import patch - -from minecode.utils_test import mocked_requests_get -from minecode.utils_test import JsonBasedTesting - -from minecode import mappers -from minecode.visitors import sourceforge -from minecode.tests import FIXTURES_REGEN - - -class SourceforgeVisitorsTest(JsonBasedTesting): - - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_visit_sf_sitemap_index_new(self): - uri = 'http://sourceforge.net/sitemap.xml' - test_loc = self.get_test_loc('sourceforge/sitemap.xml') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _data, error = sourceforge.SourceforgeSitemapIndexVisitor( - uri) - - expected_loc = self.get_test_loc( - 'sourceforge/expected_sf_sitemap_new.json') - self.check_expected_uris(uris, expected_loc) - self.assertIsNone(error) - - def test_visit_sf_sitemap_page_new(self): - uri = 'http://sourceforge.net/sitemap-1.xml' - test_loc = self.get_test_loc('sourceforge/sitemap-1.xml') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, error = sourceforge.SourceforgeSitemapPageVisitor(uri) - - expected_loc = self.get_test_loc( - 'sourceforge/expected_sf_sitemap_page_new.json') - self.check_expected_uris(uris, expected_loc) - self.assertIsNone(error) - - def test_visit_sf_sitemap_page6(self): - uri = 'https://sourceforge.net/sitemap-6.xml' - test_loc = self.get_test_loc('sourceforge/sitemap-6.xml') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - uris, _, error = sourceforge.SourceforgeSitemapPageVisitor(uri) - - expected_loc = self.get_test_loc('sourceforge/expected_sitemap-6.json') - self.check_expected_uris(uris, expected_loc) - self.assertIsNone(error) - - def test_visit_sf_project_json_api_new(self): - uri = 'https://sourceforge.net/api/project/name/netwiki/json' - test_loc = self.get_test_loc('sourceforge/netwiki.json') - with patch('requests.get') as mock_http_get: - mock_http_get.return_value = mocked_requests_get(uri, test_loc) - _, data, error = sourceforge.SourceforgeProjectJsonVisitor(uri) - - expected_loc = self.get_test_loc('sourceforge/expected_netwiki.json') - self.check_expected_results(data, expected_loc) - self.assertIsNone(error) - - -class SourceforgeMappersTest(JsonBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') - - def test_build_packages(self): - with open(self.get_test_loc('sourceforge/odanur.json')) as sourceforge_metadata: - metadata = json.load(sourceforge_metadata) - packages = mappers.sourceforge.build_packages_from_metafile(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'sourceforge/mapper_odanur_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_packages2(self): - with open(self.get_test_loc('sourceforge/openstunts.json')) as sourceforge_metadata: - metadata = json.load(sourceforge_metadata) - packages = mappers.sourceforge.build_packages_from_metafile(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'sourceforge/mapper_openstunts_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_packages3(self): - with open(self.get_test_loc('sourceforge/monoql.json')) as sourceforge_metadata: - metadata = json.load(sourceforge_metadata) - packages = mappers.sourceforge.build_packages_from_metafile(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'sourceforge/mapper_omonoql_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) - - def test_build_packages4(self): - with open(self.get_test_loc('sourceforge/niftyphp.json')) as sourceforge_metadata: - metadata = json.load(sourceforge_metadata) - packages = mappers.sourceforge.build_packages_from_metafile(metadata) - packages = [p.to_dict() for p in packages] - expected_loc = self.get_test_loc( - 'sourceforge/mapper_niftyphp_expected.json') - self.check_expected_results( - packages, expected_loc, regen=FIXTURES_REGEN) diff --git a/minecode/tests/test_tasks.py b/minecode/tests/test_tasks.py index c020d9ef..3ef43943 100644 --- a/minecode/tests/test_tasks.py +++ b/minecode/tests/test_tasks.py @@ -9,39 +9,38 @@ import json import os +from unittest import mock from django.test import TestCase -from unittest import mock +from minecode import tasks from minecode.models import ScannableURI -from packagedb.models import Package from minecode.utils_test import JsonBasedTesting -from minecode import tasks +from packagedb.models import Package class MinecodeTasksTestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): self.package1 = Package.objects.create( - download_url='https://test-url.com/package1.tar.gz', - type='type1', - name='name1', - version='1.0', + download_url="https://test-url.com/package1.tar.gz", + type="type1", + name="name1", + version="1.0", ) self.scannable_uri1 = ScannableURI.objects.create( - uri='https://test-url.com/package1.tar.gz', - package=self.package1 + uri="https://test-url.com/package1.tar.gz", package=self.package1 ) self.project_extra_data1 = { - 'md5': 'md5', - 'sha1': 'sha1', - 'sha256': 'sha256', - 'sha512': 'sha512', - 'size': 100, + "md5": "md5", + "sha1": "sha1", + "sha256": "sha256", + "sha512": "sha512", + "size": 100, } - @mock.patch('os.remove') + @mock.patch("os.remove") def test_minecode_tasks_process_scan_results(self, mock_delete): mock_delete.side_effect = [None, None] @@ -53,9 +52,10 @@ def test_minecode_tasks_process_scan_results(self, mock_delete): self.assertFalse(self.package1.declared_license_expression) self.assertFalse(self.package1.copyright) self.assertEqual(0, self.package1.resources.count()) - scan_file_location = self.get_test_loc('scancodeio/get_scan_data.json') + scan_file_location = self.get_test_loc("scancodeio/get_scan_data.json") summary_file_location = self.get_test_loc( - 'scancodeio/scan_summary_response.json') + "scancodeio/scan_summary_response.json" + ) tasks.process_scan_results( self.scannable_uri1.uuid, scan_results_location=scan_file_location, @@ -63,23 +63,24 @@ def test_minecode_tasks_process_scan_results(self, mock_delete): project_extra_data=self.project_extra_data1, ) self.package1.refresh_from_db() - self.assertEqual('md5', self.package1.md5) - self.assertEqual('sha1', self.package1.sha1) - self.assertEqual('sha256', self.package1.sha256) - self.assertEqual('sha512', self.package1.sha512) + self.assertEqual("md5", self.package1.md5) + self.assertEqual("sha1", self.package1.sha1) + self.assertEqual("sha256", self.package1.sha256) + self.assertEqual("sha512", self.package1.sha512) self.assertEqual(100, self.package1.size) + self.assertEqual("apache-2.0", self.package1.declared_license_expression) self.assertEqual( - 'apache-2.0', self.package1.declared_license_expression) - self.assertEqual( - 'Copyright (c) Apache Software Foundation', self.package1.copyright) + "Copyright (c) Apache Software Foundation", self.package1.copyright + ) self.assertFalse(self.scannable_uri1.scan_error) self.assertEqual(64, self.package1.resources.count()) def test_minecode_tasks_process_scan_results_scannableuri_does_not_exist(self): - nonexisting_uuid = '420db78a-625f-4622-b1a0-93d1ea853194' - scan_file_location = self.get_test_loc('scancodeio/get_scan_data.json') + nonexisting_uuid = "420db78a-625f-4622-b1a0-93d1ea853194" + scan_file_location = self.get_test_loc("scancodeio/get_scan_data.json") summary_file_location = self.get_test_loc( - 'scancodeio/scan_summary_response.json') + "scancodeio/scan_summary_response.json" + ) project_extra_data = json.dumps(self.project_extra_data1) with self.assertRaises(Exception) as context: @@ -89,5 +90,5 @@ def test_minecode_tasks_process_scan_results_scannableuri_does_not_exist(self): scan_summary_location=summary_file_location, project_extra_data=project_extra_data, ) - expected_message = f'ScannableURI {nonexisting_uuid} does not exist!' + expected_message = f"ScannableURI {nonexisting_uuid} does not exist!" self.assertIn(expected_message, str(context.exception)) diff --git a/minecode/tests/test_utils.py b/minecode/tests/test_utils.py index 4f23d11f..912cd173 100644 --- a/minecode/tests/test_utils.py +++ b/minecode/tests/test_utils.py @@ -19,57 +19,55 @@ class UtilsTest(JsonBasedTesting, DjangoTestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def test_stringify_null_purl_fields_with_missing_purl_fields(self): - common_data = { - 'type': None - } + common_data = {"type": None} utils.stringify_null_purl_fields(common_data) self.assertEqual(1, len(common_data)) - self.assertEqual('', common_data['type']) + self.assertEqual("", common_data["type"]) def test_stringify_null_purl_fields(self): common_data = { - 'type': None, - 'namespace': None, - 'name': None, - 'version': None, - 'qualifiers': None, - 'subpath': None + "type": None, + "namespace": None, + "name": None, + "version": None, + "qualifiers": None, + "subpath": None, } utils.stringify_null_purl_fields(common_data) for d in common_data: self.assertIsNotNone(common_data[d]) - self.assertEqual('', common_data[d]) + self.assertEqual("", common_data[d]) def test_set_purl(self): common_data = dict( - type='generic', - name='openssl', - description='The OpenSSL Project is a collaborative effort.', + type="generic", + name="openssl", + description="The OpenSSL Project is a collaborative effort.", ) package = scan_models.Package(**common_data) - package.set_purl('pkg:generic/openssl@1.0.2o') + package.set_purl("pkg:generic/openssl@1.0.2o") self.assertEqual(None, package.namespace) - self.assertEqual('generic', package.type) - self.assertEqual('openssl', package.name) - self.assertEqual('1.0.2o', package.version) + self.assertEqual("generic", package.type) + self.assertEqual("openssl", package.name) + self.assertEqual("1.0.2o", package.version) self.assertEqual({}, package.qualifiers) self.assertEqual(None, package.subpath) def test_is_int(self): self.assertTrue(utils.is_int(0)) - self.assertFalse(utils.is_int('a')) + self.assertFalse(utils.is_int("a")) def test_validate_uuid(self): - invalid_uuid1 = 'invalid' - invalid_uuid2 = '123e4567-e89b-12d3-a456-42665544000G' - valid_uuid = 'c2cf7ef0-d3be-4011-bda7-8eb4a196eef2' + invalid_uuid1 = "invalid" + invalid_uuid2 = "123e4567-e89b-12d3-a456-42665544000G" + valid_uuid = "c2cf7ef0-d3be-4011-bda7-8eb4a196eef2" for uuid, expected_result in [ [invalid_uuid1, False], diff --git a/minecode/tests/test_version.py b/minecode/tests/test_version.py index 8f278e59..e3eaee63 100644 --- a/minecode/tests/test_version.py +++ b/minecode/tests/test_version.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf8 -*- # # Copyright (c) nexB Inc. and others. All rights reserved. # purldb is a trademark of nexB Inc. @@ -17,203 +16,202 @@ class VersionHintTestCase(unittest.TestCase): - def version_tester(self, versions, ignore_pre_releases=False): """Test versions mapping of (path, expected)""" for path in versions: self.assertEqual( versions[path], - version_hint(path, ignore_pre_releases=ignore_pre_releases) + version_hint(path, ignore_pre_releases=ignore_pre_releases), ) def test_version_hint_base(self): versions = { - '/xmlgraphics/fop/source/fop-1.0-src.zip': '1.0', - '/xml/xindice/xml-xindice-1.2m1-src.zip': '1.2m1', - '/xmlgraphics/fop/binaries/fop-0.94-bin-jdk1.3.tar.gz': '0.94', - '/xmlgraphics/batik/batik-src-1.7beta1.zip': '1.7beta1', - '/xmlgraphics/batik/batik-1.7-jre13.zip': '1.7', - '/xmlbeans/source/xmlbeans-2.3.0-src.tgz': '2.3.0', - '/xml/xindice/source/xml-xindice-1.2m1-src.tar.gz': '1.2m1', - '/xml/xerces-p/binaries/XML-Xerces-2.3.0-4-win32.zip': '2.3.0-4', - '/xml/xerces-p/source/XML-Xerces-2.3.0-3.tar.gz': '2.3.0-3', - '/xml/xalan-j/source/xalan-j_2_7_0-src-2jars.tar.gz': '2_7_0', - '/xml/security/java-library/xml-security-src-1_0_5D2.zip': '1_0_5D2', - '/xml/commons/binaries/xml-commons-external-1.4.01-bin.zip': '1.4.01', - '/xml/commons/xml-commons-1.0.b2.zip': '1.0.b2', - '/xml/cocoon/3.0/cocoon-all-3.0.0-alpha-1-dist.tar.gz': '3.0.0-alpha-1', - '/xerces/j/source/Xerces-J-tools.2.10.0-xml-schema-1.1-beta.tar.gz': '2.10.0', - '/xerces/c/3/binaries/xerces-c-3.1.1-x86_64-solaris-cc-5.10.tar.gz': '3.1.1', - '/xerces/c/3/binaries/xerces-c-3.1.1-x86_64-windows-vc-8.0.zip': '3.1.1', - '/xerces/c/2/binaries/xerces-c_2_8_0-x86-windows-vc_7_1.zip': '2_8_0', - '/ws/woden/1.0M8/apache-woden-src-1.0M8.tar.gz': '1.0M8', - '/ws/scout/0_7rc1/source/scout-0.7rc1-src.zip': '0.7rc1', - '/ws/juddi/3_0/juddi-portal-bundle-3.0.0.rc1.zip': '3.0.0.rc1', - '/ws/juddi/3_0/juddi-portal-bundle-3.0.0.beta.zip': '3.0.0.beta', - '/ws/juddi/2_0RC7/juddi-tomcat-2.0rc7.zip': '2.0rc7', - '/ws/axis2/tools/1_4_1/axis2-wsdl2code-maven-plugin-1.4.1.jar': '1.4.1', - '/ws/axis/1_4/axis-src-1_4.zip': '1_4', - '/tuscany/java/sca/2.0-M5/apache-tuscany-sca-all-2.0-M5-src.tar.gz': '2.0-M5', - '/ws/axis-c/source/win32/axis-c-1.6b-Win32-trace-src.zip': '1.6b', - '/turbine/turbine-2.3.3-rc1/source/turbine-2.3.3-RC1-src.zip': '2.3.3-RC1', - '/tomcat/tomcat-connectors/jk/binaries/win64/jk-1.2.30/ia64/symbols-1.2.30.zip': '1.2.30', - '/tomcat/tomcat-7/v7.0.0-beta/bin/apache-tomcat-7.0.0-windows-i64.zip': '7.0.0', - '/tomcat/tomcat-4/v4.1.40/bin/apache-tomcat-4.1.40-LE-jdk14.exe': '4.1.40', - '/tapestry/tapestry-src-5.1.0.5.tar.gz': '5.1.0.5', - '/spamassassin/source/Mail-SpamAssassin-rules-3.3.0.r901671.tgz': '3.3.0.r901671', - '/spamassassin/Mail-SpamAssassin-rules-3.3.1.r923257.tgz': '3.3.1.r923257', - '/shindig/1.1-BETA5-incubating/shindig-1.1-BETA5-incubating-source.zip': '1.1-BETA5', - '/servicemix/nmr/1.0.0-m3/apache-servicemix-nmr-1.0.0-m3-src.tar.gz': '1.0.0-m3', - '/qpid/0.6/qpid-dotnet-0-10-0.6.zip': '0.6', - '/openjpa/2.0.0-beta/apache-openjpa-2.0.0-beta-binary.zip': '2.0.0-beta', - '/myfaces/source/portlet-bridge-2.0.0-alpha-2-src-all.tar.gz': '2.0.0-alpha-2', - '/myfaces/source/myfaces-extval20-2.0.3-src.tar.gz': '2.0.3', - '/geronimo/eclipse/updates/plugins/org.apache.geronimo.st.v21.ui_2.1.1.jar': '2.1.1', - '/directory/studio/update/1.x/plugins/org.apache.directory.studio.aciitemeditor_1.5.2.v20091211.jar': '1.5.2.v20091211', - '/db/torque/torque-3.3/source/torque-gen-3.3-RC3-src.zip': '3.3-RC3', - '/cayenne/cayenne-3.0B1.tar.gz': '3.0B1', - '/cayenne/cayenne-3.0M4-macosx.dmg': '3.0M4', - '/xmlgraphics/batik/batik-docs-current.zip': 'current', - '/xmlgraphics/batik/batik-docs-previous.zip': 'previous', - '/poi/dev/bin/poi-bin-3.7-beta1-20100620.zip': '3.7-beta1-20100620', - '/excalibur/avalon-logkit/source/excalibur-logkit-2.0.dev-0-src.zip': '2.0.dev-0', - '/db/derby/db-derby-10.4.2.0/derby_core_plugin_10.4.2.zip': '10.4.2', - '/httpd/modpython/win/2.7.1/mp152dll.zip': '2.7.1', - '/perl/mod_perl-1.31/apaci/mod_perl.config.sh': '1.31', - '/xml/xerces-j/old_xerces2/Xerces-J-bin.2.0.0.alpha.zip': '2.0.0.alpha', - '/xml/xerces-p/archives/XML-Xerces-1.7.0_0.tar.gz': '1.7.0_0', - '/httpd/docs/tools-2004-05-04.zip': '2004-05-04', - '/ws/axis2/c/M0_5/axis2c-src-M0.5.tar.gz': 'M0.5', - '/jakarta/poi/dev/src/jakarta-poi-1.8.0-dev-src.zip': '1.8.0-dev', - '/tapestry/tapestry-4.0-beta-8.zip': '4.0-beta-8', - '/openejb/3.0-beta-1/openejb-3.0-beta-1.zip': '3.0-beta-1', - '/tapestry/tapestry-4.0-rc-1.zip': '4.0-rc-1', - '/jakarta/tapestry/source/3.0-rc-3/Tapestry-3.0-rc-3-src.zip': '3.0-rc-3', - '/jakarta/lucene/binaries/lucene-1.3-final.tar.gz': '1.3-final', - '/jakarta/tapestry/binaries/3.0-beta-1a/Tapestry-3.0-beta-1a-bin.zip': '3.0-beta-1a', - '/poi/release/bin/poi-bin-3.0-FINAL-20070503.tar.gz': '3.0-FINAL-20070503', - '/harmony/milestones/M4/apache-harmony-hdk-r603534-linux-x86-32-libstdc++v6-snapshot.tar.gz': 'r603534', - '/ant/antidote/antidote-20050330.tar.bz2': '20050330', - '/apr/not-released/apr_20020725223645.tar.gz': '20020725223645', - '/ibatis/source/ibatis.net/src-revision-709676.zip': 'revision-709676', - '/ws/axis-c/source/win32/axis-c-src-1-2-win32.zip': '1-2', - '/jakarta/slide/most-recent-2.0rc1-binaries/jakarta-slide 2.0rc1 jakarta-tomcat-4.1.30.zip': '2.0rc1', - '/httpd/modpython/win/3.0.1/python2.2.1-apache2.0.43.zip': '2.2.1', - '/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.1.0.cr1_20090319213629.jar': '2.1.0.cr1_20090319213629', - '/jakarta/poi/dev/bin/poi-2.0-pre1-20030517.jar': '2.0-pre1-20030517', - '/jakarta/poi/release/bin/jakarta-poi-1.5.0-FINAL-bin.zip': '1.5.0-FINAL', - '/jakarta/poi/release/bin/poi-bin-2.0-final-20040126.zip': '2.0-final-20040126', - '/activemq/apache-activemq/5.0.0/apache-activemq-5.0.0-sources.jar': '5.0.0', - '/turbine/turbine-2.2/source/jakarta-turbine-2.2-B1.tar.gz': '2.2-B1', - '/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.0.0.cr1.jar': '2.0.0.cr1', - '/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.0.0.final_20090108225011.jar': '2.0.0.final_20090108225011', - '/ws/axis/1_2RC3/axis-src-1_2RC3.zip': '1_2RC3', - '/commons/lang/old/v1.0-b1.1/commons-lang-1.0-b1.1.zip': '1.0-b1.1', - '/commons/net/binaries/commons-net-1.2.0-release.tar.gz': '1.2.0-release', - '/ant/ivyde/2.0.0.final/apache-ivyde-2.0.0.final-200907011148-RELEASE.tgz': '2.0.0.final-200907011148-RELEASE', - '/geronimo/eclipse/updates/plugins/org.apache.geronimo.jetty.j2ee.server.v11_1.0.0.jar': 'v11_1.0.0', - '/jakarta/cactus/binaries/jakarta-cactus-13-1.7.1-fixed.zip': '1.7.1-fixed', - '/jakarta/jakarta-turbine-maven/maven/jars/maven-1.0-b5-dev.20020731.085427.jar': '1.0-b5-dev.20020731.085427', - '/xml/xalan-j/source/xalan-j_2_5_D1-src.tar.gz': '2_5_D1', - '/ws/woden/IBuilds/I20051002_1145/woden-I20051002_1145.tar.bz2': 'I20051002_1145', - '/commons/beanutils/source/commons-beanutils-1.8.0-BETA-src.tar.gz': '1.8.0-BETA', - '/cocoon/BINARIES/cocoon-2.0.3-vm14-bin.tar.gz': '2.0.3-vm14', - '/felix/xliff_filters_v1_2_7_unix.jar': 'v1_2_7', - '/excalibur/releases/200702/excalibur-javadoc-r508111-15022007.tar.gz': 'r508111-15022007', - '/geronimo/eclipse/updates/features/org.apache.geronimo.v20.feature_2.0.0.jar': 'v20.feature_2.0.0', - '/geronimo/2.1.6/axis2-jaxws-1.3-G20090406.jar': '1.3-G20090406', - '/cassandra/debian/pool/main/c/cassandra/cassandra_0.4.0~beta1-1.diff.gz': '0.4.0~beta1', - '/ha-api-3.1.6.jar': '3.1.6', - 'ha-api-3.1.6.jar': '3.1.6', - 'fryPOS_20070919.exe': '20070919', + "/xmlgraphics/fop/source/fop-1.0-src.zip": "1.0", + "/xml/xindice/xml-xindice-1.2m1-src.zip": "1.2m1", + "/xmlgraphics/fop/binaries/fop-0.94-bin-jdk1.3.tar.gz": "0.94", + "/xmlgraphics/batik/batik-src-1.7beta1.zip": "1.7beta1", + "/xmlgraphics/batik/batik-1.7-jre13.zip": "1.7", + "/xmlbeans/source/xmlbeans-2.3.0-src.tgz": "2.3.0", + "/xml/xindice/source/xml-xindice-1.2m1-src.tar.gz": "1.2m1", + "/xml/xerces-p/binaries/XML-Xerces-2.3.0-4-win32.zip": "2.3.0-4", + "/xml/xerces-p/source/XML-Xerces-2.3.0-3.tar.gz": "2.3.0-3", + "/xml/xalan-j/source/xalan-j_2_7_0-src-2jars.tar.gz": "2_7_0", + "/xml/security/java-library/xml-security-src-1_0_5D2.zip": "1_0_5D2", + "/xml/commons/binaries/xml-commons-external-1.4.01-bin.zip": "1.4.01", + "/xml/commons/xml-commons-1.0.b2.zip": "1.0.b2", + "/xml/cocoon/3.0/cocoon-all-3.0.0-alpha-1-dist.tar.gz": "3.0.0-alpha-1", + "/xerces/j/source/Xerces-J-tools.2.10.0-xml-schema-1.1-beta.tar.gz": "2.10.0", + "/xerces/c/3/binaries/xerces-c-3.1.1-x86_64-solaris-cc-5.10.tar.gz": "3.1.1", + "/xerces/c/3/binaries/xerces-c-3.1.1-x86_64-windows-vc-8.0.zip": "3.1.1", + "/xerces/c/2/binaries/xerces-c_2_8_0-x86-windows-vc_7_1.zip": "2_8_0", + "/ws/woden/1.0M8/apache-woden-src-1.0M8.tar.gz": "1.0M8", + "/ws/scout/0_7rc1/source/scout-0.7rc1-src.zip": "0.7rc1", + "/ws/juddi/3_0/juddi-portal-bundle-3.0.0.rc1.zip": "3.0.0.rc1", + "/ws/juddi/3_0/juddi-portal-bundle-3.0.0.beta.zip": "3.0.0.beta", + "/ws/juddi/2_0RC7/juddi-tomcat-2.0rc7.zip": "2.0rc7", + "/ws/axis2/tools/1_4_1/axis2-wsdl2code-maven-plugin-1.4.1.jar": "1.4.1", + "/ws/axis/1_4/axis-src-1_4.zip": "1_4", + "/tuscany/java/sca/2.0-M5/apache-tuscany-sca-all-2.0-M5-src.tar.gz": "2.0-M5", + "/ws/axis-c/source/win32/axis-c-1.6b-Win32-trace-src.zip": "1.6b", + "/turbine/turbine-2.3.3-rc1/source/turbine-2.3.3-RC1-src.zip": "2.3.3-RC1", + "/tomcat/tomcat-connectors/jk/binaries/win64/jk-1.2.30/ia64/symbols-1.2.30.zip": "1.2.30", + "/tomcat/tomcat-7/v7.0.0-beta/bin/apache-tomcat-7.0.0-windows-i64.zip": "7.0.0", + "/tomcat/tomcat-4/v4.1.40/bin/apache-tomcat-4.1.40-LE-jdk14.exe": "4.1.40", + "/tapestry/tapestry-src-5.1.0.5.tar.gz": "5.1.0.5", + "/spamassassin/source/Mail-SpamAssassin-rules-3.3.0.r901671.tgz": "3.3.0.r901671", + "/spamassassin/Mail-SpamAssassin-rules-3.3.1.r923257.tgz": "3.3.1.r923257", + "/shindig/1.1-BETA5-incubating/shindig-1.1-BETA5-incubating-source.zip": "1.1-BETA5", + "/servicemix/nmr/1.0.0-m3/apache-servicemix-nmr-1.0.0-m3-src.tar.gz": "1.0.0-m3", + "/qpid/0.6/qpid-dotnet-0-10-0.6.zip": "0.6", + "/openjpa/2.0.0-beta/apache-openjpa-2.0.0-beta-binary.zip": "2.0.0-beta", + "/myfaces/source/portlet-bridge-2.0.0-alpha-2-src-all.tar.gz": "2.0.0-alpha-2", + "/myfaces/source/myfaces-extval20-2.0.3-src.tar.gz": "2.0.3", + "/geronimo/eclipse/updates/plugins/org.apache.geronimo.st.v21.ui_2.1.1.jar": "2.1.1", + "/directory/studio/update/1.x/plugins/org.apache.directory.studio.aciitemeditor_1.5.2.v20091211.jar": "1.5.2.v20091211", + "/db/torque/torque-3.3/source/torque-gen-3.3-RC3-src.zip": "3.3-RC3", + "/cayenne/cayenne-3.0B1.tar.gz": "3.0B1", + "/cayenne/cayenne-3.0M4-macosx.dmg": "3.0M4", + "/xmlgraphics/batik/batik-docs-current.zip": "current", + "/xmlgraphics/batik/batik-docs-previous.zip": "previous", + "/poi/dev/bin/poi-bin-3.7-beta1-20100620.zip": "3.7-beta1-20100620", + "/excalibur/avalon-logkit/source/excalibur-logkit-2.0.dev-0-src.zip": "2.0.dev-0", + "/db/derby/db-derby-10.4.2.0/derby_core_plugin_10.4.2.zip": "10.4.2", + "/httpd/modpython/win/2.7.1/mp152dll.zip": "2.7.1", + "/perl/mod_perl-1.31/apaci/mod_perl.config.sh": "1.31", + "/xml/xerces-j/old_xerces2/Xerces-J-bin.2.0.0.alpha.zip": "2.0.0.alpha", + "/xml/xerces-p/archives/XML-Xerces-1.7.0_0.tar.gz": "1.7.0_0", + "/httpd/docs/tools-2004-05-04.zip": "2004-05-04", + "/ws/axis2/c/M0_5/axis2c-src-M0.5.tar.gz": "M0.5", + "/jakarta/poi/dev/src/jakarta-poi-1.8.0-dev-src.zip": "1.8.0-dev", + "/tapestry/tapestry-4.0-beta-8.zip": "4.0-beta-8", + "/openejb/3.0-beta-1/openejb-3.0-beta-1.zip": "3.0-beta-1", + "/tapestry/tapestry-4.0-rc-1.zip": "4.0-rc-1", + "/jakarta/tapestry/source/3.0-rc-3/Tapestry-3.0-rc-3-src.zip": "3.0-rc-3", + "/jakarta/lucene/binaries/lucene-1.3-final.tar.gz": "1.3-final", + "/jakarta/tapestry/binaries/3.0-beta-1a/Tapestry-3.0-beta-1a-bin.zip": "3.0-beta-1a", + "/poi/release/bin/poi-bin-3.0-FINAL-20070503.tar.gz": "3.0-FINAL-20070503", + "/harmony/milestones/M4/apache-harmony-hdk-r603534-linux-x86-32-libstdc++v6-snapshot.tar.gz": "r603534", + "/ant/antidote/antidote-20050330.tar.bz2": "20050330", + "/apr/not-released/apr_20020725223645.tar.gz": "20020725223645", + "/ibatis/source/ibatis.net/src-revision-709676.zip": "revision-709676", + "/ws/axis-c/source/win32/axis-c-src-1-2-win32.zip": "1-2", + "/jakarta/slide/most-recent-2.0rc1-binaries/jakarta-slide 2.0rc1 jakarta-tomcat-4.1.30.zip": "2.0rc1", + "/httpd/modpython/win/3.0.1/python2.2.1-apache2.0.43.zip": "2.2.1", + "/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.1.0.cr1_20090319213629.jar": "2.1.0.cr1_20090319213629", + "/jakarta/poi/dev/bin/poi-2.0-pre1-20030517.jar": "2.0-pre1-20030517", + "/jakarta/poi/release/bin/jakarta-poi-1.5.0-FINAL-bin.zip": "1.5.0-FINAL", + "/jakarta/poi/release/bin/poi-bin-2.0-final-20040126.zip": "2.0-final-20040126", + "/activemq/apache-activemq/5.0.0/apache-activemq-5.0.0-sources.jar": "5.0.0", + "/turbine/turbine-2.2/source/jakarta-turbine-2.2-B1.tar.gz": "2.2-B1", + "/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.0.0.cr1.jar": "2.0.0.cr1", + "/ant/ivyde/updatesite/features/org.apache.ivy.feature_2.0.0.final_20090108225011.jar": "2.0.0.final_20090108225011", + "/ws/axis/1_2RC3/axis-src-1_2RC3.zip": "1_2RC3", + "/commons/lang/old/v1.0-b1.1/commons-lang-1.0-b1.1.zip": "1.0-b1.1", + "/commons/net/binaries/commons-net-1.2.0-release.tar.gz": "1.2.0-release", + "/ant/ivyde/2.0.0.final/apache-ivyde-2.0.0.final-200907011148-RELEASE.tgz": "2.0.0.final-200907011148-RELEASE", + "/geronimo/eclipse/updates/plugins/org.apache.geronimo.jetty.j2ee.server.v11_1.0.0.jar": "v11_1.0.0", + "/jakarta/cactus/binaries/jakarta-cactus-13-1.7.1-fixed.zip": "1.7.1-fixed", + "/jakarta/jakarta-turbine-maven/maven/jars/maven-1.0-b5-dev.20020731.085427.jar": "1.0-b5-dev.20020731.085427", + "/xml/xalan-j/source/xalan-j_2_5_D1-src.tar.gz": "2_5_D1", + "/ws/woden/IBuilds/I20051002_1145/woden-I20051002_1145.tar.bz2": "I20051002_1145", + "/commons/beanutils/source/commons-beanutils-1.8.0-BETA-src.tar.gz": "1.8.0-BETA", + "/cocoon/BINARIES/cocoon-2.0.3-vm14-bin.tar.gz": "2.0.3-vm14", + "/felix/xliff_filters_v1_2_7_unix.jar": "v1_2_7", + "/excalibur/releases/200702/excalibur-javadoc-r508111-15022007.tar.gz": "r508111-15022007", + "/geronimo/eclipse/updates/features/org.apache.geronimo.v20.feature_2.0.0.jar": "v20.feature_2.0.0", + "/geronimo/2.1.6/axis2-jaxws-1.3-G20090406.jar": "1.3-G20090406", + "/cassandra/debian/pool/main/c/cassandra/cassandra_0.4.0~beta1-1.diff.gz": "0.4.0~beta1", + "/ha-api-3.1.6.jar": "3.1.6", + "ha-api-3.1.6.jar": "3.1.6", + "fryPOS_20070919.exe": "20070919", } self.version_tester(versions) def test_versions_with_7z_extensions(self): versions = { - 'http://heanet.dl.sourceforge.net/project/imadering/Imadering_500_211.7z': '500_211', - 'http://cznic.dl.sourceforge.net/project/lttty/LtTTY/LtTTY-0.6.0.2/lttty-src-0.602.7z': '0.602', - '/some/MPlayerGUI_0_6_79.7z': '0_6_79', - 'http://heanet.dl.sourceforge.net/project/qsubedit/0-2-1-23/QSubEdit-win32-0-2-1-23.7z': '0-2-1-23', - 'http://sourceforge.net/projects/vgmtoolbox/files/vgmtoolbox/VGMToolbox%20r930/vgmtoolbox_bin_r930.7z': 'r930', - 'blah/XMTunerSource-0-6-4.7z': '0-6-4', + "http://heanet.dl.sourceforge.net/project/imadering/Imadering_500_211.7z": "500_211", + "http://cznic.dl.sourceforge.net/project/lttty/LtTTY/LtTTY-0.6.0.2/lttty-src-0.602.7z": "0.602", + "/some/MPlayerGUI_0_6_79.7z": "0_6_79", + "http://heanet.dl.sourceforge.net/project/qsubedit/0-2-1-23/QSubEdit-win32-0-2-1-23.7z": "0-2-1-23", + "http://sourceforge.net/projects/vgmtoolbox/files/vgmtoolbox/VGMToolbox%20r930/vgmtoolbox_bin_r930.7z": "r930", + "blah/XMTunerSource-0-6-4.7z": "0-6-4", } self.version_tester(versions) def test_versions_of_debs_and_rpms(self): versions = { - 'bartlby-agent_1.2.3-1_i386.deb': '1.2.3', - 'milestones/6.0/debian/amd64/harmony-6.0-classlib_0.0r946981-1_amd64.deb': '6.0', - 'bartlby-extensions_1.2.3-12_amd64.deb': '1.2.3', - 'bashish-2.0.4.tar.gz': '2.0.4', - 'bashish_2.0.4-1_all.deb': '2.0.4', - 'bashish-2.0.4-1.bashish.generic.noarch.rpm': '2.0.4', - 'bbbike_3.18-1_i386.deb': '3.18', - 'bbbike_3.18-1_amd64.deb': '3.18', - 'blueproximity-1.2.4.tar.gz': '1.2.4', - 'blueproximity_1.2.4-0ubuntu1_feisty1_all.deb': '1.2.4', - 'blueproximity_1.2.4-0ubuntu1_all.deb': '1.2.4', - 'blueproximity-1.2.4-1.fc8.noarch.rpm': '1.2.4', - 'blueproximity-1.2.4-1.2_opensuse10_2.noarch.rpm': '1.2.4', - 'blueproximity-1.2.4-1.2_opensuse10_3.noarch.rpm': '1.2.4', - 'blueproximity-1.2.4-12.1_opensuse10_3.x86_64.rpm': '1.2.4', - 'blueproximity-1.2.4-12.1_opensuse10_3.i586.rpm': '1.2.4', - 'blueproximity-1.2.4-13.1_upensuse10_2.x86_64.rpm': '1.2.4', - 'blueproximity-1.2.4-13.1_opensuse10_2.i586.rpm': '1.2.4', - 'blueproximity-1.2.4-14.1_opensuse10_3.noarch.rpm': '1.2.4', - 'blueproximity-1.2.4-14.1_opensuse10_2.noarch.rpm': '1.2.4', - 'blueproximity-1.2.4-2.fc8.noarch.rpm': '1.2.4', - 'bpmcalc4amarok_0.1.2-1_all.deb': '0.1.2', - 'bpmcalc4amarok_0.1.2-1.diff.gz': '0.1.2', + "bartlby-agent_1.2.3-1_i386.deb": "1.2.3", + "milestones/6.0/debian/amd64/harmony-6.0-classlib_0.0r946981-1_amd64.deb": "6.0", + "bartlby-extensions_1.2.3-12_amd64.deb": "1.2.3", + "bashish-2.0.4.tar.gz": "2.0.4", + "bashish_2.0.4-1_all.deb": "2.0.4", + "bashish-2.0.4-1.bashish.generic.noarch.rpm": "2.0.4", + "bbbike_3.18-1_i386.deb": "3.18", + "bbbike_3.18-1_amd64.deb": "3.18", + "blueproximity-1.2.4.tar.gz": "1.2.4", + "blueproximity_1.2.4-0ubuntu1_feisty1_all.deb": "1.2.4", + "blueproximity_1.2.4-0ubuntu1_all.deb": "1.2.4", + "blueproximity-1.2.4-1.fc8.noarch.rpm": "1.2.4", + "blueproximity-1.2.4-1.2_opensuse10_2.noarch.rpm": "1.2.4", + "blueproximity-1.2.4-1.2_opensuse10_3.noarch.rpm": "1.2.4", + "blueproximity-1.2.4-12.1_opensuse10_3.x86_64.rpm": "1.2.4", + "blueproximity-1.2.4-12.1_opensuse10_3.i586.rpm": "1.2.4", + "blueproximity-1.2.4-13.1_upensuse10_2.x86_64.rpm": "1.2.4", + "blueproximity-1.2.4-13.1_opensuse10_2.i586.rpm": "1.2.4", + "blueproximity-1.2.4-14.1_opensuse10_3.noarch.rpm": "1.2.4", + "blueproximity-1.2.4-14.1_opensuse10_2.noarch.rpm": "1.2.4", + "blueproximity-1.2.4-2.fc8.noarch.rpm": "1.2.4", + "bpmcalc4amarok_0.1.2-1_all.deb": "0.1.2", + "bpmcalc4amarok_0.1.2-1.diff.gz": "0.1.2", } self.version_tester(versions) def test_versions_without_rc_alpha_beta(self): versions = { - '/commons/beanutils/source/commons-beanutils-1.8.0-BETA-src.tar.gz': '1.8.0', - '/cassandra/debian/pool/main/c/cassandra/cassandra_0.4.0~beta1-1.diff.gz': '0.4.0', - '/xmlgraphics/batik/batik-src-1.7beta1.zip': '1.7', - '/xml/cocoon/3.0/cocoon-all-3.0.0-alpha-1-dist.tar.gz': '3.0.0', - '/ws/scout/0_7rc1/source/scout-0.7rc1-src.zip': '0.7', - '/ws/juddi/3_0/juddi-portal-bundle-3.0.0.rc1.zip': '3.0.0', - '/ws/juddi/3_0/juddi-portal-bundle-3.0.0.beta.zip': '3.0.0', - '/ws/juddi/2_0RC7/juddi-tomcat-2.0rc7.zip': '2.0', - '/turbine/turbine-2.3.3-rc1/source/turbine-2.3.3-RC1-src.zip': '2.3.3', - '/jakarta/slide/most-recent-2.0rc1-binaries/jakarta-slide 2.0rc1 jakarta-tomcat-4.1.30.zip': '2.0', - '/jakarta/poi/dev/bin/poi-2.0-pre1-20030517.jar': '2.0', - '/ws/axis/1_2RC3/axis-src-1_2RC3.zip': '1_2', - '/ws/axis-c/source/win32/axis-c-1.6b-Win32-trace-src.zip': '1.6b', - '/xml/commons/xml-commons-1.0.b2.zip': '1.0', - '/commons/lang/old/v1.0-b1.1/commons-lang-1.0-b1.1.zip': '1.0', - '/turbine/turbine-2.2/source/jakarta-turbine-2.2-B1.tar.gz': '2.2', + "/commons/beanutils/source/commons-beanutils-1.8.0-BETA-src.tar.gz": "1.8.0", + "/cassandra/debian/pool/main/c/cassandra/cassandra_0.4.0~beta1-1.diff.gz": "0.4.0", + "/xmlgraphics/batik/batik-src-1.7beta1.zip": "1.7", + "/xml/cocoon/3.0/cocoon-all-3.0.0-alpha-1-dist.tar.gz": "3.0.0", + "/ws/scout/0_7rc1/source/scout-0.7rc1-src.zip": "0.7", + "/ws/juddi/3_0/juddi-portal-bundle-3.0.0.rc1.zip": "3.0.0", + "/ws/juddi/3_0/juddi-portal-bundle-3.0.0.beta.zip": "3.0.0", + "/ws/juddi/2_0RC7/juddi-tomcat-2.0rc7.zip": "2.0", + "/turbine/turbine-2.3.3-rc1/source/turbine-2.3.3-RC1-src.zip": "2.3.3", + "/jakarta/slide/most-recent-2.0rc1-binaries/jakarta-slide 2.0rc1 jakarta-tomcat-4.1.30.zip": "2.0", + "/jakarta/poi/dev/bin/poi-2.0-pre1-20030517.jar": "2.0", + "/ws/axis/1_2RC3/axis-src-1_2RC3.zip": "1_2", + "/ws/axis-c/source/win32/axis-c-1.6b-Win32-trace-src.zip": "1.6b", + "/xml/commons/xml-commons-1.0.b2.zip": "1.0", + "/commons/lang/old/v1.0-b1.1/commons-lang-1.0-b1.1.zip": "1.0", + "/turbine/turbine-2.2/source/jakarta-turbine-2.2-B1.tar.gz": "2.2", } self.version_tester(versions, ignore_pre_releases=True) def test_versions_libpng(self): versions = { - 'libpng-1.0.16rc3-config.tar.gz': '1.0.16', - 'libpng-1.0.16rc4-config.tar.gz': '1.0.16', - 'libpng-1.0.16rc5-config.tar.gz': '1.0.16', - 'libpng-1.0.17rc1-config.tar.gz': '1.0.17', - 'libpng-1.0.18rc1-config.tar.gz': '1.0.18', - 'libpng-1.0.18rc1.tar.gz': '1.0.18', - 'libpng-1.2.17rc3-no-config.tar.gz': '1.2.17', - 'libpng-1.2.17rc4-no-config.tar.gz': '1.2.17', - 'libpng-1.2.19beta1-no-config.tar.gz': '1.2.19', - 'libpng-1.2.19beta12-no-config.tar.gz': '1.2.19', + "libpng-1.0.16rc3-config.tar.gz": "1.0.16", + "libpng-1.0.16rc4-config.tar.gz": "1.0.16", + "libpng-1.0.16rc5-config.tar.gz": "1.0.16", + "libpng-1.0.17rc1-config.tar.gz": "1.0.17", + "libpng-1.0.18rc1-config.tar.gz": "1.0.18", + "libpng-1.0.18rc1.tar.gz": "1.0.18", + "libpng-1.2.17rc3-no-config.tar.gz": "1.2.17", + "libpng-1.2.17rc4-no-config.tar.gz": "1.2.17", + "libpng-1.2.19beta1-no-config.tar.gz": "1.2.19", + "libpng-1.2.19beta12-no-config.tar.gz": "1.2.19", } self.version_tester(versions, ignore_pre_releases=True) def test_versions_corner_cases(self): versions = { - '/bar/zaiko_2013-03-14_192300.7z': '2013-03-14_192300', + "/bar/zaiko_2013-03-14_192300.7z": "2013-03-14_192300", } self.version_tester(versions) @expectedFailure def test_versions_corner_cases2(self): versions = { - 'foo/InstallXMTuner0-6-4.msi': '0-6-4', - '/harmony/milestones/6.0/debian/amd64/harmony-6.0-classlib_0.0r946981-1_amd64.deb': '0.0r946981-1', + "foo/InstallXMTuner0-6-4.msi": "0-6-4", + "/harmony/milestones/6.0/debian/amd64/harmony-6.0-classlib_0.0r946981-1_amd64.deb": "0.0r946981-1", } self.version_tester(versions) diff --git a/minecode/tests/testfiles/conan/zlib/manifest/conanfile.py b/minecode/tests/testfiles/conan/zlib/manifest/conanfile.py index 72f2e5fc..8bd397c4 100644 --- a/minecode/tests/testfiles/conan/zlib/manifest/conanfile.py +++ b/minecode/tests/testfiles/conan/zlib/manifest/conanfile.py @@ -1,6 +1,7 @@ class ConanFile: pass + required_conan_version = ">=1.53.0" @@ -10,8 +11,10 @@ class ZlibConan(ConanFile): url = "https://github.com/conan-io/conan-center-index" homepage = "https://zlib.net" license = "Zlib" - description = ("A Massively Spiffy Yet Delicately Unobtrusive Compression Library " - "(Also Free, Not to Mention Unencumbered by Patents)") + description = ( + "A Massively Spiffy Yet Delicately Unobtrusive Compression Library " + "(Also Free, Not to Mention Unencumbered by Patents)" + ) topics = ("zlib", "compression") settings = "os", "arch", "compiler", "build_type" @@ -45,8 +48,12 @@ def layout(self): cmake_layout(self, src_folder="src") def source(self): - get(self, **self.conan_data["sources"][self.version], - destination=self.source_folder, strip_root=True) + get( + self, + **self.conan_data["sources"][self.version], + destination=self.source_folder, + strip_root=True, + ) def generate(self): tc = CMakeToolchain(self) @@ -63,18 +70,27 @@ def generate(self): def _patch_sources(self): apply_conandata_patches(self) - is_apple_clang12 = self.settings.compiler == "apple-clang" and Version(self.settings.compiler.version) >= "12.0" + is_apple_clang12 = ( + self.settings.compiler == "apple-clang" + and Version(self.settings.compiler.version) >= "12.0" + ) if not is_apple_clang12: - for filename in ['zconf.h', 'zconf.h.cmakein', 'zconf.h.in']: + for filename in ["zconf.h", "zconf.h.cmakein", "zconf.h.in"]: filepath = os.path.join(self.source_folder, filename) - replace_in_file(self, filepath, - '#ifdef HAVE_UNISTD_H ' - '/* may be set to #if 1 by ./configure */', - '#if defined(HAVE_UNISTD_H) && (1-HAVE_UNISTD_H-1 != 0)') - replace_in_file(self, filepath, - '#ifdef HAVE_STDARG_H ' - '/* may be set to #if 1 by ./configure */', - '#if defined(HAVE_STDARG_H) && (1-HAVE_STDARG_H-1 != 0)') + replace_in_file( + self, + filepath, + "#ifdef HAVE_UNISTD_H " + "/* may be set to #if 1 by ./configure */", + "#if defined(HAVE_UNISTD_H) && (1-HAVE_UNISTD_H-1 != 0)", + ) + replace_in_file( + self, + filepath, + "#ifdef HAVE_STDARG_H " + "/* may be set to #if 1 by ./configure */", + "#if defined(HAVE_STDARG_H) && (1-HAVE_STDARG_H-1 != 0)", + ) def build(self): self._patch_sources() @@ -84,11 +100,15 @@ def build(self): def _extract_license(self): tmp = load(self, os.path.join(self.source_folder, "zlib.h")) - license_contents = tmp[2:tmp.find("*/", 1)] + license_contents = tmp[2 : tmp.find("*/", 1)] return license_contents def package(self): - save(self, os.path.join(self.package_folder, "licenses", "LICENSE"), self._extract_license()) + save( + self, + os.path.join(self.package_folder, "licenses", "LICENSE"), + self._extract_license(), + ) cmake = CMake(self) cmake.install() @@ -104,4 +124,4 @@ def package_info(self): self.cpp_info.libs = [libname] self.cpp_info.names["cmake_find_package"] = "ZLIB" - self.cpp_info.names["cmake_find_package_multi"] = "ZLIB" \ No newline at end of file + self.cpp_info.names["cmake_find_package_multi"] = "ZLIB" diff --git a/minecode/tests/testfiles/fdroid/index-v2-visited-expected-mapped.json b/minecode/tests/testfiles/fdroid/index-v2-visited-expected-mapped.json index ad733c73..3abbe968 100644 --- a/minecode/tests/testfiles/fdroid/index-v2-visited-expected-mapped.json +++ b/minecode/tests/testfiles/fdroid/index-v2-visited-expected-mapped.json @@ -83,11 +83,11 @@ "Time" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/An.stop_10_src.tar.gz", - "size":558337, + "download_url":"https://f-droid.org/repo/An.stop_10.apk", + "size":66218, "sha1":null, "md5":null, - "sha256":"d489eee14c4693a4aa742c490f2566d2d17170a3977cc04993d96ba4588384c8", + "sha256":"78ec7805f5a49b156fbd5f6af174c1cd8ae9900c9c7af2b2df021aca8cd5eae9", "sha512":null, "bug_tracking_url":"https://github.com/jdmonin/anstop/issues", "code_view_url":null, @@ -125,14 +125,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/An.stop@10?download_url=https://f-droid.org/repo/An.stop_10_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/An.stop", - "repository_download_url":"https://f-droid.org/repo/An.stop_10_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/An.stop_10.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/An.stop@10" @@ -221,11 +223,11 @@ "Time" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/An.stop_9_src.tar.gz", - "size":63674, + "download_url":"https://f-droid.org/repo/An.stop_9.apk", + "size":49763, "sha1":null, "md5":null, - "sha256":"af6baad5820f1b86e8aeeec00bd3a46ad929dbae28dd3615e9ef94a555bd309f", + "sha256":"79f5253bab33cf4030b01fec457fd6ffa4fd54b631ee0bc4c1549fbb69ca6680", "sha512":null, "bug_tracking_url":"https://github.com/jdmonin/anstop/issues", "code_view_url":null, @@ -263,14 +265,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/An.stop@9?download_url=https://f-droid.org/repo/An.stop_9_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/An.stop", - "repository_download_url":"https://f-droid.org/repo/An.stop_9_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/An.stop_9.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/An.stop@9" @@ -359,11 +363,11 @@ "Navigation" ], "homepage_url":"https://sourceforge.net/projects/androidspeedo", - "download_url":"https://f-droid.org/repo/SpeedoMeterApp.main_1_src.tar.gz", - "size":3962, + "download_url":"https://f-droid.org/repo/SpeedoMeterApp.main_1.apk", + "size":6618, "sha1":null, "md5":null, - "sha256":"079d03842cbc4730495a916298e7e5b6874a583c504529d194ba0785d38153e4", + "sha256":"c838e3b53794fa4958b913a1e540167aa2e52c904f1d462352d86d4124982664", "sha512":null, "bug_tracking_url":null, "code_view_url":null, @@ -401,14 +405,16 @@ "other_license_detections":[], "extracted_license_statement":"PublicDomain", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/SpeedoMeterApp.main@1?download_url=https://f-droid.org/repo/SpeedoMeterApp.main_1_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/SpeedoMeterApp.main", - "repository_download_url":"https://f-droid.org/repo/SpeedoMeterApp.main_1_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/SpeedoMeterApp.main_1.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/SpeedoMeterApp.main@1" @@ -497,11 +503,11 @@ "Multimedia" ], "homepage_url":"http://jimroal.com/slist.htm", - "download_url":"https://f-droid.org/repo/a2dp.Vol_169_src.tar.gz", - "size":1361887, + "download_url":"https://f-droid.org/repo/a2dp.Vol_169.apk", + "size":2748737, "sha1":null, "md5":null, - "sha256":"83ce527b19a42424eef5f6f3dc837a1c8d9a5d6c3e646c67b845f3bf7cb43b4e", + "sha256":"f67ef52502faf8dbcef310c122d9efe7871effdd8f9fe0ca93b9925513152d37", "sha512":null, "bug_tracking_url":"https://github.com/jroal/a2dpvolume/issues", "code_view_url":null, @@ -539,14 +545,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/a2dp.Vol@169?download_url=https://f-droid.org/repo/a2dp.Vol_169_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/a2dp.Vol", - "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_169_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_169.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/a2dp.Vol@169" @@ -635,11 +643,11 @@ "Multimedia" ], "homepage_url":"http://jimroal.com/slist.htm", - "download_url":"https://f-droid.org/repo/a2dp.Vol_137_src.tar.gz", - "size":566140, + "download_url":"https://f-droid.org/repo/a2dp.Vol_137.apk", + "size":826576, "sha1":null, "md5":null, - "sha256":"86fb52d03061de322f07a2dd9d4ee20946f19181c85fba6672009c9d369600be", + "sha256":"fb913cccb0957c5b52caea48c3ef7a3ce1d616219b47eed65482097920fe8cc5", "sha512":null, "bug_tracking_url":"https://github.com/jroal/a2dpvolume/issues", "code_view_url":null, @@ -677,14 +685,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/a2dp.Vol@137?download_url=https://f-droid.org/repo/a2dp.Vol_137_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/a2dp.Vol", - "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_137_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_137.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/a2dp.Vol@137" @@ -773,11 +783,11 @@ "Multimedia" ], "homepage_url":"http://jimroal.com/slist.htm", - "download_url":"https://f-droid.org/repo/a2dp.Vol_135_src.tar.gz", - "size":558871, + "download_url":"https://f-droid.org/repo/a2dp.Vol_135.apk", + "size":769268, "sha1":null, "md5":null, - "sha256":"4d3054e02935ec461a3ab070fb1f9101a5f7daaddbdeec4ea191a028d00f28af", + "sha256":"970e93aea1888c80056c46513a16ef214b3f8df0f9105720fd3b1479440327d1", "sha512":null, "bug_tracking_url":"https://github.com/jroal/a2dpvolume/issues", "code_view_url":null, @@ -815,14 +825,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/a2dp.Vol@135?download_url=https://f-droid.org/repo/a2dp.Vol_135_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/a2dp.Vol", - "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_135_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/a2dp.Vol_135.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/a2dp.Vol@135" @@ -913,11 +925,11 @@ "Reading" ], "homepage_url":"http://aarddict.org", - "download_url":"https://f-droid.org/repo/aarddict.android_26_src.tar.gz", - "size":3818485, + "download_url":"https://f-droid.org/repo/aarddict.android_26.apk", + "size":1904989, "sha1":null, "md5":null, - "sha256":"8032cf918f0495204e8e00254fcd04ca4fb9c514323f42442100b37283ac0f59", + "sha256":"b72981914c91641d92508ef801fdd99aebe919590b4f981876f306e37c69af91", "sha512":null, "bug_tracking_url":"https://github.com/aarddict/android/issues", "code_view_url":null, @@ -955,14 +967,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/aarddict.android@26?download_url=https://f-droid.org/repo/aarddict.android_26_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/aarddict.android", - "repository_download_url":"https://f-droid.org/repo/aarddict.android_26_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/aarddict.android_26.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/aarddict.android@26" @@ -1053,11 +1067,11 @@ "Reading" ], "homepage_url":"http://aarddict.org", - "download_url":"https://f-droid.org/repo/aarddict.android_25_src.tar.gz", - "size":3818492, + "download_url":"https://f-droid.org/repo/aarddict.android_25.apk", + "size":1904813, "sha1":null, "md5":null, - "sha256":"f7f158ec6cf3506a0012ec1eac18a6fb2907a63fcafd9edbea4142ffcc189b0e", + "sha256":"bd0737ffd7d25bf23f6bd31f3e3b2aa195c5fe523631b44b2e9d975c69898231", "sha512":null, "bug_tracking_url":"https://github.com/aarddict/android/issues", "code_view_url":null, @@ -1095,14 +1109,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/aarddict.android@25?download_url=https://f-droid.org/repo/aarddict.android_25_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/aarddict.android", - "repository_download_url":"https://f-droid.org/repo/aarddict.android_25_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/aarddict.android_25.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/aarddict.android@25" @@ -1193,11 +1209,11 @@ "Reading" ], "homepage_url":"http://aarddict.org", - "download_url":"https://f-droid.org/repo/aarddict.android_24_src.tar.gz", - "size":3818377, + "download_url":"https://f-droid.org/repo/aarddict.android_24.apk", + "size":1904614, "sha1":null, "md5":null, - "sha256":"84b4e90d306c72f78555a7e127623e550493987357db0709f644be29b8bceea3", + "sha256":"a0320f5360b05c6d05b7ba4ffccf6e9b563a2369c68e4da0f5f407cd9ff6479e", "sha512":null, "bug_tracking_url":"https://github.com/aarddict/android/issues", "code_view_url":null, @@ -1235,14 +1251,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/aarddict.android@24?download_url=https://f-droid.org/repo/aarddict.android_24_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/aarddict.android", - "repository_download_url":"https://f-droid.org/repo/aarddict.android_24_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/aarddict.android_24.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/aarddict.android@24" @@ -1347,11 +1365,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_51_src.tar.gz", - "size":3140102, + "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_51.apk", + "size":3302945, "sha1":null, "md5":null, - "sha256":"f718641268c7863e0094f055123ca14b0e16c6501914c71ce985eadcd1965bb7", + "sha256":"1d6dae4beae98f1004519dc2338e98592585ce0ca0aabf5c38f6f214e5253361", "sha512":null, "bug_tracking_url":"https://github.com/communitymedia/mediaphone/issues", "code_view_url":null, @@ -1389,14 +1407,16 @@ "other_license_detections":[], "extracted_license_statement":"LGPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ac.robinson.mediaphone@51?download_url=https://f-droid.org/repo/ac.robinson.mediaphone_51_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ac.robinson.mediaphone", - "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_51_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_51.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ac.robinson.mediaphone@51" @@ -1501,11 +1521,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_50_src.tar.gz", - "size":3131516, + "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_50.apk", + "size":3274124, "sha1":null, "md5":null, - "sha256":"33f4b8590c6d488f709fcc1a9d89a85ae8cfe94e7ac6437d7e4b2fc67225c908", + "sha256":"fd1b70d9a1e24a8471a1bf947dab5bd5735e8cae6c10018b7868c88aa7198f7d", "sha512":null, "bug_tracking_url":"https://github.com/communitymedia/mediaphone/issues", "code_view_url":null, @@ -1543,14 +1563,16 @@ "other_license_detections":[], "extracted_license_statement":"LGPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ac.robinson.mediaphone@50?download_url=https://f-droid.org/repo/ac.robinson.mediaphone_50_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ac.robinson.mediaphone", - "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_50_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_50.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ac.robinson.mediaphone@50" @@ -1655,11 +1677,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_48_src.tar.gz", - "size":3128518, + "download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_48.apk", + "size":3086625, "sha1":null, "md5":null, - "sha256":"185bf70ff585dc81db325fff92c68e126c34b9ae49e85de3e89091f5fb5cf055", + "sha256":"6198e1e1ff295743980112b190e762d1c642ef1344b40a21803a564fdb6661d6", "sha512":null, "bug_tracking_url":"https://github.com/communitymedia/mediaphone/issues", "code_view_url":null, @@ -1697,14 +1719,16 @@ "other_license_detections":[], "extracted_license_statement":"LGPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ac.robinson.mediaphone@48?download_url=https://f-droid.org/repo/ac.robinson.mediaphone_48_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ac.robinson.mediaphone", - "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_48_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ac.robinson.mediaphone_48.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ac.robinson.mediaphone@48" @@ -1809,11 +1833,11 @@ "Internet" ], "homepage_url":"http://acrdevelopment.org", - "download_url":"https://f-droid.org/repo/acr.browser.lightning_101_src.tar.gz", - "size":1020192, + "download_url":"https://f-droid.org/repo/acr.browser.lightning_101.apk", + "size":3424126, "sha1":null, "md5":null, - "sha256":"c1ac724cab6f12be29c83ead8ab3df04804383f220e42e374b80f9c4a44b3f60", + "sha256":"820f4f9977a20b060b4091db2b35cff8cd360e060f94aa742255c845747a2d7f", "sha512":null, "bug_tracking_url":"https://github.com/anthonycr/Lightning-Browser/issues", "code_view_url":null, @@ -1851,14 +1875,16 @@ "other_license_detections":[], "extracted_license_statement":"MPL-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/acr.browser.lightning@101?download_url=https://f-droid.org/repo/acr.browser.lightning_101_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/acr.browser.lightning", - "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_101_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_101.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/acr.browser.lightning@101" @@ -1963,11 +1989,11 @@ "Internet" ], "homepage_url":"http://acrdevelopment.org", - "download_url":"https://f-droid.org/repo/acr.browser.lightning_100_src.tar.gz", - "size":1811930, + "download_url":"https://f-droid.org/repo/acr.browser.lightning_100.apk", + "size":3050894, "sha1":null, "md5":null, - "sha256":"8995f5e677b3d2585e43624df5fd357bfa40da9c703b61daf57fa3dbd9d7123e", + "sha256":"db8d5bfc217eda28485f69ece19cb12e4c2f4502a7b925b18db79980b31b72af", "sha512":null, "bug_tracking_url":"https://github.com/anthonycr/Lightning-Browser/issues", "code_view_url":null, @@ -2005,14 +2031,16 @@ "other_license_detections":[], "extracted_license_statement":"MPL-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/acr.browser.lightning@100?download_url=https://f-droid.org/repo/acr.browser.lightning_100_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/acr.browser.lightning", - "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_100_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_100.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/acr.browser.lightning@100" @@ -2117,11 +2145,11 @@ "Internet" ], "homepage_url":"http://acrdevelopment.org", - "download_url":"https://f-droid.org/repo/acr.browser.lightning_96_src.tar.gz", - "size":914859, + "download_url":"https://f-droid.org/repo/acr.browser.lightning_96.apk", + "size":2687399, "sha1":null, "md5":null, - "sha256":"e9cc630379a8478dfa6716bcd7c5f75fadf03519b78d6316f909d1c7055b15e2", + "sha256":"a225314a83ea7518e0f8105d602171985fbc884d606dbd9669a63a2928856147", "sha512":null, "bug_tracking_url":"https://github.com/anthonycr/Lightning-Browser/issues", "code_view_url":null, @@ -2159,14 +2187,16 @@ "other_license_detections":[], "extracted_license_statement":"MPL-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/acr.browser.lightning@96?download_url=https://f-droid.org/repo/acr.browser.lightning_96_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/acr.browser.lightning", - "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_96_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/acr.browser.lightning_96.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/acr.browser.lightning@96" @@ -2271,11 +2301,11 @@ "Money" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ademar.bitac_6_src.tar.gz", - "size":4996182, + "download_url":"https://f-droid.org/repo/ademar.bitac_6.apk", + "size":3077855, "sha1":null, "md5":null, - "sha256":"96606dd18416352a408585cfb0946e0b49bbc701366e5365f6d3b5d9084b81d6", + "sha256":"660a7730cfa9f11b488395fd657b1f054881e6b0736b6d7f96a61229c43dc77a", "sha512":null, "bug_tracking_url":"https://gitlab.com/ademar111190/BitAC/-/issues", "code_view_url":null, @@ -2313,14 +2343,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ademar.bitac@6?download_url=https://f-droid.org/repo/ademar.bitac_6_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ademar.bitac", - "repository_download_url":"https://f-droid.org/repo/ademar.bitac_6_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ademar.bitac_6.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ademar.bitac@6" @@ -2425,11 +2457,11 @@ "Money" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ademar.bitac_5_src.tar.gz", - "size":4995815, + "download_url":"https://f-droid.org/repo/ademar.bitac_5.apk", + "size":2919369, "sha1":null, "md5":null, - "sha256":"fe0a4dbf0fcc76df2f85e38758abcc1808fc7fa355ffdf8237c41452670f647e", + "sha256":"efd2052b37ea6cb44836f400926b79d974cf3528e6e6e2c8a72569c63cfdbdd6", "sha512":null, "bug_tracking_url":"https://gitlab.com/ademar111190/BitAC/-/issues", "code_view_url":null, @@ -2467,14 +2499,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ademar.bitac@5?download_url=https://f-droid.org/repo/ademar.bitac_5_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ademar.bitac", - "repository_download_url":"https://f-droid.org/repo/ademar.bitac_5_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ademar.bitac_5.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ademar.bitac@5" @@ -2579,11 +2613,11 @@ "Theming" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ademar.textlauncher_8_src.tar.gz", - "size":981806, + "download_url":"https://f-droid.org/repo/ademar.textlauncher_8.apk", + "size":25418, "sha1":null, "md5":null, - "sha256":"d81d49a13ee829998b63114457fc0a086273e634b196bb9dc1dd26426ba0c8a8", + "sha256":"b2a7a09d0e0dafa341912eb4563fe0a8fdc02536e438862d8a83b8ba55c282ec", "sha512":null, "bug_tracking_url":"https://gitlab.com/ademar111190/textlauncher/-/issues", "code_view_url":null, @@ -2621,14 +2655,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ademar.textlauncher@8?download_url=https://f-droid.org/repo/ademar.textlauncher_8_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ademar.textlauncher", - "repository_download_url":"https://f-droid.org/repo/ademar.textlauncher_8_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ademar.textlauncher_8.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ademar.textlauncher@8" @@ -2733,11 +2769,11 @@ "Theming" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/ademar.textlauncher_7_src.tar.gz", - "size":981742, + "download_url":"https://f-droid.org/repo/ademar.textlauncher_7.apk", + "size":14591, "sha1":null, "md5":null, - "sha256":"e0f03ad0f1d1e547b5cec5fe665fd796e0e48db865d595f7d802393b875b74d5", + "sha256":"cfb8c97c5f524d88d854e604d60a83b199b27b124743bbe8e415d027ea3a32be", "sha512":null, "bug_tracking_url":"https://gitlab.com/ademar111190/textlauncher/-/issues", "code_view_url":null, @@ -2775,14 +2811,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ademar.textlauncher@7?download_url=https://f-droid.org/repo/ademar.textlauncher_7_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ademar.textlauncher", - "repository_download_url":"https://f-droid.org/repo/ademar.textlauncher_7_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ademar.textlauncher_7.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ademar.textlauncher@7" @@ -2887,11 +2925,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/agersant.polaris_415293114_src.tar.gz", - "size":25450214, + "download_url":"https://f-droid.org/repo/agersant.polaris_415293114.apk", + "size":4952438, "sha1":null, "md5":null, - "sha256":"4284e29988d213bfcfaedf85b37fe9df15d8af30d6f262e935251df83b1b314a", + "sha256":"0b88ef7adfcc601a45db988b9cfc5296cdc4c7c095b07edc43aada63c4e1211d", "sha512":null, "bug_tracking_url":"https://github.com/agersant/polaris-android/issues", "code_view_url":null, @@ -2929,14 +2967,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agersant.polaris@415293114?download_url=https://f-droid.org/repo/agersant.polaris_415293114_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agersant.polaris", - "repository_download_url":"https://f-droid.org/repo/agersant.polaris_415293114_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agersant.polaris_415293114.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agersant.polaris@415293114" @@ -3041,11 +3081,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/agersant.polaris_415293112_src.tar.gz", - "size":25450029, + "download_url":"https://f-droid.org/repo/agersant.polaris_415293112.apk", + "size":4831957, "sha1":null, "md5":null, - "sha256":"8e67cf488d40d88dc4d0f95a8d6e712c1e63940b21a21a6d6ca1f65f2e5dfc8b", + "sha256":"b82a59f1cc21d014c6947d174abb7f533d780ffdfc3794cc193e8268ba93efe3", "sha512":null, "bug_tracking_url":"https://github.com/agersant/polaris-android/issues", "code_view_url":null, @@ -3083,14 +3123,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agersant.polaris@415293112?download_url=https://f-droid.org/repo/agersant.polaris_415293112_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agersant.polaris", - "repository_download_url":"https://f-droid.org/repo/agersant.polaris_415293112_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agersant.polaris_415293112.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agersant.polaris@415293112" @@ -3195,11 +3237,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/agersant.polaris_48_src.tar.gz", - "size":25386695, + "download_url":"https://f-droid.org/repo/agersant.polaris_48.apk", + "size":3831332, "sha1":null, "md5":null, - "sha256":"7b85afd1d1a30ba36b2ad0dce1151d246efaaab708cca580280b50776527f107", + "sha256":"134d18e2d9dbd10a974e2806e27149951c9e4ec1c53e1ba2179ec859ae6b928a", "sha512":null, "bug_tracking_url":"https://github.com/agersant/polaris-android/issues", "code_view_url":null, @@ -3237,14 +3279,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agersant.polaris@48?download_url=https://f-droid.org/repo/agersant.polaris_48_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agersant.polaris", - "repository_download_url":"https://f-droid.org/repo/agersant.polaris_48_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agersant.polaris_48.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agersant.polaris@48" @@ -3333,11 +3377,11 @@ "Multimedia" ], "homepage_url":"https://codeberg.org/agrigolo/chubby-click", - "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_22_src.tar.gz", - "size":1684337, + "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_22.apk", + "size":2274962, "sha1":null, "md5":null, - "sha256":"b90b73ade10a3478842e279a6d23456b8f7b3affae886f1c451d9d90f0d9c412", + "sha256":"cb357c3f12258e40955d1d16ad45a0996b482e07c6bce13587aa6f722c164030", "sha512":null, "bug_tracking_url":"https://codeberg.org/agrigolo/chubby-click/issues", "code_view_url":null, @@ -3375,14 +3419,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agrigolo.chubbyclick@22?download_url=https://f-droid.org/repo/agrigolo.chubbyclick_22_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agrigolo.chubbyclick", - "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_22_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_22.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agrigolo.chubbyclick@22" @@ -3471,11 +3517,11 @@ "Multimedia" ], "homepage_url":"https://codeberg.org/agrigolo/chubby-click", - "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_21_src.tar.gz", - "size":1681634, + "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_21.apk", + "size":2270785, "sha1":null, "md5":null, - "sha256":"bf6c2be1fcb8308c6a0f6e870c3466fff65cc970758ead7a1274985d626dc7ae", + "sha256":"ddeb3fadfac02ccf53c2cf6ac4fc8ecc6cf58dd2411a888c7a7d03a2035cf78e", "sha512":null, "bug_tracking_url":"https://codeberg.org/agrigolo/chubby-click/issues", "code_view_url":null, @@ -3513,14 +3559,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agrigolo.chubbyclick@21?download_url=https://f-droid.org/repo/agrigolo.chubbyclick_21_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agrigolo.chubbyclick", - "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_21_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_21.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agrigolo.chubbyclick@21" @@ -3609,11 +3657,11 @@ "Multimedia" ], "homepage_url":"https://codeberg.org/agrigolo/chubby-click", - "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_20_src.tar.gz", - "size":1681501, + "download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_20.apk", + "size":2270785, "sha1":null, "md5":null, - "sha256":"37dc91804c33ea5ecfb33e37f3fc68fb8d377d550ab522b5c1726644d501b4a0", + "sha256":"e2bd8970b208b36d9b7b37f44f29d664531d9fe36a4a0e9fa697a6d6589c4166", "sha512":null, "bug_tracking_url":"https://codeberg.org/agrigolo/chubby-click/issues", "code_view_url":null, @@ -3651,14 +3699,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/agrigolo.chubbyclick@20?download_url=https://f-droid.org/repo/agrigolo.chubbyclick_20_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/agrigolo.chubbyclick", - "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_20_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/agrigolo.chubbyclick_20.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/agrigolo.chubbyclick@20" @@ -3747,11 +3797,11 @@ "Internet" ], "homepage_url":"https://susi.ai/", - "download_url":"https://f-droid.org/repo/ai.susi_16_src.tar.gz", - "size":3964656, + "download_url":"https://f-droid.org/repo/ai.susi_16.apk", + "size":14344225, "sha1":null, "md5":null, - "sha256":"22cdb59a9d33eb02e0f44aa5206dcc3194a21b020cd2b3b063e6a718d6878cae", + "sha256":"6f851010809953054e7bb8fdd7e1f86a80e00cef6d91450518a408c4b0b59195", "sha512":null, "bug_tracking_url":"https://github.com/fossasia/susi_android/issues", "code_view_url":null, @@ -3789,14 +3839,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ai.susi@16?download_url=https://f-droid.org/repo/ai.susi_16_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ai.susi", - "repository_download_url":"https://f-droid.org/repo/ai.susi_16_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ai.susi_16.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ai.susi@16" @@ -3885,11 +3937,11 @@ "Internet" ], "homepage_url":"https://susi.ai/", - "download_url":"https://f-droid.org/repo/ai.susi_15_src.tar.gz", - "size":3868859, + "download_url":"https://f-droid.org/repo/ai.susi_15.apk", + "size":11555015, "sha1":null, "md5":null, - "sha256":"c74134a1e1986579d92f381cbd3109c3b77fcd3c52c59e7083069e49e604bbd0", + "sha256":"4f25e1679ced9ea42ceb32677bb0d0310fdf818e6cda9a9f7e8b81e73e73c8e9", "sha512":null, "bug_tracking_url":"https://github.com/fossasia/susi_android/issues", "code_view_url":null, @@ -3927,14 +3979,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ai.susi@15?download_url=https://f-droid.org/repo/ai.susi_15_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ai.susi", - "repository_download_url":"https://f-droid.org/repo/ai.susi_15_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ai.susi_15.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ai.susi@15" @@ -4023,11 +4077,11 @@ "Internet" ], "homepage_url":"https://susi.ai/", - "download_url":"https://f-droid.org/repo/ai.susi_14_src.tar.gz", - "size":3863221, + "download_url":"https://f-droid.org/repo/ai.susi_14.apk", + "size":10331217, "sha1":null, "md5":null, - "sha256":"ef878413b874564939fb0c37e1971a780488df402c46505a72886d2e7b18b0c4", + "sha256":"d0c0443f153b75fcb155aa062d8b8af6109d779c0b8d02946a00c074b5f9c305", "sha512":null, "bug_tracking_url":"https://github.com/fossasia/susi_android/issues", "code_view_url":null, @@ -4065,14 +4119,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/ai.susi@14?download_url=https://f-droid.org/repo/ai.susi_14_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/ai.susi", - "repository_download_url":"https://f-droid.org/repo/ai.susi_14_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/ai.susi_14.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/ai.susi@14" @@ -4163,11 +4219,11 @@ "Time" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_2_src.tar.gz", - "size":183772, + "download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_2.apk", + "size":185892, "sha1":null, "md5":null, - "sha256":"c26b0ed5cbd07d9e839e7f31f7757479e142e0194264c3afb0622b14c3e8f571", + "sha256":"20eea522f8d41dbbe9f8fa7204f076918dd9420c562468e5fd72059e6e66615e", "sha512":null, "bug_tracking_url":"https://github.com/andviane/moon/issues", "code_view_url":null, @@ -4205,14 +4261,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/akk.astro.droid.moonphase@2?download_url=https://f-droid.org/repo/akk.astro.droid.moonphase_2_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/akk.astro.droid.moonphase", - "repository_download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_2_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_2.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/akk.astro.droid.moonphase@2" @@ -4303,11 +4361,11 @@ "Time" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_1_src.tar.gz", - "size":183725, + "download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_1.apk", + "size":190393, "sha1":null, "md5":null, - "sha256":"084f4b57a30ad16b6479c54b4e5d778e38aa9f26712e998718243c0156a756db", + "sha256":"7e053f07f595f78863ddfc73ea55e5b19f1f504c2b8f06f61fc772521488e03f", "sha512":null, "bug_tracking_url":"https://github.com/andviane/moon/issues", "code_view_url":null, @@ -4345,14 +4403,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/akk.astro.droid.moonphase@1?download_url=https://f-droid.org/repo/akk.astro.droid.moonphase_1_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/akk.astro.droid.moonphase", - "repository_download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_1_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/akk.astro.droid.moonphase_1.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/akk.astro.droid.moonphase@1" @@ -4441,11 +4501,11 @@ "System" ], "homepage_url":"http://ed.am/dev/android/export-contacts", - "download_url":"https://f-droid.org/repo/am.ed.exportcontacts_10003_src.tar.gz", - "size":51527, + "download_url":"https://f-droid.org/repo/am.ed.exportcontacts_10003.apk", + "size":58051, "sha1":null, "md5":null, - "sha256":"0a719754dd28701e7782bd63f05e2ddfeb195bcba462efa31e8c28eb79f1b39f", + "sha256":"dcbed1c712db0b614d53e9517a287541349a33ac00d1a391806f839643cce5e9", "sha512":null, "bug_tracking_url":"http://dev.ed.am/export-contacts", "code_view_url":null, @@ -4483,14 +4543,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.ed.exportcontacts@10003?download_url=https://f-droid.org/repo/am.ed.exportcontacts_10003_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.ed.exportcontacts", - "repository_download_url":"https://f-droid.org/repo/am.ed.exportcontacts_10003_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.ed.exportcontacts_10003.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.ed.exportcontacts@10003" @@ -4579,11 +4641,11 @@ "System" ], "homepage_url":"http://ed.am/dev/android/import-contacts", - "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10304_src.tar.gz", - "size":68741, + "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10304.apk", + "size":80326, "sha1":null, "md5":null, - "sha256":"921396b7c6858188406578358db80472c9e0053335662b96999e2fda259eba09", + "sha256":"8680fbf57af95a5ab69d91502b6337549e770d23db318a9ceaed957c88a92845", "sha512":null, "bug_tracking_url":"http://dev.ed.am/import-contacts", "code_view_url":null, @@ -4621,14 +4683,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.ed.importcontacts@10304?download_url=https://f-droid.org/repo/am.ed.importcontacts_10304_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.ed.importcontacts", - "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10304_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10304.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.ed.importcontacts@10304" @@ -4717,11 +4781,11 @@ "System" ], "homepage_url":"http://ed.am/dev/android/import-contacts", - "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10303_src.tar.gz", - "size":76762, + "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10303.apk", + "size":85429, "sha1":null, "md5":null, - "sha256":"605a93496bb1ecbdb30b6af075b9fbd3ee23e61a971360f28dcddda2e567ca24", + "sha256":"f12e21058329841874e08db08eae230dd03a1437466a8f31c9485658bee53e63", "sha512":null, "bug_tracking_url":"http://dev.ed.am/import-contacts", "code_view_url":null, @@ -4759,14 +4823,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.ed.importcontacts@10303?download_url=https://f-droid.org/repo/am.ed.importcontacts_10303_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.ed.importcontacts", - "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10303_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10303.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.ed.importcontacts@10303" @@ -4855,11 +4921,11 @@ "System" ], "homepage_url":"http://ed.am/dev/android/import-contacts", - "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10301_src.tar.gz", - "size":75346, + "download_url":"https://f-droid.org/repo/am.ed.importcontacts_10301.apk", + "size":84309, "sha1":null, "md5":null, - "sha256":"22a6b3892c8bc4ce5b087639589f74418b4eaf5984a295a15e0d3b0cc577fd21", + "sha256":"c150a1a6e420e1e1ea535c9d26666d76f2d30bc1038a0cb2e871b359327aebcb", "sha512":null, "bug_tracking_url":"http://dev.ed.am/import-contacts", "code_view_url":null, @@ -4897,14 +4963,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.ed.importcontacts@10301?download_url=https://f-droid.org/repo/am.ed.importcontacts_10301_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.ed.importcontacts", - "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10301_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.ed.importcontacts_10301.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.ed.importcontacts@10301" @@ -4993,11 +5061,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_12_src.tar.gz", - "size":615615, + "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_12.apk", + "size":255897, "sha1":null, "md5":null, - "sha256":"5a52dc5903a482738986a2e6e17e2a9f3ef3841d3e941f743972a3bde37ba63a", + "sha256":"8e13ebb31680b56c802428b09cb7ac630505dd13989ff5de81067d90754d58d2", "sha512":null, "bug_tracking_url":"https://github.com/chelovek84/mBrowser/issues", "code_view_url":null, @@ -5035,14 +5103,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.zoom.mbrowser@12?download_url=https://f-droid.org/repo/am.zoom.mbrowser_12_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.zoom.mbrowser", - "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_12_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_12.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.zoom.mbrowser@12" @@ -5131,11 +5201,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_11_src.tar.gz", - "size":611638, + "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_11.apk", + "size":255472, "sha1":null, "md5":null, - "sha256":"5eba16c41e9af0c4a38fe43421be2dd7e855c7b1e302d4df84f136c8c4d33f8a", + "sha256":"30325a66e6d15229c8c29ad5b164c0e130841c67588a03870632cec1de1390ab", "sha512":null, "bug_tracking_url":"https://github.com/chelovek84/mBrowser/issues", "code_view_url":null, @@ -5173,14 +5243,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.zoom.mbrowser@11?download_url=https://f-droid.org/repo/am.zoom.mbrowser_11_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.zoom.mbrowser", - "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_11_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_11.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.zoom.mbrowser@11" @@ -5269,11 +5341,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_10_src.tar.gz", - "size":611065, + "download_url":"https://f-droid.org/repo/am.zoom.mbrowser_10.apk", + "size":254344, "sha1":null, "md5":null, - "sha256":"69b49b63e495d03c430f5e917e01b87dcd593607382adc279be73beb478c9974", + "sha256":"b17d82c7f67d6cc1f1c6f495355fb1ffd8f96eca23fc9d4a0951d208a3171855", "sha512":null, "bug_tracking_url":"https://github.com/chelovek84/mBrowser/issues", "code_view_url":null, @@ -5311,14 +5383,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.zoom.mbrowser@10?download_url=https://f-droid.org/repo/am.zoom.mbrowser_10_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.zoom.mbrowser", - "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_10_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.zoom.mbrowser_10.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.zoom.mbrowser@10" @@ -5407,11 +5481,11 @@ "System" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/am.zoom.mlauncher_7_src.tar.gz", - "size":729123, + "download_url":"https://f-droid.org/repo/am.zoom.mlauncher_7.apk", + "size":251861, "sha1":null, "md5":null, - "sha256":"2493ef2a2bb1216990d494012abdb62be872b1bf1460a2ecc55e1ba838202638", + "sha256":"d135c514f0d59cc8eed8ddd72ace781610cf01222dc2985caeecfa002a840cae", "sha512":null, "bug_tracking_url":"https://github.com/chelovek84/mLauncher/issues", "code_view_url":null, @@ -5449,14 +5523,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.zoom.mlauncher@7?download_url=https://f-droid.org/repo/am.zoom.mlauncher_7_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.zoom.mlauncher", - "repository_download_url":"https://f-droid.org/repo/am.zoom.mlauncher_7_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.zoom.mlauncher_7.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.zoom.mlauncher@7" @@ -5545,11 +5621,11 @@ "System" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/am.zoom.mlauncher_6_src.tar.gz", - "size":729127, + "download_url":"https://f-droid.org/repo/am.zoom.mlauncher_6.apk", + "size":251855, "sha1":null, "md5":null, - "sha256":"1a4d77d9a38fb16a7a01b18efafddaaf8f5b23d0bc09c0a26db336521d4e5ff1", + "sha256":"f37d48959ab15f75a140f55a2cd0d29dab27be3fb6cced615c03c50f12f62d36", "sha512":null, "bug_tracking_url":"https://github.com/chelovek84/mLauncher/issues", "code_view_url":null, @@ -5587,14 +5663,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/am.zoom.mlauncher@6?download_url=https://f-droid.org/repo/am.zoom.mlauncher_6_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/am.zoom.mlauncher", - "repository_download_url":"https://f-droid.org/repo/am.zoom.mlauncher_6_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/am.zoom.mlauncher_6.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/am.zoom.mlauncher@6" @@ -5683,11 +5761,11 @@ "Multimedia" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/amirz.dngprocessor_5_src.tar.gz", - "size":201270, + "download_url":"https://f-droid.org/repo/amirz.dngprocessor_5.apk", + "size":249134, "sha1":null, "md5":null, - "sha256":"680ea228d34737d4ff1d98d81b7da439f22c8d987c7f0bfdecdfcbd22136e7c7", + "sha256":"8a8e803d239717837e709c84de06bd2d10306836d6443eb1ec98f588786fc631", "sha512":null, "bug_tracking_url":"https://github.com/amirzaidi/DNGProcessor/issues", "code_view_url":null, @@ -5725,14 +5803,16 @@ "other_license_detections":[], "extracted_license_statement":"LGPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/amirz.dngprocessor@5?download_url=https://f-droid.org/repo/amirz.dngprocessor_5_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/amirz.dngprocessor", - "repository_download_url":"https://f-droid.org/repo/amirz.dngprocessor_5_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/amirz.dngprocessor_5.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/amirz.dngprocessor@5" @@ -5839,11 +5919,11 @@ "Theming" ], "homepage_url":"http://www.reddit.com/u/AmirZ", - "download_url":"https://f-droid.org/repo/amirz.rootless.nexuslauncher_30911_src.tar.gz", - "size":1225005, + "download_url":"https://f-droid.org/repo/amirz.rootless.nexuslauncher_30911.apk", + "size":1693305, "sha1":null, "md5":null, - "sha256":"5e6413aa4a619c842bd430a5002abb44cc5be0ade9bab105516c634666eed15d", + "sha256":"7fa44d560dc4577374d45176220de2c0b00a71e09d6d148cdea4e0a52d38404a", "sha512":null, "bug_tracking_url":"https://github.com/amirzaidi/Launcher3/issues", "code_view_url":null, @@ -5881,14 +5961,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/amirz.rootless.nexuslauncher@30911?download_url=https://f-droid.org/repo/amirz.rootless.nexuslauncher_30911_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/amirz.rootless.nexuslauncher", - "repository_download_url":"https://f-droid.org/repo/amirz.rootless.nexuslauncher_30911_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/amirz.rootless.nexuslauncher_30911.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/amirz.rootless.nexuslauncher@30911" @@ -5993,11 +6075,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_16_src.tar.gz", - "size":111974, + "download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_16.apk", + "size":1971112, "sha1":null, "md5":null, - "sha256":"968c79b32cfc86df5c0b638f8d7bfc5baeb5c6a6f730a0977a0471a5d9c779f0", + "sha256":"db1ee1a0fcddad2b8b99a6f53834718050c54adefa50a08cf64cb00efa50af27", "sha512":null, "bug_tracking_url":"https://github.com/gryphius/androdns/issues", "code_view_url":null, @@ -6035,14 +6117,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/androdns.android.leetdreams.ch.androdns@16?download_url=https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_16_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/androdns.android.leetdreams.ch.androdns", - "repository_download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_16_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_16.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/androdns.android.leetdreams.ch.androdns@16" @@ -6147,11 +6231,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_15_src.tar.gz", - "size":111953, + "download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_15.apk", + "size":1971108, "sha1":null, "md5":null, - "sha256":"df576b5e63d85b4d1801eb4814e53d9c2d89a836306dc7e9b639b44ef623786a", + "sha256":"cb30ddf5e59b91b938271983d415656de6e370f23cbdf633786036831ca3d3db", "sha512":null, "bug_tracking_url":"https://github.com/gryphius/androdns/issues", "code_view_url":null, @@ -6189,14 +6273,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/androdns.android.leetdreams.ch.androdns@15?download_url=https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_15_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/androdns.android.leetdreams.ch.androdns", - "repository_download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_15_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/androdns.android.leetdreams.ch.androdns_15.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/androdns.android.leetdreams.ch.androdns@15" @@ -6301,11 +6387,11 @@ "System" ], "homepage_url":"https://github.com/antlersoft/android-vnc-viewer/wiki/Documentation", - "download_url":"https://f-droid.org/repo/android.androidVNC_13_src.tar.gz", - "size":400409, + "download_url":"https://f-droid.org/repo/android.androidVNC_13.apk", + "size":243294, "sha1":null, "md5":null, - "sha256":"6a27130023302f7aa0974ceac9c9c9b2439b644906269d73042210ba4fbb63ac", + "sha256":"eb2682f9ab9ccc5926d1bd504995af31ce15c14b7810efd3f88ab9c3acefa4f9", "sha512":null, "bug_tracking_url":"https://github.com/antlersoft/android-vnc-viewer/issues", "code_view_url":null, @@ -6343,14 +6429,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.androidVNC@13?download_url=https://f-droid.org/repo/android.androidVNC_13_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.androidVNC", - "repository_download_url":"https://f-droid.org/repo/android.androidVNC_13_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.androidVNC_13.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.androidVNC@13" @@ -6439,11 +6527,11 @@ "Games" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/android.game.prboom_31_src.tar.gz", - "size":6292542, + "download_url":"https://f-droid.org/repo/android.game.prboom_31.apk", + "size":883589, "sha1":null, "md5":null, - "sha256":"eb7058ec653b4884b18c9c9c918764820fb1d11b28d86c3f72cc2bfb18bb4ffd", + "sha256":"0a88b31c5cc465d83fc77703adddf3f4769af32a9c6505b636ae062a7d351bc0", "sha512":null, "bug_tracking_url":null, "code_view_url":null, @@ -6481,14 +6569,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.game.prboom@31?download_url=https://f-droid.org/repo/android.game.prboom_31_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.game.prboom", - "repository_download_url":"https://f-droid.org/repo/android.game.prboom_31_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.game.prboom_31.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.game.prboom@31" @@ -6593,11 +6683,11 @@ "System" ], "homepage_url":"https://fakestandby.jonasbernard.de/", - "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_11_src.tar.gz", - "size":10393451, + "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_11.apk", + "size":1695060, "sha1":null, "md5":null, - "sha256":"9c3ebdd1a733ac2d77106ce273f725549e87dd09dbf9d7d7b45828ad6ec91225", + "sha256":"1621370f48a2ad0a41a3ef528896c3c8c6fa169be9c7deba49d9ca642fbcb887", "sha512":null, "bug_tracking_url":"https://github.com/JonasBernard/FakeStandby/issues", "code_view_url":null, @@ -6635,14 +6725,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.jonas.fakestandby@11?download_url=https://f-droid.org/repo/android.jonas.fakestandby_11_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.jonas.fakestandby", - "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_11_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_11.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.jonas.fakestandby@11" @@ -6747,11 +6839,11 @@ "System" ], "homepage_url":"https://fakestandby.jonasbernard.de/", - "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_10_src.tar.gz", - "size":10390397, + "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_10.apk", + "size":1703252, "sha1":null, "md5":null, - "sha256":"7b5a6d6a6611e2012ea42de0faccac07bf8d177729aa746c5e91c2d19d663fd2", + "sha256":"68782ca0f0fcd686dd3854879bbee3a2072dac798352263956e6079f46129b0a", "sha512":null, "bug_tracking_url":"https://github.com/JonasBernard/FakeStandby/issues", "code_view_url":null, @@ -6789,14 +6881,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.jonas.fakestandby@10?download_url=https://f-droid.org/repo/android.jonas.fakestandby_10_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.jonas.fakestandby", - "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_10_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_10.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.jonas.fakestandby@10" @@ -6901,11 +6995,11 @@ "System" ], "homepage_url":"https://fakestandby.jonasbernard.de/", - "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_9_src.tar.gz", - "size":10388620, + "download_url":"https://f-droid.org/repo/android.jonas.fakestandby_9.apk", + "size":3218772, "sha1":null, "md5":null, - "sha256":"bc61d8b67d2dcf1c50c47a69b7a262e6161baec4767fb4293bb40f973e86a63b", + "sha256":"26ca0a5bc1fc7abf92640c8ab2b92a7bd8ab5b6ecc8d3586aaf3222c110d4201", "sha512":null, "bug_tracking_url":"https://github.com/JonasBernard/FakeStandby/issues", "code_view_url":null, @@ -6943,14 +7037,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.jonas.fakestandby@9?download_url=https://f-droid.org/repo/android.jonas.fakestandby_9_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.jonas.fakestandby", - "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_9_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.jonas.fakestandby_9.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.jonas.fakestandby@9" @@ -7041,11 +7137,11 @@ "Reading" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_5_src.tar.gz", - "size":79269, + "download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_5.apk", + "size":8730238, "sha1":null, "md5":null, - "sha256":"d8980a562d74247599e4e0ebfc6d790efc21a701fc23a687c0fc9d6af6a1f11c", + "sha256":"209b6119126a78aa2b529a0b4a340bb61ab841e4de67015388bf5b852f59d2cc", "sha512":null, "bug_tracking_url":"https://github.com/NachiketaVadera/EBookDownloader/issues", "code_view_url":null, @@ -7083,14 +7179,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.nachiketa.ebookdownloader@5?download_url=https://f-droid.org/repo/android.nachiketa.ebookdownloader_5_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.nachiketa.ebookdownloader", - "repository_download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_5_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_5.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.nachiketa.ebookdownloader@5" @@ -7181,11 +7279,11 @@ "Reading" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_4_src.tar.gz", - "size":79036, + "download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_4.apk", + "size":1669940, "sha1":null, "md5":null, - "sha256":"af48fce72d27dd111025d0e55eb4a1bafbefb709a86ce7afb22f351b1c12e31d", + "sha256":"ae5b380411a2c4333ec7865113d8b19fb2431800741eb0416784281e7113ba1a", "sha512":null, "bug_tracking_url":"https://github.com/NachiketaVadera/EBookDownloader/issues", "code_view_url":null, @@ -7223,14 +7321,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/android.nachiketa.ebookdownloader@4?download_url=https://f-droid.org/repo/android.nachiketa.ebookdownloader_4_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/android.nachiketa.ebookdownloader", - "repository_download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_4_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/android.nachiketa.ebookdownloader_4.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/android.nachiketa.ebookdownloader@4" @@ -7319,11 +7419,11 @@ "Graphics" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/anupam.acrylic_19_src.tar.gz", - "size":717863, + "download_url":"https://f-droid.org/repo/anupam.acrylic_19.apk", + "size":542178, "sha1":null, "md5":null, - "sha256":"8d7c23bab765ed1c765feeff5c6f357c06775224fa40a1d7da58b3be2668eca5", + "sha256":"df01309e3641fac77cd9bd356558e122e31f1317f988dfb4144ebad949e0ac84", "sha512":null, "bug_tracking_url":"https://github.com/valerio-bozzolan/AcrylicPaint/issues", "code_view_url":null, @@ -7361,14 +7461,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/anupam.acrylic@19?download_url=https://f-droid.org/repo/anupam.acrylic_19_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/anupam.acrylic", - "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_19_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_19.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/anupam.acrylic@19" @@ -7457,11 +7559,11 @@ "Graphics" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/anupam.acrylic_18_src.tar.gz", - "size":717113, + "download_url":"https://f-droid.org/repo/anupam.acrylic_18.apk", + "size":540073, "sha1":null, "md5":null, - "sha256":"5ea3a0c4a0ec2a1f771debbd3dcc5287274205afca6ba7c5f4dfcd42f39250a1", + "sha256":"b014d0a5febd73ee883b69f1054c015d222003559332a7b489d0fd1e49eca408", "sha512":null, "bug_tracking_url":"https://github.com/valerio-bozzolan/AcrylicPaint/issues", "code_view_url":null, @@ -7499,14 +7601,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/anupam.acrylic@18?download_url=https://f-droid.org/repo/anupam.acrylic_18_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/anupam.acrylic", - "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_18_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_18.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/anupam.acrylic@18" @@ -7595,11 +7699,11 @@ "Graphics" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/anupam.acrylic_17_src.tar.gz", - "size":460693, + "download_url":"https://f-droid.org/repo/anupam.acrylic_17.apk", + "size":461248, "sha1":null, "md5":null, - "sha256":"b561e4348ab05b5f8094181f31d99d664496759680b2afd393ad5a74d511eeab", + "sha256":"b06660fd279f443c74aa35b2fc79bd1689f04fb8a23a892d4cf73340de8a261e", "sha512":null, "bug_tracking_url":"https://github.com/valerio-bozzolan/AcrylicPaint/issues", "code_view_url":null, @@ -7637,14 +7741,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/anupam.acrylic@17?download_url=https://f-droid.org/repo/anupam.acrylic_17_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/anupam.acrylic", - "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_17_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/anupam.acrylic_17.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/anupam.acrylic@17" @@ -7751,11 +7857,11 @@ "Multimedia" ], "homepage_url":"https://github.com/alextran1502/immich", - "download_url":"https://f-droid.org/repo/app.alextran.immich_54_src.tar.gz", - "size":47553968, + "download_url":"https://f-droid.org/repo/app.alextran.immich_54.apk", + "size":57002798, "sha1":null, "md5":null, - "sha256":"ca85cdb973ac46acc20b375226d7227d6010f9c8dfaafd962903181833d4cd96", + "sha256":"b6d1717613cbeb60cbf342b5543f009f9a04008fc9999ad34dc3ba87909d24be", "sha512":null, "bug_tracking_url":"https://github.com/alextran1502/immich/issues", "code_view_url":null, @@ -7793,14 +7899,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.alextran.immich@54?download_url=https://f-droid.org/repo/app.alextran.immich_54_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.alextran.immich", - "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_54_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_54.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.alextran.immich@54" @@ -7907,11 +8015,11 @@ "Multimedia" ], "homepage_url":"https://github.com/alextran1502/immich", - "download_url":"https://f-droid.org/repo/app.alextran.immich_53_src.tar.gz", - "size":47547734, + "download_url":"https://f-droid.org/repo/app.alextran.immich_53.apk", + "size":56920878, "sha1":null, "md5":null, - "sha256":"5e348ab1d8d101ae1b844d8a58b8b6da89ad41933eaf9a5110ab8ff78d765d61", + "sha256":"bd22db2ef4364a84d03b90f0d1770015fd9d500728311e2caa7adb1c8fed58a2", "sha512":null, "bug_tracking_url":"https://github.com/alextran1502/immich/issues", "code_view_url":null, @@ -7949,14 +8057,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.alextran.immich@53?download_url=https://f-droid.org/repo/app.alextran.immich_53_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.alextran.immich", - "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_53_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_53.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.alextran.immich@53" @@ -8063,11 +8173,11 @@ "Multimedia" ], "homepage_url":"https://github.com/alextran1502/immich", - "download_url":"https://f-droid.org/repo/app.alextran.immich_52_src.tar.gz", - "size":45165676, + "download_url":"https://f-droid.org/repo/app.alextran.immich_52.apk", + "size":56757038, "sha1":null, "md5":null, - "sha256":"173a3e7accf415037e20179f0ceb6053f12080eef63e9ab02daca277c7404b9e", + "sha256":"410b2fa7495cb7c8e2b6ba6b296f629e2ea2ca00524e47811339924711ac4ffc", "sha512":null, "bug_tracking_url":"https://github.com/alextran1502/immich/issues", "code_view_url":null, @@ -8105,14 +8215,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.alextran.immich@52?download_url=https://f-droid.org/repo/app.alextran.immich_52_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.alextran.immich", - "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_52_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.alextran.immich_52.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.alextran.immich@52" @@ -8201,11 +8313,11 @@ "Money" ], "homepage_url":"https://crescent.cash/", - "download_url":"https://f-droid.org/repo/app.crescentcash.src_120_src.tar.gz", - "size":12692630, + "download_url":"https://f-droid.org/repo/app.crescentcash.src_120.apk", + "size":9839433, "sha1":null, "md5":null, - "sha256":"e9e21882e7f29dab34dfb53697f0d43866819c37b2305fbdd6ac0568e2fd09b4", + "sha256":"a02b64c18d0ec43adfe87686551416a4c68f36db3a25122fc474797eb4880aee", "sha512":null, "bug_tracking_url":"https://gitlab.com/pokkst/crescentcash/issues", "code_view_url":null, @@ -8243,14 +8355,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crescentcash.src@120?download_url=https://f-droid.org/repo/app.crescentcash.src_120_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crescentcash.src", - "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_120_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_120.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crescentcash.src@120" @@ -8339,11 +8453,11 @@ "Money" ], "homepage_url":"https://crescent.cash/", - "download_url":"https://f-droid.org/repo/app.crescentcash.src_118_src.tar.gz", - "size":12652634, + "download_url":"https://f-droid.org/repo/app.crescentcash.src_118.apk", + "size":9837377, "sha1":null, "md5":null, - "sha256":"bd4d9664749a285a16fa6436311640f45745443d6c1aa9acd8d5f256e62cd822", + "sha256":"04b4db2625a90d7f413e963d4509342bbc870479700764b2a53cd87ad875b3f8", "sha512":null, "bug_tracking_url":"https://gitlab.com/pokkst/crescentcash/issues", "code_view_url":null, @@ -8381,14 +8495,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crescentcash.src@118?download_url=https://f-droid.org/repo/app.crescentcash.src_118_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crescentcash.src", - "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_118_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_118.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crescentcash.src@118" @@ -8477,11 +8593,11 @@ "Money" ], "homepage_url":"https://crescent.cash/", - "download_url":"https://f-droid.org/repo/app.crescentcash.src_117_src.tar.gz", - "size":12675494, + "download_url":"https://f-droid.org/repo/app.crescentcash.src_117.apk", + "size":9802865, "sha1":null, "md5":null, - "sha256":"bdb15b54aaef02ac84ab51508936297c2825f9a2735c59db39a945b71e2bb83b", + "sha256":"b376c41a96ac242da44944db44ada51f99b6818191b7f50ae0d44c97d8bae3af", "sha512":null, "bug_tracking_url":"https://gitlab.com/pokkst/crescentcash/issues", "code_view_url":null, @@ -8519,14 +8635,16 @@ "other_license_detections":[], "extracted_license_statement":"MIT", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crescentcash.src@117?download_url=https://f-droid.org/repo/app.crescentcash.src_117_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crescentcash.src", - "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_117_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crescentcash.src_117.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crescentcash.src@117" @@ -8615,11 +8733,11 @@ "Games" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3200000_src.tar.gz", - "size":1912837, + "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3200000.apk", + "size":5435991, "sha1":null, "md5":null, - "sha256":"4ab9f2bfbd621ef624b11f22e23ad531d57a21c383bd1331f451b0bf5a517952", + "sha256":"2ae2f52f2487751b4519fa1670b282c974fa9c8544a621793c94f712106e849e", "sha512":null, "bug_tracking_url":"https://github.com/yourealwaysbe/forkyz/issues", "code_view_url":null, @@ -8657,14 +8775,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3200000?download_url=https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3200000_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crossword.yourealwaysbe.forkyz", - "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3200000_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3200000.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3200000" @@ -8753,11 +8873,11 @@ "Games" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3100000_src.tar.gz", - "size":1480726, + "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3100000.apk", + "size":5427462, "sha1":null, "md5":null, - "sha256":"2f73a7f95bf303c8000fec515d69a67f7fcfe4bf73ab745db551dcfd29f9fe0a", + "sha256":"a815d9aa7c680402d76b9c8fe291fc74f564f3e03bb170dc68c6275d3c3c7500", "sha512":null, "bug_tracking_url":"https://github.com/yourealwaysbe/forkyz/issues", "code_view_url":null, @@ -8795,14 +8915,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3100000?download_url=https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3100000_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crossword.yourealwaysbe.forkyz", - "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3100000_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3100000.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3100000" @@ -8891,11 +9013,11 @@ "Games" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3000000_src.tar.gz", - "size":1480460, + "download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3000000.apk", + "size":5427462, "sha1":null, "md5":null, - "sha256":"31cd5689b9e2fd0aab497226b946e3ee799a10c995f13f3a5e2977891655baa1", + "sha256":"079de513de4ac49eee95bc21f03a15e732f4822dbb054d3a958bdd5269e93680", "sha512":null, "bug_tracking_url":"https://github.com/yourealwaysbe/forkyz/issues", "code_view_url":null, @@ -8933,14 +9055,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3000000?download_url=https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3000000_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.crossword.yourealwaysbe.forkyz", - "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3000000_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.crossword.yourealwaysbe.forkyz_3000000.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.crossword.yourealwaysbe.forkyz@3000000" @@ -9029,11 +9153,11 @@ "Security" ], "homepage_url":"http://forum.xda-developers.com/android/apps-games/app-easy-token-source-securid-token-t2805507", - "download_url":"https://f-droid.org/repo/app.easytoken_919_src.tar.gz", - "size":10755431, + "download_url":"https://f-droid.org/repo/app.easytoken_919.apk", + "size":1024784, "sha1":null, "md5":null, - "sha256":"c251369d27760a347b5282ff92cb5d6c40a1e2d40e3cda464ef2e30f3e9c2478", + "sha256":"4ea7fe623d6e3920f920335191cd23adaf2533d5bd10cef10a7a5a80473da79e", "sha512":null, "bug_tracking_url":"https://github.com/cernekee/EasyToken/issues", "code_view_url":null, @@ -9071,14 +9195,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.easytoken@919?download_url=https://f-droid.org/repo/app.easytoken_919_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.easytoken", - "repository_download_url":"https://f-droid.org/repo/app.easytoken_919_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.easytoken_919.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.easytoken@919" @@ -9167,11 +9293,11 @@ "Security" ], "homepage_url":"http://forum.xda-developers.com/android/apps-games/app-easy-token-source-securid-token-t2805507", - "download_url":"https://f-droid.org/repo/app.easytoken_909_src.tar.gz", - "size":8025551, + "download_url":"https://f-droid.org/repo/app.easytoken_909.apk", + "size":809099, "sha1":null, "md5":null, - "sha256":"12c9383926b5f9f0d1e4df2efacce6cfdf1515dfdb07e32b902802ddd938c53f", + "sha256":"4d688551eec4e75e6ac3469ab220f4c3d3e40e2254c9916f31fecdb1f04cb27e", "sha512":null, "bug_tracking_url":"https://github.com/cernekee/EasyToken/issues", "code_view_url":null, @@ -9209,14 +9335,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.easytoken@909?download_url=https://f-droid.org/repo/app.easytoken_909_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.easytoken", - "repository_download_url":"https://f-droid.org/repo/app.easytoken_909_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.easytoken_909.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.easytoken@909" @@ -9305,11 +9433,11 @@ "Security" ], "homepage_url":"http://forum.xda-developers.com/android/apps-games/app-easy-token-source-securid-token-t2805507", - "download_url":"https://f-droid.org/repo/app.easytoken_819_src.tar.gz", - "size":8018460, + "download_url":"https://f-droid.org/repo/app.easytoken_819.apk", + "size":804978, "sha1":null, "md5":null, - "sha256":"48910c8fe3e98d6899b11eeafcde96ad30be61af8dda366d0bd1fb41f35b6854", + "sha256":"f0594a80c5dc64f63cea7009683453ddcd65485dbbbbef8d84e4281e44f0e2e2", "sha512":null, "bug_tracking_url":"https://github.com/cernekee/EasyToken/issues", "code_view_url":null, @@ -9347,14 +9475,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-2.0-or-later", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.easytoken@819?download_url=https://f-droid.org/repo/app.easytoken_819_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.easytoken", - "repository_download_url":"https://f-droid.org/repo/app.easytoken_819_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.easytoken_819.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.easytoken@819" @@ -9459,11 +9589,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.fedilab.fedilabtube_45_src.tar.gz", - "size":6582330, + "download_url":"https://f-droid.org/repo/app.fedilab.fedilabtube_45.apk", + "size":12287923, "sha1":null, "md5":null, - "sha256":"ca2fe5033881b074537c01a06ccb31c9dd319010c035d62d4f710fd927ae1452", + "sha256":"bd3f683f838f1fab45192f14aec98379fb3d36960b917b9f2cb7a7e81c6481c8", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/fedilab-tube/issues", "code_view_url":null, @@ -9501,14 +9631,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.fedilabtube@45?download_url=https://f-droid.org/repo/app.fedilab.fedilabtube_45_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.fedilabtube", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.fedilabtube_45_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.fedilabtube_45.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.fedilabtube@45" @@ -9613,11 +9745,11 @@ "Internet" ], "homepage_url":"https://framagit.org/tom79/mobilizon-android-app", - "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_3_src.tar.gz", - "size":1012014, + "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_3.apk", + "size":3037446, "sha1":null, "md5":null, - "sha256":"62518770423befc370c46d68a9ada76b10cd8cf023f122e638eda2f0e3d69100", + "sha256":"9b7a3d5efee6a925becc2dfbe3a2b2a0e7c1841609c0cbab2481d9557ee3ed4e", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/mobilizon-android-app/issues", "code_view_url":null, @@ -9655,14 +9787,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.mobilizon@3?download_url=https://f-droid.org/repo/app.fedilab.mobilizon_3_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.mobilizon", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_3_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_3.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.mobilizon@3" @@ -9767,11 +9901,11 @@ "Internet" ], "homepage_url":"https://framagit.org/tom79/mobilizon-android-app", - "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_2_src.tar.gz", - "size":999588, + "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_2.apk", + "size":3021062, "sha1":null, "md5":null, - "sha256":"1e52ad2425d1d375d33909e9000088c654ac0e9636e3f52212a2cfaf7907d72f", + "sha256":"defe55d3423cd24a0830090fdfa1ca1f6edfa02bea79a123b7f9635bff92b726", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/mobilizon-android-app/issues", "code_view_url":null, @@ -9809,14 +9943,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.mobilizon@2?download_url=https://f-droid.org/repo/app.fedilab.mobilizon_2_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.mobilizon", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_2_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_2.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.mobilizon@2" @@ -9921,11 +10057,11 @@ "Internet" ], "homepage_url":"https://framagit.org/tom79/mobilizon-android-app", - "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_1_src.tar.gz", - "size":997708, + "download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_1.apk", + "size":3021062, "sha1":null, "md5":null, - "sha256":"f1413b554480c0fb9088944a57c01fdeb9b8cd23b05271e7c28fba734f40358c", + "sha256":"84e17b1b523153a91808e16011d0a6d6d9c6061bdb398756d4d56c9ce339b8e1", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/mobilizon-android-app/issues", "code_view_url":null, @@ -9963,14 +10099,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.mobilizon@1?download_url=https://f-droid.org/repo/app.fedilab.mobilizon_1_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.mobilizon", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_1_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.mobilizon_1.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.mobilizon@1" @@ -10059,11 +10197,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_33_src.tar.gz", - "size":1096069, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_33.apk", + "size":3714928, "sha1":null, "md5":null, - "sha256":"7f151a6aecae7fe1e4de472425b658a2de18d19c1dc44fe784e18cb80d39fe17", + "sha256":"bcf251559ee4777f26a9e26b403b1606365893ef96df5edcfcca47a71c377361", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10101,14 +10239,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizeme@33?download_url=https://f-droid.org/repo/app.fedilab.nitterizeme_33_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizeme", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_33_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_33.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizeme@33" @@ -10197,11 +10337,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_32_src.tar.gz", - "size":1398878, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_32.apk", + "size":3732264, "sha1":null, "md5":null, - "sha256":"217889ed1119e7fc862711a2c5b80f9767176866ac507560cd771f750b3cb131", + "sha256":"80608edebc87f3967dbcc14263b0ddbbac31d389970602049f81ea38e247aa82", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10239,14 +10379,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizeme@32?download_url=https://f-droid.org/repo/app.fedilab.nitterizeme_32_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizeme", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_32_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_32.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizeme@32" @@ -10335,11 +10477,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_31_src.tar.gz", - "size":3236208, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_31.apk", + "size":3678975, "sha1":null, "md5":null, - "sha256":"cfe56f1fa80ceca3058468ef400be55ef75ca76195c4b42d94949a0d3d958c44", + "sha256":"cd7dc93738eb5f7c7c0fa76c616e016ea2cec5bf9572b1f4fb3f24afed345cb6", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10377,14 +10519,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizeme@31?download_url=https://f-droid.org/repo/app.fedilab.nitterizeme_31_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizeme", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_31_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizeme_31.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizeme@31" @@ -10473,11 +10617,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_33_src.tar.gz", - "size":1096611, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_33.apk", + "size":3714928, "sha1":null, "md5":null, - "sha256":"43a03cbdc9c012f2e28717855f0ee18afcd80d618ae3f37e6ac5f12efa906711", + "sha256":"30ae40611f5fbbce0772e3c26d4103d98d62ed22b7dd83736219ace083b1bc46", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10515,14 +10659,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizemelite@33?download_url=https://f-droid.org/repo/app.fedilab.nitterizemelite_33_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizemelite", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_33_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_33.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizemelite@33" @@ -10611,11 +10757,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_32_src.tar.gz", - "size":1399379, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_32.apk", + "size":3732264, "sha1":null, "md5":null, - "sha256":"e199a466c8d1f3ed771b1803123397d54863bb3261e7ef89a45dd3d6c19c06d5", + "sha256":"7c8135e64e6d4e3558dc870f459cdf159c31166799f17d85809b528f1d55feda", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10653,14 +10799,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizemelite@32?download_url=https://f-droid.org/repo/app.fedilab.nitterizemelite_32_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizemelite", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_32_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_32.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizemelite@32" @@ -10749,11 +10897,11 @@ "Internet" ], "homepage_url":"https://fedilab.app/wiki/nitterizeme/", - "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_31_src.tar.gz", - "size":3236897, + "download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_31.apk", + "size":3674879, "sha1":null, "md5":null, - "sha256":"eb79f7871c4bf8ee0c0c6a97b6d86065e0de19bb1d0519f6bff6874ed1e226f6", + "sha256":"3564df4741d8774abffa119d8daea6f6f6f1eef0cd3900cecf85274970105f24", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/nitterizeme/issues", "code_view_url":null, @@ -10791,14 +10939,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.nitterizemelite@31?download_url=https://f-droid.org/repo/app.fedilab.nitterizemelite_31_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.nitterizemelite", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_31_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.nitterizemelite_31.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.nitterizemelite@31" @@ -10887,11 +11037,11 @@ "Navigation" ], "homepage_url":"https://framagit.org/tom79/openmaps", - "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_13_src.tar.gz", - "size":3417207, + "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_13.apk", + "size":3828282, "sha1":null, "md5":null, - "sha256":"89bd8da46c986b9633c8de48bb61f468021cb4036286dd192ae0eeabc933750b", + "sha256":"f8f74eb4efd87d25113e960af3f1469839042fbe4da0eea0075290944cc5fb53", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/openmaps/issues", "code_view_url":null, @@ -10929,14 +11079,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.openmaps@13?download_url=https://f-droid.org/repo/app.fedilab.openmaps_13_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.openmaps", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_13_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_13.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.openmaps@13" @@ -11025,11 +11177,11 @@ "Navigation" ], "homepage_url":"https://framagit.org/tom79/openmaps", - "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_12_src.tar.gz", - "size":3416732, + "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_12.apk", + "size":3717905, "sha1":null, "md5":null, - "sha256":"f0eb93ca63d5c4f23e7a403f1d79906ff578451c11d11e084ab680eb604b76a2", + "sha256":"f0e53ba598a761609dcd5dc2a2a2b733b8a8a51611f8a9d1dbc0a777ae339904", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/openmaps/issues", "code_view_url":null, @@ -11067,14 +11219,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.openmaps@12?download_url=https://f-droid.org/repo/app.fedilab.openmaps_12_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.openmaps", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_12_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_12.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.openmaps@12" @@ -11163,11 +11317,11 @@ "Navigation" ], "homepage_url":"https://framagit.org/tom79/openmaps", - "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_11_src.tar.gz", - "size":3407906, + "download_url":"https://f-droid.org/repo/app.fedilab.openmaps_11.apk", + "size":3295949, "sha1":null, "md5":null, - "sha256":"0f5d6fb9f8eeae672fdb73b1c3af613542e56f3a936e8942aa7fe7ebebd53441", + "sha256":"ae76fbd1ee331c9d7c2a734a5b2f88fe38c8850ce4351ef7ae70550f3caa05ee", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/openmaps/issues", "code_view_url":null, @@ -11205,14 +11359,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.openmaps@11?download_url=https://f-droid.org/repo/app.fedilab.openmaps_11_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.openmaps", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_11_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.openmaps_11.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.openmaps@11" @@ -11317,11 +11473,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.fedilab.tubelab_45_src.tar.gz", - "size":6581087, + "download_url":"https://f-droid.org/repo/app.fedilab.tubelab_45.apk", + "size":12043946, "sha1":null, "md5":null, - "sha256":"a114d34c5f6066ed3045e4b6096119ab715760cc371e9ace8f532196aec07fe7", + "sha256":"74987196d05f1003732a4cb99d1237d0b1a4831de5f828988d621b0d9a7a6d81", "sha512":null, "bug_tracking_url":"https://framagit.org/tom79/fedilab-tube/issues", "code_view_url":null, @@ -11359,14 +11515,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.fedilab.tubelab@45?download_url=https://f-droid.org/repo/app.fedilab.tubelab_45_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.fedilab.tubelab", - "repository_download_url":"https://f-droid.org/repo/app.fedilab.tubelab_45_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.fedilab.tubelab_45.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.fedilab.tubelab@45" @@ -11471,11 +11629,11 @@ "Internet" ], "homepage_url":null, - "download_url":"https://f-droid.org/repo/app.intra_64_src.tar.gz", - "size":1329599, + "download_url":"https://f-droid.org/repo/app.intra_64.apk", + "size":12590857, "sha1":null, "md5":null, - "sha256":"8449121f840d7208363c1bb39daea26afeea7317cd708b85c3b006e7db76fe83", + "sha256":"eb19b25591e54c6e3718e9810d38d42f2883df3a834ca511c05532bdf45dbbf3", "sha512":null, "bug_tracking_url":"https://github.com/Jigsaw-Code/Intra/issues", "code_view_url":null, @@ -11513,14 +11671,16 @@ "other_license_detections":[], "extracted_license_statement":"Apache-2.0", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.intra@64?download_url=https://f-droid.org/repo/app.intra_64_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.intra", - "repository_download_url":"https://f-droid.org/repo/app.intra_64_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.intra_64.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.intra@64" @@ -11609,11 +11769,11 @@ "Reading" ], "homepage_url":"https://librenews.io/", - "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_5_src.tar.gz", - "size":314767, + "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_5.apk", + "size":2240436, "sha1":null, "md5":null, - "sha256":"f01e5322c0161259310186612b98612955b0045b4fc07b9d512fb3bc76451281", + "sha256":"cad1fef2b79f7df832a27dbf991e70c83b1639eaa678c177a250e32e74fc52db", "sha512":null, "bug_tracking_url":"https://github.com/milesmcc/LibreNews-Android/issues", "code_view_url":null, @@ -11651,14 +11811,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.librenews.io.librenews@5?download_url=https://f-droid.org/repo/app.librenews.io.librenews_5_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.librenews.io.librenews", - "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_5_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_5.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.librenews.io.librenews@5" @@ -11747,11 +11909,11 @@ "Reading" ], "homepage_url":"https://librenews.io/", - "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_4_src.tar.gz", - "size":314245, + "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_4.apk", + "size":2239563, "sha1":null, "md5":null, - "sha256":"21e79d2f2aad88aa10d0620dcdb1ee0eb9ffd9830fd7e9d5120e2a1943f5cab2", + "sha256":"856fa473c9f9aed97d9f4122a853288ceca3cc3ccceb33dd29cb0f21c706729b", "sha512":null, "bug_tracking_url":"https://github.com/milesmcc/LibreNews-Android/issues", "code_view_url":null, @@ -11789,14 +11951,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.librenews.io.librenews@4?download_url=https://f-droid.org/repo/app.librenews.io.librenews_4_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.librenews.io.librenews", - "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_4_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_4.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.librenews.io.librenews@4" @@ -11885,11 +12049,11 @@ "Reading" ], "homepage_url":"https://librenews.io/", - "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_3_src.tar.gz", - "size":310961, + "download_url":"https://f-droid.org/repo/app.librenews.io.librenews_3.apk", + "size":1993445, "sha1":null, "md5":null, - "sha256":"8aae95fde276065e50626f5a1d89beffffbe93d6c34502c0ae83106839a03b24", + "sha256":"506b3003a4de2d5eb956a0c995b22efee313c983fdffc5ca3bef940d8428caa6", "sha512":null, "bug_tracking_url":"https://github.com/milesmcc/LibreNews-Android/issues", "code_view_url":null, @@ -11927,14 +12091,16 @@ "other_license_detections":[], "extracted_license_statement":"GPL-3.0-only", "notice_text":null, - "source_packages":[], + "source_packages":[ + "pkg:fdroid/app.librenews.io.librenews@3?download_url=https://f-droid.org/repo/app.librenews.io.librenews_3_src.tar.gz" + ], "file_references":[], "is_private":false, "is_virtual":false, "extra_data":{}, "dependencies":[], "repository_homepage_url":"https://f-droid.org/en/packages/app.librenews.io.librenews", - "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_3_src.tar.gz", + "repository_download_url":"https://f-droid.org/repo/app.librenews.io.librenews_3.apk", "api_data_url":null, "datasource_id":null, "purl":"pkg:fdroid/app.librenews.io.librenews@3" diff --git a/minecode/utils.py b/minecode/utils.py index cbcd4d3e..64d38b2a 100644 --- a/minecode/utils.py +++ b/minecode/utils.py @@ -20,15 +20,12 @@ from django.utils.encoding import force_str import arrow -from arrow.parser import ParserError import requests -from requests.exceptions import InvalidSchema -from requests.exceptions import ConnectionError - +from arrow.parser import ParserError from commoncode.fileutils import create_dir from extractcode.extract import extract - -from minecode.management.commands import get_settings +from requests.exceptions import ConnectionError +from requests.exceptions import InvalidSchema logger = logging.getLogger(__name__) # import sys @@ -41,35 +38,30 @@ def stringify_null_purl_fields(data): Modify `data` in place by ensuring `purl` fields are not None. This is useful for cleaning data before saving to db. """ - purl_fields = ('type', 'namespace', 'name', - 'version', 'qualifiers', 'subpath') + purl_fields = ("type", "namespace", "name", "version", "qualifiers", "subpath") for field in purl_fields: try: if not data[field]: - data[field] = '' + data[field] = "" except KeyError: continue def sha1(content): - """ - Returns the sha1 hash of the given content. - """ + """Return the sha1 hash of the given content.""" h = hashlib.sha1() h.update(content) return h.hexdigest() def md5(content): - """ - Returns the md5 hash of the given content. - """ + """Return the md5 hash of the given content.""" h = hashlib.md5() h.update(content) return h.hexdigest() -class DataObject(object): +class DataObject: """ A data object, using attributes for storage and a to_dict method to get a dict back. @@ -90,40 +82,35 @@ def __getitem__(self, item): return self.__dict__.get(item) def __eq__(self, other): - return ( - self.to_dict(other.to_dict()) - ) + return self.to_dict(other.to_dict()) def normalize_trailing_slash(uri): - """ - Appends a trailing slash if the URI is not ending with one already. - """ - if not uri.endswith('/'): - uri += '/' + """Append a trailing slash if the URI is not ending with one already.""" + if not uri.endswith("/"): + uri += "/" return uri def is_ascii(s): - """ - Returns True is the string is ASCII. - """ + """Return True is the string is ASCII.""" return all(ord(c) < 128 for c in s) def clean_html_entities(text): - """ - Reverse of django.utils.html.escape - """ - return text.replace('&', '&').replace('<', '<').replace('>', '>')\ - .replace('"', '"').replace(''', "'") + """Reverse of django.utils.html.escape""" + return ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace(""", '"') + .replace("'", "'") + ) def clean_description(text): - """ - Cleans the description text from HTML entities and from extra whitespaces. - """ - return ' '.join(clean_html_entities(text.strip()).split()) + """Clean the description text from HTML entities and from extra whitespaces.""" + return " ".join(clean_html_entities(text.strip()).split()) def strip_nbsp(s): @@ -131,13 +118,13 @@ def strip_nbsp(s): Replace non breaking space HTML entities with regular space and strip the string. """ - return force_str(s).replace(' ', ' ').strip() + return force_str(s).replace(" ", " ").strip() -CR = '\r' -LF = '\n' +CR = "\r" +LF = "\n" CRLF = CR + LF -CRLF_NO_CR = ' ' + LF +CRLF_NO_CR = " " + LF def unixlinesep(text, preserve=False): @@ -167,7 +154,7 @@ def decode_fuzzy_date(s, _self=None): """ import dateutil - if hasattr(_self, 'testing'): + if hasattr(_self, "testing"): # fixed base date used only for testing for well defined date offsets base = arrow.get(2014, 2, 2) else: @@ -175,39 +162,36 @@ def decode_fuzzy_date(s, _self=None): base = arrow.utcnow() fuzzy = { - 'Last 30 days': -30, - 'Last 7 days': -7, - 'Today': 0, - 'Yesterday': -1, + "Last 30 days": -30, + "Last 7 days": -7, + "Today": 0, + "Yesterday": -1, } formats = [ - 'YYYY-MM-DD HH:mm:ss', - - 'MMM DD, YYYY', - 'MMM D, YYYY', - - 'ddd MMM D HH:mm:ss YYYY', - 'ddd MMM D H:mm:ss YYYY', - 'ddd MMM DD HH:mm:ss YYYY', - 'ddd MMM DD H:mm:ss YYYY', - 'dddd MMM D HH:mm:ss YYYY', - 'dddd MMM D H:mm:ss YYYY', - 'dddd MMM DD HH:mm:ss YYYY', - 'dddd MMM DD H:mm:ss YYYY', - - 'MM/DD/YYYY', + "YYYY-MM-DD HH:mm:ss", + "MMM DD, YYYY", + "MMM D, YYYY", + "ddd MMM D HH:mm:ss YYYY", + "ddd MMM D H:mm:ss YYYY", + "ddd MMM DD HH:mm:ss YYYY", + "ddd MMM DD H:mm:ss YYYY", + "dddd MMM D HH:mm:ss YYYY", + "dddd MMM D H:mm:ss YYYY", + "dddd MMM DD HH:mm:ss YYYY", + "dddd MMM DD H:mm:ss YYYY", + "MM/DD/YYYY", ] # normalize spaces - s = ' '.join(s.split()) - if s == 'Earlier this year': - ar = base.floor('year') + s = " ".join(s.split()) + if s == "Earlier this year": + ar = base.floor("year") elif s in fuzzy: ar = base.replace(days=fuzzy[s]) else: ar = arrow.get(s, formats) - ar = ar.replace(tzinfo=dateutil.tz.tzutc()).to('utc') # NOQA + ar = ar.replace(tzinfo=dateutil.tz.tzutc()).to("utc") # NOQA return ar.isoformat() @@ -224,24 +208,23 @@ def get_http_response(uri, timeout=10): Fetch and return the response object from an HTTP uri. `timeout` is a timeout with precedence over REQUESTS_ARGS settings. """ - requests_args = getattr(settings, 'REQUESTS_ARGS', {}) - requests_args['timeout'] = timeout + requests_args = getattr(settings, "REQUESTS_ARGS", {}) + requests_args["timeout"] = timeout - if not uri.lower().startswith('http'): - raise Exception( - 'get_http_response: Not an HTTP URI: %(uri)r' % locals()) + if not uri.lower().startswith("http"): + raise Exception(f"get_http_response: Not an HTTP URI: {uri}") try: response = requests.get(uri, **requests_args) - except (ConnectionError, InvalidSchema) as e: - logger.error( - 'get_http_response: Download failed for %(uri)r' % locals()) + except (ConnectionError, InvalidSchema): + logger.error(f"get_http_response: Download failed for {uri}") raise status = response.status_code if status != 200: - raise Exception('get_http_response: Download failed for %(uri)r ' - 'with %(status)r' % locals()) + raise Exception( + f"get_http_response: Download failed for {uri} " f"with {status}" + ) return response @@ -258,15 +241,15 @@ def get_package_sha1(package, field="repository_download_url"): # Download archive from URL and calculate sha1 response = requests.get(download_url) if response: - sha1_hash = hashlib.new('sha1', response.content) + sha1_hash = hashlib.new("sha1", response.content) sha1 = sha1_hash.hexdigest() return sha1 def fetch_and_write_file_from_url(url): """ - Fetches a file from the `url` and returns the location for the - temporary file. Return None if the url is not reachable. + Fetch a file from the `url` and return the location for the temporary file. + Return None if the url is not reachable. """ response = requests.get(url) if not response.ok: @@ -275,9 +258,8 @@ def fetch_and_write_file_from_url(url): metadata_content = response.text filename = url.split("/")[-1] file_name, _, extension = filename.rpartition(".") - temp_metadata_file = get_temp_file( - file_name=file_name, extension=extension) - with open(temp_metadata_file, 'a') as metadata_file: + temp_metadata_file = get_temp_file(file_name=file_name, extension=extension) + with open(temp_metadata_file, "a") as metadata_file: metadata_file.write(metadata_content) return temp_metadata_file @@ -290,24 +272,20 @@ def validate_sha1(sha1): Return `sha1` if it is valid, None otherwise. """ if sha1 and len(sha1) != 40: - logger.warning( - f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!' - ) + logger.warning(f'Invalid SHA1 length ({len(sha1)}): "{sha1}": SHA1 ignored!') sha1 = None return sha1 -def system_temp_dir(temp_dir=os.getenv('MINECODE_TMP')): - """ - Return the global temp directory.. - """ +def system_temp_dir(temp_dir=os.getenv("MINECODE_TMP")): + """Return the global temp directory..""" if not temp_dir: - temp_dir = os.path.join(tempfile.gettempdir(), 'minecode') + temp_dir = os.path.join(tempfile.gettempdir(), "minecode") create_dir(temp_dir) return temp_dir -def get_temp_dir(base_dir='', prefix=''): +def get_temp_dir(base_dir="", prefix=""): """ Return the path to base a new unique temporary directory, created under the system-wide `system_temp_dir` temp directory and as a subdir of the @@ -321,14 +299,14 @@ def get_temp_dir(base_dir='', prefix=''): return tempfile.mkdtemp(prefix=prefix, dir=base_dir) -def get_temp_file(file_name='data', extension='.file', dir_name=''): +def get_temp_file(file_name="data", extension=".file", dir_name=""): """ Return a file path string to a new, unique and non-existing temporary file that can safely be created without a risk of name collision. """ - if extension and not extension.startswith('.'): - extension = '.' + extension + if extension and not extension.startswith("."): + extension = "." + extension file_name = file_name + extension # create a new temp dir each time @@ -338,9 +316,7 @@ def get_temp_file(file_name='data', extension='.file', dir_name=''): def extract_file(location): - """ - Extract file at location returning the extracted location. - """ + """Extract file at location returning the extracted location.""" target = None try: for event in extract(location): @@ -350,18 +326,16 @@ def extract_file(location): target = event.target break except Exception as e: - logger.error('extract_file: failed for %(location)r' % locals()) + logger.error(f"extract_file: failed for {location}") raise e return target def parse_date(s): - """ - Return date string in YYYY-MM-DD format from a datetime string - """ + """Return date string in YYYY-MM-DD format from a datetime string""" if s: try: - return arrow.get(s).format('YYYY-MM-DD') + return arrow.get(s).format("YYYY-MM-DD") except ParserError: # If we can't parse a date, it's not a big deal as `release_date` # is not an important field for us @@ -369,8 +343,7 @@ def parse_date(s): def is_int(s): - """To test if the input para is a int - """ + """To test if the input para is a int""" try: int(s) return True @@ -384,12 +357,11 @@ def form_vcs_url(vcs_tool, vcs_url, revision_tag_or_branch=None, sub_path=None): # +://[/][@][#] if vcs_url: if vcs_tool: - vcs_url = '+'.join(str(v) for v in [vcs_tool, vcs_url]) + vcs_url = "+".join(str(v) for v in [vcs_tool, vcs_url]) if revision_tag_or_branch: - vcs_url = '@'.join(str(v) - for v in [vcs_url, revision_tag_or_branch]) + vcs_url = "@".join(str(v) for v in [vcs_url, revision_tag_or_branch]) if sub_path: - vcs_url = '#'.join(str(v) for v in [vcs_url, sub_path]) + vcs_url = "#".join(str(v) for v in [vcs_url, sub_path]) return vcs_url @@ -402,7 +374,7 @@ def validate_uuid(uuid_string): # This is from https://stackoverflow.com/questions/4856882/limiting-memory-use-in-a-large-django-queryset/5188179#5188179 -class MemorySavingQuerysetIterator(object): +class MemorySavingQuerysetIterator: def __init__(self, queryset, max_obj_num=1000): self._base_queryset = queryset self._generator = self._setup() @@ -414,10 +386,10 @@ def _setup(self): # the objects we ensure that there are only `max_obj_num` objects in # memory at any given time smaller_queryset = copy.deepcopy(self._base_queryset)[ - i:i+self.max_obj_num] - logger.debug('Grabbing next %s objects from DB' % self.max_obj_num) - for obj in smaller_queryset.iterator(): - yield obj + i : i + self.max_obj_num + ] + logger.debug(f"Grabbing next {self.max_obj_num} objects from DB") + yield from smaller_queryset.iterator() def __iter__(self): return self._generator diff --git a/minecode/utils_test.py b/minecode/utils_test.py index ca2ea0bd..923d9943 100644 --- a/minecode/utils_test.py +++ b/minecode/utils_test.py @@ -7,10 +7,6 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from collections import OrderedDict - -from itertools import chain -from unittest import TestCase import codecs import json import ntpath @@ -19,20 +15,22 @@ import shutil import stat import tarfile +from collections import OrderedDict +from itertools import chain +from unittest import TestCase from django.apps import apps from django.db import connection from django.db.migrations.executor import MigrationExecutor from django.test import TestCase as DjangoTestCase -from rest_framework.utils.serializer_helpers import ReturnDict -from rest_framework.utils.serializer_helpers import ReturnList from commoncode.testcase import FileBasedTesting +from rest_framework.utils.serializer_helpers import ReturnDict +from rest_framework.utils.serializer_helpers import ReturnList from scancode.cli_test_utils import purl_with_fake_uuid -from minecode.utils import get_temp_dir from minecode.tests import FIXTURES_REGEN - +from minecode.utils import get_temp_dir """ The conventions used for the tests are: @@ -47,14 +45,14 @@ class BaseMiningTestCase(TestCase): - BASE_DIR = os.path.join(os.path.dirname(__file__), 'testfiles') + BASE_DIR = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - if not hasattr(self, 'to_delete'): + if not hasattr(self, "to_delete"): self.to_delete = [] def tearDown(self): - for pth in getattr(self, 'to_delete', []): + for pth in getattr(self, "to_delete", []): self.make_rwe(pth) shutil.rmtree(pth, ignore_errors=True) @@ -87,7 +85,7 @@ def extract_archive(self, location, delete=True): Return the temporary dir where the trace was extracted. The temporary dir is deleted once tests are completed. """ - with open(location, 'rb') as input_tar: + with open(location, "rb") as input_tar: tar = tarfile.open(fileobj=input_tar) extract_dir = self.get_temp_dir(delete) tar.extractall(extract_dir) @@ -96,12 +94,13 @@ def extract_archive(self, location, delete=True): def build_archive(self, real_location, tar_path, outarch): from contextlib import closing - with closing(tarfile.open(outarch, mode='w:bz2')) as out: + + with closing(tarfile.open(outarch, mode="w:bz2")) as out: out.add(real_location, arcname=tar_path) def get_temp_dir(self, delete=True): - assert dir and dir != '' - tmp_dir = get_temp_dir(base_dir='', prefix='minecode-tests-') + assert dir and dir != "" + tmp_dir = get_temp_dir(base_dir="", prefix="minecode-tests-") if delete: self.to_delete.append(tmp_dir) return tmp_dir @@ -148,19 +147,15 @@ class MiningTestCase(BaseMiningTestCase, DjangoTestCase): def remove_vcs(location): - """ - Remove well known version control directories. - """ + """Remove well known version control directories.""" for root, dirs, _files in os.walk(location): - for vcs_dir in 'CVS', '.svn', '.git', '.hg': + for vcs_dir in "CVS", ".svn", ".git", ".hg": if vcs_dir in dirs: shutil.rmtree(os.path.join(root, vcs_dir), False) def to_os_native_path(path): - """ - Normalize a path to use the native OS path separator. - """ + """Normalize a path to use the native OS path separator.""" path = path.replace(posixpath.sep, os.path.sep) path = path.replace(ntpath.sep, os.path.sep) path = path.rstrip(os.path.sep) @@ -168,7 +163,6 @@ def to_os_native_path(path): class MockResponse: - def __init__(self, content, status_code): self.content = content self.status_code = status_code @@ -179,7 +173,7 @@ def mocked_requests_get(url, location): Return a MockResponse object by parsing the content of the file at `location` in a response to request to a single `url`. """ - with open(location, 'rb') as loc: + with open(location, "rb") as loc: return MockResponse(loc.read(), 200) @@ -190,23 +184,21 @@ def mocked_requests_get_for_uris(url_to_location, *args, **kwargs): mapping of url->location. """ location = url_to_location[args[0]] - with open(location, 'rb') as loc: + with open(location, "rb") as loc: return MockResponse(loc.read(), 200) def response_403(url, request): - """ - Returns a HTTP response with status 403. - """ - return {'status_code': 403, 'content': ''} + """Return a HTTP response with status 403.""" + return {"status_code": 403, "content": ""} class JsonBasedTestingMixin(TestCase): def _normalize_results(self, data, fields_to_remove=[]): """ - Returns the `data`, where any `package_uid` value has been normalized - with `purl_with_fake_uuid()` and fields from `fields_to_remove` have - been removed from `data`. + Return `data`, where any `package_uid` value has been normalized with + `purl_with_fake_uuid()` and fields from `fields_to_remove` have been + removed from it. """ if type(data) in (list, ReturnList): return [self._normalize_results(entry, fields_to_remove) for entry in data] @@ -222,8 +214,7 @@ def _normalize_results(self, data, fields_to_remove=[]): ): value = purl_with_fake_uuid(value) if key == "for_packages": - value = [purl_with_fake_uuid(package_uid) - for package_uid in value] + value = [purl_with_fake_uuid(package_uid) for package_uid in value] if key in fields_to_remove: continue normalized_data[key] = value @@ -233,17 +224,21 @@ def _normalize_results(self, data, fields_to_remove=[]): def _remove_fields_from_results(self, data, fields_to_remove): if type(data) in (list, ReturnList): - return [self._remove_fields_from_results(entry, fields_to_remove) for entry in data] + return [ + self._remove_fields_from_results(entry, fields_to_remove) + for entry in data + ] if type(data) in (dict, OrderedDict, ReturnDict): - normalized_data = {} # Remove fields from results and normalize Package UIDs for field in fields_to_remove: - if not field in data: + if field not in data: continue data.pop(field) - def check_expected_results(self, results, expected_loc, fields_to_remove=[], regen=FIXTURES_REGEN): + def check_expected_results( + self, results, expected_loc, fields_to_remove=[], regen=FIXTURES_REGEN + ): """ Check `results` are equal to expected data stored in a JSON file at `expected_loc`. @@ -257,10 +252,10 @@ def check_expected_results(self, results, expected_loc, fields_to_remove=[], reg results = self._normalize_results(results, fields_to_remove) if regen: - with codecs.open(expected_loc, mode='wb', encoding='utf-8') as expect: - json.dump(results, expect, indent=2, separators=(',', ':')) + with codecs.open(expected_loc, mode="wb", encoding="utf-8") as expect: + json.dump(results, expect, indent=2, separators=(",", ":")) - with codecs.open(expected_loc, mode='rb', encoding='utf-8') as expect: + with codecs.open(expected_loc, mode="rb", encoding="utf-8") as expect: expected = json.load(expect) results = json.loads(json.dumps(results)) @@ -270,9 +265,9 @@ def check_expected_results(self, results, expected_loc, fields_to_remove=[], reg class JsonBasedTesting(JsonBasedTestingMixin, FileBasedTesting): def _normalize_results(self, data, fields_to_remove=[]): """ - Returns the `data`, where any `package_uid` value has been normalized - with `purl_with_fake_uuid()` and fields from `fields_to_remove` have - been removed from `data`. + Return the `data`, where any `package_uid` value has been normalized + with `purl_with_fake_uuid()` and fields from `fields_to_remove` that + have been removed from `data`. """ if type(data) in (list, ReturnList): return [self._normalize_results(entry, fields_to_remove) for entry in data] @@ -288,8 +283,7 @@ def _normalize_results(self, data, fields_to_remove=[]): ): value = purl_with_fake_uuid(value) if key == "for_packages": - value = [purl_with_fake_uuid(package_uid) - for package_uid in value] + value = [purl_with_fake_uuid(package_uid) for package_uid in value] if key in fields_to_remove: continue normalized_data[key] = value @@ -299,40 +293,21 @@ def _normalize_results(self, data, fields_to_remove=[]): def _remove_fields_from_results(self, data, fields_to_remove): if type(data) in (list, ReturnList): - return [self._remove_fields_from_results(entry, fields_to_remove) for entry in data] + return [ + self._remove_fields_from_results(entry, fields_to_remove) + for entry in data + ] if type(data) in (dict, OrderedDict, ReturnDict): - normalized_data = {} # Remove fields from results and normalize Package UIDs for field in fields_to_remove: - if not field in data: + if field not in data: continue data.pop(field) - def check_expected_results(self, results, expected_loc, fields_to_remove=[], regen=FIXTURES_REGEN): - """ - Check `results` are equal to expected data stored in a JSON - file at `expected_loc`. - `results` can be a JSON string or a regular Python structure. - - Regen the expected JSON if `regen` is True. - """ - if isinstance(results, str): - results = json.loads(results) - - results = self._normalize_results(results, fields_to_remove) - - if regen: - with codecs.open(expected_loc, mode='wb', encoding='utf-8') as expect: - json.dump(results, expect, indent=2, separators=(',', ':')) - - with codecs.open(expected_loc, mode='rb', encoding='utf-8') as expect: - expected = json.load(expect) - - results = json.loads(json.dumps(results)) - self.assertEqual(expected, results) - - def check_expected_uris(self, uris, expected_loc, data_is_json=False, regen=FIXTURES_REGEN): + def check_expected_uris( + self, uris, expected_loc, data_is_json=False, regen=FIXTURES_REGEN + ): """ Check a `uris` iterable of URIs matches the data stored in the JSON file at `expected_loc`. @@ -340,23 +315,20 @@ def check_expected_uris(self, uris, expected_loc, data_is_json=False, regen=FIXT results = [] for uri in uris: uri_dict = uri.to_dict(data_is_json=data_is_json) - if uri_dict.get('date'): + if uri_dict.get("date"): # Parse date since date will be used as Date field in # ResourceURI object, to make it as string format is just for # test comparation. # FIXME: we should ONLY have strings there!!! - uri_dict['date'] = str(uri_dict.get('date')) + uri_dict["date"] = str(uri_dict.get("date")) results.append(uri_dict) self.check_expected_results( - results=results, expected_loc=expected_loc, regen=regen) + results=results, expected_loc=expected_loc, regen=regen + ) def model_to_dict(instance, fields=None, exclude=None): """ - Copied from django.forms.models. model_to_dict - license: bsd-new - see ABOUT file for details - Return a mapping containing the data in ``instance``. ``fields`` is an optional list of field names. If provided, only the @@ -368,18 +340,22 @@ def model_to_dict(instance, fields=None, exclude=None): Note that all field with the word "date" in their name is converted to a boolean value to abstract test results from dates. + + Copied from django.forms.models. model_to_dict + license: bsd-new + see ABOUT file for details """ opts = instance._meta data = dict() for f in chain(opts.concrete_fields, opts.private_fields, opts.many_to_many): - if not getattr(f, 'editable', False): + if not getattr(f, "editable", False): continue if fields and f.name not in fields: continue if exclude and f.name in exclude: continue value = f.value_from_object(instance) - if 'date' in f.name: + if "date" in f.name: value = bool(value) data[f.name] = value return data @@ -397,9 +373,7 @@ def app(self): def setUp(self): assert ( self.migrate_from and self.migrate_to - ), "TestCase '{}' must define migrate_from and migrate_to properties".format( - type(self).__name__ - ) + ), f"TestCase '{type(self).__name__}' must define migrate_from and migrate_to properties" self.migrate_from = [(self.app, self.migrate_from)] self.migrate_to = [(self.app, self.migrate_to)] executor = MigrationExecutor(connection) diff --git a/minecode/version.py b/minecode/version.py index 220bc087..5d25ea9a 100644 --- a/minecode/version.py +++ b/minecode/version.py @@ -10,40 +10,39 @@ import re - -VERSION_PATTERNS_REGEX = [re.compile(x, re.IGNORECASE) for x in [ - # v123413.feature_111.22.11.121 - 'v\d+\.feature\_(\d+\.){1,3}\d+', - - # YYYY-MM-DD_12345 - '\d{4}-\d{2}-\d{2}_\d+', - - # FIXME: this a single regex that should be split - '(M?(v\d+(\-|\_))?\d+\.){1,3}\d+[A-Za-z0-9]*' - '((\.|\-|_|~)(b|B|rc|r|v|RC|alpha|beta|m|pre|vm|G)?\d+((\-|\.)\d+)?)?' - '(' - '(\.|\-)' - '(' - '(' - '(alpha|dev|beta|rc|final|pre)' - '(\-|\_)\d+[A-Za-z]?(\-RELEASE)?' - ')' - '|alpha' - '|dev(\.\d+\.\d+)?' - '|beta|final|release|fixed' - '|(cr\d(\_\d*)?)' - ')' - ')?', - - '[A-Za-z]?(\d+\_){1,3}\d+\_?[A-Za-z]{0,2}\d+', - '(b|rc|r|v|RC|alpha|beta|m|pre|revision-)\d+(\-\d+)?', - 'current|previous|latest|alpha|beta', - '\d+-\d+-\d+-\d+', - '\d{4}-\d{2}-\d{2}', - '\d+-\d+-\d+', - '(\d(\-|\_)){1,2}\d', - '\d{5,14}', -]] +VERSION_PATTERNS_REGEX = [ + re.compile(x, re.IGNORECASE) + for x in [ + # v123413.feature_111.22.11.121 + r"v\d+\.feature\_(\d+\.){1,3}\d+", + # YYYY-MM-DD_12345 + r"\d{4}-\d{2}-\d{2}_\d+", + # FIXME: this a single regex that should be split + r"(M?(v\d+(\-|\_))?\d+\.){1,3}\d+[A-Za-z0-9]*" + r"((\.|\-|_|~)(b|B|rc|r|v|RC|alpha|beta|m|pre|vm|G)?\d+((\-|\.)\d+)?)?" + "(" + r"(\.|\-)" + "(" + "(" + "(alpha|dev|beta|rc|final|pre)" + r"(\-|\_)\d+[A-Za-z]?(\-RELEASE)?" + ")" + "|alpha" + r"|dev(\.\d+\.\d+)?" + "|beta|final|release|fixed" + r"|(cr\d(\_\d*)?)" + ")" + ")?", + r"[A-Za-z]?(\d+\_){1,3}\d+\_?[A-Za-z]{0,2}\d+", + r"(b|rc|r|v|RC|alpha|beta|m|pre|revision-)\d+(\-\d+)?", + "current|previous|latest|alpha|beta", + r"\d+-\d+-\d+-\d+", + r"\d{4}-\d{2}-\d{2}", + r"\d+-\d+-\d+", + r"(\d(\-|\_)){1,2}\d", + r"\d{5,14}", + ] +] def version_hint(path, ignore_pre_releases=False, remove_v_prefix=False): @@ -58,7 +57,7 @@ def version_hint(path, ignore_pre_releases=False, remove_v_prefix=False): if not stripped: return for pattern in VERSION_PATTERNS_REGEX: - segments = stripped.split('/') + segments = stripped.split("/") # skip the first path segment unless there's only one segment first_segment = 1 if len(segments) > 1 else 0 interesting_segments = segments[first_segment:] @@ -70,47 +69,87 @@ def version_hint(path, ignore_pre_releases=False, remove_v_prefix=False): fixed = fix_packages_version(path, vs) if ignore_pre_releases: fixed = strip_pre_releases(fixed) - if remove_v_prefix and fixed.startswith('v'): + if remove_v_prefix and fixed.startswith("v"): fixed = fixed[1:] return fixed -NON_VERSION_TAGS = ('win32', 'am64', 'x86_64', 'i386', 'i586', 'i586', 'x86', - 'macosx',) +NON_VERSION_TAGS = ( + "win32", + "am64", + "x86_64", + "i386", + "i586", + "i586", + "x86", + "macosx", +) -NON_VT_RES = [re.compile(re.escape(t), re.IGNORECASE) - for t in NON_VERSION_TAGS] +NON_VT_RES = [re.compile(re.escape(t), re.IGNORECASE) for t in NON_VERSION_TAGS] def strip_version_tags(path): """Remove well known tags that are not part of the version.""" for ret in NON_VT_RES: - path = ret.sub('', path) + path = ret.sub("", path) return path ARCHIVE_FILE_EXTENSIONS = ( - '.7z', '.7zip', '.tar.gz', '.tar.bz2', '.tar.xz', '.tgz', '.tbz', - '.tbz2', '.tz', '.txz', '.zip', '.rar', '.tar', '.gz', '.bz2', '.jar', - '.tar.lzma', '.war', '.lib', '.a', '.ear', '.sar', '.tlz', - '.xz', '.lzma', '.exe', '.rpm', '.deb', '.msi', '.z', '.pkg', + ".7z", + ".7zip", + ".tar.gz", + ".tar.bz2", + ".tar.xz", + ".tgz", + ".tbz", + ".tbz2", + ".tz", + ".txz", + ".zip", + ".rar", + ".tar", + ".gz", + ".bz2", + ".jar", + ".tar.lzma", + ".war", + ".lib", + ".a", + ".ear", + ".sar", + ".tlz", + ".xz", + ".lzma", + ".exe", + ".rpm", + ".deb", + ".msi", + ".z", + ".pkg", ) -ARCHIVE_FILE_EXT_RES = [re.compile(re.escape(e) + '$', re.IGNORECASE) - for e in ARCHIVE_FILE_EXTENSIONS] +ARCHIVE_FILE_EXT_RES = [ + re.compile(re.escape(e) + "$", re.IGNORECASE) for e in ARCHIVE_FILE_EXTENSIONS +] def strip_extensions(path): - """"Remove well known archive extensions from end of path.""" + """Remove well known archive extensions from end of path.""" for rext in ARCHIVE_FILE_EXT_RES: - path = rext.sub('', path) + path = rext.sub("", path) return path # these extensions are used for common RPMs and Deb packages -PACKAGE_EXTENSIONS = ('.deb', '.rpm', '.srpm', '.diff.gz',) +PACKAGE_EXTENSIONS = ( + ".deb", + ".rpm", + ".srpm", + ".diff.gz", +) def fix_packages_version(path, version_string): @@ -120,8 +159,8 @@ def fix_packages_version(path, version_string): becomes 1.2.4 instead of 1.2.4-1 """ if path.endswith(PACKAGE_EXTENSIONS): - if version_string.count('-') == 1: - left, _right = version_string.split('-') + if version_string.count("-") == 1: + left, _right = version_string.split("-") return left # return as-is in all other cases return version_string @@ -130,9 +169,14 @@ def fix_packages_version(path, version_string): PRE_RELEASE_TAGS = [] -for pt in ('pre', 'rc', 'alpha', 'beta', 'b1', 'b2', 'b3', 'b4', 'b5'): +for pt in ("pre", "rc", "alpha", "beta", "b1", "b2", "b3", "b4", "b5"): # common punctuation prefixes before the tag - for pp in ('_', '-', '.', '~',): + for pp in ( + "_", + "-", + ".", + "~", + ): # variants with prefix before the bare variant PRE_RELEASE_TAGS.append(pp + pt.upper()) PRE_RELEASE_TAGS.append(pp + pt) @@ -142,9 +186,7 @@ def fix_packages_version(path, version_string): def strip_pre_releases(version_string): - """ - Return a version string stripped from alpha, beta, rc and pre parts. - """ + """Return a version string stripped from alpha, beta, rc and pre parts.""" if not any(t in version_string for t in PRE_RELEASE_TAGS): return version_string for tag in PRE_RELEASE_TAGS: diff --git a/minecode/visitors/apache.py b/minecode/visitors/apache.py deleted file mode 100644 index 382f5058..00000000 --- a/minecode/visitors/apache.py +++ /dev/null @@ -1,337 +0,0 @@ -# -# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from itertools import chain - -from packageurl import PackageURL - -from minecode import ls -from minecode import seed -from minecode import visit_router - -from minecode.visitors import HttpVisitor -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI - - -""" -Collect data from Apache.org. -There are two primary sources of data: - -1. directory listings of the downloads distribution web site apache.org/dist - and archive.apache.org. These map well to packages but we get little or no - data beside a checksum and some name and painfully extracted version. - This data could also be fetched for the most recent ones (since 2012) from: - https://dist.apache.org/repos/dist/release/ which is an SVN repo - And svn ls -R https://dist.apache.org/repos/dist/release/ could be more - efficient and easier to parse incrementally? - -2. JSON data collated by the Foundation to provide project information. These - are for projects and do not map very well to a package or download (but - rather to several of thems at once) - -The JSON data comes from https://projects.apache.org/about.html and -is created with this code: -https://svn.apache.org/repos/asf/comdev/projects.apache.org/trunk/ . - -This JSON data is project-level except for releases-files.json .. but this is -just based on parsing the find-ls directory listing so bring nothing new. - -- http://home.apache.org/public/public_ldap_projects.json : seems to be the - origin for the projects.json and podlings.json data - -- These is a list of VCS repositories. Each key maps rather well to a package - name. But the key (some package name?) may not match a project: - https://projects.apache.org/json/foundation/repositories.json - This comes from http://git.apache.org/ - -- This more or less maps to top-level projects but does not relate to packages - https://projects.apache.org/json/foundation/committees.json - -- This list podling projects with only few details and does not map to packages - https://projects.apache.org/json/foundation/podlings.json - -- This should contain an entry for each project but does not. Yet each JSON - contains also the releases.json and repositories.json content for that project. - https://projects.apache.org/json/projects/ - -- This seems to be the origin of most project data: - https://svn.apache.org/repos/asf/comdev/projects.apache.org/trunk/data/projects.xml - -- Another source of the JSON may be: - https://whimsy.apache.org/public/ -""" - - -class ApacheSeed(seed.Seeder): - - def get_seeds(self): - # note: this is the same as below and does not list archived files - # https://archive.apache.org/dist/zzz/find-ls.gz - # to get these we need to rsync or use other techniques - yield 'https://apache.org/dist/zzz/find-ls.gz' - - # FIXME: we cannot relate this to a download package: disabled for now - # yield 'https://projects.apache.org/json/foundation/projects.json' - # yield 'https://projects.apache.org/json/foundation/podlings.json' - - -CHECKSUM_EXTS = '.sha256', '.sha512', '.md5', '.sha', '.sha1', - -# only keep downloads with certain extensions for some archives, packages and checksums -ARCHIVE_EXTS = ( - # archives - '.jar', '.zip', '.tar.gz', '.tgz', '.tar.bz2', '.war', '.tar.xz', '.tgz', '.tar', - # packages - # '.deb', '.rpm', '.msi', '.exe', - '.whl', '.gem', '.nupkg', - # '.dmg', - # '.nbm', -) - -IGNORED_PATH_CONTAINS = ( - 'META/', # # - # doc - '/documentation/', - '/doc/', # # - '-doc.', # # - '-doc-', # # - - '/docs/', # # - '-docs.', # # - '-docs-', # # - - 'javadoc', # # - 'fulldoc', # # - 'apidoc', # # - '-manual.', - '-asdocs.', # # - - # eclipse p2/update sites are redundant - # redundant - 'updatesite/', # # - 'eclipse-update-site', # # - 'update/eclipse', # # - 'sling/eclipse', # # - 'eclipse.site-', - - # large multi-origin binary distributions - '-distro.', - '-bin-withdeps.', - '-bin-with-deps', - - # these are larger distributions with third-parties - 'apache-airavata-distribution', - 'apache-airavata-server', - 'apache-mahout-distribution', - '/syncope-standalone-', - - 'binaries/conda', - - # obscure - 'perl/contrib', - # index data - 'zzz', - # doc - 'ant/manual' -) - - -# TODO: ignore these globs too: - -# openoffice/*/binaries is very large -# /*/apache-log4j-*-site.zip - - -SOURCE_INDICATORS = ( - '_src.', - '-src.', - '-source.', - '-sources.', - '-source-release', - '/source/', - '/sources/', - '/src/', - '_sources.', -) - - -BINARY_INDICATORS = ( -) - - -@visit_router.route('https?://apache.org/dist/zzz/find\-ls\.gz') -class ApacheDistIndexVisitor(NonPersistentHttpVisitor): - """ - Collect URIs for all packages in the "find -ls" index available from Apache - dist sites. - """ - def get_uris(self, content): - import gzip - with gzip.open(content, 'rt') as f: - content = f.read() - - url_template = 'https://apache.org/dist/{path}' - - archive_checksum_extensions = tuple(chain.from_iterable( - [[ae + cke for ae in ARCHIVE_EXTS] for cke in CHECKSUM_EXTS])) - kept_extensions = archive_checksum_extensions + ARCHIVE_EXTS - - for entry in ls.parse_directory_listing(content, from_find=True): - # skip directories, links and special files - if entry.type != ls.FILE: - continue - path = entry.path - - # ignore several downloads - if (not path.endswith(kept_extensions) - or any(i in path for i in IGNORED_PATH_CONTAINS)): - continue - # only checksums need further visit, the archive will be scanned only - is_visited = not path.endswith(CHECKSUM_EXTS) - - yield URI( - visited=is_visited, - source_uri=self.uri, - uri=url_template.format(path=path), - package_url=build_purl(path), - size=entry.size - ) - - -def build_purl(uri): - """ - Return a PackageURL built from an Apache download URL or path. - - URLs start with this prefix 'https://apache.org/dist/' - """ - # FIXME: this is the essence of collecting name and versions for Apache and - # this need to be super robust - segments = [p for p in uri.split('/') if p] - version = None - project_name = segments[0] - # The path typically contains the version but where is highly inconsistent - # - bahir/bahir-spark/2.1.1/apache-bahir-2.1.1-src.zip - # - groovy/2.4.15/sources/apache-groovy-src-2.4.15.zip - # FIXME: this is not correct - if len(segments) > 1 and ('/distribution/' in uri or '/sources/' in uri): - version = segments[1] - - package_url = PackageURL( - type='apache', - # TODO: namespace='', - name=project_name, - version=version) - - return package_url - - -@visit_router.route('https?://(archive\.)apache.org/dist/.*\.(md5|sha1?|sha256|sha512)',) -class ApacheChecksumVisitor(HttpVisitor): - """ - Collect files that contain archive checksums. - """ - def dumps(self, content): - if content: - # the format can be md5sum-like this way: - # c7a2d3becea1d28b518528f8204b8d2a apache-groovy-docs-2.4.6.zip - # with split on space to get the checksum value. - content = content.split() - if content: - content = content[0] - else: - content = '' - return content - - -# FIXME: we cannot relate this to a download package: disabled for now -# @visit_router.route('https://projects.apache.org/json/foundation/projects.json') -class ApacheProjectsJsonVisitor(HttpJsonVisitor): - """ - Collect URIs for all Apache projects. - - The json format is like: - "abdera": { - "bug-database": "https://issues.apache.org/jira/browse/ABDERA", - "category": "xml", - "created": "2008-12-25", - "description": "The goal of the Apache Abdera project ....", - "doap": "http://svn.apache.org/repos/asf/abdera/java/trunk/doap_Abdera.rdf", - "download-page": "http://abdera.apache.org/#downloads", - "homepage": "http://abdera.apache.org", - "license": "http://usefulinc.com/doap/licenses/asl20", - "mailing-list": "http://abdera.apache.org/project.html#lists", - "name": "Apache Abdera", - "pmc": "abdera", - "programming-language": "Java", - "release": [ - { - "created": "2008-04-11", - "name": "Apache Abdera 0.4", - "revision": "1.7.1" - } - ], - "repository": [ - "http://svn.apache.org/repos/asf/abdera" - ], - "shortdesc": "An open source Atom implementation" - }, - """ - def get_uris(self, content): - url_template = 'https://projects.apache.org/json/projects/{name}.json' - for project_name, project_meta in content.items(): - package_url = PackageURL(type='apache', name=project_name) - yield URI( - uri=url_template.format(name=project_name), - package_url=package_url.to_string(), - date=project_meta.get('created')) - - -# FIXME: we cannot relate this to a download package: disabled for now -# @visit_router.route('https://projects.apache.org/json/projects/.*json') -class ApacheSingleProjectJsonVisitor(HttpJsonVisitor): - """ - Collect json content from single project json file. It does not - return any URI as the json contains the project meatadata only, so - this visitor is getting the json to pass to mapper. - """ - pass - - -# FIXME: what can we do with a homepage and nam, packagedb wise?? -# @visit_router.route('https://projects.apache.org/json/foundation/podlings.json') -class ApachePodlingsJsonVisitor(HttpJsonVisitor): - """ - Collect name and homepage for all podlings aka "incubator" projects. - - The json format is like: - "airflow": { - "description": "Airflow is a workflow automation and scheduling ...", - "homepage": "http://airflow.incubator.apache.org/", - "name": "Apache Airflow (Incubating)", - "pmc": "incubator", - "podling": true, - "started": "2016-03" - }, - """ - def get_uris(self, content): - for project_name, project_meta in content.items(): - if 'homepage' not in project_meta: - continue - - package_url = PackageURL( - type='apache', - namespace='incubator', - name=project_name) - - yield URI( - uri=project_meta.get('homepage'), - package_url=package_url.to_string(), - data=project_meta, - source_uri=self.uri, - visited=True) diff --git a/minecode/visitors/bitbucket.py b/minecode/visitors/bitbucket.py deleted file mode 100644 index 4595c23d..00000000 --- a/minecode/visitors/bitbucket.py +++ /dev/null @@ -1,199 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - -import logging - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -""" -Mercurial support is retiring in mid 2020 and only git is -available after that. -https://bitbucket.org/blog/sunsetting-mercurial-support-in-bitbucket - - -TODO: collect actual packages.... -TODO: collect counts and more: -watchers count: - https://api.bitbucket.org/2.0/repositories/mikael/stellaris/watchers?fields=size -forks count: - https://api.bitbucket.org/2.0/repositories/mikael/stellaris/forks?fields=size -tags: - https://api.bitbucket.org/2.0/repositories/mchaput/whoosh/refs/tags -then the tag download is with: - https://bitbucket.org/pypa/setuptools/get/.zip - https://bitbucket.org/pypa/setuptools/get/20.1.1.tar.bz2 - -the latest commit to get a download link: - https://api.bitbucket.org/2.0/repositories/pypa/setuptools/commits - This gets the count of commits. - the link is then: https://bitbucket.org/pypa/setuptools/get/.tar.bz2 - -the downloads if any: -https://api.bitbucket.org/2.0/repositories/pypa/setuptools/downloads -each download has a count and a URL such as: -https://api.bitbucket.org/2.0/repositories/pypa/setuptools/downloads/setuptools-19.6b1.zip - this URL can also be built using the filename as: - https://bitbucket.org/pypa/setuptools/downloads/setuptools-19.6b1.zip - -Also there is no value to add repos that are empty and have no downloads. -Therefore we should better: -1. collect repo data as a "template" only record -2. effectively create package IFF there are commits and/or downloads. -2.1 if commits and no tags: make a single package using the latest commit -2.2 if tags: use these for packages -2.3 if downloads: use these packages - -NB: we can also get only certain fields: -https://api.bitbucket.org/2.0/repositories/pypa/setuptools?pagelen=1&fields=size,links,full_name -https://api.bitbucket.org/2.0/repositories/pypa/setuptools/watchers?pagelen=1&fields=size,values.links -""" - - -class BitbucketSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://api.bitbucket.org/2.0/repositories?pagelen=400' - - -# TODO: review mapper -@visit_router.route('https://api\.bitbucket\.org/2\.0/repositories\?pagelen=.*',) -class BitbucketIndexVisitor(HttpJsonVisitor): - """ - Collect repository data through paginated API calls. - The index contains repo-level data for every repo. - """ - def get_uris(self, content): - next_page = content.get('next') - if next_page: - yield URI(uri=next_page, source_uri=self.uri) - - -@visit_router.route('https://api\.bitbucket\.org/2\.0/repositories/[^\/]*/[^\/\?]*/?') -class BitbucketSingleRepoVisitor(HttpJsonVisitor): - """ - Collect data for a single repository. - Note: this is strictly equivalent to one item of the index paginated calls. - """ - def get_uris(self, content): - return get_repo_uris(content, source_uri=self.uri) - - -@visit_router.route('https://api.bitbucket.org/2.0/repositories/[^\/]*/[^\/]*/(refs/tags|downloads).*') -class BitbucketDetailsVisitorPaginated(HttpJsonVisitor): - """ - Collect repository details for data that are paginated. - """ - def get_uris(self, content): - next_page = content.get('next') - if next_page: - purl = get_purl(self.uri) - yield URI(uri=next_page, source_uri=self.uri, package_url=purl) - - -@visit_router.route( - 'https://api\.bitbucket\.org/2\.0/repositories/[^\/]*/[^\/]*/(watchers|forks|commits).*') -class BitbucketDetailsVisitor(HttpJsonVisitor): - """ - Collect repository details for data that are not paginated. - """ - pass - - -def get_repo_ns_name(url_like): - """ - Return a namespace and name for a bitbucket repo given something that looks - like a bitbucket URL. - - For example: - >>> get_repo_ns_name('https://api.bitbucket.org/2.0/repositories/bastiand/mercurialeclipse/refs/tags?pagelen=2') - ('bastiand', 'mercurialeclipse') - >>> get_repo_ns_name('https://bitbucket.org/bastiand/mercurialeclipse/src') - ('bastiand', 'mercurialeclipse') - >>> get_repo_ns_name('/bastiand/mercurialeclipse/src') - ('bastiand', 'mercurialeclipse') - """ - if url_like.startswith('https://api.bitbucket.org'): - head, _, path = url_like.partition('2.0/repositories') - if head: - segments = [p for p in path.split('/') if p] - if len(segments) >= 2: - ns = segments[0] - name = segments[1] - return ns, name - - if url_like.startswith('https://bitbucket.org/'): - head, _, path = url_like.partition('bitbucket.org/') - if head: - segments = [p for p in path.split('/') if p] - if len(segments) >= 2: - ns = segments[0] - name = segments[1] - return ns, name - - segments = [p for p in url_like.strip('/').split('/') if p] - if len(segments) >= 2: - ns = segments[0] - name = segments[1] - return ns, name - - -def get_purl(url_like): - """ - Return a Package URL string created from a bitbucket url or url-like. - """ - ns_name = get_repo_ns_name(url_like) - if not ns_name: - return - ns, name = ns_name - return PackageURL(type='bitbucket', namespace=ns, name=name).to_string() - - -def get_repo_uris(repo_data, source_uri): - """ - Yield URIs from a single repository `repo_data` data. - """ - full_name = repo_data.get('full_name', '').strip() - package_url = get_purl(full_name) - links = repo_data.get('links', {}) - repo_uri = links.get('html', {}).get('href') - if not repo_uri: - repo_uri = 'https://bitbucket.org/{full_name}'.format(full_name=full_name) - - # Yield URI for latest commits, tags and downloads as candidate packages. - commits_url = links.get('commits', {}).get('href') - # we only care about the latest commit - commits_url += '?pagelen=1' - yield URI(uri=commits_url, package_url=package_url, source_uri=source_uri) - - # for counts only: these should go to the package template - for link in ('forks', 'watchers'): - url = links.get(link, {}).get('href') - if url: - # we get a single fields and only one page - url += '?pagelen=1&fields=size' - yield URI(uri=url, package_url=package_url, source_uri=source_uri) - - for link in ('refs/tags', 'downloads'): - url = links.get(link, {}).get('href') - if url: - # paginated, we want them all - url += '?pagelen=100' - yield URI(uri=url, package_url=package_url, source_uri=source_uri) diff --git a/minecode/visitors/bower.py b/minecode/visitors/bower.py deleted file mode 100644 index e9f605a9..00000000 --- a/minecode/visitors/bower.py +++ /dev/null @@ -1,74 +0,0 @@ -# -# Copyright (c) 2017 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI - - -class BowerSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://registry.bower.io/packages' - - -@visit_router.route('https://registry.bower.io/packages') -class BowerTopJsonVisitor(HttpJsonVisitor): - """ - Collect URIs for all packages from the json returned. - """ - - def get_uris(self, content): - """ - The json content is a list with name and url, like the following format: - ... - { - "name": "bello", - "url": "https://github.com/QiaoBuTang/bello.git" - }, - { - "name": "bello-gfw", - "url": "https://gitcafe.com/GilbertSun/bello.git" - }, - ... - The url could be in the following formats like github, loglg, gitcafe, bitbuckets etc. - # FIXME: We should cover all urls beyond the above four categories. - """ - github_base_url = 'https://raw.githubusercontent.com/{owner}/{name}/master/bower.json' - lolg_base_url = 'https://lolg.it/{owner}/{name}/raw/master/bower.json' - gitcafe_base_url = 'https://coding.net/u/{owner}/p/{name}/git/raw/master/bower.json' - bitbucket_base_url = 'https://bitbucket.org/{owner}/{name}/raw/master/bower.json' - base_url_map = { - 'https://github.com/': github_base_url, - 'https://lolg.it/': lolg_base_url, - 'https://gitcafe.com/': gitcafe_base_url, - 'https://bitbucket.org/': bitbucket_base_url - } - for entry in content: - name = entry.get('name') - url = entry.get('url') - if name in url: - owner = None - package_url = PackageURL(type='bower', name=name).to_string() - for host_name, base_url in base_url_map.iteritems(): - if url.startswith(host_name): - owner = url[len(host_name): url.index(name) - 1] - yield URI(uri=base_url.format(owner=owner, name=name), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://raw.githubusercontent.com/.*/master/bower.json', - 'https://lolg.it/.*/master/bower.json', - 'https://coding.net/.*/master/bower.json', - 'https://bitbucket.org/*/master/bower.json') -class BowerJsonVisitor(HttpJsonVisitor): - """ - Collect content of the json itself by the visitor. - """ - pass diff --git a/minecode/visitors/cpan.py b/minecode/visitors/cpan.py deleted file mode 100644 index 107771d2..00000000 --- a/minecode/visitors/cpan.py +++ /dev/null @@ -1,191 +0,0 @@ -# -# Copyright (c) by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -import json - -from bs4 import BeautifulSoup -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class CpanSeed(seed.Seeder): - - def get_seeds(self): - yield 'http://www.cpan.org/modules/01modules.index.html' - author_search_template = 'https://fastapi.metacpan.org/author/_search?q=email:{char}*&size=5000' - for char in 'abcdefghijklmnopqrstuvwxyz'.split(): - yield author_search_template.format(char) - -# The idea of CPAN API visitor is based on -# https://github.com/metacpan/metacpan-api/blob/master/docs/API-docs.md -# -# From the doc: You can certainly scroll if you are fetching less than 5,000 -# items. You might want to do this if you are expecting a large data set, but -# will still need to run many requests to get all of the required data. -# -# To get all results for sure it's over 5000, we should use search twice based -# on author and release. -# -# First get all authors by searching email from a-z, then get all releases based -# on each author. It will make the returned result a small set. - -# For example: - -# First try to reach the author search, the following search URL will get all -# authors whose email starts with 'a', this will loop from 'a' to 'z. - -# https://fastapi.metacpan.org/author/_search?q=email:a*&size=5000 - -# If we get the Author ID in above returned json, we can pass to release search -# URL as follows, it will get all releases from the passing author. - -# https://fastapi.metacpan.org/release/_search?q=author:ABERNDT&size=5000 - - -@visit_router.route('https://fastapi.metacpan.org/author/_search\?q=email:[a-z]\*&size=5000') -class MetaCpanAuthorURLVisitors(HttpJsonVisitor): - """ - Run search on author's email, and parse the returned json content and form - the MetaCpanRleaseURLVisitors' URL by adding AUTHOR condition. For example: - https://fastapi.metacpan.org/author/_search?q=email:a*&size=5000 a* stands - for all email which starts with 'a', and it's the same with 'A' as email is - case insensitive. The visitor will cover all cases from a to z, and yield - the search URLs by passing each author in the release searching URL - """ - - def get_uris(self, content): - release_visitor_template = 'https://fastapi.metacpan.org/release/_search?q=author:{id}&size=5000' - hits = content.get('hits', {}) - inner_hits = hits.get('hits', []) - for hit in inner_hits: - _id = hit.get('_id') - if not _id: - continue - yield URI(uri=release_visitor_template.format(id=_id), source_uri=self.uri) - - -@visit_router.route('https://fastapi.metacpan.org/release/_search\?q=author:\w+&size=5000') -class MetaCpanRleaseURLVisitors(HttpJsonVisitor): - """ - Run the release results by searching the passing AUTHOR ID. The visitor will - yield the json whose author ID is the passing author info. The - implementation if the class is empty, it just returns for mapper use of the - json content. - """ - pass - - -@visit_router.route('http://www.cpan.org/modules/01modules.index.html') -class CpanModulesVisitors(HttpVisitor): - """ - Return URIs by parsing the HTML page of cpan modules page. - """ - def get_uris(self, content): - """ - Return the uris of authors pages, the returning URIs will be an input of - CpanProjectHTMLVisitors - """ - page = BeautifulSoup(content, 'lxml') - url_template = 'http://www.cpan.org/{path}' - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - - url = a['href'] - if not url: - continue - - if url.startswith('../authors'): - if url.endswith(('.zip', '.tar.gz')): - # Skip tar.gz since it will be captured by the CpanProjectHTMLVisitors - continue - else: - url = url_template.format(path=url[3:]) - yield URI(uri=url, source_uri=self.uri) - - -@visit_router.route('http://www.cpan.org/authors/.*/') -class CpanProjectHTMLVisitors(HttpVisitor): - """ - Visit the HTML page of cpan project page and return the Packages info, HTML - data and error. - """ - def get_uris(self, content): - """ - Return the uris by looking for the tar.gz in the html, and then forming - the uri for meta and readme files - """ - page = BeautifulSoup(content, 'lxml') - if self.uri.endswith('/'): - url_template = self.uri + '{path}' - else: - url_template = self.uri + '/{path}' - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - - url = a['href'] - if not url: - continue - - if url.startswith(('/', '?')): - continue # Avoid the directory and other non-file links - else: - name = url - name = name.replace('tar.gz', ''). replace('.readme', '').replace('.meta', '') - partions = name.rpartition('-') - name = partions[0] - version = partions[-1] - package_url = None - if name and version: - package_url = PackageURL(type='cpan', name=name, version=version).to_string() - url = url_template.format(path=url) - yield URI(uri=url, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('http://www.cpan.org/.*.meta') -class CpanMetaVisitors(HttpVisitor): - """ - Visit the meta file and return the meta data of the Package The goal - of this visitor is to get the content instead of returning any valid - uris. - """ - pass - - -@visit_router.route('http://www.cpan.org/.*.readme') -class CpanReadmeVisitors(HttpVisitor): - """ - Visit the readme file and translate to json and dump it and return for mapper use. - """ - - def dumps(self, content): - """ - Return the json by parsing the readme content - """ - # Handle bytes properly in python3 - if type(content) == bytes: - content = content.decode('utf-8') - - lines = content.splitlines() - readme_dict = dict() - body = [] - head = None - for line in lines: - if len(line) > 1 and line.isupper() and line[0] != ' ': - if head: - readme_dict[head] = '\n'.join(body).lstrip('\n').rstrip('\n') - head = line - body = [] - else: - body.append(line.strip()) - return json.dumps(readme_dict) diff --git a/minecode/visitors/cran.py b/minecode/visitors/cran.py deleted file mode 100644 index db91c100..00000000 --- a/minecode/visitors/cran.py +++ /dev/null @@ -1,44 +0,0 @@ -# -# Copyright (c) 2017 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - - -from bs4 import BeautifulSoup - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class CranSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://cloud.r-project.org/web/packages/available_packages_by_date.html' - - -@visit_router.route('https://cloud.r-project.org/web/packages/available_packages_by_date.html') -class CranPackagesVisitors(HttpVisitor): - """ - Return URIs by parsing the HTML content of the page - """ - def get_uris(self, content): - base_url = 'https://cloud.r-project.org/web/packages/{package}/index.html' - a_blocks = BeautifulSoup(content, 'lxml').find_all('a') - for a in a_blocks: - package = a.text - package_url = PackageURL(type='cran', name=package).to_string() - yield URI(uri=base_url.format(package=package), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://cloud.r-project.org/web/packages/[\w\-\.]/index.html') -class CranSinglePackageVisitor(HttpVisitor): - """ - Return only the HTML content of the page, and will be parsed in mapper - """ - pass diff --git a/minecode/visitors/debian.py b/minecode/visitors/debian.py deleted file mode 100644 index 1e1ff956..00000000 --- a/minecode/visitors/debian.py +++ /dev/null @@ -1,756 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import attr -import gzip -import json -import logging -import requests - -from commoncode import fileutils -import debian_inspector -from debian_inspector import debcon -from debian_inspector import copyright as debcopy -from debian_inspector.version import Version as DebVersion -from packagedcode.models import PackageData -from packagedcode.debian import DebianDscFileHandler -from packagedcode.debian_copyright import StandaloneDebianCopyrightFileHandler -from packageurl import PackageURL - -from minecode import ls -from minecode import seed -from minecode import visit_router -from minecode import priority_router -from minecode.visitors import HttpVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI -from minecode.utils import fetch_and_write_file_from_url -from minecode.utils import get_package_sha1 -from packagedb.models import make_relationship -from packagedb.models import PackageContentType -from packagedb.models import PackageRelation - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - -""" -Collect Debian and Debian derivative packages (such as Ubuntu). -There are two approaches: -1. get the directory listings of all available packages (and files) -2. get and navigate through the tree of Debian control files -""" - - -DEBIAN_BASE_URL = "https://deb.debian.org/debian/pool/main/" -DEBIAN_METADATA_URL = "https://metadata.ftp-master.debian.org/changelogs/main/" - -UBUNTU_BASE_URL = "http://archive.ubuntu.com/ubuntu/pool/main/" -UBUNTU_METADATA_URL = "http://changelogs.ubuntu.com/changelogs/pool/main/" - -# Other URLs and sources to consider -# 'http://ftp.debian.org/debian/' -# rsync://archive.debian.org/debian-archive -# http://sources.debian.net/doc/api/ -# Packages.gz files: https://get.videolan.org/debian/i386/Packages.gz -# https://debian-handbook.info/browse/stable/sect.setup-apt-package-repository.html - - -class DebianSeed(seed.Seeder): - - def get_seeds(self): - yield 'http://ftp.debian.org/debian/ls-lR.gz' - yield 'http://archive.ubuntu.com/ubuntu/ls-lR.gz' - - -def is_collectible(file_name): - """ - Return True if a `file_name` is collectible. - """ - # 'Contents-*.gz' are mapping/indexes of installed files to the actual package that provides them. - # TODO: add tests! - - return (file_name and ( - file_name in ('Packages.gz', 'Release', 'Sources.gz',) - or file_name.endswith(('.deb', '.dsc',)) - or (file_name.startswith('Contents-') and file_name.endswith('.gz')) - )) - - -def is_debian_url(uri): - return 'debian.org' in uri - - -def is_ubuntu_url(uri): - return 'ubuntu' in uri - - -@visit_router.route( - 'http://ftp.debian.org/.*/ls\-lR\.gz', - 'http://.*/ubuntu/ls\-lR\.gz', - # mirrors - 'http://ftp.[a-z][a-z].debian.org/.*/ls\-lR\.gz', -) -class DebianDirectoryIndexVisitor(NonPersistentHttpVisitor): - """ - Collect package URIs from Debian-like repos with an ls-LR directory listing. - """ - - def get_uris(self, content): - with gzip.open(content, 'rt') as f: - content = f.read() - - url_template = self.uri.replace('ls-lR.gz', '{path}') - - for entry in ls.parse_directory_listing(content): - if entry.type != ls.FILE: - continue - - path = entry.path.lstrip('/') - file_name = fileutils.file_name(path) - - if not is_collectible(file_name): - continue - - if is_debian_url(self.uri): - namespace = 'debian' - elif is_ubuntu_url(self.uri): - namespace = 'ubuntu' - else: - logger.error( - 'Unknown Debian URI namespace: {}'.format(self.uri)) - continue - - if file_name.endswith(('.deb', '.udeb', '.tar.gz', '.tar.xz', '.tar.bz2', '.tar.lzma')): - name, version, arch = debian_inspector.package.get_nva( - file_name) - package_url = PackageURL( - type='deb', - namespace=namespace, - name=name, - version=str(version), - qualifiers=dict(arch=arch) if arch else None).to_string() - else: - package_url = None - - yield URI( - uri=url_template.format(path=path), - package_url=None or package_url, - file_name=file_name, - date=entry.date, - size=entry.size, - source_uri=self.uri) - - -def parse_release(location): - """ - Return a dictionary of data message from the debian Release file at `location`. - - A Release file contains return value like these: - Origin: Debian - Label: Debian - Suite: stable - Version: 8.3 - Codename: jessie - Date: Sat, 23 Jan 2016 13:17:38 UTC - Architectures: amd64 arm64 armel armhf i386 mips mipsel powerpc ppc64el s390x - Components: main contrib non-free - Description: Debian 8.3 Released 23 January 2016 - MD5Sum: - f08bebee4d8727f4320c0ed6984a01c9 1194884 contrib/Contents-amd64 - c7f0b9213c9031cf89343a1bb8dbca3a 88565 contrib/Contents-amd64.gz - 36d2e8055b0cc185c8c5b081b414f4ce 1021655 contrib/Contents-arm64 - 20bb294fefef1ab19e20ff0de7976ee2 72539 contrib/Contents-arm64.gz - d2e1f415e05f53742b7133dd10ccf3af 1035687 contrib/Contents-armel - 5f24794a69552fbb10f303e33d35d380 73710 contrib/Contents-armel.gz - d70a5e2db762a9eb493607e16f8c423e 1028590 contrib/Contents-armhf - - The MD5Sum key will return a list instead of a string value, element in the - list is a dictionary keyed by: - - md5sum - size - name - """ - return debcon.get_paragraphs_data_from_file(location) - - -def parse_copyright_only(location): - """ - Return a DebianCopyright from the Debian copyright file at `location`. - """ - return debcopy.DebianCopyright.from_file(location) - - -def parse_copyright_allinfo(location): - """ - Return a DebianCopyright from the Debian copyright file at `location`. - """ - return debcopy.DebianCopyright.from_file(location) - - -def parse_license(location): - """ - Return a list of License paragraphs from Debian copyright file at location. - """ - copyparas = debcopy.DebianCopyright.from_file(location) - return [para for para in copyparas.paragraphs - if isinstance(para, debian_inspector.copyright.CopyrightLicenseParagraph)] - - -def collect_source_packages(location): - """ - Yield one Paragraph object per package from a plain text 'Sources' file at - location. - - The source info is a dictionary, the content is like this: - 'Package': 'album' - 'Binary': 'album' - 'Version': '4.12-3' - 'Build-Depends': 'debhelper (>= 9)' - 'Architecture': 'all' - 'Format': '3.0 (quilt)' - """ - return debcon.get_paragraphs_data_from_file(location) - - -def parse_packages_index(location): - """ - Yield one Paragraph object per package from a plain text 'Packages' file at - location. - - A typical Debian Packages file looks like this: - http://ftp.debian.org/debian/dists/unstable/main/binary-mips/Packages.gz - """ - return debcon.get_paragraphs_data_from_file(location) - - -@visit_router.route('http://ftp.debian.org/debian/dists/.*/Sources.gz') -class DebianSourcesVisitor(NonPersistentHttpVisitor): - """ - Collect package URIs from a Sources gz data file. - """ - - def get_uris(self, content): - base_url = 'http://ftp.debian.org/debian' - with gzip.open(content, 'rb') as f: - text = f.read() - for source in debcon.get_paragraphs_data(text): - dir_info = source.get('Directory') - if not dir_info: - continue - package = source.get('Package') - version = source.get('Version') - - package_url = None - if package and version: - package_url = PackageURL( - type='deb', namespace='debian', name=package, - version=version).to_string() - - dir_info = dir_info.lstrip('/') - dir_url = base_url + '/{}'.format(dir_info) - yield URI(uri=dir_url, package_url=package_url, source_uri=self.uri) - - -# TODO add .xz support -@visit_router.route('http://ftp.debian.org/debian/dists/.*Packages.gz') -class DebianPackagesVisitor(NonPersistentHttpVisitor): - """ - Collect URIs to actual .deb Packages and the content itself from a Packages gz data file. - """ - - def get_uris(self, content): - base_url = 'http://ftp.debian.org/debian' - with gzip.open(content, 'rb') as f: - text = f.read() - - for package in debcon.get_paragraphs_data(text): - file_info = package.get('Filename') - if not file_info: - continue - - package = package.get('Package') - version = package.get('Version') - - if package and version: - package_url = PackageURL( - type='deb', - namespace='debian', - name=package, - version=version).to_string() - else: - package_url = None - - # FIXME: we we do not keep the actual content... we should! - file_info = file_info.lstrip('/') - dir_url = base_url + file_info - yield URI( - uri=dir_url, - package_url=package_url, - source_uri=self.uri) - - -@visit_router.route('http://ftp.debian.org/debian/pool/.*\.dsc') -class DebianDescriptionVisitor(HttpVisitor): - """ - Collect package data from a .dsc Package description file. - There is no URI we can get from description file directly. - """ - - def dumps(self, content): - dsc = debcon.Debian822.from_string(content) - # FIXME: this does not make sense as this is a mapping-time thing - return json.dumps(dsc.to_dict()) - - -@visit_router.route('http://ftp.debian.org/debian/.*/Release') -class DebianReleaseVisitor(HttpVisitor): - """ - Collect Release file content from a Release data file. - """ - pass - - -@priority_router.route('pkg:deb/.*') -def process_request(purl_str, **kwargs): - """ - Process `priority_resource_uri` containing a maven Package URL (PURL) as a - URI. - - This involves obtaining Package information for the PURL from debian and - using it to create a new PackageDB entry. The binary package is then added to the - scan queue afterwards. We also get the Package information for the - accompanying source package and add it to the PackageDB and scan queue, if - available. - - Return an error string for errors that occur, or empty string if there is no error. - """ - from minecode.model_utils import DEFAULT_PIPELINES - - source_purl = kwargs.get("source_purl", None) - addon_pipelines = kwargs.get('addon_pipelines', []) - pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) - - try: - package_url = PackageURL.from_string(purl_str) - source_package_url = None - if source_purl: - source_package_url = PackageURL.from_string(source_purl) - - except ValueError as e: - error = f'error occured when parsing purl: {purl_str} source_purl: {source_purl} : {e}' - return error - - has_version = bool(package_url.version) - if has_version: - error = map_debian_metadata_binary_and_source( - package_url=package_url, - source_package_url=source_package_url, - pipelines=pipelines, - priority=priority, - ) - - return error - - -def map_debian_package(debian_package, package_content, pipelines, priority=0): - """ - Add a debian `package_url` to the PackageDB. - - Return an error string if errors have occured in the process. - """ - from minecode.model_utils import add_package_to_scan_queue - from minecode.model_utils import merge_or_create_package - - db_package = None - error = '' - - purl = debian_package.package_url - if package_content == PackageContentType.BINARY: - download_url = debian_package.binary_archive_url - elif package_content == PackageContentType.SOURCE_ARCHIVE: - download_url = debian_package.source_archive_url - - response = requests.get(download_url) - if not response.ok: - msg = f'Package metadata does not exist on debian: {download_url}' - error += msg + '\n' - logger.error(msg) - return db_package, error - - purl_package = PackageData( - type=purl.type, - namespace=purl.namespace, - name=purl.name, - version=purl.version, - qualifiers=purl.qualifiers, - ) - - package, error_metadata = get_debian_package_metadata(debian_package) - if not package: - error += error_metadata - return db_package, error - - package_copyright, error_copyright = get_debian_package_copyright( - debian_package) - package.update_purl_fields(package_data=purl_package, replace=True) - if package_copyright: - update_license_copyright_fields( - package_from=package_copyright, - package_to=package, - replace=True, - ) - else: - error += error_metadata - - # This will be used to download and scan the package - package.download_url = download_url - - # Set package_content value - package.extra_data['package_content'] = package_content - - # If sha1 exists for an archive, we know we can create the package - # Use purl info as base and create packages for binary and source package - sha1 = get_package_sha1(package=package, field="download_url") - if sha1: - package.sha1 = sha1 - db_package, _, _, _ = merge_or_create_package(package, visit_level=50) - else: - msg = f'Failed to retrieve package archive: {purl.to_string()} from url: {download_url}' - error += msg + '\n' - logger.error(msg) - - # Submit package for scanning - if db_package: - add_package_to_scan_queue(db_package, pipelines, priority) - - return db_package, error - - -def get_debian_package_metadata(debian_package): - """ - Given a DebianPackage object with package url and source package url - information, get the .dsc package metadata url, fetch the .dsc file, - parse and return the PackageData object containing the package metadata - for that Debian package. - - If there are errors, return None and a string containing the error - information. - """ - error = '' - - metadata_url = debian_package.package_metadata_url - temp_metadata_file = fetch_and_write_file_from_url(url=metadata_url) - if not temp_metadata_file: - msg = f'Package metadata does not exist on debian: {metadata_url}' - error += msg + '\n' - logger.error(msg) - return None, error - - packages = DebianDscFileHandler.parse(location=temp_metadata_file) - package = list(packages).pop() - - package.qualifiers = debian_package.package_url.qualifiers - - return package, error - - -def get_debian_package_copyright(debian_package): - """ - Given a DebianPackage object with package url and source package url - information, get the debian copyright file url, fetch and run license - detection, and return the PackageData object containing the package - metadata for that Debian package. - - If there are errors, return None and a string containing the error - information. - """ - error = '' - - metadata_url = debian_package.package_copyright_url - temp_metadata_file = fetch_and_write_file_from_url(url=metadata_url) - if not temp_metadata_file: - msg = f'Package metadata does not exist on debian: {metadata_url}' - error += msg + '\n' - logger.error(msg) - return None, error - - packages = StandaloneDebianCopyrightFileHandler.parse( - location=temp_metadata_file) - package = list(packages).pop() - - package.qualifiers = debian_package.package_url.qualifiers - - return package, error - - -def update_license_copyright_fields(package_from, package_to, replace=True): - fields_to_update = [ - 'copyright', - 'holder', - 'declared_license_expression', - 'declared_license_expression_spdx', - 'license_detections', - 'other_license_expression', - 'other_license_expression_spdx', - 'other_license_detections', - 'extracted_license_statement' - ] - - for field in fields_to_update: - value = getattr(package_from, field) - if value and replace: - setattr(package_to, field, value) - - -def map_debian_metadata_binary_and_source(package_url, source_package_url, pipelines, priority=0): - """ - Get metadata for the binary and source release of the Debian package - `package_url` and save it to the PackageDB. - - Return an error string for errors that occur, or empty string if there is no error. - """ - error = '' - - if "repository_url" in package_url.qualifiers: - base_url = package_url.qualifiers["repository_url"] - elif package_url.namespace == 'ubuntu': - base_url = UBUNTU_BASE_URL - else: - base_url = DEBIAN_BASE_URL - - if "api_data_url" in package_url.qualifiers: - metadata_base_url = package_url.qualifiers["api_data_url"] - elif package_url.namespace == 'ubuntu': - metadata_base_url = UBUNTU_METADATA_URL - else: - metadata_base_url = DEBIAN_METADATA_URL - - package_urls = dict( - package_url=package_url, - source_package_url=source_package_url, - archive_base_url=base_url, - metadata_base_url=metadata_base_url, - ) - debian_package, emsg = DebianPackage.from_purls(package_urls) - if emsg: - return emsg - - binary_package, emsg = map_debian_package( - debian_package, - PackageContentType.BINARY, - pipelines, - priority, - ) - if emsg: - error += emsg - - package_url.qualifiers['classifier'] = 'sources' - source_package, emsg = map_debian_package( - debian_package, - PackageContentType.SOURCE_ARCHIVE, - pipelines, - priority, - ) - if emsg: - error += emsg - - if binary_package and source_package: - make_relationship( - from_package=binary_package, - to_package=source_package, - relationship=PackageRelation.Relationship.SOURCE_PACKAGE, - ) - - return error - - -@attr.s -class DebianPackage: - """ - Contains the package url and source package url for a debian package - necessary to get source, binary, metadata and copyright urls for it. - """ - - archive_base_url = attr.ib(type=str) - metadata_base_url = attr.ib(type=str) - package_url = attr.ib(type=str) - source_package_url = attr.ib(type=str) - metadata_directory_url = attr.ib(type=str, default=None) - archive_directory_url = attr.ib(type=str, default=None) - - @classmethod - def from_purls(cls, package_urls): - """ - Set the directory URLs for metadata and package archives. - """ - debian_package = cls(**package_urls) - error = debian_package.set_debian_directories() - return debian_package, error - - @property - def package_archive_version(self): - """ - Get the useful part of the debian package version used in - source, binary, metadata and copyright URLs optionally. - """ - debvers = DebVersion.from_string(self.package_url.version) - if debvers.revision != "0": - purl_version = f"{debvers.upstream}-{debvers.revision}" - else: - purl_version = debvers.upstream - return purl_version - - @property - def binary_archive_url(self): - """ - Get the .deb debian binary archive url for this debian package. - """ - purl_version = self.package_archive_version - arch = self.package_url.qualifiers.get("arch") - if arch: - archive_name = f"{self.package_url.name}_{purl_version}_{arch}.deb" - else: - archive_name = f"{self.package_url.name}_{purl_version}.deb" - binary_package_url = self.archive_directory_url + f"{archive_name}" - return binary_package_url - - @property - def source_archive_url(self): - """ - Get the debian source tarball archive url for this debian package. - """ - debian_source_archive_formats = [ - ".tar.xz", ".tar.gz", ".orig.tar.xz", ".orig.tar.gz", ".orig.tar.bz2" - ] - - source_version = self.package_archive_version - if not self.source_package_url: - source_package_name = self.package_url.name - else: - source_package_name = self.source_package_url.name - if self.source_package_url.version: - source_version = self.source_package_url.version - - for archive_format in debian_source_archive_formats: - if ".orig" in archive_format: - base_version_source = source_version.split('-')[0] - archive_name = f"{source_package_name}_{base_version_source}" + \ - archive_format - else: - archive_name = f"{source_package_name}_{source_version}" + \ - archive_format - source_package_url = self.archive_directory_url + archive_name - response = requests.get(source_package_url) - if response.ok: - break - - return source_package_url - - @property - def package_metadata_url(self): - """ - Get the .dsc metadata file url for this debian package. - """ - metadata_version = self.package_archive_version - if not self.source_package_url: - metadata_package_name = self.package_url.name - else: - metadata_package_name = self.source_package_url.name - if self.source_package_url.version: - metadata_version = self.source_package_url.version - - base_version_metadata = metadata_version.split('+')[0] - metadata_dsc_package_url = self.archive_directory_url + \ - f"{metadata_package_name}_{base_version_metadata}.dsc" - response = requests.get(metadata_dsc_package_url) - if not response.ok: - metadata_dsc_package_url = self.archive_directory_url + \ - f"{metadata_package_name}_{metadata_version}.dsc" - - return metadata_dsc_package_url - - @property - def package_copyright_url(self): - """ - Get the debian copyright file url containing license and copyright - declarations for this debian package. - """ - # Copyright files for ubuntu are named just `copyright` and placed under a name-version folder - # instead of having the name-version in the copyright file itself - copyright_file_string = "_copyright" - if self.package_url.namespace == "ubuntu": - copyright_file_string = "/copyright" - - metadata_version = self.package_archive_version - if not self.source_package_url: - metadata_package_name = self.package_url.name - else: - metadata_package_name = self.source_package_url.name - if self.source_package_url.version: - metadata_version = self.source_package_url.version - - copyright_package_url = self.metadata_directory_url + \ - f"{metadata_package_name}_{metadata_version}{copyright_file_string}" - response = requests.get(copyright_package_url) - if not response.ok: - base_version_metadata = metadata_version.split('+')[0] - copyright_package_url = self.metadata_directory_url + \ - f"{metadata_package_name}_{base_version_metadata}{copyright_file_string}" - - return copyright_package_url - - def set_debian_directories(self): - """ - Compute and set base urls for metadata and archives, to get - source/binary - """ - error = '' - - archive_base_url = self.archive_base_url - metadata_base_url = self.metadata_base_url - - index_folder = None - if self.package_url.name.startswith('lib'): - name_wout_lib = self.package_url.name.replace("lib", "") - index_folder = 'lib' + name_wout_lib[0] - else: - index_folder = self.package_url.name[0] - - msg = "No directory exists for package at: " - - package_directory = f"{archive_base_url}{index_folder}/{self.package_url.name}/" - metadata_directory = f"{metadata_base_url}{index_folder}/{self.package_url.name}/" - - response = requests.get(package_directory) - if not response.ok: - if not self.source_package_url: - error = msg + str(package_directory) - return error - - if self.source_package_url.name.startswith('lib'): - name_wout_lib = self.source_package_url.name.replace("lib", "") - index_folder = 'lib' + name_wout_lib[0] - else: - index_folder = self.source_package_url.name[0] - - package_directory = f"{archive_base_url}{index_folder}/{self.source_package_url.name}/" - metadata_directory = f"{metadata_base_url}{index_folder}/{self.source_package_url.name}/" - - response = requests.get(package_directory) - if not response.ok: - error = msg + str(package_directory) - return error - - self.archive_directory_url = package_directory - self.metadata_directory_url = metadata_directory diff --git a/minecode/visitors/dockerhub.py b/minecode/visitors/dockerhub.py deleted file mode 100644 index 92601a5c..00000000 --- a/minecode/visitors/dockerhub.py +++ /dev/null @@ -1,136 +0,0 @@ -# -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -import json -import string -from bs4 import BeautifulSoup - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpVisitor -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI - - -def get_search_conditions(): - """ Return a list of combination of char and char, char and number, number and number. - By doing this, we can pass the conditions to the query API of docker hub, the API does not - support the single char, so we combine two chars as a list. - For example: ['aa', 'ab', .....'a1', 'a2'.....'z9'...] - """ - char_list = [] - for char in string.ascii_lowercase: - char_list.append(char) - int_list = [] - for i in range(0, 10): - int_list.append(str(i)) - char_list.extend(int_list) - - conditions = [] - for c in char_list: - for second_c in char_list: - conditions.append(c + second_c) - return conditions - - -class DockerHubSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://hub.docker.com/explore/?page=1' - search_uril_format = 'https://index.docker.io/v1/search?q={condition}&n=100&page=1' - for condition in get_search_conditions(): - # yield a combination of query conditions, the API accepts at least - # two chars for searching conditions. - yield search_uril_format.format(condition=condition) - - -@visit_router.route('https://hub.docker.com/explore/\?page=\d?') -class DockHubExplorePageVisitor(HttpVisitor): - """ - Visit the HTML page of DockerHub Explore Page and yield each uri of the project, and yield the next page of DockHub. - """ - - def get_uris(self, content): - dockhub_library_html_template = 'https://hub.docker.com/{project}' - dockhub_library_restapi_template = 'https://registry.hub.docker.com/v2/repositories/library/{project}' - dockhub_next_page_template = 'https://hub.docker.com/explore/?page={page}' - page_legal = False - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - href = a['href'] - if href and href.startswith('/_/'): - page_legal = True - project_name = href[1:] - package_url = PackageURL(type='docker', name=project_name.replace('_/', 'library/').rstrip('/')).to_string() - yield URI(uri=dockhub_library_html_template.format(project=project_name), package_url=package_url, source_uri=self.uri) - yield URI(uri=dockhub_library_restapi_template.format(project=href.partition('/_/')[-1]), package_url=package_url, source_uri=self.uri) - if page_legal: - current_page = int(self.uri.partition('=')[-1]) - next_page = current_page + 1 - yield URI(uri=dockhub_next_page_template.format(page=next_page), source_uri=self.uri) - - -@visit_router.route('https://hub.docker.com/_/[\w\-\.]+/') -class DockHubProjectHTMLVisitor(HttpVisitor): - - def dumps(self, content): - """ - Return the json by parsing the HTML project page - """ - metadata_dict = dict() - page = BeautifulSoup(content, 'lxml') - for div in page.find_all(name='div'): - for span in div.find_all(name='span'): - if span.string == 'Short Description': - next_sibling = div.next_sibling - if next_sibling: - for sibling_span in next_sibling.find_all(name='span'): - sibling_text = sibling_span.string - metadata_dict['summary'] = sibling_text - for h1 in div.find_all(name='h1'): - if h1.string == 'License': - licenses_paras = [] - next_sibling = h1.next_sibling - while(next_sibling): - if next_sibling.string: - licenses_paras.append(next_sibling.string) - next_sibling = next_sibling.next_sibling - if licenses_paras: - metadata_dict['license_text'] = ''.join(licenses_paras) - return json.dumps(metadata_dict) - - -@visit_router.route('https://registry.hub.docker.com/v2/repositories/library/[\w\-\.]+/') -class DockHubLibraryRESTJsonVisitor(HttpJsonVisitor): - """ - Return URIs by parsing the json content of API of Dock Hub library - Note that this class is reuse the parent's function to return json data. - """ - - -@visit_router.route('https://index.docker.io/v1/search\?q=\w\w&n=100&page=\d+') -class DockHubGetAllProjectsFromSearchVisitor(HttpJsonVisitor): - def get_uris(self, content): - base_url = 'https://hub.docker.com/v2/repositories/{name}' - num_page = content.get('num_pages') - current_page = content.get('page') - if num_page and current_page: - if int(current_page) < int(num_page): - next_page = int(current_page) + 1 - yield URI(uri=(self.uri.rpartition('=')[0] + '=' + str(next_page)), source_uri=self.uri) - results = content.get('results', {}) - for result in results: - name = result.get('name') - # TODO: This will be used when new Package definition is merged. - star_count = result.get('star_count') - if name: - package_url = PackageURL(type='docker', name=name).to_string() - yield URI(uri=base_url.format(name=name), package_url=package_url, source_uri=self.uri) diff --git a/minecode/visitors/eclipse.py b/minecode/visitors/eclipse.py deleted file mode 100644 index 13fdb8ab..00000000 --- a/minecode/visitors/eclipse.py +++ /dev/null @@ -1,158 +0,0 @@ -# -# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from bs4 import BeautifulSoup - -from commoncode import fileutils -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class EclipseSeed(seed.Seeder): - - def get_seeds(self): - yield 'http://projects.eclipse.org/json/projects/all' - - -@visit_router.route('https://projects.eclipse.org/list-of-projects') -class EclipseProjectVisitors(HttpVisitor): - """ - Visit the HTML page of eclipse projects page and return the Packages info, json data and error. - """ - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - href = a['href'] - if href and href.startswith('https://projects.eclipse.org/projects/'): - # if the herf content starts with Eclipse single project suffix, generate a URI with the href content - project_name = href.replace('https://projects.eclipse.org/projects/', '') - package_url = PackageURL(type='eclipse', name=project_name).to_string() - yield URI(uri=href, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://projects.eclipse.org/projects/.*') -class EclipseSingleProjectVisitor(HttpVisitor): - """ - Visit the HTML page of single eclipse project. - This is to get the HTML page as metadata, as it's single project and the URI is already collected by - EclipseProjectVisitors https://projects.eclipse.org/list-of-projects, so it won't return any new URI - and the goal is to return HTML page. - - For example:https://projects.eclipse.org/projects/modeling.m2t.accele - """ - pass - - -@visit_router.route('http://git.eclipse.org/c') -class EclipseGitVisitor(HttpVisitor): - """ - Visitor Eclipse Git HTML page and return URIs in the Git HTML page. - """ - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for td in page.find_all(name='td'): - if 'class' not in td.attrs: - continue - if td.attrs.get('class') != ['sublevel-repo']: - continue - - for a in td.findChildren(name='a'): - href = a['href'] - name = a.contents[0] - package_url = PackageURL(type='eclipse', name=name).to_string() - yield URI(uri=href, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('http://www.eclipse.org/downloads/packages/all') -class EclipsePackagesVisitor(HttpVisitor): - """ - Visit the Eclipse packages HTML page and return URIs parsed from HTML page. - """ - - def fetch(self, uri, timeout=40): - """ - Fetch and return the content found at a remote uri with an extra timeout - """ - return HttpVisitor.fetch(self, uri, timeout=timeout) - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for td in page.find_all(name='span'): - if 'class' not in td.attrs: - continue - if td.attrs.get('class') != ['field-content']: - continue - - a = td.find(name='a') - href = a['href'] - name = a.contents[0] - # Skip some of the nodes if it's a HTML tag but not a string - if name and isinstance(name, str): - package_url = PackageURL(type='eclipse', name=name).to_string() - yield URI(uri=href, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('http://www.eclipse.org/downloads/packages/release/.*') -class EclipseReleaseVisitor(HttpVisitor): - """ - Visit the Eclipse release HTML page and return expected Package URIs. - """ - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - suffix_list = ['-win32.zip', '-win64.exe', '-win32-x86_64.zip', '-linux-gtk-x86_64.tar.gz', - '-linux-gtk-x86_64.tar.gz', '-macosx-cocoa-x86_64.tar.gz', '-linux-gtk.tar.gz', '-x86_64.tar.gz'] - for div in page.find_all(name='div'): - for a in div.find_all(name='a'): - url = a.get('href') - if url and 'download.php?file=' in url: - file_name = fileutils.file_name(url) - name = file_name - for suffix in suffix_list: - name = name.replace(suffix, '') - package_url = PackageURL(type='eclipse', name=name).to_string() - yield URI(uri=url, file_name=file_name, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('http://projects.eclipse.org/json/projects/all') -class EclipseProjectsJsonVisitor(HttpJsonVisitor): - """ - Visit the Ecipse json API and return expected project specified URIs. - """ - - def fetch(self, uri, timeout=40): - """ - Fetch and return the content found at a remote uri with an extra timeout - """ - return HttpJsonVisitor.fetch(self, uri, timeout=timeout) - - def get_uris(self, content): - url_template = 'http://projects.eclipse.org/json/project/{name}' - projects = content.get('projects', {}) - for project in projects: - # TODO: are we sure there is not more data available in this JSON? - package_url = PackageURL(type='eclipse', name=project).to_string() - yield URI(uri=url_template.format(name=project), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('http://projects.eclipse.org/json/project/.*') -class EclipseSingleProjectJsonVisitor(HttpJsonVisitor): - """ - Visit json of a single Eclipse project. This is to return the json - itself without any URIs, as the URI itself is returned by - EclipseProjectsJsonVisitor. - """ - pass diff --git a/minecode/visitors/fdroid.py b/minecode/visitors/fdroid.py deleted file mode 100644 index 81384d80..00000000 --- a/minecode/visitors/fdroid.py +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import codecs -import json -import xmlrpc - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.utils import get_temp_file -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI -from minecode.visitors import Visitor -from minecode.visitors import NonPersistentHttpVisitor - -""" -Visitors for F-Droid package repositories. - -NOTE: the license of F-Droid package data needs to be clarified. -See https://gitlab.com/fdroid/fdroiddata/-/issues/2826 for details - -F-Droid packages come with a main JSON index and possible increment/diffs. -- https://f-droid.org/repo/index-v2.json - -- this is a legacy XMl index https://f-droid.org/repo/index.xml - -- This top level file lists index and diffs https://f-droid.org/repo/entry.json - -- This is a diff example: https://f-droid.org/repo/diff/1666980277000.json - -- Each apk is available from a URL using this form: - - https://f-droid.org/repo/app.seeneva.reader_3.apk - https://f-droid.org/repo/{application_id}_{version_code}.apk - -The {application_id}_{version_code}.apk "file name" for each tarball and -apk file name is listed in the index. -""" - - -class FdroidSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://f-droid.org/repo/index-v2.json' - - -def build_purl(package_id, version_code, filename): - """ - Return a PackageURL for an F-Droid package. - """ - return PackageURL( - type='fdroid', - name=package_id, - version=version_code, - qualifiers=dict(filename=filename) - ) - - -@visit_router.route('https://f-droid.org/repo/index-v2.json') -class FdroidIndexVisitor(NonPersistentHttpVisitor): - """ - Collect package metadata URIs from the F-Droid index for each package. - We treat each apk and corresponding source tarball as a different package. - """ - - def get_uris(self, content): - """ - Yield a URI for each F-Droid package. - """ - json_location = content - with open(json_location) as c: - content = json.loads(c.read()) - - packages = content['packages'] - - for package_id, package_data in packages.items(): - purl = PackageURL(type='fdroid', name=package_id).to_string() - yield URI( - uri=purl, - package_url=purl, - source_uri=self.uri, - data=json.dumps(package_data, separators=( - ',', ':'), ensure_ascii=False), - # note: visited is True since there nothing more to visit - visited=True - ) diff --git a/minecode/visitors/freebsd.py b/minecode/visitors/freebsd.py deleted file mode 100644 index 241db9ad..00000000 --- a/minecode/visitors/freebsd.py +++ /dev/null @@ -1,89 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import logging -import os - -from bs4 import BeautifulSoup - -from minecode import seed -from minecode import visit_router -from minecode.utils import extract_file -from minecode.visitors import HttpVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -class FreeBSDSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://pkg.freebsd.org' - - -@visit_router.route('https://pkg.freebsd.org') -class FreeBSDBaseHTMLVisitors(HttpVisitor): - """ - Visit the freeBSD home link and yield uri for each FreeBSD repo - """ - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - base_url = 'https://pkg.freebsd.org/{path}/' - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - href = a['href'] - # the sub link useful is like: FreeBSD:13:aarch64 - if href and href.startswith('FreeBSD%3A'): - url = base_url.format(path=href) - yield URI(uri=url, source_uri=self.uri) - - -@visit_router.route('https://pkg.freebsd.org/.*/') -class FreeBSDSubHTMLVisitors(HttpVisitor): - """ - Visit the sub repo URL and yield all uris in the page and in its children page - """ - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - base_url = self.uri + '{path}' - for a in page.find_all(name='a'): - if 'href' not in a.attrs or 'title' not in a.attrs: - # parent link doesn't have title. - continue - href = a['href'] - url = base_url.format(path=href) - yield URI(uri=url, source_uri=self.uri) - - -@visit_router.route('https://pkg.freebsd.org/.*packagesite.txz') -class FreeBSDIndexVisitors(NonPersistentHttpVisitor): - """ - Extract packagesite.txz index file, get the data of packagesite.yaml file. - """ - - def dumps(self, content): - """ - Extract the file packagesite.yaml and read the content of the file and return. - """ - extracted_location = extract_file(content) - manifest_file = os.path.join(extracted_location, 'packagesite.yaml') - if os.path.exists(manifest_file): - with open(manifest_file) as file_handler: - return file_handler.read() - else: - logger.warn( - 'The packagesite.yaml is not existing in index file:' + content) diff --git a/minecode/visitors/freedesktop.py b/minecode/visitors/freedesktop.py deleted file mode 100644 index 52987855..00000000 --- a/minecode/visitors/freedesktop.py +++ /dev/null @@ -1,48 +0,0 @@ -# -# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from bs4 import BeautifulSoup - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class FreedesktopSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://www.freedesktop.org/wiki/Software' - - -@visit_router.route('https://www.freedesktop.org/wiki/Software') -class FreedesktopHTMLVisitor(HttpVisitor): - """ - Visit the Freedesktop Software HTML page and return URIs parsed from HTML page. - """ - def get_uris(self, content): - url_template = 'https://www.freedesktop.org/wiki/Software/{name}' - page = BeautifulSoup(content, 'lxml') - for div in page.find_all(name='div'): - for a in div.find_all(name='a'): - if 'href' not in a.attrs: - continue - href = a['href'] - if href and href.startswith('./'): - project_name = href.replace('./', '').strip('/') - package_url = PackageURL(type='freedesktop', name=project_name).to_string() - yield URI(uri=url_template.format(name=project_name), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://www.freedesktop.org/wiki/Software/.*') -class FreedesktopProjectHTMLVisitor(HttpVisitor): - """ - Visit the Freedesktop Project HTML page. - """ - pass diff --git a/minecode/visitors/github.py b/minecode/visitors/github.py deleted file mode 100644 index 665db404..00000000 --- a/minecode/visitors/github.py +++ /dev/null @@ -1,212 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - -from collections import OrderedDict -from datetime import date -from datetime import datetime -import json -import logging - -from github.MainClass import Github -from github.Repository import Repository -from github.Download import Download -from packageurl import PackageURL - -from minecode import priority_router -from minecode import visit_router, seed -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI -from minecode.visitors.generic import map_fetchcode_supported_package - - -logger = logging.getLogger(__name__) - -TRACE = False -if TRACE: - handler = logging.StreamHandler() - logger.addHandler(handler) - logger.setLevel(logging.INFO) - - -class GithubSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://api.github.com/repositories?since=0' - - -@visit_router.route('https://api.github.com/repositories\?since=\d+') -class GithubReposVisitor(HttpJsonVisitor): - """ - Visitor to run repositories request to get all repositories by increasing since symbol 100 each loop time. - Refer to: https://developer.github.com/v3/repos/#list-all-public-repositories - https://api.github.com/repositories - """ - def get_uris(self, content): - repo_request_base = 'https://api.github.com/repositories?since=' - has_content = False - if content: - for entry in content: - has_content = True - url = entry.get('url') - # Take full_name instead of name here since we want to keep more info, especially when forming the package url - # "name": "grit", - # "full_name": "mojombo/grit", - name = entry.get('full_name') - if url: - package_url = None - if name: - package_url = PackageURL(type='github', name=name).to_string() - # Yield URI for GithubSingleRepoVisitor use - yield URI(uri=url, package_url=package_url, source_uri=self.uri) - if not has_content: - logger.info('The content of the response is empty, the processing might be finished for URI: {}'.format(self.uri)) - else: - uri = self.uri - current_id = uri.replace('https://api.github.com/repositories?since=', '') - current_id = int(current_id) - # 100 is fixed since each page has 100 entries. Plus 100 means to go from next page. - new_id = current_id + 100 - new_url = repo_request_base + str(new_id) - yield URI(uri=new_url, source_uri=self.uri) - - -@visit_router.route('https://api.github.com/repos/[\w\-\.]+/[\w\-\.]+') -class GithubSingleRepoVisitor(HttpJsonVisitor): - """ - Visitor to get the json and add more content with GitHub API from one repo. - For example: https://api.github.com/repos/mojombo/grit - """ - - def fetch(self, uri, timeout=None): - """ - Having its own fetch function instead of inheriting from HttpJsonVisitor class is because: - The json itself has lots of URL info, the Github API can get content without acccessing the URLs inside the json explicitly. - The main idea is to fetch download_url... - """ - full_name = uri.replace('https://api.github.com/repos/', '') - g = Github() - repo = g.get_repo(full_name) - - common_data = OrderedDict( - name=repo.name, - description=repo.description, - blobs_url=repo.blobs_url, - language=repo.language, - size=repo.size, - homepage=repo.homepage, - html_url=repo.html_url, - etag=repo.etag, - full_name=repo.full_name, - repo_id=repo.id, - ssh_url=repo.ssh_url, - source_url=repo.svn_url, - clone_url=repo.clone_url, - watchers_count=repo.watchers, - master_branch=repo.master_branch, - updated_at=json_serial_date_obj(repo.updated_at), - pushed_at=json_serial_date_obj(repo.pushed_at), - ) - - if repo.owner: - common_data['owner'] = repo.owner.name - if repo._issues_url: - common_data['issue_url'] = repo._issues_url.value - - if repo._git_url: - common_data['git_url'] = repo._git_url.value - - if repo.organization: - repo.origanization = repo.organization.name - - downloads = [] - if repo.get_downloads(): - for download in list(repo.get_downloads()): - downloads.append(OrderedDict( - name=download.name, - url=download.url, - size=download.size, - s3_url=download.s3_url, - created_at=json_serial_date_obj(download.created_at), - download_count=download.download_count, - description=download.description, - redirect=download.redirect, - signature=download.signature, - html_url=download.html_url, - bucket=download.bucket, - acl=download.acl, - accesskeyid=download.accesskeyid, - expirationdate=json_serial_date_obj(download.expirationdate), - )) - common_data['downloads'] = downloads - - tags = [] - if repo.get_tags(): - for tag in list(repo.get_tags()): - tag_info = OrderedDict( - name=tag.name, - tarball_url=tag.tarball_url, - zipball_url=tag.zipball_url, - ) - if tag.commit: - tag_info['sha1'] = tag.commit.sha - tags.append(tag_info) - common_data['tags'] = tags - - if not common_data.get('tags') and not common_data.get('downloads'): - # If there is no downloads and tags, let's make the download_url by forming archive/master.zip at the end - # For example, the base html is: https://github.com/collectiveidea/calendar_builder - # The final download_url is https://github.com/collectiveidea/calendar_builder/archive/master.zip - branches_download_urls = [] - download_url_bases = '{html_url}/archive/{branch_name}.zip' - if repo.get_branches(): - for branch in list(repo.get_branches()): - branches_download_urls.append(download_url_bases.format(html_url=common_data.get('html_url'), branch_name=branch.name)) - common_data['branches_download_urls'] = branches_download_urls - - common_data['labels'] = [] - if repo.get_labels(): - for label in repo.get_labels(): - common_data['labels'].append(label.name) - - return json.dumps(common_data) - - -def json_serial_date_obj(obj): - """JSON serializer for date object""" - if obj and isinstance(obj, (datetime, date)): - return obj.isoformat() - - -# Indexing GitHub PURLs requires a GitHub API token. -# Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. -@priority_router.route('pkg:github/.*') -def process_request_dir_listed(purl_str, **kwargs): - """ - Process `priority_resource_uri` containing a GitHub Package URL (PURL). - - This involves obtaining Package information for the PURL using - https://github.com/aboutcode-org/fetchcode and using it to create a new - PackageDB entry. The package is then added to the scan queue afterwards. - """ - from minecode.model_utils import DEFAULT_PIPELINES - - addon_pipelines = kwargs.get('addon_pipelines', []) - pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) - - try: - package_url = PackageURL.from_string(purl_str) - except ValueError as e: - error = f"error occurred when parsing {purl_str}: {e}" - return error - - error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) - - if error_msg: - return error_msg diff --git a/minecode/visitors/gitlab.py b/minecode/visitors/gitlab.py deleted file mode 100644 index 82e7eb9b..00000000 --- a/minecode/visitors/gitlab.py +++ /dev/null @@ -1,86 +0,0 @@ -# -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from packageurl import PackageURL - -from minecode import seed -from minecode.utils import get_http_response -from minecode import visit_router - -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class GitlabSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://gitlab.com/api/v4/projects' - - -@visit_router.route('https://gitlab.com/api/v4/projects') -class GitlabAPIHeaderVisitor(HttpVisitor): - """ - Get the header of the API, and parse the page size and total pages from the - header, and yield urls for further visiting like GitlabAPIVisitor - """ - - def fetch(self, uri, timeout=10): - """ - Return only the headers of the response. - """ - return get_http_response(uri, timeout).headers - - def get_uris(self, content): - new_page_template = 'https://gitlab.com/api/v4/projects?page={next_page}&per_page={per_page}&statistics=true' - - page_size = content.get('X-Per-Page') - total_pages = content.get('X-Total-Pages') - if page_size and total_pages: - total_pages = int(total_pages) - for i in range(total_pages): - # Use the loop to yield the uri of next page of the visitor. - nextpage_url = new_page_template.format(next_page=i + 1, per_page=page_size) - yield URI(uri=nextpage_url, source_uri=self.uri, visited=False) - - -@visit_router.route('https://gitlab.com/api/v4/projects\?page=\d+&per_page=\d+&statistics=true') -class GitlabAPIVisitor(HttpJsonVisitor): - """ - Return URIs from the json content of one API page returned from gitlab api. - This yields the "web_url" from each package in the current json page. - """ - - def get_uris(self, content): - """Yield URIs from the json content, the passing content is the json info, the example is: - [ - { - "id": 6377679, - ... - "web_url": "https://gitlab.com/prithajnath/cnn-keras", - ... - }, - { - .. - "web_url": "https://gitlab.com/janpoboril/rules-bug", - ... - } - ... - ] - Each element in the list is a dictionary, and we concern the web_url for the visitor and also return the data. - """ - - if not content: - # If the page is empty, just return - return - for element in content: - # The element is one package in the list of current returned page. - url = element.get('web_url') - if url: - project_name = url.rpartition('/')[-1] - package_url = PackageURL(type='gitlab', name=project_name).to_string() - yield URI(uri=url, package_url=package_url, data=element, source_uri=self.uri, visited=False) diff --git a/minecode/visitors/golang.py b/minecode/visitors/golang.py deleted file mode 100644 index 12f5a6b5..00000000 --- a/minecode/visitors/golang.py +++ /dev/null @@ -1,221 +0,0 @@ -# -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -import json - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router - -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI - - -class GoLangSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://api.godoc.org/packages' - - -@visit_router.route('https://api.godoc.org/packages') -class GodocIndexVisitor(NonPersistentHttpVisitor): - """ - Collect Golang URIs for packages available in the Go doc index. - """ - - def get_uris(self, content): - """ - Return URIs to search the API further for a package - """ - seen_paths = set() - for path, package in get_packages(content): - package_url, path = parse_package_path(path) - if path in seen_paths: - continue - seen_paths.add(path) - - # note the addition of a * at the end of the search string... - # without this the returned data are sparse - details_url = 'https://api.godoc.org/search?q={path}*'.format(**locals()) - host = get_well_known_host(path) - # If the path belongs github/bitbucket, yield a repo too - if host: - # keep github, bitbucket... as type: - repo_type, _, _ = host.lower().partition('.') # NOQA - repo_url = 'https://{namespace}/{name}'.format(**package_url.to_dict()) - repo_purl = PackageURL( - type=repo_type, - namespace=package_url.namespace, - name=package_url.name, - qualifiers=dict(package_url=package_url.to_string()) - ).to_string() - - yield URI(uri=repo_url, package_url=repo_purl, source_uri=self.uri) - - yield URI(uri=details_url, - package_url=package_url.to_string(), - source_uri=self.uri) - - else: - yield URI(uri=details_url, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://api\.godoc\.org/search\?q=.*') -class GodocSearchVisitor(NonPersistentHttpVisitor): - """ - Collect URIs and data through the godoc searchi API. - """ - - def get_uris(self, content): - seen_paths = set() - for path, package in get_packages(content): - package_url, path = parse_package_path(path) - if path in seen_paths: - continue - seen_paths.add(path) - - purl = package_url.to_string() - yield URI( - # NOTE: here we use a previsited PURL as URI - uri=purl, - package_url=purl, - source_uri=self.uri, - # the data contains some popcounts and a description - data=package, - visited=True) - - -def get_packages(packages_json_location): - """ - Yield a path and mapping of Go package raw data from a JSON data location. - { - "name": "aws", - "path": "github.com/aws/aws-sdk-go/aws", - "import_count": 13623, - "synopsis": "Package aws provides the core SDK's utilities and shared types.", - "stars": 4218, - "score": 0.99 - }, - """ - with open(packages_json_location) as f: - data = json.load(f) - for package in data.get('results', []): - path = package['path'] - if path and not is_standard_import(path): - yield path, package - - -def is_standard_import(path): - """ - Return True if a Go import path is for a standard library import - """ - standard_packages = ( - 'archive', - 'bufio', - 'builtin', - 'bytes', - 'compress', - 'container', - 'context', - 'crypto', - 'database', - 'debug', - 'encoding', - 'expvar', - 'flag', - 'fmt', - 'go', - 'hash', - 'html', - 'image', - 'index', - 'io', - 'log', - 'math', - 'mime', - 'net', - 'os', - 'path', - 'plugin', - 'reflect', - 'regexp', - 'runtime', - 'sort', - 'strconv', - 'strings', - 'sync', - 'syscall', - 'testing', - 'text', - 'time', - 'unsafe', - 'golang.org/x/benchmarks', - 'golang.org/x/blog', - 'golang.org/x/build', - 'golang.org/x/crypto', - 'golang.org/x/debug', - 'golang.org/x/image', - 'golang.org/x/mobile', - 'golang.org/x/net', - 'golang.org/x/perf', - 'golang.org/x/review', - 'golang.org/x/sync', - 'golang.org/x/sys', - 'golang.org/x/text', - 'golang.org/x/time', - 'golang.org/x/tools', - 'golang.org/x/tour', - 'golang.org/x/exp' - ) - - return path.startswith(standard_packages) - - -repo_hosters = 'bitbucket.org/', 'github.com/', 'gitlab.com/' - - -def get_well_known_host(path): - """ - Return a host if this path is from a well known hoster or None. - """ - if path.startswith(repo_hosters): - host, _, _ = path.partition('.') - return host - - -def parse_package_path(path): - """ - Return a PackageURL and transformed path given a path to a Go import. - """ - path = path or '' - segments = path.split('/') - - host = get_well_known_host(path) - qualifiers = None - if host: - # keep only the first few segments - segments = segments[:3] - repo_url = 'https://' + '/'.join(segments) - qualifiers = dict(vcs_repository=repo_url) - namespace = None - if len(segments) > 1: - namespace = segments[:-1] - namespace = '/'.join(namespace) - - name = segments[-1] - - path = '/'.join(segments) - - package_url = PackageURL( - type='golang', - namespace=namespace, - name=name, - qualifiers=qualifiers - ) - - return package_url, path diff --git a/minecode/visitors/googlecode.py b/minecode/visitors/googlecode.py deleted file mode 100644 index b924e5fc..00000000 --- a/minecode/visitors/googlecode.py +++ /dev/null @@ -1,152 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) 2014 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from datetime import datetime -import os - -from bs4 import BeautifulSoup - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.utils import extract_file -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class GooglecodeSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://code.google.com/archive/search?q=domain:code.google.com' - yield 'https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip' - - -@visit_router.route('https://storage.googleapis.com/google-code-archive/google-code-archive.txt.zip') -class GooglecodeArchiveVisitor(NonPersistentHttpVisitor): - """ - Fetch the googlecode archive file and extract it, and read the text file and get the URLs - """ - - def get_uris(self, content): - """ - Return URIs by extracting and parsing the text file. - - Please refer to: https://github.com/pombredanne/swh-fetcher-googlecode - - For example, with Google - Cloud Storage URL gs://google-code-archive/v2/code.google/hg4j/project.json, - you can get the file's contents by URL-escaping the string and adding it to - googleapis.com. e.g. - https://www.googleapis.com/storage/v1/ - b/google-code-archive/o/v2%2Fcode.google.com%2Fhg4j%2Fproject.json?alt=media - """ - extracted_location = extract_file(content) - text_file = os.path.join(extracted_location, 'google-code-archive.txt') - url_base = 'https://www.googleapis.com/storage/v1/b/{project_info}?alt=media' - if os.path.exists(text_file): - with open(text_file) as project_file: - for project_line in project_file: - if not project_line: - continue - project_line = project_line.strip() - if project_line.startswith('gs://google-code-archive/v2') and project_line.endswith('/project.json'): - project_line = project_line.replace('gs://google-code-archive/v2', '') - package_name = project_line.replace('/project.json', '') - package_url = PackageURL(type='googlecode', name=package_name.strip('/')).to_string() - project_line = 'google-code-archive/o/v2' + project_line.replace('/', '%2F') - url = url_base.format(project_info=project_line) - yield URI(uri=url, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://www.googleapis.com/storage/v1/b/google-code-archive/o/v2.*project.json\?alt=media') -class GoogleAPIProjectJsonVisitor(HttpJsonVisitor): - """ - Fetch the json of the API URL and this will be used for mapper use. - """ - pass - - -@visit_router.route('https://code.google.com/archive/search\?q=domain:code.google.com', - 'https://code.google.com/archive/search\?q=domain:code.google.com&page=[0-9]*') -class GoogleProjectPagesVisitor(HttpVisitor): - """ - Parse the passing google projects list pages, and return all project json url - which the project belongs to in the current page, and the next page url. - """ - - def get_uris(self, content): - """ - Return URIs for pagnitions of project lists - """ - page = BeautifulSoup(content, 'lxml') - projectjson_url_template = 'https://storage.googleapis.com/google-code-archive/v2/code.google.com/{project}/project.json' - for page in page.find_all('a'): - url = page['href'] - if url and 'https://code.google.com/archive/p/' in url: - project_name = url.replace('https://code.google.com/archive/p/', '') - project_api_url = projectjson_url_template.format(project=project_name) - package_url = PackageURL(type='googlecode', name=project_name.strip('/')).to_string() - yield URI(uri=project_api_url, package_url=package_url, source_uri=self.uri) - if page.text.startswith('Next'): - yield URI(uri=url, source_uri=self.uri) - - -@visit_router.route('https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/project.json') -class GoogleProjectJsonVisitor(HttpJsonVisitor): - """ - Collect the project json for mapper use and also return the download page json url. - """ - - def get_uris(self, content): - """ - Return the download json URL - """ - yield URI(uri=self.uri.replace('project.json', 'downloads-page-1.json')) - - -@visit_router.route('https://storage.googleapis.com/google-code-archive/v2/code.google.com/.*/downloads-page-[0-9]*.json') -class GoogleDownloadsPageJsonVisitor(HttpJsonVisitor): - """ - Collect download URIs and the next page related to the current download page. - """ - - def get_uris(self, content): - """Yield the next download page based on current page number and total page number. - and yield the download urls in the json, for example: - https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/hg4j/hg4j_1.2m2.jar - """ - url = self.uri - page_num = content.get('pageNumber') - total_pages = content.get('totalPages') - name_template = 'downloads-page-{page}.json' - filename = name_template.format(page=str(page_num)) - new_filename = name_template.format(page=str(page_num + 1)) - - assert filename in url - if page_num < total_pages: - new_page_url = url.replace(filename, new_filename) - yield URI(uri=new_page_url, source_uri=self.uri, ) - - download_url_template = url.replace(filename, '') + '{file_name}' - for download in content.get('downloads', []): - file_name = download.get('filename') - package_url = PackageURL(type='googlecode', name=file_name).to_string() - if '_' in file_name and '.' in file_name: - partitions = file_name.partition('_') - package_name = partitions[0] - version = partitions[-1].rpartition('.')[0] - package_url = PackageURL(type='googlecode', name=package_name, version=version).to_string() - download_url = download_url_template.format(file_name=file_name) - last_modified_date = None - release_date = download.get('releaseDate') - if release_date: - last_modified_date = datetime.fromtimestamp(release_date) - yield URI(uri=download_url, package_url=package_url, file_name=file_name, source_uri=self.uri, date=last_modified_date, size=download.get('fileSize'), sha1=download.get('sha1Checksum')) diff --git a/minecode/visitors/gstreamer.py b/minecode/visitors/gstreamer.py deleted file mode 100644 index 1f8637a9..00000000 --- a/minecode/visitors/gstreamer.py +++ /dev/null @@ -1,62 +0,0 @@ -# -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from bs4 import BeautifulSoup - -from commoncode.fileutils import file_base_name -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class GstreamerSeed(seed.Seeder): - is_active = False - - def get_seeds(self): - yield 'https://gstreamer.freedesktop.org/src/' - - -@visit_router.route('https://gstreamer.freedesktop.org/src/([\w\-\.]+/)*') -class GstreamerHTMLVisitor(HttpVisitor): - """ - Visit the HTML page of gstreamer. Yield the uri which can be used for the next visitor use or the uri stands for the file resource. - The regex is to match: - https://gstreamer.freedesktop.org/src/ - https://gstreamer.freedesktop.org/src/gst-openmax/pre/ - """ - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - url_template = self.uri + '{sub_path}' - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - href = a['href'] - if href: - # For parent folder link or other unrelated links, ignore - if href.startswith('/') or href.startswith('?'): - continue - if href.endswith('/'): - # If the path is folder, yield it for the next visitor use. - yield URI(uri=url_template.format(sub_path=href), source_uri=self.uri) - else: - # If it's the file resource, form the package_url and yield the URI with package url info - # For example: gst-openmax-0.10.0.4.tar.bz2 - file_name = href - file_name_without_prefix = file_base_name(file_name) - if '-' in file_name_without_prefix: - project_name_versions = file_name.rpartition('-') - project_name = project_name_versions[0] - version = project_name_versions[-1] - else: - project_name = file_name - version = None - package_url = PackageURL(type='gstreamer', name=project_name, version=version).to_string() - yield URI(uri=url_template.format(sub_path=href), package_url=package_url, file_name=file_name, source_uri=self.uri) diff --git a/minecode/visitors/haxe.py b/minecode/visitors/haxe.py deleted file mode 100644 index 9395ada1..00000000 --- a/minecode/visitors/haxe.py +++ /dev/null @@ -1,83 +0,0 @@ -# -# Copyright (c) 2018 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from bs4 import BeautifulSoup - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class HaxeSeed(seed.Seeder): - is_active = False - - def get_seeds(self): - yield 'https://lib.haxe.org/all' - - -@visit_router.route('https://lib.haxe.org/all') -class HaxeProjectsVisitor(HttpVisitor): - """ - Visit the Haxe all projects page and yield uri of each project. - """ - - def get_uris(self, content): - """ - Parse the HTML to get project name, and format the url with this project name into a version URL. - For example: https://lib.haxe.org/p/openfl/versions/ - """ - version_url_tempalte = 'https://lib.haxe.org{project_href}versions' - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - href = a['href'] - if href and href.startswith('/p/'): - project_name = href.replace('/p', '').rstrip('/') - package_url = PackageURL(type='haxe', name=project_name).to_string() - yield URI(uri=version_url_tempalte.format(project_href=href), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://lib.haxe.org/p/[\w\-\.]+/versions') -class HaxeVersionsVisitor(HttpVisitor): - """ - Visit the version page of a project and yield uri of each version. - For example: https://lib.haxe.org/p/openfl/versions - """ - - def get_uris(self, content): - """ - Yield haxelib json URL based on specified version, for example: https://lib.haxe.org/p/openfl/8.6.4/raw-files/openfl/package.json - """ - version_url_tempalte = 'https://lib.haxe.org/p/{project}/{version}/raw-files/{project}/package.json' - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - href = a['href'] - if href and href.startswith('/p/') and href.endswith('/'): - # Parse if the href contains the versino info: - project_version = href.replace('/p/', '').rstrip('/') - project_version = project_version.split('/') - if len(project_version) == 2: - # if there is only one slash between project and version, openfl/8.6.3 - project = project_version[0] - version = project_version[1] - package_url = PackageURL(type='haxe', name=project, version=version).to_string() - yield URI(uri=version_url_tempalte.format(project=project, version=version), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://lib.haxe.org/p/[\w\-\.]+/[\w\-\.]+/raw-files/[\w\-\.]+/package.json') -class HaxePackageJsonVisitor(HttpJsonVisitor): - """ - Empty Visitor to get the package json content only. - """ - pass diff --git a/minecode/visitors/maven.py b/minecode/visitors/maven.py deleted file mode 100644 index d08ffb21..00000000 --- a/minecode/visitors/maven.py +++ /dev/null @@ -1,1648 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -from collections import namedtuple -from typing import Dict -from urllib.parse import urlparse -import gzip -import hashlib -import io -import json -import logging -import os -import re - -from bs4 import BeautifulSoup -from dateutil import tz -import arrow -import requests - -from jawa.util.utf import decode_modified_utf8 -import javaproperties - -from packageurl import PackageURL -from packagedcode.maven import build_filename -from packagedcode.maven import build_url -from packagedcode.maven import get_urls -from packagedcode.maven import get_maven_pom -from packagedcode.maven import _parse - -from minecode import priority_router -from minecode import seed -from minecode import visit_router -from minecode.visitors import java_stream -from minecode.visitors import HttpVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI -from minecode.utils import validate_sha1 -from packagedb.models import make_relationship -from packagedb.models import PackageContentType -from packagedb.models import PackageRelation -from packagedb.models import make_relationship - -""" -This module handles the Maven repositories such as central and other -nexus-based maven repositories. This is dubbed the maven2 format for the -repository and support the v4 POM format. - -Old Maven1 format repositories are not supported (e.g. with jars, -sources, poms directories and POM format v2/v3). -""" - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) - -TRACE = False -TRACE_DEEP = False - -if TRACE: - logger.setLevel(logging.DEBUG) - -MAVEN_BASE_URL = 'https://repo1.maven.org/maven2' - - -class GzipFileWithTrailing(gzip.GzipFile): - """ - A subclass of gzip.GzipFile supporting files with trailing garbage. Ignore - the garbage. - """ - # TODO: what is first_file?? - first_file = True - gzip_magic = b'\037\213' - has_trailing_garbage = False - - def _read_gzip_header(self): - # read the first two bytes - magic = self.fileobj.read(2) - # rewind two bytes back - self.fileobj.seek(-2, os.SEEK_CUR) - is_gzip = magic != self.gzip_magic - if is_gzip and not self.first_file: - self.first_file = False - self.has_trailing_garbage = True - raise EOFError('Trailing garbage found') - - self.first_file = False - gzip.GzipFile._read_gzip_header(self) - - -class MavenSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz' - yield 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties' - # yield 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.457.gz' - # yield 'http://jcenter.bintray.com/' - # yield 'https://repo2.maven.org/maven2/.index/nexus-maven-repository-index.gz' - # other repos: http://stackoverflow.com/a/161846/302521 - # 1. google has a mirror https://www.infoq.com/news/2015/11/maven-central-at-google - # https://maven-central.storage.googleapis.com/repos/central/data/.index/nexus-maven-repository-index.properties - # 2. apache has a possible mirro at http://repo.maven.apache.org/maven2/.index/nexus-maven-repository-index.properties - # 3. ibiblio has an out of date mirror that has no directory listing and was last updated on 20161121171437 - # clojars is not a mirror, but its own repo: https://clojars.org/repo/.index/ - # other mirrors https://www.google.com/search?q=allinurl%3A%20.index%2Fnexus-maven-repository-index.properties&pws=0&gl=us&gws_rd=cr - # also has a npm mirrors: https://maven-eu.nuxeo.org/nexus/#view-repositories;npmjs~browsestorage - - -def get_pom_text(namespace, name, version, qualifiers={}, base_url=MAVEN_BASE_URL): - """ - Return the contents of the POM file of the package described by the purl - field arguments in a string. - """ - # Create URLs using purl fields - if qualifiers and not isinstance(qualifiers, Dict): - return - urls = get_urls( - namespace=namespace, - name=name, - version=version, - qualifiers=qualifiers, - base_url=base_url, - ) - if not urls: - return - # Get and parse POM info - pom_url = urls['api_data_url'] - # TODO: manage different types of errors (404, etc.) - response = requests.get(pom_url) - if not response: - return - return response.text - - -def fetch_parent(pom_text, base_url=MAVEN_BASE_URL): - """ - Return the parent pom text of `pom_text`, or None if `pom_text` has no parent. - """ - if not pom_text: - return - pom = get_maven_pom(text=pom_text) - if ( - pom.parent - and pom.parent.group_id - and pom.parent.artifact_id - and pom.parent.version.version - ): - parent_namespace = pom.parent.group_id - parent_name = pom.parent.artifact_id - parent_version = str(pom.parent.version.version) - parent_pom_text = get_pom_text( - namespace=parent_namespace, - name=parent_name, - version=parent_version, - qualifiers={}, - base_url=base_url, - ) - return parent_pom_text - - -def get_ancestry(pom_text, base_url=MAVEN_BASE_URL): - """ - Return a list of pom text of the ancestors of `pom`. The list is ordered - from oldest ancestor to newest. The list is empty is there is no parent pom. - """ - ancestors = [] - has_parent = True - while has_parent: - parent_pom_text = fetch_parent(pom_text=pom_text, base_url=base_url) - if not parent_pom_text: - has_parent = False - else: - ancestors.append(parent_pom_text) - pom_text = parent_pom_text - return reversed(ancestors) - - -def get_merged_ancestor_package_from_maven_package(package, base_url=MAVEN_BASE_URL): - """ - Merge package details of a package with its ancestor pom - and return the merged package. - """ - if not package: - return - pom_text = get_pom_text( - name=package.name, - namespace=package.namespace, - version=package.version, - qualifiers=package.qualifiers, - base_url=base_url, - ) - merged_package = merge_ancestors( - ancestor_pom_texts=get_ancestry(pom_text), - package=package, - ) - return merged_package - - -def merge_parent(package, parent_package): - """ - Merge `parent_package` data into `package` and return `package. - """ - mergeable_fields = ( - 'declared_license_expression', - 'homepage_url', - 'parties', - ) - for field in mergeable_fields: - # If `field` is empty on the package we're looking at, populate - # those fields with values from the parent package. - if not getattr(package, field): - value = getattr(parent_package, field) - setattr(package, field, value) - - msg = f'Field `{field}` has been updated using values obtained from the parent POM {parent_package.purl}' - history = package.extra_data.get('history') - if history: - package.extra_data['history'].append(msg) - else: - package.extra_data['history'] = [msg] - - return package - - -def merge_ancestors(ancestor_pom_texts, package): - """ - Merge metadata from `ancestor_pom_text` into `package`. - - The order of POM content in `ancestor_pom_texts` is expected to be in the - order of oldest ancestor to newest. - """ - for ancestor_pom_text in ancestor_pom_texts: - ancestor_package = _parse( - datasource_id='maven_pom', - package_type='maven', - primary_language='Java', - text=ancestor_pom_text, - ) - package = merge_parent(package, ancestor_package) - return package - - -def map_maven_package(package_url, package_content, pipelines, priority=0, reindex_metadata=False): - """ - Add a maven `package_url` to the PackageDB. - - Return an error string if errors have occured in the process. - - if ``reindex_metadata`` is True, only reindex metadata and DO NOT rescan the full package. - """ - from minecode.model_utils import add_package_to_scan_queue, merge_or_create_package - - db_package = None - error = '' - - if 'repository_url' in package_url.qualifiers: - base_url = package_url.qualifiers['repository_url'] - else: - base_url = MAVEN_BASE_URL - - pom_text = get_pom_text( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - qualifiers=package_url.qualifiers, - base_url=base_url, - ) - if not pom_text: - msg = f'Package does not exist on maven: {package_url}' - error += msg + '\n' - logger.error(msg) - return db_package, error - - package = _parse( - 'maven_pom', - 'maven', - 'Java', - text=pom_text, - base_url=base_url, - ) - ancestor_pom_texts = get_ancestry(pom_text=pom_text, base_url=base_url) - package = merge_ancestors( - ancestor_pom_texts=ancestor_pom_texts, package=package) - - urls = get_urls( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - qualifiers=package_url.qualifiers, - base_url=base_url, - ) - # In the case of looking up a maven package with qualifiers of - # `classifiers=sources`, the purl of the package created from the pom does - # not have the qualifiers, so we need to set them. Additionally, the download - # url is not properly generated since it would be missing the sources bit - # from the filename. - package.qualifiers = package_url.qualifiers - package.download_url = urls['repository_download_url'] - package.repository_download_url = urls['repository_download_url'] - - # Set package_content value - package.extra_data['package_content'] = package_content - - # If sha1 exists for a jar, we know we can create the package - # Use pom info as base and create packages for binary and source package - - # Check to see if binary is available - sha1 = get_package_sha1(package) - if sha1: - package.sha1 = sha1 - override = reindex_metadata - db_package, _, _, _ = merge_or_create_package( - package, visit_level=50, override=override) - else: - msg = f'Failed to retrieve JAR: {package_url}' - error += msg + '\n' - logger.error(msg) - - if not reindex_metadata: - # Submit package for scanning - if db_package: - add_package_to_scan_queue( - package=db_package, - pipelines=pipelines, - priority=priority - ) - - return db_package, error - - -def map_maven_binary_and_source(package_url, pipelines, priority=0, reindex_metadata=False): - """ - Get metadata for the binary and source release of the Maven package - `package_url` and save it to the PackageDB. - - Return an error string for errors that occur, or empty string if there is no error. - """ - error = '' - package, emsg = map_maven_package( - package_url=package_url, - package_content=PackageContentType.BINARY, - pipelines=pipelines, - priority=priority, - reindex_metadata=reindex_metadata, - ) - if emsg: - error += emsg - - source_package_url = package_url - source_package_url.qualifiers['classifier'] = 'sources' - source_package, emsg = map_maven_package( - package_url=source_package_url, - package_content=PackageContentType.SOURCE_ARCHIVE, - pipelines=pipelines, - priority=priority, - reindex_metadata=reindex_metadata, - ) - if emsg: - error += emsg - - if not reindex_metadata and package and source_package: - make_relationship( - from_package=source_package, - to_package=package, - relationship=PackageRelation.Relationship.SOURCE_PACKAGE, - ) - - return error - - -def map_maven_packages(package_url, pipelines): - """ - Given a valid `package_url` with no version, get metadata for the binary and - source release for each version of the Maven package `package_url` and save - it to the PackageDB. - - Return an error string for errors that occur, or empty string if there is no error. - """ - error = '' - namespace = package_url.namespace - name = package_url.name - # Find all versions of this package - query_params = f'g:{namespace}+AND+a:{name}' - url = f'https://search.maven.org/solrsearch/select?q={query_params}&core=gav' - response = requests.get(url) - if response: - package_listings = response.json().get('response', {}).get('docs', []) - for listing in package_listings: - purl = PackageURL( - type='maven', - namespace=listing.get('g'), - name=listing.get('a'), - version=listing.get('v'), - ) - emsg = map_maven_binary_and_source(purl, pipelines) - if emsg: - error += emsg - return error - - -def get_package_sha1(package): - """ - Return the sha1 value for `package` by checking if the sha1 file exists for - `package` on maven and returning the contents if it does. - If the sha1 is invalid, we download the package's JAR and calculate the sha1 - from that. - """ - download_url = package.repository_download_url - sha1_download_url = f'{download_url}.sha1' - response = requests.get(sha1_download_url) - if response.ok: - sha1_contents = response.text.strip().split() - sha1 = sha1_contents[0] - sha1 = validate_sha1(sha1) - if not sha1: - # Download JAR and calculate sha1 if we cannot get it from the repo - response = requests.get(download_url) - if response: - sha1_hash = hashlib.new('sha1', response.content) - sha1 = sha1_hash.hexdigest() - return sha1 - - -@priority_router.route('pkg:maven/.*') -def process_request(purl_str, **kwargs): - """ - Process `priority_resource_uri` containing a maven Package URL (PURL) as a - URI. - - This involves obtaining Package information for the PURL from maven and - using it to create a new PackageDB entry. The package is then added to the - scan queue afterwards. We also get the Package information for the - accompanying source package and add it to the PackageDB and scan queue, if - available. - - Return an error string for errors that occur, or empty string if there is no error. - """ - from minecode.model_utils import DEFAULT_PIPELINES - - addon_pipelines = kwargs.get('addon_pipelines', []) - pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) - - try: - package_url = PackageURL.from_string(purl_str) - except ValueError as e: - error = f'error occured when parsing {purl_str}: {e}' - return error - - has_version = bool(package_url.version) - if has_version: - reindex_metadata = kwargs.get("reindex_metadata", False) - error = map_maven_binary_and_source( - package_url, - pipelines, - reindex_metadata=reindex_metadata, - priority=priority, - ) - else: - error = map_maven_packages(package_url, pipelines) - - return error - - -collect_links = re.compile(r'href="([^"]+)"').findall -collect_links_and_artifact_timestamps = re.compile( - r'\s+(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}|-)' -).findall - - -def check_if_file_name_is_linked_on_page(file_name, links, **kwargs): - """ - Return True if `file_name` is in `links` - """ - return any(l.endswith(file_name) for l in links) - - -def check_if_page_has_pom_files(links, **kwargs): - """ - Return True of any entry in `links` ends with .pom. - """ - return any(l.endswith('.pom') for l in links) - - -def check_if_page_has_directories(links, **kwargs): - """ - Return True if any entry, excluding "../", ends with /. - """ - return any(l.endswith('/') for l in links if l != '../') - - -def check_if_package_version_page(links, **kwargs): - """ - Return True if `links` contains pom files and has no directories - """ - return check_if_page_has_pom_files( - links=links - ) and not check_if_page_has_directories(links=links) - - -def check_if_package_page(links, **kwargs): - return check_if_file_name_is_linked_on_page( - file_name='maven-metadata.xml', links=links - ) and not check_if_page_has_pom_files(links=links) - - -def check_if_maven_root(links, **kwargs): - """ - Return True if "archetype-catalog.xml" is in `links`, as the root of a Maven - repo contains "archetype-catalog.xml". - """ - return check_if_file_name_is_linked_on_page( - file_name='archetype-catalog.xml', links=links - ) - - -def check_on_page(url, checker): - """ - Return True if there is a link on `url` that is the same as `file_name`, - False otherwise. - """ - response = requests.get(url) - if response: - links = collect_links(response.text) - return checker(links=links) - return False - - -def is_maven_root(url): - """ - Return True if `url` is the root of a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_maven_root) - - -def is_package_page(url): - """ - Return True if `url` is a package page on a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_package_page) - - -def is_package_version_page(url): - """ - Return True if `url` is a package version page on a Maven repo, False otherwise. - """ - return check_on_page(url, check_if_package_version_page) - - -def url_parts(url): - parsed_url = urlparse(url) - scheme = parsed_url.scheme - netloc = parsed_url.netloc - path_segments = [p for p in parsed_url.path.split('/') if p] - return scheme, netloc, path_segments - - -def create_url(scheme, netloc, path_segments): - url_template = f'{scheme}://{netloc}' - path = '/'.join(path_segments) - return f'{url_template}/{path}' - - -def get_maven_root(url): - """ - Given `url`, that is a URL to namespace, package, or artifact in a Maven - repo, return the URL to the root of that repo. If a Maven root cannot be - determined, return None. - - >>> get_maven_root('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') - 'https://repo1.maven.org/maven2' - """ - scheme, netloc, path_segments = url_parts(url) - for i in range(len(path_segments)): - segments = path_segments[: i + 1] - url_segment = create_url(scheme, netloc, segments) - if is_maven_root(url_segment): - return url_segment - return None - - -def determine_namespace_name_version_from_url(url, root_url=None): - """ - Return a 3-tuple containing strings of a Package namespace, name, and - version, determined from `url`, where `url` points to namespace, package, - specific package version, or artifact on a Maven repo. - - Return None if a Maven root cannot be determined from `url`. - - >>> determine_namespace_name_version_from_url('https://repo1.maven.org/maven2/net/shibboleth/parent/7.11.0/') - ('net.shibboleth', 'parent', '7.11.0') - """ - if not root_url: - root_url = get_maven_root(url) - if not root_url: - raise Exception(f'Error: not a Maven repository: {url}') - - _, remaining_path_segments = url.split(root_url) - remaining_path_segments = remaining_path_segments.split('/') - remaining_path_segments = [p for p in remaining_path_segments if p] - - namespace_segments = [] - package_name = '' - package_version = '' - for i in range(len(remaining_path_segments)): - segment = remaining_path_segments[i] - segments = remaining_path_segments[: i + 1] - path = '/'.join(segments) - url_segment = f'{root_url}/{path}' - if is_package_page(url_segment): - package_name = segment - elif is_package_version_page(url_segment): - package_version = segment - else: - namespace_segments.append(segment) - namespace = '.'.join(namespace_segments) - return namespace, package_name, package_version - - -def add_to_import_queue(url, root_url): - """ - Create ImportableURI for the Maven repo package page at `url`. - """ - from minecode.models import ImportableURI - - data = None - response = requests.get(url) - if response: - data = response.text - namespace, name, _ = determine_namespace_name_version_from_url( - url, root_url) - purl = PackageURL( - type='maven', - namespace=namespace, - name=name, - ) - importable_uri = ImportableURI.objects.insert(url, data, purl) - if importable_uri: - logger.info(f'Inserted {url} into ImportableURI queue') - - -def filter_only_directories(timestamps_by_links): - """ - Given a mapping of `timestamps_by_links`, where the links are directory names (which end with `/`), - """ - timestamps_by_links_filtered = {} - for link, timestamp in timestamps_by_links.items(): - if link != '../' and link.endswith('/'): - timestamps_by_links_filtered[link] = timestamp - return timestamps_by_links_filtered - - -valid_artifact_extensions = [ - 'ejb3', - 'ear', - 'aar', - 'apk', - 'gem', - 'jar', - 'nar', - # 'pom', - 'so', - 'swc', - 'tar', - 'tar.gz', - 'war', - 'xar', - 'zip', -] - - -def filter_for_artifacts(timestamps_by_links): - """ - Given a mapping of `timestamps_by_links`, where the links are the filenames - of Maven artifacts, return a mapping of filenames whose extension is in - `valid_artifact_extensions` and their timestamps. - """ - timestamps_by_links_filtered = {} - for link, timestamp in timestamps_by_links.items(): - for ext in valid_artifact_extensions: - if link.endswith(ext): - timestamps_by_links_filtered[link] = timestamp - return timestamps_by_links_filtered - - -def collect_links_from_text(text, filter): - """ - Return a mapping of link locations and their timestamps, given HTML `text` - content, that is filtered using `filter`. - """ - links_and_timestamps = collect_links_and_artifact_timestamps(text) - timestamps_by_links = {} - for link, timestamp in links_and_timestamps: - if timestamp == '-': - timestamp = '' - timestamps_by_links[link] = timestamp - - timestamps_by_links = filter(timestamps_by_links=timestamps_by_links) - return timestamps_by_links - - -def create_absolute_urls_for_links(text, url, filter): - """ - Given the `text` contents from `url`, return a mapping of absolute URLs to - links from `url` and their timestamps, that is then filtered by `filter`. - """ - timestamps_by_absolute_links = {} - url = url.rstrip('/') - timestamps_by_links = collect_links_from_text(text, filter) - for link, timestamp in timestamps_by_links.items(): - if not link.startswith(url): - link = f'{url}/{link}' - timestamps_by_absolute_links[link] = timestamp - return timestamps_by_absolute_links - - -def get_directory_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ - timestamps_by_directory_links = {} - response = requests.get(url) - if response: - timestamps_by_directory_links = create_absolute_urls_for_links( - response.text, url=url, filter=filter_only_directories - ) - return timestamps_by_directory_links - - -def get_artifact_links(url): - """ - Return a list of absolute directory URLs of the hyperlinks from `url` - """ - timestamps_by_artifact_links = [] - response = requests.get(url) - if response: - timestamps_by_artifact_links = create_absolute_urls_for_links( - response.text, url=url, filter=filter_for_artifacts - ) - return timestamps_by_artifact_links - - -def crawl_to_package(url, root_url): - """ - Given a maven repo `url`, - """ - if is_package_page(url): - add_to_import_queue(url, root_url) - return - - for link in get_directory_links(url): - crawl_to_package(link, root_url) - - -def crawl_maven_repo_from_root(root_url): - """ - Given the `url` to a maven root, traverse the repo depth-first and add - packages to the import queue. - """ - crawl_to_package(root_url, root_url) - - -def get_artifact_sha1(artifact_url): - """ - Return the SHA1 value of the Maven artifact located at `artifact_url`. - """ - sha1 = None - artifact_sha1_url = f'{artifact_url}.sha1' - response = requests.get(artifact_sha1_url) - if response: - sha1_contents = response.text.strip().split() - sha1 = sha1_contents[0] - sha1 = validate_sha1(sha1) - return sha1 - - -def get_classifier_from_artifact_url( - artifact_url, package_version_page_url, package_name, package_version -): - """ - Return the classifier from a Maven artifact URL `artifact_url`, otherwise - return None if a classifier cannot be determined from `artifact_url` - """ - classifier = None - # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0 - package_version_page_url = package_version_page_url.rstrip('/') - # https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0 - leading_url_portion = f'{package_version_page_url}/{package_name}-{package_version}' - # artifact_url = 'https://repo1.maven.org/maven2/net/alchim31/livereload-jvm/0.2.0/livereload-jvm-0.2.0-onejar.jar' - # ['', '-onejar.jar'] - _, remaining_url_portion = artifact_url.split(leading_url_portion) - # ['-onejar', 'jar'] - remaining_url_portions = remaining_url_portion.split('.') - if remaining_url_portions and remaining_url_portions[0]: - # '-onejar' - classifier = remaining_url_portions[0] - if classifier.startswith('-'): - # 'onejar' - classifier = classifier[1:] - return classifier - - -@visit_router.route('http://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') -@visit_router.route('https://repo1\.maven\.org/maven2/\.index/nexus-maven-repository-index.properties') -class MavenNexusPropertiesVisitor(NonPersistentHttpVisitor): - """ - Fetch the property files, parse the create the URI for each increment index - """ - - def get_uris(self, content): - """ - Parse a NEXUS index properties file and yield increment index URIs - This file is a Java properties file with rows likes this: - nexus.index.incremental-15=526 - nexus.index.incremental-14=527 - - Each value points to a fragment increamental index that has the same - format as the bigger one. - """ - - base_url = 'https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz' - with open(content) as config_file: - properties = javaproperties.load(config_file) or {} - - for key, increment_index in properties.items(): - if key.startswith('nexus.index.incremental'): - yield URI( - uri=base_url.format(index=increment_index), - source_uri=self.uri, - ) - - -@visit_router.route( - 'https?://.*/nexus-maven-repository-index.gz', - # increments - 'https?://.*/nexus-maven-repository-index\.\d+\.gz') -class MavenNexusIndexVisitor(NonPersistentHttpVisitor): - """ - Download and process a Nexus Maven index file. - WARNING: Processing is rather long: a full index is ~600MB. - """ - - def get_uris(self, content): - """ - Yield a combo of pre-visited URIs with a special maven-index:// - scheme together with other regular fetchable URIs for POMs and - JARs found in a Maven index. - - For NonPersistentHttpVisitor content is the path to the temp Gzipped - index file, not the actual file content. - """ - index_location = content - - artifacts = get_artifacts( - index_location, worthyness=is_worthy_artifact) - - for artifact in artifacts: - # we cannot do much without these - group_id = artifact.group_id - artifact_id = artifact.artifact_id - version = artifact.version - extension = artifact.extension - - if not (group_id and artifact_id and version and extension): - continue - - qualifiers = {} - if extension and extension != 'jar': - qualifiers['type'] = extension - - classifier = artifact.classifier - if classifier: - qualifiers['classifier'] = classifier - - package_url = PackageURL( - type='maven', - namespace=group_id, - name=artifact_id, - version=version, - qualifiers=qualifiers or None, - ) - - # FIXME: also use the Artifact.src_exist flags too? - - # build a URL: This is the real JAR download URL - # FIXME: this should be set at the time of creating Artifacts - # instead togther with the filename... especially we could use - # different REPOs. - jar_download_url, file_name = build_url_and_filename( - group_id, artifact_id, version, extension, classifier) - - # FIXME: should this be set in the yielded URI too - last_mod = artifact.last_modified - - # We yield a pre-visited URI for each JAR - mock_maven_index_uri = build_url( - group_id, artifact_id, version, file_name, - base_url='maven-index://repo1.maven.org') - - artifact_data = artifact.to_dict() - artifact_data['download_url'] = jar_download_url - artifact_as_json = json.dumps(artifact_data, separators=(',', ':')) - - yield URI( - # this is the Maven index index URI - source_uri=self.uri, - # FIXME: remove these mock URIs after migration - uri=mock_maven_index_uri, - package_url=package_url.to_string(), - visited=True, - mining_level=0, - file_name=file_name, - size=artifact.size, - sha1=artifact.sha1, - date=last_mod, - data=artifact_as_json, - ) - - package_url = PackageURL( - type='maven', - namespace=group_id, - name=artifact_id, - version=version, - ) - - # also yield a POM for this. There are no artifacts for - # the POM of a Jar in the repo. Only for Parent POMs - # therefore we create a download with the pomextension - pom_download_url, pom_file_name = build_url_and_filename( - group_id, artifact_id, version, extension='pom', classifier='') - yield URI( - # this is the Maven index index URI - source_uri=self.uri, - uri=pom_download_url, - # use the same PURL as the main jar - package_url=package_url.to_string(), - visited=False, - mining_level=20, - file_name=pom_file_name, - size=0, - date=last_mod, - ) - - -@visit_router.route('https?://jcenter\.bintray\.com/(.+/)*') -class MavenHTMLPageVisitor(HttpVisitor): - """ - Parse the HTML page and yield all necessary uris from the page and its sub pages. - Note that the regex of the route expression is using . to map any characters except new line is becasue of the case: - http://jcenter.bintray.com/'com/virtualightning'/, this is in the test too. - """ - - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for pre in page.find_all(name='pre'): - for a in pre.find_all(name='a'): - url = a.get('href') - if not url: - continue - # Remove : symbol since it's a special char for bintray repo. - if url.startswith(':'): - url = url[1:] - filename = None # default is folder, the filename is None. - if not url.endswith('/'): - # a file - filename = url - yield URI( - uri=self.uri + url, - visited=False, - file_name=filename, - source_uri=self.uri, - ) - - -@visit_router.route('https?://.*/maven-metadata\.xml') -class MavenMetaDataVisitor(HttpVisitor): - """ - Parse the maven-metadata.xml file and yield uris of jars and pom. - """ - - def get_uris(self, content): - # FIXME this may not be correct. The only thing we can infer from the maven - # metadata is wha are the groupid/artifactid and available versions - # The actual download files likely need to be obtained from directory listing - # or infered from parsing the POM??? - - base_url = self.uri.partition('maven-metadata.xml')[0] + '{version}/' - pom_url = base_url + '{artifactId}-{version}.pom' - - # FIXME: this may not exist and or with another extension?? and this should be PREVISITED - jar_url = base_url + '{artifactId}-{version}.jar' - # FIXME: sources may not exists?? and this should be PREVISITED - source_url = base_url + '{artifactId}-{version}-sources.jar' - - # FIXME: why use BeautifulSoup for valid XML??? - page = BeautifulSoup(content, 'lxml-xml') - - group_id = page.find(name='groupId') - artifact_id = page.find(name='artifactId') - if not (group_id and artifact_id): - return - - group_id = group_id.string - artifact_id = artifact_id.string - - for version in page.find_all('version'): - version = version.string - - # FIXME: we may not get the proper extensions and classifiers and miss the qualifiers - package_url = PackageURL( - type='maven', - namespace=group_id, - name=artifact_id, - version=version).to_string() - - # the JAR proper as previsited - yield URI( - source_uri=self.uri, - uri=jar_url.format(version=version, artifactId=artifact_id), - package_url=package_url, - visited=True, - ) - - # the source as previsited - yield URI( - source_uri=self.uri, - uri=source_url.format(version=version, artifactId=artifact_id), - package_url=package_url, - visited=True, - ) - - # the POM needs to be visited - yield URI( - source_uri=self.uri, - uri=pom_url.format(version=version, artifactId=artifact_id), - package_url=package_url, - visited=False, - ) - - -# TODO: consider switching to HTTPS -def build_url_and_filename(group_id, artifact_id, version, extension, classifier, - base_repo_url='https://repo1.maven.org/maven2'): - """ - Return a tuple of (url, filename) for the download URL of a Maven - artifact built from its coordinates. - """ - file_name = build_filename(artifact_id, version, extension, classifier) - url = build_url(group_id, artifact_id, version, file_name, base_repo_url) - return url, file_name - - -# TODO: consider switching to HTTPS -def build_maven_xml_url(group_id, artifact_id, - base_repo_url='https://repo1.maven.org/maven2'): - """ - Return a download URL for a Maven artifact built from its - coordinates. - """ - group_id = group_id.replace('.', '/') - path = '{group_id}/{artifact_id}'.format(**locals()) - return '{base_repo_url}/{path}/maven-metadata.xml'.format(**locals()) - - -@visit_router.route('https?://repo1.maven.org/maven2/.*\.pom') -class MavenPOMVisitor(HttpVisitor): - """ - Visit a POM. The POM XML is stored as data and there is nothing - special to do for this visitor. - """ - pass - - -def is_worthy_artifact(artifact): - """ - We only care for certain artifacts that are worthy of indexing. - - Maven has some intricate interrelated values for these fields - type, extension, packaging, classifier, language - See http://maven.apache.org/ref/3.2.5/maven-core/artifact-handlers.html - - These are the defaults: - - type extension packaging classifier language - -------------------------------------------------------------- - pom = type = type none - jar = type = type java - maven-plugin jar = type java - ejb jar ejb = type java - ejb3 = type ejb3 = type java - war = type = type java - ear = type = type java - rar = type = type java - par = type = type java - java-source jar = type sources java - javadoc jar = type javadoc java - ejb-client jar ejb client java - test-jar jar jar tests java - """ - if artifact.version == 'archetypes': - # we skip these entirely, they have a different shape - return - - worthy_ext_pack = set([ - # packaging, classifier, extension - (u'jar', u'sources', u'jar'), - (u'jar', None, u'jar'), - (u'bundle', None, u'jar'), - (u'war', None, u'war'), - (u'zip', u'source-release', u'zip'), - (u'maven-plugin', None, u'jar'), - (u'aar', None, u'aar'), - (u'jar', u'sources-commercial', u'jar'), - (u'zip', u'src', u'zip'), - (u'tar.gz', u'src', u'tar.gz'), - (u'jar', None, u'zip'), - (u'zip', u'project-src', u'zip'), - (u'jar', u'src', u'jar'), - ]) - - return (artifact.packaging, - artifact.classifier, - artifact.extension,) in worthy_ext_pack - - -def is_source(classifier): - """ - Return True if the `artifact` Artifact is a source artifact. - - """ - return classifier and ('source' in classifier or 'src' in classifier) - -######################################################################## -# DOCUMENTAION OF the FIELDS aka. Records: -# -# Constants and information for field names can be found in -# https://github.com/apache/maven-indexer/tree/ecddb3c18ee1ee1357a01bffa7f9cb5252f21209 -# in these classes: -# - org.apache.maven.index.ArtifactInfoRecord -# - org.apache.maven.index.ArtifactInfo -# - org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator -# See also org.apache.maven.index.reader -# -# Note: these are the field names found in the Maven central index in -# July 2016: -# i u 1 m n d del -# allGroups allGroupsList rootGroups rootGroupsList -# IDXINFO DESCRIPTOR -# -# Bundle-Description Bundle-DocURL Bundle-License Bundle-Name Bundle- -# SymbolicName Bundle-Version Export-Package Export-Service Import- -# Package Require-Bundle - - -ENTRY_FIELDS = { - 'u': 'Artifact UINFO: Unique groupId, artifactId, version, classifier, extension (or packaging). using', - 'i': 'Artifact INFO: data using | separator', - '1': 'Artifact SHA1 checksum, hex encoded as in sha1sum', - 'm': 'Artifact record last modified, a long as a string representing a Java time for the entry record', - 'n': 'Artifact name', - 'd': 'Artifact description', -} - -# we IGNORE these fields for now. They can be included optionally. -ENTRY_FIELDS_OTHER = { - # rarely present, mostly is repos other than central - 'c': 'Artifact Classes (tokenized on newlines only) a list of LF-separated paths, without .class extension', - - 'sha256': 'sha256 of artifact? part of OSGI?', - - # OSGI stuffs, not always there but could be useful metadata - 'Bundle-SymbolicName': 'Bundle-SymbolicName (indexed, stored)', - 'Bundle-Version': 'Bundle-Version (indexed, stored)', - 'Bundle-Description': 'Bundle-Description (indexed, stored)', - 'Bundle-Name': 'Bundle-Name (indexed, stored)', - 'Bundle-License': 'Bundle-License (indexed, stored)', - 'Bundle-DocURL': 'Bundle-DocURL (indexed, stored)', - 'Require-Bundle': 'Require-Bundle (indexed, stored)', -} - -# we ignore these fields entirely for now. -ENTRY_FIELDS_IGNORED = { - - 'IDXINFO': '', - 'DESCRIPTOR': '', - - 'allGroups': '', - 'allGroupsList': '', - 'rootGroups': '', - 'rootGroupsList': '', - - # FIXME: we should deal with these - 'del': 'Deleted marker, will contain UINFO if document is deleted from index', - - 'Export-Package': 'Export-Package (indexed, stored)', - 'Export-Service': 'Export-Service (indexed, stored)', - 'Import-Package': 'Import-Package (indexed, stored)', - # maven-plugin stuffs - 'px': 'MavenPlugin prefix (as keyword, stored)', - 'gx': 'MavenPlugin goals (as keyword, stored)', -} - - -def get_artifacts(location, fields=frozenset(ENTRY_FIELDS), - worthyness=is_worthy_artifact, include_all=False): - """ - Yield artifact mappings from a Gzipped Maven nexus index data file - at location. - """ - for entry in get_entries(location, fields): - artifact = build_artifact(entry, include_all) - # at this stage we know enough to decide is this data is worthy of being an - # artifact for now we care only about a few things: POMs and binary Jars. - if artifact and worthyness(artifact): - yield artifact - - -_artifact_base_fields = ( - 'group_id', - 'artifact_id', - 'version', - 'packaging', - 'classifier', - 'extension', - 'last_modified', - 'size', - 'sha1', - 'name', - 'description', - 'src_exist', - 'jdoc_exist', - 'sig_exist', -) - -_artifact_extended_fields = ( - 'sha256', - 'osgi', - 'classes', -) - -# FIXME: named tuples are suboptimal here for a simple dictionary - - -def to_dict(self): - return self._asdict() - - -Artifact = namedtuple('Artifact', _artifact_base_fields) -Artifact.to_dict = to_dict - -ArtifactExtended = namedtuple( - 'ArtifactExtended', _artifact_base_fields + _artifact_extended_fields) -ArtifactExtended.to_dict = to_dict - - -def build_artifact(entry, include_all=False): - """ - Return a Maven artifact mapping collected from a single entry - mapping or None. - """ - - SEP = '|' - NA = 'NA' - NULL = 'null' - - # UINFO - # See org.apache.maven.index.reader.RecordExpander.expandUinfo - # See org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator.updateArtifactInfo - uinfo = entry.get('u') - if not uinfo: - # not much we can do without this - return - - uinfo = uinfo.split(SEP) - gid = uinfo[0] - aid = uinfo[1] - version = uinfo[2] - - classifier = uinfo[3] - if classifier == NA: - classifier = None - - extension = None - if len(uinfo) > 4: - extension = uinfo[4] - - # INFO - # See org.apache.maven.index.reader.RecordExpander.expandAddedArtifact - # See org.apache.maven.index.creator.MinimalArtifactInfoIndexCreator.updateArtifactInfo - - packaging = None - size = 0 - # record last modified is at entry.get('m') and we ignore this - last_modified = None - src_exist = False - jdoc_exist = False - sig_exist = False - - info = entry.get('i') - if info: - info = info.split(SEP) - - packaging = info[0] - if packaging in (NA, NULL): - packaging = None - - # this is the artifact last modified - # create a date/time stamp string from a long as a string - lm = info[1] - if lm and lm.isdigit() and lm != '0': - last_modified = java_time_ts(int(lm)) - - size = info[2] - size = int(size) if size and size.isdigit() else None - - # for *Exists fields of INFO: see org.apache.maven.index.ArtifactAvailability - # not present locally: '0': False, - # present locally: '1': True, ==> the only one we care for - # not available: '2': False, - PRESENT = '1' - src_exist = info[3] == PRESENT - jdoc_exist = info[4] == PRESENT - - if len(info) > 6: - extension = info[6] - else: - # FIXME: is this likely incorrect see worthyness check - if classifier or packaging in ('pom', 'war', 'ear'): - extension = packaging - else: - extension = 'jar' - sig_exist = info[5] == PRESENT - - # other MISC fields - sha1 = entry.get('1') - name = entry.get('n') - description = entry.get('d') - - if not include_all: - artifact = Artifact( - group_id=gid, artifact_id=aid, version=version, - packaging=packaging, classifier=classifier, extension=extension, - last_modified=last_modified, size=size, sha1=sha1, - name=name, description=description, - src_exist=src_exist, jdoc_exist=jdoc_exist, sig_exist=sig_exist, - ) - - else: - # TODO: should this be part of the base set? - sha256 = entry.get('sha256') - - # OSGI: Rarely there. Note that we ignore 'Export-', 'Import-', on - # purpose: these are big and messey for now - osgi = dict() - for key, value in entry.items(): - if key.startswith('Bundle-') and value: - # TODO: could also include 'Require-Bundle' - osgi[key] = value.strip() - - # Classes: Rarely there, but eventually useful in the future - # Can be quite big too - classes = entry.get('c', '').splitlines(False) - - artifact = ArtifactExtended( - group_id=gid, artifact_id=aid, version=version, - packaging=packaging, classifier=classifier, extension=extension, - last_modified=last_modified, size=size, sha1=sha1, - name=name, description=description, - src_exist=src_exist, jdoc_exist=jdoc_exist, sig_exist=sig_exist, - sha256=sha256, osgi=osgi, classes=classes - ) - - return artifact - - -def get_entries(location, fields=frozenset(ENTRY_FIELDS)): - """ - Yield Maven index entry mappings from a Gzipped Maven nexus index - data file at `location`. Only includes `fields` names. - """ - buffer_size = 128 * 1024 * 1024 - if TRACE_DEEP: - entry = None - entries_count = 0 - keys = set() - keys_update = keys.update - - with GzipFileWithTrailing(location, 'rb') as compressed: - # using io.BufferedReader for increased perfs - with io.BufferedReader(compressed, buffer_size=buffer_size) as nexus_index: - jstream = java_stream.DataInputStream(nexus_index) - - # FIXME: we do nothing with these two - # NOTE: this reads 1+8=9 bytes of the stream - _index_version, _last_modified = decode_index_header(jstream) - while True: - try: - entry = decode_entry(jstream, fields) - if TRACE_DEEP: - if entry: - keys_update(entry) - entries_count += 1 - - if entry: - yield entry - - except EOFError: - if TRACE_DEEP: - print( - 'Index version: %(_index_version)r last_modified: %(_last_modified)r' % locals()) - print( - 'Processed %(entries_count)d docs. Last entry: %(entry)r' % locals()) - print('Unique keys:') - for k in sorted(keys): - print(k) - break - - -def decode_index_header(jstream): - """ - Return the index header from a `jstream` Java-like stream as a tuple - of (index_version, last_updated_date) where index_version is an int - and last_updated_date is a an UTC ISO timestamp string or an empty - string. - """ - -# this.chunkName = chunkName.trim(); -# this.dataInputStream = new DataInputStream( new GZIPInputStream( inputStream, 2 * 1024 ) ); -# this.version = ( (int) dataInputStream.readByte() ) & 0xff; -# this.timestamp = new Date( dataInputStream.readLong() ); - - supported_format_version = 1 - # one byte - index_version = int(jstream.read_byte()) - assert supported_format_version == index_version - # eight byte - timestamp = jstream.read_long() - last_modified = timestamp != -1 and java_time_ts(timestamp) or '' - return int(index_version), last_modified - - -def decode_entry(jstream, fields=()): - """ - Read and return one entry mapping of name -> values from a Maven - index `jstream` Java-like stream. Note that the stream is not a - standard Java stream for UTF data. - - Only includes `fields` names. - - An entry starts with an integer which is the number of fields for - this entry. - - Then we have this data layout for each field: - - - field storage type: one byte flag which is then compared to - constants. These are flags for Lucene indexing: INDEXED, STORED, - TOKENIZED, ANALYZED it ends up being two booleans: indexed and - stored and we do not care for these. - - - field name: a Java UTF-8 string (using a len on 2 bytes, then the - name proper). Constants for field names are in ArtifactInfoRecord - and ArtifactInfo. The entry for these is available in ENTRY_FIELDS - for reference. - - - field value: a Java UTF-8-encoded string using the Maven Index special encoding - - one int which is the length of the UTF string in bytes - - the utf-8 string proper using Java conventions - """ - - read = jstream.read - read_int = jstream.read_int - read_byte = jstream.read_byte - read_utf = jstream.read_utf - - has_fields = bool(fields) - entry = {} - # this read 4 bytes - field_count = read_int() - for _ in range(field_count): - # Flags for lucene: INDEXED, STORED, TOKENIZED, ANALYZED: ignored - # this is a mask and one off: - # field_indexed = 1 - # field_tokenized = 2 - # field_stored = 4 - # this reads 1 byte: total 5 - _indexing_type = read_byte() - - # all field names are ASCII chars, so even though this is UTF-8 - # encoded, this is ascii Constants for field names are in - # ArtifactInfoRecord and ArtifactInfo - # FIXME: we should discard things we do not care for in terms of fields right away - - # Read a regular "Java Modified UTF-8" as unicode. - # this read 2 bytes which are the len then the len. total 7 + len - name = decode_modified_utf8(read_utf()) - - # Read a Maven Nexus index special "Java Modified UTF-8" as - # unicode: Regular Java write/readUTF is a string length on 2 - # bytes followed by a UTF-encoded stream of bytes of that - # length. The Nexus Maven index use a full int rather than a 2 - # bytes int bypassing the 65K char limit for length of the - # standard Java readUTF. - # this read 4 bytes which is a len - value_length = read_int() - # this read bytes len - value = decode_modified_utf8(read(value_length)) - - # why do we skip some fields - if has_fields: - if name in fields: - entry[name] = value - else: - entry[name] = value - - return entry - - -def java_time_ts(tm): - """ - Convert a Java time long (as milliseconds since epoch) to an UTC ISO - timestamp. - """ - tzinfo = tz.tzutc() - ar = arrow.get(tm / 1000).replace(tzinfo=tzinfo).to('utc') - return ar.isoformat() - -################################################################################ -# These are CLI/shell test and stat utilities -################################################################################ - - -def _spit_json(location, target): - with open(target, 'w') as t: - t.write('[\n') - for i, artifact in enumerate(get_artifacts(location)): - if i % 1000 == 0: - print('number or artifacts:', i) - t.write(json.dumps(artifact.to_dict(), separators=(',', ':'))) - t.write(',\n') - - t.write(']\n') - - print('total number or artifacts:', i) - - -def _artifact_stats(location): - """ - Print artifacts stats from a Gzipped Maven nexus index data file - at location. - """ - from collections import Counter - pom_packs = Counter() - pom_classifs = Counter() - pom_extensions = Counter() - combos = Counter() - - pom_worthy = 0 - - for i, artifact in enumerate(get_artifacts(location)): - combos[(artifact.packaging, artifact.classifier, artifact.extension)] += 1 - - if artifact.packaging: - pom_packs[artifact.packaging] += 1 - - if artifact.classifier: - pom_classifs[artifact.classifier] += 1 - - if artifact.extension: - pom_extensions[artifact.extension] += 1 - - if is_worthy_artifact(artifact): - pom_worthy += 1 - - if i % 10000 == 0: - print('number or artifacts:', i) - - print() - print('Total number of artifacts:', i) - print('Total number of worthy artifacts:', pom_worthy) - - print('Top packaging:') - for n, c in pom_packs.most_common(): - print(n, ':', c) - - print('Top classifiers:') - for n, c in pom_classifs.most_common(): - print(n, ':', c) - - print('Top extensions:') - for n, c in pom_extensions.most_common(): - print(n, ':', c) - - print('Top Combos: packaging, classifier, extension') - for n, c in combos.most_common(): - print(n, ':', c) - - """ - Latest stats on 2017-08-07: -Total number or artifacts: 5844648 -Total number of POMs: 302603 -Total number of worthy POMs: 300879 -Total number of JARs: 5158191 -Total number of POMs with names: 278521 with description: 151034 -Total number of JARs with names: 4762013 with description: 3144938 -Total number of Other with names: 360646 with description: 228119 -Unique POM packagings: [None, u'${packaging.type}', u'${packagingType}', - u'0-alpha-1-20050407.154541-1.pom', u'aar', u'apk', u'application-assembly', - u'bundle', u'feature', u'gem', u'hk2-jar', u'it-packaging', u'izpack-jar', - u'jar', u'jboss-sar', u'maven-archetype', u'maven-plugin', u'mule-extension', - u'mule-plugin', u'nar', u'nbm-application', u'pom', u'so', u'swc', u'tar', - u'tar.gz', u'war', u'xar', u'zip'] -Unique POM classifiers: [None, u'1', u'DEAD', u'M6a', u'bsd', u'changelog', -u'dtddoc', u'it', u'java', u'javadoc', u'jdbc3', u'pom'] - """ - - -def _entries_stats(location): - """ - Print entries stats from a Gzipped Maven nexus index data file - at location. - """ - from collections import Counter - field_names = Counter() - field_names_update = field_names.update - - field_sets = Counter() - field_sets_update = field_sets.update - - for i, entry in enumerate(get_entries(location, ())): - keys = tuple(entry.keys()) - field_names_update(keys) - field_sets_update([keys]) - if i % 10000 == 0: - print() - print('number of entries:', i) - print('field names stats:', field_names) - - print() - print('Total number of entries:', i) - print() - print('All field names:', field_names.most_common()) - print() - print('All field name sets:', field_sets.most_common()) - print() diff --git a/minecode/visitors/npm.py b/minecode/visitors/npm.py deleted file mode 100644 index 118a10aa..00000000 --- a/minecode/visitors/npm.py +++ /dev/null @@ -1,194 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import logging -import json - -from packagedcode.npm import npm_api_url -from packagedcode.npm import split_scoped_package_name -from packagedcode.npm import NpmPackageJsonHandler -from packageurl import PackageURL -import requests - -from minecode import seed -from minecode import priority_router -from minecode import visit_router -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI -from packagedb.models import PackageContentType - - -""" -Collect NPM packages from npm registries. -""" - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -class NpmSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since=0' - - -@visit_router.route('https://replicate.npmjs.com/registry/_changes\?include_docs=true&limit=\d+&since=\d+') -class NpmRegistryVisitor(NonPersistentHttpVisitor): - """ - Yield one URI for the next batch of changes to re-visit. Yield one URI for - each NPM package (that contains all the versions for this package) as - previsited for mapping. - """ - - def get_uris(self, content): - """ - Yield a URI for the next index sequence to visit and one URI for each - package fetched in a batch. - """ - next_visitable_index_url_template = ( - 'https://replicate.npmjs.com/registry/_changes?include_docs=true&limit=1000&since={last_seq}') - - json_location = content - with open(json_location) as c: - content = json.loads(c.read()) - - try: - last_seq = content['last_seq'] - except KeyError: - # provide a more meaningful message in case the JSON is incorrect - raise Exception( - 'NpmRegistryVisitor: Missing "last_seq" field: Aborting.') - - # Always yield an index URI, even if there is no results to avoid stopping the index visits - yield URI(uri=next_visitable_index_url_template.format(last_seq=last_seq), source_uri=self.uri) - - try: - results = content['results'] - except KeyError: - # provide a more meaningful message in case the JSON is incorrect - raise Exception( - 'NpmRegistryVisitor: Missing "results" field: Aborting.') - - for result in results: - doc = result.get('doc') - # verify if this record is a package record (as opposed to - # some couchdb design document that we would ignore) - is_package_record = 'versions' in doc and 'name' in doc - if not is_package_record: - continue - - # remove the readme field from the data: this is big and mostly - # useless for now - doc.pop('readme', None) - - name = doc.get('name') - - namespace, name = split_scoped_package_name(name) - package_api_url = npm_api_url(namespace, name) - - package_url = PackageURL( - type='npm', - namespace=namespace, - name=name).to_string() - - # here: this is ready for mapping - yield URI( - uri=package_api_url, - package_url=package_url, - source_uri=self.uri, - data=json.dumps(doc, separators=( - ',', ':'), ensure_ascii=False), - # note: visited is True since there nothing more to visit - visited=True) - - -def get_package_json(namespace, name, version): - """ - Return the contents of the package.json file of the package described by the purl - field arguments in a string. - """ - # Create URLs using purl fields - url = npm_api_url( - namespace=namespace, - name=name, - version=version, - ) - - try: - response = requests.get(url) - response.raise_for_status() - return response.json() - except requests.exceptions.HTTPError as err: - logger.error(f"HTTP error occurred: {err}") - - -def map_npm_package(package_url, pipelines, priority=0): - """ - Add a npm `package_url` to the PackageDB. - - Return an error string if any errors are encountered during the process - """ - from minecode.model_utils import add_package_to_scan_queue - from minecode.model_utils import merge_or_create_package - - package_json = get_package_json( - namespace=package_url.namespace, - name=package_url.name, - version=package_url.version, - ) - - if not package_json: - error = f'Package does not exist on npmjs: {package_url}' - logger.error(error) - return error - - package = NpmPackageJsonHandler._parse( - json_data=package_json - ) - package.extra_data['package_content'] = PackageContentType.SOURCE_ARCHIVE - - db_package, _, _, error = merge_or_create_package(package, visit_level=0) - - # Submit package for scanning - if db_package: - add_package_to_scan_queue( - package=db_package, - pipelines=pipelines, - priority=priority - ) - - return error - - -@priority_router.route('pkg:npm/.*') -def process_request(purl_str, **kwargs): - """ - Process `priority_resource_uri` containing a npm Package URL (PURL) as a - URI. - - This involves obtaining Package information for the PURL from npm and - using it to create a new PackageDB entry. The package is then added to the - scan queue afterwards. - """ - from minecode.model_utils import DEFAULT_PIPELINES - - addon_pipelines = kwargs.get('addon_pipelines', []) - pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) - - package_url = PackageURL.from_string(purl_str) - if not package_url.version: - return - - error_msg = map_npm_package(package_url, pipelines, priority) - - if error_msg: - return error_msg diff --git a/minecode/visitors/nuget.py b/minecode/visitors/nuget.py deleted file mode 100644 index ce64ea93..00000000 --- a/minecode/visitors/nuget.py +++ /dev/null @@ -1,161 +0,0 @@ -# -# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - - -from bs4 import BeautifulSoup - -from commoncode import fileutils -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import URI - - -class NugetSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://api-v2v3search-0.nuget.org/query' - yield 'https://www.nuget.org/packages?page=1' - - -@visit_router.route('https://api-v2v3search-0.nuget.org/query') -class NugetQueryVisitor(HttpJsonVisitor): - """ - 'https://api-v2v3search-0.nuget.org/query' is a query URL which has metadata for - Nuget packages and we can query for all the packages by using the pagination - technique. For example 'https://api-v2v3search-0.nuget.org/query?skip=40' will - skip the first 40 packages in the order and returns JSON data for the packages - from 40-60. - 'https://api-v2v3search-0.nuget.org/query' could be the latest version, as the - url 'https://api-v3search-0.nuget.org/query' is not accessible now. - """ - def get_uris(self, content): - """ - Return all the URLs for query results through pagination. - Starts with number '0', increment count by '20'. - The total count is found by 'totalHits'. - """ - pkgs_count = content.get('totalHits', 0) - count = 0 - url_template = 'https://api-v2v3search-0.nuget.org/query?skip={count}' - while count < pkgs_count: - url = url_template.format(count=str(count)) - yield URI(uri=url, source_uri=self.uri) - count = count + 20 - - -@visit_router.route('https://api-v2v3search-0.nuget.org/query\?skip=\d+') -class PackagesPageVisitor(HttpJsonVisitor): - """ - Visit the nuget API resources and return all the package URLs available at the passing`uri`. - """ - def get_uris(self, content): - metadata = content['data'] - for packages in metadata: - for version in packages['versions']: - pkg_ver = version['version'] - pkg_url = version['@id'] - version_template = '{pkg_version}.0.json' - version_name = version_template.format(pkg_version=pkg_ver) - name = pkg_url.replace('https://api.nuget.org/v3/registration1/', '').partition('/')[0] - package_url = PackageURL(type='nuget', name=name, version=pkg_ver).to_string() - if version_name in pkg_url: - # sometimes an extra '0' is appended to the version in the URL - # FIXME: this is weird: there must be good reason why this is done??? - pkg_url = pkg_url.replace(version_name, pkg_ver + '.json') - yield URI(uri=pkg_url, package_url=package_url, source_uri=self.uri) - - # Add another case to have registration0 or registration1 in the url, yield the alternative url. - if pkg_url.find('/registration0/') > 0: - pkg_url = pkg_url.replace('/registration0/', '/registration1/') - yield URI(uri=pkg_url, source_uri=self.uri) - - elif pkg_url.find('/registration1/') > 0: - pkg_url = pkg_url.replace('/registration1/', '/registration0/') - yield URI(uri=pkg_url, source_uri=self.uri) - - -@visit_router.route('https://api.nuget.org/.+.json') -class NugetAPIJsonVisitor(HttpJsonVisitor): - """ - Visit packageContent of nuget API json and return a - download URL for the NugetPackage object - - This could cover three cases: - 1. packageContent is not empty. - https://api.nuget.org/v3/registration1/entityframework/4.3.1.json - Visiting above link will return the npkg file: https://api.nuget.org/packages/entityframework.4.3.1.nupkg - and return the json resource for next DownloadVisitor: https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json - - 2. catalogEntry is not empty - https://api.nuget.org/v3/registration1/entityframework/4.3.1.json - Visiting above link will return the npkg file: https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json - - 3. No key matched - The second loop will return the url https://api.nuget.org/v3/catalog0/data/2015.02.07.22.31.06/entityframework.4.3.1.json - by visiting this url it won't create any new uris, the key is to store the json file itself through visitor and used in mapper. - """ - def get_uris(self, content): - download_url = content.get('packageContent') - if download_url: - filename = fileutils.file_name(download_url) - withou_prefix = filename.replace('.nupkg', '') - filename_splits = withou_prefix.partition('.') - name = filename_splits[0] - version = None - if len(filename_splits) > 1: - version = filename_splits[-1] - package_url = PackageURL( - type='nuget', - name=name, - version=version) - yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) - - catalog_entry_url = content.get('catalogEntry') - if catalog_entry_url: - yield URI(uri=catalog_entry_url, source_uri=self.uri) - - -@visit_router.route('https://www.nuget.org/packages\?page=\d+') -class NugetHTMLPageVisitor(HttpVisitor): - """ - Visitor to yield the URI of the each package page. - """ - def get_uris(self, content): - url_format = 'https://www.nuget.org/packages/{name}' - soup = BeautifulSoup(content, 'lxml') - has_package = False - for a in soup.find_all('a'): - if a.get('class') and 'package-title' in a.get('class'): - has_package = True - href = a.get('href') - if not href: - continue - # href format is like: "/packages/NUnit/" - name = href.strip('/').partition('/')[-1] - if name: - yield URI(uri=url_format.format(name=name), source_uri=self.uri) - if has_package: - page_id = self.uri.replace('https://www.nuget.org/packages?page=', '').strip('/') - next_pageid = int(page_id) + 1 - nextpage_url_format = 'https://www.nuget.org/packages?page={id}' - yield URI(uri=nextpage_url_format.format(id=next_pageid), source_uri=self.uri) - - -@visit_router.route('https://www.nuget.org/packages/[\w\-\.]+', - 'https://www.nuget.org/packages/[\w\-\.]+/[\w\-\.]+') -class NugetHTMLPackageVisitor(HttpVisitor): - """ - Visitor to fetch the package HTML content - Example: https://www.nuget.org/packages/log4net - or https://www.nuget.org/packages/log4net/2.0.7 - """ - pass diff --git a/minecode/visitors/openssl.py b/minecode/visitors/openssl.py deleted file mode 100644 index 0a7524fc..00000000 --- a/minecode/visitors/openssl.py +++ /dev/null @@ -1,122 +0,0 @@ -# -# Copyright (c) 2016 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -from bs4 import BeautifulSoup -from datetime import datetime - -from commoncode import fileutils -from packageurl import PackageURL - -from minecode import priority_router -from minecode import seed -from minecode import visit_router -from minecode.utils import is_int -from minecode.visitors import HttpVisitor -from minecode.visitors import URI -from minecode.visitors.generic import map_fetchcode_supported_package - - -class OpenSSLSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://ftp.openssl.org/' - - -@visit_router.route('https://ftp.openssl.org/', - 'https://ftp.openssl.org/.*/') -class OpenSSLVisitor(HttpVisitor): - """ - Collect package metadata URIs from the open SSL HTML site. - """ - - def get_uris(self, content): - """ - Return URIs objects and the corresponding size, file date info. - """ - page = BeautifulSoup(content, 'lxml') - for a in page.find_all(name='a'): - if 'href' not in a.attrs: - continue - href = a['href'] - if not href: - continue - if href.startswith('?') or href.startswith('/'): - # if href is not valid resource, ignore, for example, it's a - # link to parent link etc. - continue - url = self.uri + href - next_sibling = a.parent.findNext('td') - - date = None - if next_sibling and next_sibling.contents: - date = next_sibling.contents[0].strip() - # The passing date format is like: 2014-11-19 17:48 - date = datetime.strptime(date, '%Y-%m-%d %H:%M') - - if next_sibling: - next_next = next_sibling.findNext('td') - if next_next and next_next.contents: - size = next_next.contents[0].strip() - if size and is_int(size): - # By default, if the unit is not shown, it means k. - size = str(int(size) * 1024) - if size.endswith(('M', 'm')): - # If the size is mega byte, and the format is a float - # instead of int, since it's possible like 5.1M - size = str( - int(float(size.replace('M', '').replace('m', '')) * 1024 * 1024)) - elif size.endswith('G') or size.endswith('G'): - # if the size is gega byte - size = str( - int(float(size.replace('G', '').replace('g', '')) * 1024 * 1024 * 1024)) - if size == '-': - # if it's folder, ignore the size - size = None - file_name = None - if not url.endswith('/'): - file_name = fileutils.file_name(url) - if file_name: - # If it's a file, pass the url to mapper by setting the visited - # to True - package_url = None - version = None - if 'tar.gz' in file_name: - version = file_name.replace('openssl-', '').partition('.tar.gz')[0] - package_url = PackageURL(type='generic', name='openssl', version=version).to_string() - yield URI(uri=url, source_uri=self.uri, package_url=package_url, date=date, file_name=file_name, size=size) - else: - yield URI(uri=url, source_uri=self.uri, date=date, size=size) - - -# Indexing OpenSSL PURLs requires a GitHub API token. -# Please add your GitHub API key to the `.env` file, for example: `GH_TOKEN=your-github-api`. -@priority_router.route('pkg:openssl/openssl@.*') -def process_request_dir_listed(purl_str, **kwargs): - """ - Process `priority_resource_uri` containing a OpenSSL Package URL (PURL) - supported by fetchcode. - - This involves obtaining Package information for the PURL using - https://github.com/aboutcode-org/fetchcode and using it to create a new - PackageDB entry. The package is then added to the scan queue afterwards. - """ - from minecode.model_utils import DEFAULT_PIPELINES - - addon_pipelines = kwargs.get('addon_pipelines', []) - pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) - priority = kwargs.get('priority', 0) - - try: - package_url = PackageURL.from_string(purl_str) - except ValueError as e: - error = f"error occurred when parsing {purl_str}: {e}" - return error - - error_msg = map_fetchcode_supported_package(package_url, pipelines, priority) - - if error_msg: - return error_msg diff --git a/minecode/visitors/openwrt.py b/minecode/visitors/openwrt.py deleted file mode 100644 index 11d49aeb..00000000 --- a/minecode/visitors/openwrt.py +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright (c) 2017 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import unicode_literals - -import gzip -import json -import os - -from bs4 import BeautifulSoup -from debian_inspector import debcon -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.utils import extract_file -from minecode.visitors import HttpVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI - - -class OpenWrtSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://downloads.openwrt.org/chaos_calmer/15.05/' - - -@visit_router.route('https://downloads.openwrt.org/.*/') -class OpenWrtDownloadPagesVisitor(HttpVisitor): - """ - Visit the OpwnWRT download HTML page and return URIs parsed from HTML page. - """ - def get_uris(self, content): - page = BeautifulSoup(content, 'lxml') - for td in page.find_all(name='td'): - a = td.find(name='a') - if not a: - continue - href = a['href'] - if href == '../': # Ignore the parent url - continue - - # Add the uri for next loop if it ends with "/", which means it'a - # folder resource uri - if href.endswith('/'): - package_url = PackageURL(type='openwrt', name=href.replace('/', '')).to_string() - yield URI(uri=self.uri + href, package_url=package_url, source_uri=self.uri) - elif href.endswith(('Packages', 'Packages.gz', '.ipk')): - yield URI(uri=self.uri + href, source_uri=self.uri) - - -@visit_router.route('https://downloads.openwrt.org/.*/Packages\.gz') -class OpenWrtPackageIndexVisitor(NonPersistentHttpVisitor): - """ - Visit the OpwnWRT Packages.gz Index file and collect uris. - """ - def get_uris(self, content): - with gzip.open(content, 'rb') as f: - content = f.read() - - for package in debcon.get_paragraphs_data(content): - file_info = package.get('Filename') - if not file_info: - continue - version = package.get('Version') - md5sum = package.get('MD5Sum') - sha256sum = package.get('SHA256sum') - package_name = package.get('Package') - package_url = None - if package_name and version: - package_url = PackageURL(type='openwrt', name=package_name, version=version).to_string() - file_info = file_info.lstrip('/') - dir_url = self.uri.replace('Packages.gz', '') + file_info - yield URI(uri=dir_url, package_url=package_url, data=json.dumps(str(package)), source_uri=self.uri, md5=md5sum, sha256=sha256sum,) - - -@visit_router.route('https://downloads.openwrt.org/.*\.ipk') -class OpenWrtIpkPackageArchiveVisitor(NonPersistentHttpVisitor): - """ - Visit the OpwnWRT Packages.gz and collect uris. - """ - def dumps(self, content): - """ - Extract an ipk package archive and its control.targ.gz. Parse the - control file and return a JSON string from these data. - """ - extracted_location = extract_file(content) - control_targz = os.path.join(extracted_location, 'control.tar.gz') - control_extracted_folder = extract_file(control_targz) - control_location = os.path.join(control_extracted_folder, 'control') - parsed = debcon.Debian822.from_file(control_location) - return json.dumps(parsed) diff --git a/minecode/visitors/packagist.py b/minecode/visitors/packagist.py deleted file mode 100644 index fb6adac8..00000000 --- a/minecode/visitors/packagist.py +++ /dev/null @@ -1,57 +0,0 @@ -# -# Copyright (c) 2017 by nexB, Inc. http://www.nexb.com/ - All rights reserved. -# - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI - -""" -Collect packagist packages - -The packagist repo API is at: https://packagist.org/apidoc -""" - - -class PackagistSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://packagist.org/packages/list.json' - - -@visit_router.route('https://packagist.org/packages/list.json') -class PackagistListVisitor(HttpJsonVisitor): - """ - Collect list json resource and yield URIs for searching with package url. - - The yield uri format is like: https://packagist.org/p/[vendor]/[package].json - """ - - def get_uris(self, content): - search_url_template = 'https://packagist.org/p/{vendor}/{package}.json' - packages_entries = content.get('packageNames', {}) - for package in packages_entries: - # FIXME: what does it mean to have no / in the URL? - if '/' not in package: - continue - vp = package.split('/') - vendor = vp[0] - package = vp[1] - package_url = PackageURL(type='composer', name=package).to_string() - yield URI(uri=search_url_template.format(vendor=vendor, package=package), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://packagist.org/p/.*json') -class PackageVisitor(HttpJsonVisitor): - """ - Collect JSON for a package. - """ - # FIXME: what about having a download URL to fetch the real package??? - pass diff --git a/minecode/visitors/pypi.py b/minecode/visitors/pypi.py deleted file mode 100644 index ba9425a6..00000000 --- a/minecode/visitors/pypi.py +++ /dev/null @@ -1,131 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import codecs -import json -import xmlrpc - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.utils import get_temp_file -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import URI -from minecode.visitors import Visitor - - -""" -Visitors for Pypi and Pypi-like Python package repositories. - -We have this hierarchy in Pypi: - index (xmlrpc) -> packages (json) -> package releases (json) -> download urls - -Pypi serves a main index via XMLRPC that contains a list of package names. -For each package, a JSON contains details including the list of all releases. -For each release, a JSON contains details for the released version and all the -downloads available for this release. We create Packages at this level as well -as one download URI for each effective download. - -Some information about every release and download is replicated in every JSON -payload and is ignored for simplicity (which is not super efficient). -""" - - -class PypiSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://pypi.python.org/pypi/' - - -@visit_router.route('https://pypi.python.org/pypi/') -class PypiIndexVisitor(Visitor): - """ - Collect package metadata URIs from the top level pypi index for each package. - """ - - def fetch(self, uri, timeout=None): - """ - Specialized fetching using XML RPCs. - """ - packages = xmlrpc.client.ServerProxy(uri).list_packages() - content = list(packages) - - temp_file = get_temp_file('PypiIndexVisitor') - with codecs.open(temp_file, mode='wb', encoding='utf-8') as expect: - json.dump(content, expect, indent=2, separators=(',', ':')) - return temp_file - - def dumps(self, content): - """ - The content is huge json and should not be dumped. - """ - return None - - def get_uris(self, content): - with codecs.open(content, mode='rb', encoding='utf-8') as contentfile: - packages_list = json.load(contentfile) - - url_template = 'https://pypi.python.org/pypi/{name}/json' - for name in packages_list: - package_url = PackageURL(type='pypi', name=name).to_string() - yield URI(uri=url_template.format(name=name), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://pypi.python.org/pypi/[^/]+/json') -class PypiPackageVisitor(HttpJsonVisitor): - """ - Collect package metadata URIs for all release of a single Pypi package. - The url will contain only the package name, for example: https://pypi.org/pypi/vmock/json - By parsing the content, the goal is to form the json with version/release: https://pypi.org/pypi/vmock/0.1/json - """ - - def get_uris(self, content): - - url_template = 'https://pypi.python.org/pypi/{name}/{release}/json' - info = content.get('info', {}) - name = info.get('name') - if name: - for release in content['releases']: - package_url = PackageURL( - type='pypi', name=name, version=release).to_string() - yield URI(uri=url_template.format(name=name, release=release), package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https://pypi.python.org/pypi/[^/]+/[^/]+/json') -class PypiPackageReleaseVisitor(HttpJsonVisitor): - """ - Collect package download URIs for all packages archives of one Pypi package - release. The example is: https://pypi.org/pypi/vmock/0.1/json - """ - - def get_uris(self, content): - # TODO: this is likely best ignored entirely??? - # A download_url may be provided for an off-Pypi-download - info = content.get('info', {}) - name = info.get('name') - version = None - download_url = info.get('download_url') - if download_url and download_url != 'UNKNOWN': - version = info.get('version') - package_url = PackageURL( - type='pypi', name=name, version=version).to_string() - yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) - - # Common on-Pypi-download URLs are in the urls block - for download in content.get('urls', {}): - url = download.get('url') - if not url: - continue - package_url = PackageURL( - type='pypi', name=name, version=version).to_string() - yield URI(url, package_url=package_url, file_name=download.get('filename'), - size=download.get('size'), date=download.get('upload_time'), - md5=download.get('md5_digest'), source_uri=self.uri) diff --git a/minecode/visitors/repomd_parser.py b/minecode/visitors/repomd_parser.py deleted file mode 100644 index 78f8fe8a..00000000 --- a/minecode/visitors/repomd_parser.py +++ /dev/null @@ -1,110 +0,0 @@ -# -# Copyright (c) 2016 nexB Inc. and others. All rights reserved. -# - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import unicode_literals - -import json -import logging -import os - - -from commoncode import fileutils -from packagedcode.models import PackageData -from packagedcode.rpm import EVR - -from minecode import visit_router -from minecode.utils import extract_file -from minecode.utils import fetch_http -from minecode.utils import get_temp_file -from minecode.visitors import URI -from minecode.visitors import repodata - - -logger = logging.getLogger(__name__) - - -""" -Analyzes the "repomd.xml" of a given repository from the URL given as input -and generates a list of RPM objects -""" - - -def download(uri): - """ - Fetch the file at uri, saving it to a temp file and return the path to - this temp file. - """ - name = fileutils.file_name(uri) - file_ext = fileutils.file_extension(name) - name = name.replace(file_ext, '') - - content = fetch_http(uri) - temp_file = get_temp_file(file_name='minecode-fetched-file-' + name, extension=file_ext) - with open(temp_file, 'wb') as tmp: - tmp.write(content) - file_name = tmp.name - return file_name - - -def generate_rpm_objects(package_infos, base_url): - """ - Yield Packages from an iterable of RPM infos given a base_url. - """ - # FIXME: what does package_infos mean? wheer does it come from? - for infos in package_infos: - package_data = dict( - # FIXME: need to add id back? this is id is some hash which is local to the repo. - # id=infos.get('pkgid'), - type='rpm', - name=infos.get('name'), - version=EVR(epoch=infos.get('epoch'), version=infos.get( - 'ver'), release=infos.get('rel')).to_string(), - description=infos.get('description'), - homepage_url=infos.get('url'), - download_url=repodata.build_rpm_download_url( - base_url, infos.get('href')), - extracted_license_statement = infos.get('license', '') - ) - package = PackageData.from_data(package_data) - if infos.get('source_rpm'): - src_rpm = PackageData(name=infos.get('source_rpm')) - package.related_packages = [src_rpm] - yield package - -# TODO: refactor, this does not make sense, each are different URIs? -# FIXME: the doc and semantics are cryptic too - - -def fetch_repomd_subfile(base_url, repomd_xml, subfile): - """ - Downloads and extract a subfile('filelists.xml.gz', 'primary.xml.gz', - 'other.xml.gz') of any repodata and returns the subfile location. - """ - url = base_url + repodata.get_url_for_tag(repomd_xml, subfile) - target_location = extract_file(download(url)) - return os.path.join(target_location, os.listdir(target_location)[0]) - - -@visit_router.route('.+/repomd.xml') -def collect_rpm_packages_from_repomd(uri): - """ - Collect RPM data from yum repository repomd.xml. - """ - base_url = fileutils.parent_directory(fileutils.parent_directory(uri)) - repomd_xml = download(uri) - - filelists_xml = fetch_repomd_subfile(base_url, repomd_xml, 'filelists') - primary_xml = fetch_repomd_subfile(base_url, repomd_xml, 'primary') - other_xml = fetch_repomd_subfile(base_url, repomd_xml, 'other') - - pkg_infos = repodata.get_pkg_infos(filelists_xml, primary_xml, other_xml) - - rpms = list(generate_rpm_objects(pkg_infos, base_url)) - uris = [] - for rpm in rpms: - if rpm.download_url: - uris.append(URI(uri=rpm.download_url)) - return uris, json.dumps([r.to_dict() for r in rpms]), None diff --git a/minecode/visitors/rubygems.py b/minecode/visitors/rubygems.py deleted file mode 100644 index 2bac0c10..00000000 --- a/minecode/visitors/rubygems.py +++ /dev/null @@ -1,145 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - - -import gzip -import json -import logging -import os - -from rubymarshal import reader -from rubymarshal.classes import UsrMarshal -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.utils import extract_file -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -# FIXME: we are missing several API calls: -# http://guides.rubygems.org/rubygems-org-api/ - -class RubyGemsSeed(seed.Seeder): - - def get_seeds(self): - # We keep only specs.4.8.gz and exclude latest_spec.4.8.gz, - # since specs.4.8.gz covers all uris in latest spec. - yield 'http://rubygems.org/specs.4.8.gz' - - -class GemVersion(UsrMarshal): - - def version(self): - return self.values['version'] - - -@visit_router.route('https?://rubygems\.org/specs\.4\.8\.gz') -class RubyGemsIndexVisitor(NonPersistentHttpVisitor): - """ - Collect REST APIs URIs from RubyGems index file. - """ - - def get_uris(self, content): - with gzip.open(content, 'rb') as idx: - index = idx.read() - - # TODO: use a purl!!! - for name, version, platform in reader.loads(index): - json_url = 'https://rubygems.org/api/v1/versions/{name}.json'.format( - **locals()) - - package_url = PackageURL(type='gem', name=name).to_string() - yield URI(uri=json_url, package_url=package_url, source_uri=self.uri) - - # note: this list only has ever a single value - version = version.values[0] - if isinstance(version, bytes): - version = version.decode('utf-8') - - download_url = 'https://rubygems.org/downloads/{name}-{version}' - - if isinstance(platform, bytes): - platform = platform.decode('utf-8') - if platform != 'ruby': - download_url += '-{platform}' - - download_url += '.gem' - download_url = download_url.format(**locals()) - package_url = PackageURL( - type='gem', name=name, version=version).to_string() - yield URI(uri=download_url, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https?://rubygems\.org/api/v1/versions/[\w\-\.]+.json') -class RubyGemsApiManyVersionsVisitor(HttpJsonVisitor): - """ - Collect the json content of each version. - Yield the uri of each gem based on name, platform and version. - The data of the uri is the JSON subset for a single version. - """ - - def get_uris(self, content): - """ - Yield URI of the gems url and data. - """ - # FIXME: return actual data too!!! - for version_details in content: - # get the gems name by parsing from the uri - name = self.uri[ - self.uri.index('/versions/') + len('/versions/'):-len('.json')] - version = version_details.get('number') - gem_name = '%(name)s-%(version)s' % locals() - package_url = PackageURL( - type='gem', name=name, version=version).to_string() - download_url = 'https://rubygems.org/downloads/%(gem_name)s.gem' % locals( - ) - yield URI(uri=download_url, source_uri=self.uri, package_url=package_url, - data=json.dumps(version_details)) - -# TODO: add API dependencies -# https://rubygems.org/api/v1/dependencies.json?gems=file_validators -# Also use Use the V2 API at http://guides.rubygems.org/rubygems-org-api-v2/ -# GET - /api/v2/rubygems/[GEM NAME]/versions/[VERSION NUMBER].(json|yaml) - - -@visit_router.route('https?://rubygems.org/downloads/[\w\-\.]+.gem') -class RubyGemsPackageArchiveMetadataVisitor(NonPersistentHttpVisitor): - """ - Fetch a Rubygems gem archive, extract it and return its metadata file content. - """ - - def dumps(self, content): - return get_gem_metadata(content) - - -def get_gem_metadata(location): - """ - Return the metadata file content as a string extracted from the gem archive - at `location`. - """ - # Extract the compressed file first. - extracted_location = extract_file(location) - metadata_gz = os.path.join(extracted_location, 'metadata.gz') - # Extract the embedded metadata gz file - extract_parent_location = extract_file(metadata_gz) - # Get the first file in the etracted folder which is the meta file location - meta_extracted_file = os.path.join( - extract_parent_location, os.listdir(extract_parent_location)[0]) - with open(meta_extracted_file) as meta_file: - return meta_file.read() diff --git a/minecode/visitors/sourceforge.py b/minecode/visitors/sourceforge.py deleted file mode 100644 index 7b2d7a7a..00000000 --- a/minecode/visitors/sourceforge.py +++ /dev/null @@ -1,90 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/aboutcode-org/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import logging -import re - -from bs4 import BeautifulSoup - -from packageurl import PackageURL - -from minecode import seed -from minecode import visit_router -from minecode.visitors import HttpJsonVisitor -from minecode.visitors import HttpVisitor -from minecode.visitors import NonPersistentHttpVisitor -from minecode.visitors import URI - - -logger = logging.getLogger(__name__) -handler = logging.StreamHandler() -logger.addHandler(handler) -logger.setLevel(logging.INFO) - - -class SourceforgeSeed(seed.Seeder): - - def get_seeds(self): - yield 'https://sourceforge.net/sitemap.xml' - - -@visit_router.route('https?://sourceforge.net/sitemap.xml') -class SourceforgeSitemapIndexVisitor(NonPersistentHttpVisitor): - """ - Collect sub-sitemaps from the main sitemap. Return on URI for each sub- - sitemap, for example: https://sourceforge.net/sitemap-167.xml - - Note that the class implements from NonPersistentHttpVisitor instead of HttpVisitor, - as the XML file itself will be over 100M big, so NonPersistentHttpVisitor will be more - reasonable. - """ - - def get_uris(self, content): - """ - Collect all the sitemaps URIs from master sitemap. - """ - locs = BeautifulSoup(open(content), 'lxml').find_all('loc') - # Content passing from NonPersistentHttpVisitor is a temp file path - # instead of file content, so opening to get a file handler is - # necessary. - for loc in locs: - yield URI(uri=loc.text, source_uri=self.uri) - - -@visit_router.route('https?://sourceforge.net/sitemap-\d+.xml') -class SourceforgeSitemapPageVisitor(HttpVisitor): - - def get_uris(self, content): - """ - Collect all the projects URIs from a sub-sitemaps. - """ - sitemap_locs = BeautifulSoup(content, 'lxml').find_all('loc') - regex = re.compile( - r"^https?://sourceforge.net/projects/[a-z0-9.-]+/?$") - for loc in sitemap_locs: - if loc.text and re.match(regex, loc.text): - project_json_baseurl = 'https://sourceforge.net/api/project/name/{}/json' - project_name = loc.text.partition( - 'https://sourceforge.net/projects/')[-1].strip('/') - project_json_url = project_json_baseurl.format(project_name) - package_url = PackageURL( - type='sourceforge', name=project_name).to_string() - # The priority in the xml has different view with the priority in visitor, so skip it. - yield URI(uri=project_json_url, package_url=package_url, source_uri=self.uri) - - -@visit_router.route('https?://sourceforge.net/api/project/name/[a-z0-9.-]+/json', - 'https?://sourceforge.net/rest/p/[a-z0-9.-]+' - ) -class SourceforgeProjectJsonVisitor(HttpJsonVisitor): - """ - Collect Sourceforge project data through the JSON API. - The implementation is empty since it will inherit the implementation from HttpJsonVisitor and it returns json data for mapper. - """ - pass diff --git a/packagedb/api.py b/packagedb/api.py index 104d95a7..a6097b2e 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -9,18 +9,18 @@ import logging -import django_filters from django.core.exceptions import ValidationError from django.db.models import OuterRef from django.db.models import Q from django.db.models import Subquery from django.forms import widgets from django.forms.fields import MultipleChoiceField + +import django_filters from django_filters.filters import Filter from django_filters.filters import MultipleChoiceFilter from django_filters.filters import OrderingFilter from django_filters.rest_framework import FilterSet - from drf_spectacular.plumbing import build_array_type from drf_spectacular.plumbing import build_basic_type from drf_spectacular.types import OpenApiTypes @@ -39,12 +39,12 @@ from univers.version_range import VersionRange from univers.versions import InvalidVersion +from minecode import collectors # NOQA + # UnusedImport here! -# But importing the mappers and visitors module triggers routes registration +# But importing the collectors module triggers routes registration from minecode import priority_router -from minecode import visitors # NOQA from minecode.models import PriorityResourceURI -from minecode.models import ScannableURI from minecode.route import NoRouteAvailable from packagedb.filters import PackageSearchFilter from packagedb.models import Package @@ -56,7 +56,6 @@ from packagedb.package_managers import get_api_package_name from packagedb.package_managers import get_version_fetcher from packagedb.serializers import CollectPackageSerializer -from packagedb.serializers import is_supported_addon_pipeline from packagedb.serializers import DependentPackageSerializer from packagedb.serializers import IndexPackagesResponseSerializer from packagedb.serializers import IndexPackagesSerializer @@ -64,13 +63,14 @@ from packagedb.serializers import PackageSetAPISerializer from packagedb.serializers import PackageWatchAPISerializer from packagedb.serializers import PackageWatchCreateSerializer -from packagedb.serializers import UpdatePackagesSerializer from packagedb.serializers import PackageWatchUpdateSerializer from packagedb.serializers import PartySerializer -from packagedb.serializers import PurlValidateResponseSerializer from packagedb.serializers import PurlUpdateResponseSerializer +from packagedb.serializers import PurlValidateResponseSerializer from packagedb.serializers import PurlValidateSerializer from packagedb.serializers import ResourceAPISerializer +from packagedb.serializers import UpdatePackagesSerializer +from packagedb.serializers import is_supported_addon_pipeline from packagedb.throttling import StaffUserRateThrottle from purl2vcs.find_source_repo import get_source_package_and_add_to_package_set @@ -85,22 +85,19 @@ class CharMultipleWidget(widgets.TextInput): def value_from_datadict(self, data, files, name): value = widgets.SelectMultiple().value_from_datadict(data, files, name) - if not value or value == ['']: - return '' + if not value or value == [""]: + return "" return value def format_value(self, value): - """ - Return a value as it should appear when rendered in a template. - """ - return ', '.join(value) + """Return a value as it should appear when rendered in a template.""" + return ", ".join(value) class MultipleCharField(MultipleChoiceField): - """ - Overrides `MultipleChoiceField` to fit in `MultipleCharFilter`. - """ + """Overrides `MultipleChoiceField` to fit in `MultipleCharFilter`.""" + widget = CharMultipleWidget def valid_value(self, value): @@ -108,16 +105,13 @@ def valid_value(self, value): class MultipleCharFilter(MultipleChoiceFilter): - """ - Filters on multiple values for a CharField type using `?field=a&field=b` URL syntax. - """ + """Filters on multiple values for a CharField type using `?field=a&field=b` URL syntax.""" + field_class = MultipleCharField class MultipleCharInFilter(MultipleCharFilter): - """ - Does a __in = [value] filter instead of field=value filter - """ + """Does a __in = [value] filter instead of field=value filter""" def filter(self, qs, value): if not value: @@ -129,7 +123,7 @@ def filter(self, qs, value): predicate = self.get_filter_predicate(value) old_field_name = next(iter(predicate)) - new_field_name = f'{old_field_name}__in' + new_field_name = f"{old_field_name}__in" predicate[new_field_name] = predicate[old_field_name] predicate.pop(old_field_name) @@ -151,11 +145,11 @@ class CreateListRetrieveUpdateViewSetMixin( To use it, override the class and set the `.queryset` and `.serializer_class` attributes. """ + pass class PackageResourcePurlFilter(Filter): - def filter(self, qs, value): if not value: return qs @@ -173,7 +167,6 @@ def filter(self, qs, value): class PackageResourceUUIDFilter(Filter): - def filter(self, qs, value): if not value: return qs @@ -187,24 +180,24 @@ def filter(self, qs, value): class ResourceFilterSet(FilterSet): - package = PackageResourceUUIDFilter(label='Package UUID') - purl = PackageResourcePurlFilter(label='Package pURL') + package = PackageResourceUUIDFilter(label="Package UUID") + purl = PackageResourcePurlFilter(label="Package pURL") md5 = MultipleCharInFilter( - help_text='Exact MD5. Multi-value supported.', + help_text="Exact MD5. Multi-value supported.", ) sha1 = MultipleCharInFilter( - help_text='Exact SHA1. Multi-value supported.', + help_text="Exact SHA1. Multi-value supported.", ) class ResourceViewSet(viewsets.ReadOnlyModelViewSet): - queryset = Resource.objects.select_related('package') + queryset = Resource.objects.select_related("package") serializer_class = ResourceAPISerializer filterset_class = ResourceFilterSet throttle_classes = [StaffUserRateThrottle, AnonRateThrottle] - lookup_field = 'sha1' + lookup_field = "sha1" - @action(detail=False, methods=['post']) + @action(detail=False, methods=["post"]) def filter_by_checksums(self, request, *args, **kwargs): """ Take a mapping, where the keys are the names of the checksum algorthm @@ -217,7 +210,7 @@ def filter_by_checksums(self, request, *args, **kwargs): - sha1 Example: - + ------- { "sha1": [ "b55fd82f80cc1bd0bdabf9c6e3153788d35d7911", @@ -238,24 +231,23 @@ def filter_by_checksums(self, request, *args, **kwargs): } This will return Resources whose sha1 or md5 matches those values. + """ data = dict(request.data) unsupported_fields = [] for field, value in data.items(): - if field not in ('md5', 'sha1'): + if field not in ("md5", "sha1"): unsupported_fields.append(field) if unsupported_fields: - unsupported_fields_str = ', '.join(unsupported_fields) + unsupported_fields_str = ", ".join(unsupported_fields) response_data = { - 'status': f'Unsupported field(s) given: {unsupported_fields_str}' + "status": f"Unsupported field(s) given: {unsupported_fields_str}" } return Response(response_data, status=status.HTTP_400_BAD_REQUEST) if not data: - response_data = { - 'status': 'No values provided' - } + response_data = {"status": "No values provided"} return Response(response_data, status=status.HTTP_400_BAD_REQUEST) lookups = Q() @@ -263,18 +255,18 @@ def filter_by_checksums(self, request, *args, **kwargs): value = value or [] # We create this intermediate dictionary so we can modify the field # name to have __in at the end - d = {f'{field}__in': value} + d = {f"{field}__in": value} lookups |= Q(**d) qs = Resource.objects.filter(lookups) paginated_qs = self.paginate_queryset(qs) serializer = ResourceAPISerializer( - paginated_qs, many=True, context={'request': request}) + paginated_qs, many=True, context={"request": request} + ) return self.get_paginated_response(serializer.data) class MultiplePackageURLFilter(MultipleCharFilter): - def filter(self, qs, value): if not value: # Even though not a noop, no point filtering if empty. @@ -283,7 +275,7 @@ def filter(self, qs, value): if self.is_noop(qs, value): return qs - if all(v == '' for v in value): + if all(v == "" for v in value): return qs q = Q() @@ -302,48 +294,48 @@ def filter(self, qs, value): PACKAGE_FILTER_SORT_FIELDS = [ - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'download_url', - 'filename', - 'size', - 'release_date', + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "download_url", + "filename", + "size", + "release_date", ] class PackageFilterSet(FilterSet): type = django_filters.CharFilter( - lookup_expr='iexact', - help_text='Exact type. (case-insensitive)', + lookup_expr="iexact", + help_text="Exact type. (case-insensitive)", ) namespace = django_filters.CharFilter( - lookup_expr='iexact', - help_text='Exact namespace. (case-insensitive)', + lookup_expr="iexact", + help_text="Exact namespace. (case-insensitive)", ) name = MultipleCharFilter( - lookup_expr='iexact', - help_text='Exact name. Multi-value supported. (case-insensitive)', + lookup_expr="iexact", + help_text="Exact name. Multi-value supported. (case-insensitive)", ) version = MultipleCharFilter( - help_text='Exact version. Multi-value supported.', + help_text="Exact version. Multi-value supported.", ) md5 = MultipleCharInFilter( - help_text='Exact MD5. Multi-value supported.', + help_text="Exact MD5. Multi-value supported.", ) sha1 = MultipleCharInFilter( - help_text='Exact SHA1. Multi-value supported.', + help_text="Exact SHA1. Multi-value supported.", ) purl = MultiplePackageURLFilter( - label='Package URL', + label="Package URL", ) search = PackageSearchFilter( - label='Search', - field_name='name', - lookup_expr='icontains', + label="Search", + field_name="name", + lookup_expr="icontains", ) sort = OrderingFilter(fields=PACKAGE_FILTER_SORT_FIELDS) @@ -351,31 +343,31 @@ class PackageFilterSet(FilterSet): class Meta: model = Package fields = ( - 'search', - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'download_url', - 'filename', - 'sha1', - 'sha256', - 'md5', - 'size', - 'release_date', + "search", + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "download_url", + "filename", + "sha1", + "sha256", + "md5", + "size", + "release_date", ) class PackagePublicViewSet(viewsets.ReadOnlyModelViewSet): - queryset = Package.objects.prefetch_related('dependencies', 'parties') + queryset = Package.objects.prefetch_related("dependencies", "parties") serializer_class = PackageAPISerializer - lookup_field = 'uuid' + lookup_field = "uuid" filterset_class = PackageFilterSet throttle_classes = [StaffUserRateThrottle, AnonRateThrottle] - @action(detail=True, methods=['get']) + @action(detail=True, methods=["get"]) def latest_version(self, request, *args, **kwargs): """ Return the latest version of the current Package, @@ -386,44 +378,38 @@ def latest_version(self, request, *args, **kwargs): latest_version = package.get_latest_version() if latest_version: return Response( - PackageAPISerializer(latest_version, context={ - 'request': request}).data + PackageAPISerializer(latest_version, context={"request": request}).data ) return Response({}) - @action(detail=True, methods=['get']) + @action(detail=True, methods=["get"]) def history(self, request, *args, **kwargs): - """ - Return the History field associated with the current Package. - """ + """Return the History field associated with the current Package.""" package = self.get_object() return Response({"history": package.history}) - @action(detail=True, methods=['get']) + @action(detail=True, methods=["get"]) def resources(self, request, *args, **kwargs): - """ - Return the Resources associated with the current Package. - """ + """Return the Resources associated with the current Package.""" package = self.get_object() qs = Resource.objects.filter(package=package) paginated_qs = self.paginate_queryset(qs) serializer = ResourceAPISerializer( - paginated_qs, many=True, context={'request': request}) + paginated_qs, many=True, context={"request": request} + ) return self.get_paginated_response(serializer.data) @action(detail=True) def get_enhanced_package_data(self, request, *args, **kwargs): - """ - Return a mapping of enhanced Package data for a given Package - """ + """Return a mapping of enhanced Package data for a given Package""" package = self.get_object() package_data = get_enhanced_package(package) return Response(package_data) - @action(detail=False, methods=['post']) + @action(detail=False, methods=["post"]) def filter_by_checksums(self, request, *args, **kwargs): """ Take a mapping, where the keys are the names of the checksum algorthm @@ -438,7 +424,7 @@ def filter_by_checksums(self, request, *args, **kwargs): - sha512 Example: - + ------- { "sha1": [ "b55fd82f80cc1bd0bdabf9c6e3153788d35d7911", @@ -459,42 +445,42 @@ def filter_by_checksums(self, request, *args, **kwargs): } This will return Packages whose sha1 or md5 matches those values. + """ data = dict(request.data) unsupported_fields = [] - supported_fields = ['md5', 'sha1', 'sha256', - 'sha512', 'enhance_package_data'] + supported_fields = ["md5", "sha1", "sha256", "sha512", "enhance_package_data"] for field, value in data.items(): if field not in supported_fields: unsupported_fields.append(field) if unsupported_fields: - unsupported_fields_str = ', '.join(unsupported_fields) + unsupported_fields_str = ", ".join(unsupported_fields) response_data = { - 'status': f'Unsupported field(s) given: {unsupported_fields_str}' + "status": f"Unsupported field(s) given: {unsupported_fields_str}" } return Response(response_data, status=status.HTTP_400_BAD_REQUEST) - enhance_package_data = data.pop('enhance_package_data', False) + enhance_package_data = data.pop("enhance_package_data", False) if not data: - response_data = { - 'status': 'No values provided' - } + response_data = {"status": "No values provided"} return Response(response_data, status=status.HTTP_400_BAD_REQUEST) lookups = Q() for field, value in data.items(): # Subquery to get the ids of the Packages with the earliest release_date for each `field` - earliest_release_dates = Package.objects.filter( - **{field: OuterRef(field)} - ).order_by('release_date').values('id')[:1] + earliest_release_dates = ( + Package.objects.filter(**{field: OuterRef(field)}) + .order_by("release_date") + .values("id")[:1] + ) value = value or [] lookups |= Q( **{ - f'{field}__in': value, - 'id__in': Subquery(earliest_release_dates), + f"{field}__in": value, + "id__in": Subquery(earliest_release_dates), } ) @@ -502,32 +488,28 @@ def filter_by_checksums(self, request, *args, **kwargs): qs = Package.objects.filter(lookups) paginated_qs = self.paginate_queryset(qs) if enhance_package_data: - serialized_package_data = [get_enhanced_package( - package=package) for package in paginated_qs] + serialized_package_data = [ + get_enhanced_package(package=package) for package in paginated_qs + ] else: serializer = PackageAPISerializer( - paginated_qs, many=True, context={'request': request}) + paginated_qs, many=True, context={"request": request} + ) serialized_package_data = serializer.data return self.get_paginated_response(serialized_package_data) class PackageViewSet(PackagePublicViewSet): - @action(detail=True) def reindex_package(self, request, *args, **kwargs): - """ - Reindex this package instance - """ + """Reindex this package instance""" package = self.get_object() package.reindex() - data = { - 'status': f'{package.package_url} has been queued for reindexing' - } + data = {"status": f"{package.package_url} has been queued for reindexing"} return Response(data) class PackageUpdateSet(viewsets.ViewSet): - """ Take a list of `purls` (where each item is a dictionary containing PURL and content_type). @@ -548,17 +530,18 @@ class PackageUpdateSet(viewsets.ViewSet): """ def create(self, request): - res = [] serializer = UpdatePackagesSerializer(data=request.data) if not serializer.is_valid(): - return Response({'errors': serializer.errors}, status=status.HTTP_400_BAD_REQUEST) + return Response( + {"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST + ) validated_data = serializer.validated_data - packages = validated_data.get('purls', []) - uuid = validated_data.get('uuid', None) + packages = validated_data.get("purls", []) + uuid = validated_data.get("uuid", None) package_set = None if uuid: @@ -568,35 +551,32 @@ def create(self, request): if package_set: package_set = package_set - except: - message = { - 'update_status': f'No Package Set found for {uuid}' - } + except Exception: + message = {"update_status": f"No Package Set found for {uuid}"} return Response(message, status=status.HTTP_400_BAD_REQUEST) for items in packages or []: - res_data = {} - purl = items.get('purl') + purl = items.get("purl") - res_data['purl'] = purl - content_type = items.get('content_type') + res_data["purl"] = purl + content_type = items.get("content_type") content_type_val = PackageContentType.__getitem__(content_type) lookups = purl_to_lookups(purl) filtered_packages = Package.objects.filter(**lookups) - res_data['update_status'] = "Already Exists" + res_data["update_status"] = "Already Exists" if not filtered_packages: if package_set is None: package_set = PackageSet.objects.create() - lookups['package_content'] = content_type_val - lookups['download_url'] = " " + lookups["package_content"] = content_type_val + lookups["download_url"] = " " cr = Package.objects.create(**lookups) package_set.add_to_package_set(cr) - res_data['update_status'] = "Updated" + res_data["update_status"] = "Updated" res.append(res_data) @@ -606,53 +586,51 @@ def create(self, request): UPDATEABLE_FIELDS = [ - 'primary_language', - 'copyright', - - 'declared_license_expression', - 'declared_license_expression_spdx', - 'license_detections', - 'other_license_expression', - 'other_license_expression_spdx', - 'other_license_detections', + "primary_language", + "copyright", + "declared_license_expression", + "declared_license_expression_spdx", + "license_detections", + "other_license_expression", + "other_license_expression_spdx", + "other_license_detections", # TODO: update extracted license statement and other fields together # all license fields are based off of `extracted_license_statement` and should be treated as a unit # hold off for now - 'extracted_license_statement', - - 'notice_text', - 'api_data_url', - 'bug_tracking_url', - 'code_view_url', - 'vcs_url', - 'source_packages', - 'repository_homepage_url', - 'dependencies', - 'parties', - 'homepage_url', - 'description', + "extracted_license_statement", + "notice_text", + "api_data_url", + "bug_tracking_url", + "code_view_url", + "vcs_url", + "source_packages", + "repository_homepage_url", + "dependencies", + "parties", + "homepage_url", + "description", ] NONUPDATEABLE_FIELDS = [ - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'purl', - 'datasource_id', - 'download_url', - 'size', - 'md5', - 'sha1', - 'sha256', - 'sha512', - 'package_uid', - 'repository_download_url', - 'file_references', - 'history', - 'last_modified_date', + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "purl", + "datasource_id", + "download_url", + "size", + "md5", + "sha1", + "sha256", + "sha512", + "package_uid", + "repository_download_url", + "file_references", + "history", + "last_modified_date", ] @@ -675,7 +653,10 @@ def get_enhanced_package(package): # and we can't enhance a package that is not in a package set. return package.to_dict() - elif package_content in [PackageContentType.BINARY, PackageContentType.SOURCE_ARCHIVE]: + elif package_content in [ + PackageContentType.BINARY, + PackageContentType.SOURCE_ARCHIVE, + ]: # Binary packages can only be part of one set # TODO: Can source_archive packages be part of multiple sets? first_package_in_set = package.package_sets.first() @@ -702,14 +683,12 @@ def _get_enhanced_package(package, packages): # always default to PackageContentType.BINARY as we can have None/NULL in the model for now # Reference: https://github.com/aboutcode-org/purldb/issues/490 - package_content = ( - package and package.package_content) or PackageContentType.BINARY + package_content = (package and package.package_content) or PackageContentType.BINARY for peer in packages: # always default to PackageContentType.BINARY as we can have None/NULL in the model for now # Reference: https://github.com/aboutcode-org/purldb/issues/490 - peer_content = ( - peer and peer.package_content) or PackageContentType.BINARY + peer_content = (peer and peer.package_content) or PackageContentType.BINARY if peer_content >= package_content: # We do not want to mix data with peers of the same package content @@ -720,25 +699,24 @@ def _get_enhanced_package(package, packages): package_value = package_data.get(field) peer_value = getattr(peer, field) if not package_value and peer_value: - if field == 'parties': + if field == "parties": peer_value = PartySerializer(peer_value, many=True).data - if field == 'dependencies': - peer_value = DependentPackageSerializer( - peer_value, many=True).data + if field == "dependencies": + peer_value = DependentPackageSerializer(peer_value, many=True).data package_data[field] = peer_value enhanced = True if enhanced: - extra_data = package_data.get('extra_data', {}) - enhanced_by = extra_data.get('enhanced_by', []) + extra_data = package_data.get("extra_data", {}) + enhanced_by = extra_data.get("enhanced_by", []) enhanced_by.append(peer.purl) - extra_data['enhanced_by'] = enhanced_by - package_data['extra_data'] = extra_data + extra_data["enhanced_by"] = enhanced_by + package_data["extra_data"] = extra_data return package_data class PackageSetViewSet(viewsets.ReadOnlyModelViewSet): - queryset = PackageSet.objects.prefetch_related('packages') + queryset = PackageSet.objects.prefetch_related("packages") serializer_class = PackageSetAPISerializer @@ -748,16 +726,17 @@ class PackageWatchViewSet(CreateListRetrieveUpdateViewSetMixin): Add the new package version to the scan queue. Default watch interval is 7 days. """ - queryset = PackageWatch.objects.get_queryset().order_by('-id') + + queryset = PackageWatch.objects.get_queryset().order_by("-id") serializer_class = PackageWatchAPISerializer - lookup_field = 'package_url' - lookup_value_regex = r'pkg:[a-zA-Z0-9_]+\/[a-zA-Z0-9_.-]+(?:\/[a-zA-Z0-9_.-]+)*' - http_method_names = ['get', 'post', 'patch'] + lookup_field = "package_url" + lookup_value_regex = r"pkg:[a-zA-Z0-9_]+\/[a-zA-Z0-9_.-]+(?:\/[a-zA-Z0-9_.-]+)*" + http_method_names = ["get", "post", "patch"] def get_serializer_class(self): - if self.action == 'create': + if self.action == "create": return PackageWatchCreateSerializer - elif self.request.method == 'PATCH': + elif self.request.method == "PATCH": return PackageWatchUpdateSerializer return super().get_serializer_class() @@ -801,20 +780,23 @@ class CollectViewSet(viewsets.ViewSet): **Note:** See `Index packages` for bulk indexing/reindexing of packages. """ + serializer_class = CollectPackageSerializer @extend_schema( parameters=[ - OpenApiParameter('purl', str, 'query', - description='PackageURL', required=True), - OpenApiParameter('source_purl', str, 'query', - description='Source PackageURL'), - + OpenApiParameter( + "purl", str, "query", description="PackageURL", required=True + ), + OpenApiParameter( + "source_purl", str, "query", description="Source PackageURL" + ), # There is no OpenApiTypes.LIST https://github.com/tfranzel/drf-spectacular/issues/341 OpenApiParameter( - 'addon_pipelines', + "addon_pipelines", build_array_type(build_basic_type(OpenApiTypes.STR)), - 'query', description='Addon pipelines', + "query", + description="Addon pipelines", ), ], responses={200: PackageAPISerializer()}, @@ -823,23 +805,25 @@ def list(self, request, format=None): serializer = self.serializer_class(data=request.query_params) if not serializer.is_valid(): return Response( - {'errors': serializer.errors}, + {"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST, ) validated_data = serializer.validated_data - purl = validated_data.get('purl') - sort = validated_data.get('sort') or ['-version',] + purl = validated_data.get("purl") + sort = validated_data.get("sort") or [ + "-version", + ] kwargs = dict() # We want this request to have high priority since the user knows the # exact package they want - kwargs['priority'] = 100 + kwargs["priority"] = 100 - if source_purl := validated_data.get('source_purl', None): + if source_purl := validated_data.get("source_purl", None): kwargs["source_purl"] = source_purl - if addon_pipelines := validated_data.get('addon_pipelines', []): + if addon_pipelines := validated_data.get("addon_pipelines", []): kwargs["addon_pipelines"] = addon_pipelines lookups = purl_to_lookups(purl) @@ -849,7 +833,7 @@ def list(self, request, format=None): errors = priority_router.process(purl, **kwargs) except NoRouteAvailable: message = { - 'status': f'cannot fetch Package data for {purl}: no available handler' + "status": f"cannot fetch Package data for {purl}: no available handler" } return Response(message, status=status.HTTP_400_BAD_REQUEST) @@ -859,7 +843,7 @@ def list(self, request, format=None): message = {} if errors: message = { - 'status': f'error(s) occurred when fetching metadata for {purl}: {errors}' + "status": f"error(s) occurred when fetching metadata for {purl}: {errors}" } return Response(message, status=status.HTTP_400_BAD_REQUEST) @@ -867,7 +851,8 @@ def list(self, request, format=None): get_source_package_and_add_to_package_set(package) serializer = PackageAPISerializer( - packages, many=True, context={'request': request}) + packages, many=True, context={"request": request} + ) return Response(serializer.data) @extend_schema( @@ -876,7 +861,7 @@ def list(self, request, format=None): 200: IndexPackagesResponseSerializer(), }, ) - @action(detail=False, methods=['post'], serializer_class=IndexPackagesSerializer) + @action(detail=False, methods=["post"], serializer_class=IndexPackagesSerializer) def index_packages(self, request, *args, **kwargs): """ Collect and index a JSON array of `packages` objects with PURLs to process. @@ -976,12 +961,14 @@ def _reindex_package(package, reindexed_packages, **kwargs): serializer = self.serializer_class(data=request.data) if not serializer.is_valid(): - return Response({'errors': serializer.errors}, status=status.HTTP_400_BAD_REQUEST) + return Response( + {"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST + ) validated_data = serializer.validated_data - packages = validated_data.get('packages', []) - reindex = validated_data.get('reindex', False) - reindex_set = validated_data.get('reindex_set', False) + packages = validated_data.get("packages", []) + reindex = validated_data.get("reindex", False) + reindex_set = validated_data.get("reindex_set", False) queued_packages = [] unqueued_packages = [] @@ -990,19 +977,31 @@ def _reindex_package(package, reindexed_packages, **kwargs): reindexed_packages = [] requeued_packages = [] - supported_ecosystems = ['maven', 'npm', 'deb', - 'generic', 'gnu', 'openssl', 'github', 'conan'] + supported_ecosystems = [ + "maven", + "npm", + "deb", + "generic", + "gnu", + "openssl", + "github", + "conan", + ] unique_packages, unsupported_packages, unsupported_vers = get_resolved_packages( - packages, supported_ecosystems) + packages, supported_ecosystems + ) if reindex: for package in unique_packages: - purl = package['purl'] + purl = package["purl"] kwargs = dict() - if addon_pipelines := package.get('addon_pipelines'): + if addon_pipelines := package.get("addon_pipelines"): kwargs["addon_pipelines"] = [ - pipe for pipe in addon_pipelines if is_supported_addon_pipeline(pipe)] + pipe + for pipe in addon_pipelines + if is_supported_addon_pipeline(pipe) + ] lookups = purl_to_lookups(purl) packages = Package.objects.filter(**lookups) if packages.count() > 0: @@ -1012,62 +1011,68 @@ def _reindex_package(package, reindexed_packages, **kwargs): if reindex_set: for package_set in package.package_sets.all(): for p in package_set.packages.all(): - _reindex_package( - p, reindexed_packages, **kwargs) + _reindex_package(p, reindexed_packages, **kwargs) else: nonexistent_packages.append(package) - requeued_packages.extend( - [p.package_url for p in reindexed_packages]) + requeued_packages.extend([p.package_url for p in reindexed_packages]) if not reindex or nonexistent_packages: - interesting_packages = nonexistent_packages if nonexistent_packages else unique_packages + interesting_packages = ( + nonexistent_packages if nonexistent_packages else unique_packages + ) for package in interesting_packages: - purl = package['purl'] + purl = package["purl"] is_routable_purl = priority_router.is_routable(purl) if not is_routable_purl: unsupported_packages.append(purl) else: # add to queue extra_fields = dict() - if source_purl := package.get('source_purl'): - extra_fields['source_uri'] = source_purl - if addon_pipelines := package.get('addon_pipelines'): - extra_fields['addon_pipelines'] = [ - pipe for pipe in addon_pipelines if is_supported_addon_pipeline(pipe)] - if priority := package.get('priority'): - extra_fields['priority'] = priority + if source_purl := package.get("source_purl"): + extra_fields["source_uri"] = source_purl + if addon_pipelines := package.get("addon_pipelines"): + extra_fields["addon_pipelines"] = [ + pipe + for pipe in addon_pipelines + if is_supported_addon_pipeline(pipe) + ] + if priority := package.get("priority"): + extra_fields["priority"] = priority priority_resource_uri = PriorityResourceURI.objects.insert( - purl, **extra_fields) + purl, **extra_fields + ) if priority_resource_uri: queued_packages.append(purl) else: unqueued_packages.append(purl) response_data = { - 'queued_packages_count': len(queued_packages), - 'queued_packages': queued_packages, - 'requeued_packages_count': len(requeued_packages), - 'requeued_packages': requeued_packages, - 'unqueued_packages_count': len(unqueued_packages), - 'unqueued_packages': unqueued_packages, - 'unsupported_packages_count': len(unsupported_packages), - 'unsupported_packages': unsupported_packages, - 'unsupported_vers_count': len(unsupported_vers), - 'unsupported_vers': unsupported_vers, + "queued_packages_count": len(queued_packages), + "queued_packages": queued_packages, + "requeued_packages_count": len(requeued_packages), + "requeued_packages": requeued_packages, + "unqueued_packages_count": len(unqueued_packages), + "unqueued_packages": unqueued_packages, + "unsupported_packages_count": len(unsupported_packages), + "unsupported_packages": unsupported_packages, + "unsupported_vers_count": len(unsupported_vers), + "unsupported_vers": unsupported_vers, } serializer = IndexPackagesResponseSerializer( - response_data, context={'request': request}) + response_data, context={"request": request} + ) return Response(serializer.data) @extend_schema( parameters=[ - OpenApiParameter('purl', str, 'query', - description='PackageURL', required=True), + OpenApiParameter( + "purl", str, "query", description="PackageURL", required=True + ), ], responses={200: PackageAPISerializer()}, ) - @action(detail=False, methods=['get'], serializer_class=CollectPackageSerializer) + @action(detail=False, methods=["get"], serializer_class=CollectPackageSerializer) def reindex_metadata(self, request, *args, **kwargs): """ Collect or recollect the package metadata of a ``PURL`` string. @@ -1088,18 +1093,18 @@ def reindex_metadata(self, request, *args, **kwargs): serializer = self.serializer_class(data=request.query_params) if not serializer.is_valid(): return Response( - {'errors': serializer.errors}, + {"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST, ) validated_data = serializer.validated_data - purl = validated_data.get('purl') + purl = validated_data.get("purl") lookups = purl_to_lookups(purl) packages = Package.objects.filter(**lookups) if packages.count() == 0: return Response( - {'status': f'Not recollecting: Package does not exist for {purl}'}, + {"status": f"Not recollecting: Package does not exist for {purl}"}, status=status.HTTP_400_BAD_REQUEST, ) @@ -1110,7 +1115,7 @@ def reindex_metadata(self, request, *args, **kwargs): errors = priority_router.process(purl, **kwargs) except NoRouteAvailable: message = { - 'status': f'cannot fetch Package data for {purl}: no available handler' + "status": f"cannot fetch Package data for {purl}: no available handler" } return Response(message, status=status.HTTP_400_BAD_REQUEST) @@ -1120,12 +1125,13 @@ def reindex_metadata(self, request, *args, **kwargs): message = {} if errors: message = { - 'status': f'error(s) occurred when fetching metadata for {purl}: {errors}' + "status": f"error(s) occurred when fetching metadata for {purl}: {errors}" } return Response(message, status=status.HTTP_400_BAD_REQUEST) serializer = PackageAPISerializer( - packages, many=True, context={'request': request}) + packages, many=True, context={"request": request} + ) return Response(serializer.data) @@ -1149,16 +1155,22 @@ class PurlValidateViewSet(viewsets.ViewSet): - exists - True, if input PURL exists in real world and `check_existence` flag is enabled. """ + serializer_class = PurlValidateSerializer def get_view_name(self): - return 'Validate PURL' + return "Validate PURL" @extend_schema( parameters=[ - OpenApiParameter('purl', str, 'query', description='PackageURL'), - OpenApiParameter('check_existence', bool, 'query', - description='Check existence', default=False), + OpenApiParameter("purl", str, "query", description="PackageURL"), + OpenApiParameter( + "check_existence", + bool, + "query", + description="Check existence", + default=False, + ), ], responses={200: PurlValidateResponseSerializer()}, ) @@ -1166,47 +1178,46 @@ def list(self, request): serializer = self.serializer_class(data=request.query_params) if not serializer.is_valid(): - return Response({'errors': serializer.errors}, status=status.HTTP_400_BAD_REQUEST) + return Response( + {"errors": serializer.errors}, status=status.HTTP_400_BAD_REQUEST + ) validated_data = serializer.validated_data - purl = validated_data.get('purl') - check_existence = validated_data.get('check_existence', False) + purl = validated_data.get("purl") + check_existence = validated_data.get("check_existence", False) message_valid = "The provided PackageURL is valid." message_not_valid = "The provided PackageURL is not valid." - message_valid_and_exists = ( - "The provided Package URL is valid, and the package exists in the upstream repo." - ) + message_valid_and_exists = "The provided Package URL is valid, and the package exists in the upstream repo." message_valid_but_does_not_exist = ( "The provided PackageURL is valid, but does not exist in the upstream repo." ) - message_valid_but_package_type_not_supported = ( - "The provided PackageURL is valid, but `check_existence` is not supported for this package type." - ) + message_valid_but_package_type_not_supported = "The provided PackageURL is valid, but `check_existence` is not supported for this package type." response = {} - response['exists'] = None - response['purl'] = purl - response['valid'] = False - response['message'] = message_not_valid + response["exists"] = None + response["purl"] = purl + response["valid"] = False + response["message"] = message_not_valid # validate purl try: package_url = PackageURL.from_string(purl) except ValueError: serializer = PurlValidateResponseSerializer( - response, context={'request': request}) + response, context={"request": request} + ) return Response(serializer.data, status=status.HTTP_400_BAD_REQUEST) - response['valid'] = True + response["valid"] = True response["message"] = message_valid unsupported_ecosystem = False if check_existence: - response['exists'] = False + response["exists"] = False lookups = purl_to_lookups(purl) packages = Package.objects.filter(**lookups) if packages.exists(): - response['exists'] = True + response["exists"] = True else: versionless_purl = PackageURL( type=package_url.type, @@ -1218,25 +1229,26 @@ def list(self, request): and package_url.type in VERSION_CLASS_BY_PACKAGE_TYPE ): all_versions = get_all_versions_plain(versionless_purl) - if all_versions and (not package_url.version or ( - package_url.version in all_versions) + if all_versions and ( + not package_url.version or (package_url.version in all_versions) ): # True, if requested purl has no version and any version of package exists upstream. # True, if requested purl.version exists upstream. - response['exists'] = True + response["exists"] = True else: unsupported_ecosystem = True - if response['exists']: + if response["exists"]: response["message"] = message_valid_and_exists elif unsupported_ecosystem: - response['exists'] = None + response["exists"] = None response["message"] = message_valid_but_package_type_not_supported else: response["message"] = message_valid_but_does_not_exist serializer = PurlValidateResponseSerializer( - response, context={'request': request}) + response, context={"request": request} + ) return Response(serializer.data) @@ -1251,8 +1263,8 @@ def get_resolved_packages(packages, supported_ecosystems): unsupported_vers = set() for package in packages or []: - purl = package.get('purl') - vers = package.get('vers') + purl = package.get("purl") + vers = package.get("vers") if not purl: continue @@ -1269,7 +1281,7 @@ def get_resolved_packages(packages, supported_ecosystems): if parsed_purl.version: # We prioritize Package requests that have explicit versions - package['priority'] = 100 + package["priority"] = 100 resolved_packages_by_purl[purl] = package continue @@ -1277,24 +1289,26 @@ def get_resolved_packages(packages, supported_ecosystems): if not vers and not parsed_purl.version: if resolved_purls := resolve_all_versions(parsed_purl): for res_purl in resolved_purls: - resolved_packages_by_purl[res_purl] = {'purl': res_purl} + resolved_packages_by_purl[res_purl] = {"purl": res_purl} continue if resolved_purls := resolve_versions(parsed_purl, vers): for res_purl in resolved_purls: - resolved_packages_by_purl[res_purl] = {'purl': res_purl} + resolved_packages_by_purl[res_purl] = {"purl": res_purl} else: unsupported_vers.add(vers) unique_resolved_packages = resolved_packages_by_purl.values() - return list(unique_resolved_packages), list(unsupported_purls), list(unsupported_vers) + return ( + list(unique_resolved_packages), + list(unsupported_purls), + list(unsupported_vers), + ) def resolve_all_versions(parsed_purl): - """ - Take versionless and return a list of PURLs for all the released versions. - """ + """Take versionless and return a list of PURLs for all the released versions.""" all_versions = get_all_versions(parsed_purl) or [] return [ @@ -1341,16 +1355,15 @@ def resolve_versions(parsed_purl, vers): result.append(str(package_url)) except InvalidConstraintsError: logger.warning( - f"Invalid constraints sequence in '{vers}' for '{parsed_purl}'") + f"Invalid constraints sequence in '{vers}' for '{parsed_purl}'" + ) return return result def get_all_versions_plain(purl: PackageURL): - """ - Return all the versions available for the given purls. - """ + """Return all the versions available for the given purls.""" if ( purl.type not in VERSION_API_CLASSES_BY_PACKAGE_TYPE or purl.type not in VERSION_CLASS_BY_PACKAGE_TYPE @@ -1387,4 +1400,6 @@ def get_all_versions(purl): VERSION_CLASS_BY_PACKAGE_TYPE = { - pkg_type: range_class.version_class for pkg_type, range_class in RANGE_CLASS_BY_SCHEMES.items()} + pkg_type: range_class.version_class + for pkg_type, range_class in RANGE_CLASS_BY_SCHEMES.items() +} diff --git a/packagedb/api_custom.py b/packagedb/api_custom.py index e793fe8a..6a068788 100644 --- a/packagedb/api_custom.py +++ b/packagedb/api_custom.py @@ -17,6 +17,7 @@ class PageSizePagination(PageNumberPagination): For example: http://api.example.org/accounts/?page=4&page_size=20 """ + page_size = 20 max_page_size = 20 - page_size_query_param = 'page_size' + page_size_query_param = "page_size" diff --git a/packagedb/filters.py b/packagedb/filters.py index f6974764..11a4ff2c 100644 --- a/packagedb/filters.py +++ b/packagedb/filters.py @@ -9,10 +9,11 @@ import shlex -import django_filters from django.core.exceptions import FieldError from django.db.models import Q +import django_filters + # The function and Classes in this file are from https://github.com/aboutcode-org/scancode.io/blob/main/scanpipe/filters.py @@ -49,8 +50,7 @@ def parse_query_string_to_lookups(query_string, default_lookup_expr, default_fie search_value = term field_name = default_field - lookups &= Q( - **{f"{field_name}__{lookup_expr}": search_value}, _negated=negated) + lookups &= Q(**{f"{field_name}__{lookup_expr}": search_value}, _negated=negated) return lookups @@ -85,8 +85,7 @@ def filter(self, qs, value): if "://" not in value and ":" in value: return super().filter(qs, value) - search_fields = ["type", "namespace", - "name", "version", "download_url"] + search_fields = ["type", "namespace", "name", "version", "download_url"] lookups = Q() for field_names in search_fields: lookups |= Q(**{f"{field_names}__{self.lookup_expr}": value}) diff --git a/packagedb/from_purl.py b/packagedb/from_purl.py index b6102a9c..e0adc42a 100644 --- a/packagedb/from_purl.py +++ b/packagedb/from_purl.py @@ -14,23 +14,20 @@ from rest_framework import viewsets from rest_framework.response import Response -from purl2vcs.find_source_repo import get_package_object_from_purl -from purl2vcs.find_source_repo import get_source_repo from packagedb.serializers import PurltoGitRepoResponseSerializer from packagedb.serializers import PurltoGitRepoSerializer +from purl2vcs.find_source_repo import get_package_object_from_purl +from purl2vcs.find_source_repo import get_source_repo @extend_schema( parameters=[ - OpenApiParameter("package_url", str, "query", - description="package url"), + OpenApiParameter("package_url", str, "query", description="package url"), ], responses={200: PurltoGitRepoResponseSerializer()}, ) class FromPurlToGitRepoViewSet(viewsets.ViewSet): - """ - Return a ``git_repo`` from a standard PackageURL. - """ + """Return a ``git_repo`` from a standard PackageURL.""" serializer_class = PurltoGitRepoSerializer diff --git a/packagedb/management/commands/create_source_repo_packages.py b/packagedb/management/commands/create_source_repo_packages.py index 1d7d461c..2d32653c 100644 --- a/packagedb/management/commands/create_source_repo_packages.py +++ b/packagedb/management/commands/create_source_repo_packages.py @@ -11,14 +11,13 @@ import sys import openpyxl -from packageurl.contrib.django.utils import purl_to_lookups from minecode.management.commands import VerboseCommand from minecode.model_utils import add_package_to_scan_queue -from purl2vcs.find_source_repo import add_source_package_to_package_set -from purl2vcs.find_source_repo import get_package_object_from_purl from packagedb.models import Package from packagedb.models import PackageContentType +from purl2vcs.find_source_repo import add_source_package_to_package_set +from purl2vcs.find_source_repo import get_package_object_from_purl TRACE = False @@ -75,7 +74,6 @@ def handle(self, *args, **options): for row in rows: # Look up the package the row is for by using the purl to query the db. purl = row["purl"] - source_purl = row["source_purl"] print(f"Processing packages for: {purl}") package = get_package_object_from_purl(package_url=purl) if not package: diff --git a/packagedb/management/commands/fix_purl_values.py b/packagedb/management/commands/fix_purl_values.py index 61c6587b..da0866be 100644 --- a/packagedb/management/commands/fix_purl_values.py +++ b/packagedb/management/commands/fix_purl_values.py @@ -7,22 +7,22 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from dateutil.parser import parse as dateutil_parse -import copy import logging import sys -from urllib3.util import Retry +import requests +from dateutil.parser import parse as dateutil_parse +from packagedcode.maven import build_filename +from packagedcode.maven import get_urls from packageurl import PackageURL -from packagedcode.maven import get_urls, build_filename from requests import Session from requests.adapters import HTTPAdapter -import requests +from urllib3.util import Retry +from minecode.collectors.maven import collect_links_from_text +from minecode.collectors.maven import filter_for_artifacts from minecode.management.commands import VerboseCommand from minecode.utils import MemorySavingQuerysetIterator -from minecode.visitors.maven import collect_links_from_text -from minecode.visitors.maven import filter_for_artifacts from packagedb.models import Package DEFAULT_TIMEOUT = 30 @@ -34,9 +34,9 @@ logger.setLevel(logging.INFO) session = Session() -session.mount('https://', HTTPAdapter(max_retries=Retry(10))) +session.mount("https://", HTTPAdapter(max_retries=Retry(10))) headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36', + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36", } @@ -45,16 +45,18 @@ def get_timestamps_by_links(package_version_page_url): response = requests.get(package_version_page_url) if response: timestamps_by_links = collect_links_from_text( - response.text, filter=filter_for_artifacts) + response.text, filter=filter_for_artifacts + ) timestamps_by_links = { - link: dateutil_parse(timestamp) for link, timestamp in timestamps_by_links.items() + link: dateutil_parse(timestamp) + for link, timestamp in timestamps_by_links.items() } return timestamps_by_links -class MavenArtifact(object): - def __init__(self, namespace, name, version, qualifiers='', ec=[]): - type = 'maven' +class MavenArtifact: + def __init__(self, namespace, name, version, qualifiers="", ec=[]): + type = "maven" self.type = type self.namespace = namespace self.name = name @@ -65,7 +67,7 @@ def __init__(self, namespace, name, version, qualifiers='', ec=[]): namespace=namespace, name=name, version=version, - qualifiers=qualifiers + qualifiers=qualifiers, ) urls = get_urls( namespace=namespace, @@ -73,19 +75,18 @@ def __init__(self, namespace, name, version, qualifiers='', ec=[]): version=version, qualifiers=self.package_url.qualifiers, ) - self.download_url = urls['repository_download_url'] - self.repository_homepage_url = urls['repository_homepage_url'] - self.api_data_url = urls['api_data_url'] + self.download_url = urls["repository_download_url"] + self.repository_homepage_url = urls["repository_homepage_url"] + self.api_data_url = urls["api_data_url"] qualifiers_mapping = self.package_url.qualifiers filename = build_filename( artifact_id=name, version=version, - extension=qualifiers_mapping.get('type') or 'jar', - classifier=qualifiers_mapping.get('classifier'), + extension=qualifiers_mapping.get("type") or "jar", + classifier=qualifiers_mapping.get("classifier"), ) - timestamps_by_links = get_timestamps_by_links( - self.repository_homepage_url) + timestamps_by_links = get_timestamps_by_links(self.repository_homepage_url) self.release_date = timestamps_by_links.get(filename) self.related_artifacts = list( self._populate_related_artifacts( @@ -98,14 +99,14 @@ def __init__(self, namespace, name, version, qualifiers='', ec=[]): @classmethod def _populate_related_artifacts(cls, namespace, name, version, ec): - filtered_ec = [entry for entry in ec if not entry.startswith('.')] + filtered_ec = [entry for entry in ec if not entry.startswith(".")] for entry in filtered_ec: - _, ending = entry.split('-') - split_ending = ending.split('.') + _, ending = entry.split("-") + split_ending = ending.split(".") classifier = None if len(split_ending) > 0: classifier = split_ending[0] - qualifiers = f'classifier={classifier}' + qualifiers = f"classifier={classifier}" yield cls( namespace=namespace, name=name, @@ -115,7 +116,7 @@ def _populate_related_artifacts(cls, namespace, name, version, ec): def query_sha1_on_maven(sha1, timeout=DEFAULT_TIMEOUT): - maven_api_search_url = f'https://search.maven.org/solrsearch/select?q=1:{sha1}' + maven_api_search_url = f"https://search.maven.org/solrsearch/select?q=1:{sha1}" try: response = session.get(maven_api_search_url, timeout=timeout) response.raise_for_status() @@ -125,14 +126,14 @@ def query_sha1_on_maven(sha1, timeout=DEFAULT_TIMEOUT): if not response.ok: return f"API query failed for: {maven_api_search_url}" contents = response.json() - resp = contents.get('response', {}) + resp = contents.get("response", {}) matched_artifacts = [] - if resp.get('numFound', 0) > 0: - for matched_artifact in resp.get('docs', []): - namespace = matched_artifact.get('g', '') - name = matched_artifact.get('a', '') - version = matched_artifact.get('v', '') - ec = matched_artifact.get('ec', []) + if resp.get("numFound", 0) > 0: + for matched_artifact in resp.get("docs", []): + namespace = matched_artifact.get("g", "") + name = matched_artifact.get("a", "") + version = matched_artifact.get("v", "") + ec = matched_artifact.get("ec", []) if not namespace and name and version: continue matched_artifacts.append( @@ -147,14 +148,14 @@ def query_sha1_on_maven(sha1, timeout=DEFAULT_TIMEOUT): class Command(VerboseCommand): - help = 'Update maven Package download_url values' + help = "Update maven Package download_url values" def handle(self, *args, **options): - maven_packages = Package.objects.filter( - type='maven', sha1__is_null=False) + maven_packages = Package.objects.filter(type="maven", sha1__is_null=False) maven_packages_count = maven_packages.count() logger.info( - f'Checking {maven_packages_count:,} Maven Package PackageURL values') + f"Checking {maven_packages_count:,} Maven Package PackageURL values" + ) packages_to_delete = [] for package in MemorySavingQuerysetIterator(maven_packages): @@ -197,8 +198,12 @@ def handle(self, *args, **options): package_different_case.qualifiers = artifact_qualifiers package_different_case.download_url = artifact.download_url package_different_case.release_date = artifact.release_date - package_different_case.repository_homepage_url = artifact.repository_homepage_url - package_different_case.repository_download_url = artifact.repository_download_url + package_different_case.repository_homepage_url = ( + artifact.repository_homepage_url + ) + package_different_case.repository_download_url = ( + artifact.repository_download_url + ) package_different_case.api_data_url = artifact.api_data_url package_different_case.sha1 = package.sha1 package_different_case.save() diff --git a/packagedb/management/commands/run_scheduler.py b/packagedb/management/commands/run_scheduler.py index b70f6e94..e8f065f1 100644 --- a/packagedb/management/commands/run_scheduler.py +++ b/packagedb/management/commands/run_scheduler.py @@ -8,6 +8,7 @@ # from django_rq.management.commands import rqscheduler + from packagedb.models import PackageWatch from packagedb.schedules import clear_zombie_watch_schedules from packagedb.schedules import scheduled_job_exists @@ -28,4 +29,4 @@ class Command(rqscheduler.Command): def handle(self, *args, **kwargs): clear_zombie_watch_schedules() init_watch_scheduled() - super(Command, self).handle(*args, **kwargs) + super().handle(*args, **kwargs) diff --git a/packagedb/management/commands/watch_packages.py b/packagedb/management/commands/watch_packages.py index f6297f4e..4c379598 100644 --- a/packagedb/management/commands/watch_packages.py +++ b/packagedb/management/commands/watch_packages.py @@ -7,8 +7,9 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from commoncode import cliutils from django.core.management.base import BaseCommand + +from commoncode import cliutils from fetchcode.package_versions import SUPPORTED_ECOSYSTEMS from packageurl import PackageURL from univers.version_range import RANGE_CLASS_BY_SCHEMES @@ -36,8 +37,7 @@ def handle(self, *args, **options): purl_value = options.get("purl") packages_qs = ( - Package.objects.filter( - type__in=PRIORITY_QUEUE_SUPPORTED_ECOSYSTEMS) + Package.objects.filter(type__in=PRIORITY_QUEUE_SUPPORTED_ECOSYSTEMS) .filter(type__in=SUPPORTED_ECOSYSTEMS) .distinct("type", "namespace", "name") .order_by("type", "namespace", "name") diff --git a/packagedb/models.py b/packagedb/models.py index 4ed64760..9b35266e 100644 --- a/packagedb/models.py +++ b/packagedb/models.py @@ -13,8 +13,6 @@ import uuid from collections import OrderedDict -import natsort -from dateutil.parser import parse as dateutil_parse from django.conf import settings from django.contrib.auth.models import UserManager from django.contrib.postgres.fields import ArrayField @@ -27,14 +25,17 @@ from django.dispatch import receiver from django.utils import timezone from django.utils.translation import gettext_lazy as _ -from rest_framework.authtoken.models import Token +import natsort +from dateutil.parser import parse as dateutil_parse from licensedcode.cache import build_spdx_license_expression -from packagedb import schedules from packagedcode.models import normalize_qualifiers from packageurl import PackageURL from packageurl.contrib.django.models import PackageURLMixin from packageurl.contrib.django.models import PackageURLQuerySetMixin +from rest_framework.authtoken.models import Token + +from packagedb import schedules TRACE = False @@ -44,10 +45,8 @@ def sort_version(packages): - """ - Return the packages sorted by version. - """ - return natsort.natsorted(packages, key=lambda p: p.version.replace('.', '~')+'z') + """Return the packages sorted by version.""" + return natsort.natsorted(packages, key=lambda p: p.version.replace(".", "~") + "z") class PackageQuerySet(PackageURLQuerySetMixin, models.QuerySet): @@ -57,14 +56,13 @@ def insert(self, download_url, **extra_fields): Return None if the insertion failed when an identical entry already exist. """ package, created = self.get_or_create( - download_url=download_url, defaults=extra_fields) + download_url=download_url, defaults=extra_fields + ) if created: return package def get_or_none(self, *args, **kwargs): - """ - Return the object matching the given lookup parameters, or None if no match exists. - """ + """Return the object matching the given lookup parameters, or None if no match exists.""" try: return self.get(*args, **kwargs) except Package.DoesNotExist: @@ -81,22 +79,21 @@ def paginated(self, per_page=5000): paginator = Paginator(self, per_page=per_page) for page_number in paginator.page_range: page = paginator.page(page_number) - for object in page.object_list: - yield object + yield from page.object_list VCS_CHOICES = [ - ('git', 'git'), - ('svn', 'subversion'), - ('hg', 'mercurial'), - ('bzr', 'bazaar'), - ('cvs', 'cvs'), + ("git", "git"), + ("svn", "subversion"), + ("hg", "mercurial"), + ("bzr", "bazaar"), + ("cvs", "cvs"), ] class LowerCaseField(models.CharField): def __init__(self, *args, **kwargs): - super(LowerCaseField, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) def to_python(self, value): return str(value).lower() @@ -108,12 +105,13 @@ class HistoryMixin(models.Model): is a list containing mappings representing the history for this object. Each mapping contains the field "timestamp" and "message". """ + history = models.JSONField( default=list, blank=True, editable=False, help_text=_( - 'A list of mappings representing the history for this object. ' + "A list of mappings representing the history for this object. " 'Each mapping contains the fields "timestamp" and "message".' ), ) @@ -121,22 +119,20 @@ class HistoryMixin(models.Model): null=True, blank=True, db_index=True, - help_text=_('Timestamp set when a Package is created'), + help_text=_("Timestamp set when a Package is created"), ) last_modified_date = models.DateTimeField( null=True, blank=True, db_index=True, - help_text=_('Timestamp set when a Package is created or modified'), + help_text=_("Timestamp set when a Package is created or modified"), ) class Meta: abstract = True def append_to_history(self, message, data={}, save=False): - """ - Append the ``message`` string to the history of this object. - """ + """Append the ``message`` string to the history of this object.""" time = timezone.now() timestamp = time.strftime("%Y-%m-%d-%H:%M:%S") entry = { @@ -220,10 +216,8 @@ class ExtraDataFieldMixin(models.Model): ) def update_extra_data(self, data): - """ - Updates the `extra_data` field with the provided `data` dict. - """ - if type(data) != dict: + """Update `extra_data` field with the provided `data` dict.""" + if type(data) is not dict: raise ValueError("Argument `data` value must be a dict()") self.extra_data.update(data) @@ -354,8 +348,7 @@ class AbstractPackage(models.Model): copyright = models.TextField( blank=True, null=True, - help_text=_( - "Copyright statements for this package. Typically one per line."), + help_text=_("Copyright statements for this package. Typically one per line."), ) holder = models.TextField( blank=True, @@ -456,13 +449,13 @@ class PackageContentType(models.IntegerChoices): # TODO: curation is a special case, based on how the curation identity # fields matches with the current package - CURATION = 1, 'curation' - PATCH = 2, 'patch' - SOURCE_REPO = 3, 'source_repo' - SOURCE_ARCHIVE = 4, 'source_archive' - BINARY = 5, 'binary' - TEST = 6, 'test' - DOC = 7, 'doc' + CURATION = 1, "curation" + PATCH = 2, "patch" + SOURCE_REPO = 3, "source_repo" + SOURCE_ARCHIVE = 4, "source_archive" + BINARY = 5, "binary" + TEST = 6, "test" + DOC = 7, "doc" def get_class_name(obj): @@ -483,9 +476,11 @@ class Package( ) mining_level = models.PositiveIntegerField( default=0, - help_text=_('A numeric indication of the highest depth and breadth ' - 'of package data collected through previous visits. ' - 'Higher means more and deeper collection.'), + help_text=_( + "A numeric indication of the highest depth and breadth " + "of package data collected through previous visits. " + "Higher means more and deeper collection." + ), ) keywords = ArrayField( base_field=models.TextField( @@ -495,15 +490,17 @@ class Package( default=list, blank=True, null=True, - help_text=_('A list of keywords.'), + help_text=_("A list of keywords."), ) root_path = models.CharField( max_length=1024, blank=True, null=True, - help_text=_('The path to the root of the package documented in this manifest ' - 'if any, such as a Maven .pom or a npm package.json parent ' - 'directory.') + help_text=_( + "The path to the root of the package documented in this manifest " + "if any, such as a Maven .pom or a npm package.json parent " + "directory." + ), ) source_packages = ArrayField( base_field=models.TextField( @@ -513,31 +510,33 @@ class Package( default=list, blank=True, null=True, - help_text=_('A list of source package URLs (aka. "purl") for this package. ' - 'For instance an SRPM is the "source package" for a binary RPM.'), + help_text=_( + 'A list of source package URLs (aka. "purl") for this package. ' + 'For instance an SRPM is the "source package" for a binary RPM.' + ), ) last_indexed_date = models.DateTimeField( null=True, blank=True, - help_text='Timestamp set to the date of the last indexing. Used to track indexing status.' + help_text="Timestamp set to the date of the last indexing. Used to track indexing status.", ) index_error = models.TextField( null=True, blank=True, - help_text='Indexing errors messages. When present this means the indexing has failed.', + help_text="Indexing errors messages. When present this means the indexing has failed.", ) package_sets = models.ManyToManyField( - 'PackageSet', - related_name='packages', - help_text=_( - 'A set representing the Package sets this Package is a member of.'), + "PackageSet", + related_name="packages", + help_text=_("A set representing the Package sets this Package is a member of."), ) package_content = models.IntegerField( null=True, choices=PackageContentType.choices, help_text=_( - 'Content of this Package as one of: {}'.format( - ', '.join(PackageContentType.labels)) + "Content of this Package as one of: {}".format( + ", ".join(PackageContentType.labels) + ) ), ) summary = models.JSONField( @@ -545,7 +544,7 @@ class Package( blank=True, null=True, help_text=_( - 'A mapping containing a summary and license clarity score for this Package' + "A mapping containing a summary and license clarity score for this Package" ), ) @@ -553,37 +552,37 @@ class Package( # TODO: Think about ordering, unique together, indexes, etc. class Meta: - ordering = ['id'] + ordering = ["id"] unique_together = [ ( - 'download_url', - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath' + "download_url", + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", ) ] indexes = [ # multicolumn index for search on a whole `purl` - models.Index(fields=[ - 'type', 'namespace', 'name', 'version', 'qualifiers', 'subpath' - ]), - models.Index(fields=['type']), - models.Index(fields=['namespace']), - models.Index(fields=['name']), - models.Index(fields=['version']), - models.Index(fields=['qualifiers']), - models.Index(fields=['subpath']), - models.Index(fields=['download_url']), - models.Index(fields=['filename']), - models.Index(fields=['size']), - models.Index(fields=['release_date']), - models.Index(fields=['md5']), - models.Index(fields=['sha1']), - models.Index(fields=['sha256']), - models.Index(fields=['sha512']), + models.Index( + fields=["type", "namespace", "name", "version", "qualifiers", "subpath"] + ), + models.Index(fields=["type"]), + models.Index(fields=["namespace"]), + models.Index(fields=["name"]), + models.Index(fields=["version"]), + models.Index(fields=["qualifiers"]), + models.Index(fields=["subpath"]), + models.Index(fields=["download_url"]), + models.Index(fields=["filename"]), + models.Index(fields=["size"]), + models.Index(fields=["release_date"]), + models.Index(fields=["md5"]), + models.Index(fields=["sha1"]), + models.Index(fields=["sha256"]), + models.Index(fields=["sha512"]), ] def __str__(self): @@ -596,18 +595,17 @@ def purl(self): @property def package_uid(self): purl = PackageURL.from_string(self.package_url) - purl.qualifiers['uuid'] = str(self.uuid) + purl.qualifiers["uuid"] = str(self.uuid) return str(purl) def to_dict(self): from packagedb.serializers import PackageMetadataSerializer + package_metadata = PackageMetadataSerializer(self).data return package_metadata def get_all_versions(self): - """ - Return a list of all the versions of this Package. - """ + """Return a list of all the versions of this Package.""" manager = self.__class__.objects queryset = manager.filter( name=self.name, @@ -617,9 +615,7 @@ def get_all_versions(self): return queryset def get_latest_version(self): - """ - Return the latest version of this Package. - """ + """Return the latest version of this Package.""" sorted_versions = sort_version(self.get_all_versions()) if sorted_versions: return sorted_versions[-1] @@ -630,14 +626,15 @@ def reindex(self, **kwargs): created for this Package. The fingerprints and Resources associated with this Package are deleted and recreated from the updated scan data. """ - from minecode.model_utils import add_package_to_scan_queue from minecode.model_utils import DEFAULT_PIPELINES + from minecode.model_utils import add_package_to_scan_queue - addon_pipelines = kwargs.get('addon_pipelines', []) + addon_pipelines = kwargs.get("addon_pipelines", []) pipelines = DEFAULT_PIPELINES + tuple(addon_pipelines) add_package_to_scan_queue( - self, pipelines=pipelines, reindex_uri=True, priority=100) + self, pipelines=pipelines, reindex_uri=True, priority=100 + ) def update_fields(self, save=False, **values_by_fields): """ @@ -661,28 +658,28 @@ def update_fields(self, save=False, **values_by_fields): if not hasattr(self, field): # Raise exception when we we are given a keyword argument that # doesn't correspond to a Package field - raise AttributeError( - f"'{class_name}' has no attribute '{field}'") + raise AttributeError(f"'{class_name}' has no attribute '{field}'") related_model_fields = [ - 'dependencies', - 'parties', - 'resources', + "dependencies", + "parties", + "resources", ] if field in related_model_fields: unsaved_models = [] - if field == 'dependencies': + if field == "dependencies": for dep_data in value: - if isinstance(dep_data, (dict, OrderedDict)): + if isinstance(dep_data, dict | OrderedDict): dep = DependentPackage( package=self, - purl=dep_data.get('purl'), + purl=dep_data.get("purl"), extracted_requirement=dep_data.get( - 'extracted_requirement'), - scope=dep_data.get('scope'), - is_runtime=dep_data.get('is_runtime'), - is_optional=dep_data.get('is_optional'), - is_resolved=dep_data.get('is_resolved'), + "extracted_requirement" + ), + scope=dep_data.get("scope"), + is_runtime=dep_data.get("is_runtime"), + is_optional=dep_data.get("is_optional"), + is_resolved=dep_data.get("is_resolved"), ) elif isinstance(dep_data, DependentPackage): dep = dep_data @@ -692,16 +689,16 @@ def update_fields(self, save=False, **values_by_fields): ) unsaved_models.append(dep) - if field == 'parties': + if field == "parties": for party_data in value: - if isinstance(party_data, (dict, OrderedDict)): + if isinstance(party_data, dict | OrderedDict): party = Party( package=self, - type=party_data.get('type'), - role=party_data.get('role'), - name=party_data.get('name'), - email=party_data.get('email'), - url=party_data.get('url'), + type=party_data.get("type"), + role=party_data.get("role"), + name=party_data.get("name"), + email=party_data.get("email"), + url=party_data.get("url"), ) elif isinstance(party_data, Party): party = party_data @@ -711,28 +708,29 @@ def update_fields(self, save=False, **values_by_fields): ) unsaved_models.append(party) - if field == 'resources': + if field == "resources": for resource_data in value: - if isinstance(resource_data, (dict, OrderedDict)): + if isinstance(resource_data, dict | OrderedDict): resource = Resource( package=self, - path=resource_data.get('path'), - is_file=resource_data.get('type') == 'file', - name=resource_data.get('name'), - extension=resource_data.get('extension'), - size=resource_data.get('size'), - md5=resource_data.get('md5'), - sha1=resource_data.get('sha1'), - sha256=resource_data.get('sha256'), - mime_type=resource_data.get('mime_type'), - file_type=resource_data.get('file_type'), + path=resource_data.get("path"), + is_file=resource_data.get("type") == "file", + name=resource_data.get("name"), + extension=resource_data.get("extension"), + size=resource_data.get("size"), + md5=resource_data.get("md5"), + sha1=resource_data.get("sha1"), + sha256=resource_data.get("sha256"), + mime_type=resource_data.get("mime_type"), + file_type=resource_data.get("file_type"), programming_language=resource_data.get( - 'programming_language'), - is_binary=resource_data.get('is_binary'), - is_text=resource_data.get('is_text'), - is_archive=resource_data.get('is_archive'), - is_media=resource_data.get('is_media'), - is_key_file=resource_data.get('is_key_file'), + "programming_language" + ), + is_binary=resource_data.get("is_binary"), + is_text=resource_data.get("is_text"), + is_archive=resource_data.get("is_archive"), + is_media=resource_data.get("is_media"), + is_key_file=resource_data.get("is_key_file"), ) resource.set_scan_results(resource_data) elif isinstance(resource_data, Resource): @@ -746,20 +744,19 @@ def update_fields(self, save=False, **values_by_fields): if unsaved_models: created_models_count = len(unsaved_models) model_count = 0 - if field == 'dependencies': + if field == "dependencies": model_count = self.dependencies.all().count() with transaction.atomic(): self.dependencies.all().delete() - DependentPackage.objects.bulk_create( - unsaved_models) + DependentPackage.objects.bulk_create(unsaved_models) - if field == 'parties': + if field == "parties": model_count = self.parties.all().count() with transaction.atomic(): self.parties.all().delete() Party.objects.bulk_create(unsaved_models) - if field == 'resources': + if field == "resources": model_count = self.resources.all().count() with transaction.atomic(): self.resources.all().delete() @@ -767,17 +764,17 @@ def update_fields(self, save=False, **values_by_fields): msg = f"Replaced {model_count} existing entries of field '{field}' with {created_models_count} new entries." self.append_to_history(msg) - replaced_fields.extend([field, 'history']) + replaced_fields.extend([field, "history"]) else: # Ensure the incoming value is of the correct type - if field == 'qualifiers' and isinstance(value, dict): + if field == "qualifiers" and isinstance(value, dict): value = normalize_qualifiers(value, encode=True) date_fields = [ - 'created_date', - 'last_indexed_date', - 'last_modified_date', - 'release_date', + "created_date", + "last_indexed_date", + "last_modified_date", + "release_date", ] if field in date_fields and isinstance(value, str): value = dateutil_parse(value) @@ -802,14 +799,14 @@ def update_fields(self, save=False, **values_by_fields): if updated_fields and history_entries: data = { - 'updated_fields': history_entries, + "updated_fields": history_entries, } self.append_to_history( - 'Package field values have been updated.', + "Package field values have been updated.", data=data, save=save, ) - updated_fields.append('history') + updated_fields.append("history") if replaced_fields: updated_fields.extend(replaced_fields) @@ -821,11 +818,11 @@ def update_fields(self, save=False, **values_by_fields): return self, updated_fields -party_person = 'person' +party_person = "person" # often loosely defined -party_project = 'project' +party_project = "project" # more formally defined -party_org = 'organization' +party_org = "organization" PARTY_TYPES = ( (party_person, party_person), (party_project, party_project), @@ -834,14 +831,13 @@ def update_fields(self, save=False, **values_by_fields): class Party(models.Model): - """ - A party is a person, project or organization related to a package. - """ + """A party is a person, project or organization related to a package.""" + package = models.ForeignKey( Package, - related_name='parties', + related_name="parties", on_delete=models.CASCADE, - help_text=_('The Package that this party is related to') + help_text=_("The Package that this party is related to"), ) type = models.CharField( @@ -849,118 +845,114 @@ class Party(models.Model): blank=True, null=True, choices=PARTY_TYPES, - help_text=_('the type of this party') + help_text=_("the type of this party"), ) role = models.CharField( max_length=32, blank=True, null=True, - help_text=_('A role for this party. Something such as author, ' - 'maintainer, contributor, owner, packager, distributor, ' - 'vendor, developer, owner, etc.') + help_text=_( + "A role for this party. Something such as author, " + "maintainer, contributor, owner, packager, distributor, " + "vendor, developer, owner, etc." + ), ) name = models.CharField( - max_length=70, - blank=True, - null=True, - help_text=_('Name of this party.') + max_length=70, blank=True, null=True, help_text=_("Name of this party.") ) email = models.CharField( - max_length=255, - blank=True, - null=True, - help_text=_('Email for this party.') + max_length=255, blank=True, null=True, help_text=_("Email for this party.") ) url = models.CharField( max_length=1024, blank=True, null=True, - help_text=_('URL to a primary web page for this party.') + help_text=_("URL to a primary web page for this party."), ) def to_dict(self): from packagedb.serializers import PartySerializer + party_data = PartySerializer(self).data return party_data class DependentPackage(models.Model): - """ - An identifiable dependent package package object. - """ + """An identifiable dependent package package object.""" + package = models.ForeignKey( Package, - related_name='dependencies', + related_name="dependencies", on_delete=models.CASCADE, - help_text=_('The Package that this dependent package is related to') + help_text=_("The Package that this dependent package is related to"), ) purl = models.CharField( max_length=2048, blank=True, null=True, - help_text=_('A compact purl package URL') + help_text=_("A compact purl package URL"), ) extracted_requirement = models.CharField( max_length=200, blank=True, null=True, - help_text=_( - 'A string defining version(s)requirements. Package-type specific.') + help_text=_("A string defining version(s)requirements. Package-type specific."), ) scope = models.CharField( max_length=100, blank=True, null=True, - help_text=_('The scope of this dependency, such as runtime, install, etc. ' - 'This is package-type specific and is the original scope string.') + help_text=_( + "The scope of this dependency, such as runtime, install, etc. " + "This is package-type specific and is the original scope string." + ), ) is_runtime = models.BooleanField( - default=True, - help_text=_('True if this dependency is a runtime dependency.') + default=True, help_text=_("True if this dependency is a runtime dependency.") ) is_optional = models.BooleanField( - default=False, - help_text=_('True if this dependency is an optional dependency') + default=False, help_text=_("True if this dependency is an optional dependency") ) is_resolved = models.BooleanField( default=False, - help_text=_('True if this dependency version requirement has ' - 'been resolved and this dependency url points to an ' - 'exact version.') + help_text=_( + "True if this dependency version requirement has " + "been resolved and this dependency url points to an " + "exact version." + ), ) def to_dict(self): from packagedb.serializers import DependentPackageSerializer + depedent_package_data = DependentPackageSerializer(self).data return depedent_package_data class AbstractResource(models.Model): - """ - These model fields should be kept in line with scancode.resource.Resource - """ + """These model fields should be kept in line with scancode.resource.Resource""" path = models.CharField( max_length=2000, help_text=_( - 'The full path value of a resource (file or directory) in the archive it is from.'), + "The full path value of a resource (file or directory) in the archive it is from." + ), ) name = models.CharField( max_length=255, blank=True, - help_text=_( - "File or directory name of this resource with its extension."), + help_text=_("File or directory name of this resource with its extension."), ) extension = models.CharField( @@ -974,7 +966,7 @@ class AbstractResource(models.Model): size = models.BigIntegerField( blank=True, null=True, - help_text=_('Size in bytes.'), + help_text=_("Size in bytes."), ) mime_type = models.CharField( @@ -998,8 +990,7 @@ class AbstractResource(models.Model): max_length=50, blank=True, null=True, - help_text=_( - "Programming language of this resource if this is a code file."), + help_text=_("Programming language of this resource if this is a code file."), ) is_binary = models.BooleanField(default=False) @@ -1010,17 +1001,16 @@ class AbstractResource(models.Model): is_file = models.BooleanField( default=False, - help_text=_( - 'True if this Resource is a file, False if it is a Directory') + help_text=_("True if this Resource is a file, False if it is a Directory"), ) @property def type(self): - return 'file' if self.is_file else 'directory' + return "file" if self.is_file else "directory" @type.setter def type(self, value): - if value == 'file': + if value == "file": self.is_file = True else: self.is_file = False @@ -1079,8 +1069,7 @@ class ScanFieldsModelMixin(models.Model): authors = models.JSONField( blank=True, default=list, - help_text=_( - "List of detected authors (and related detection details)."), + help_text=_("List of detected authors (and related detection details)."), ) package_data = models.JSONField( default=list, @@ -1137,47 +1126,41 @@ def copy_scan_results(self, from_instance, save=False): class Resource( - ExtraDataFieldMixin, - HashFieldsMixin, - ScanFieldsModelMixin, - AbstractResource + ExtraDataFieldMixin, HashFieldsMixin, ScanFieldsModelMixin, AbstractResource ): package = models.ForeignKey( Package, - related_name='resources', + related_name="resources", on_delete=models.CASCADE, - help_text=_('The Package that this Resource is from') + help_text=_("The Package that this Resource is from"), ) git_sha1 = models.CharField( max_length=40, blank=True, null=True, - help_text=_('git SHA1 checksum hex-encoded'), + help_text=_("git SHA1 checksum hex-encoded"), ) class Meta: - unique_together = ( - ('package', 'path'), - ) - ordering = ('id',) + unique_together = (("package", "path"),) + ordering = ("id",) indexes = [ - models.Index(fields=['md5']), - models.Index(fields=['sha1']), - models.Index(fields=['sha256']), - models.Index(fields=['sha512']), - models.Index(fields=['git_sha1']), + models.Index(fields=["md5"]), + models.Index(fields=["sha1"]), + models.Index(fields=["sha256"]), + models.Index(fields=["sha512"]), + models.Index(fields=["git_sha1"]), ] @property def for_packages(self): """Return the list of all Packages associated to this resource.""" - return [ - self.package.package_uid or str(self.package) - ] + return [self.package.package_uid or str(self.package)] def to_dict(self): from packagedb.serializers import ResourceMetadataSerializer + resource_metadata = ResourceMetadataSerializer(self).data return resource_metadata @@ -1213,9 +1196,9 @@ class Relationship(models.TextChoices): relationship = models.CharField( max_length=30, choices=Relationship.choices, - help_text='Relationship between the from and to package ' - 'URLs such as "source_package" when a package ' - 'is the source code package for another package.' + help_text="Relationship between the from and to package " + 'URLs such as "source_package" when a package ' + "is the source code package for another package.", ) def __str__(self): @@ -1226,11 +1209,11 @@ def __str__(self): def make_relationship( - from_package, to_package, relationship, + from_package, + to_package, + relationship, ): - """ - Create and return the from/to package relathionship if it does exists. - """ + """Create and return the from/to package relathionship if it does exists.""" pkg, _created = PackageRelation.objects.get_or_create( from_package=from_package, to_package=to_package, @@ -1240,9 +1223,8 @@ def make_relationship( class PackageWatch(models.Model): - """ - Model representing a watch on a package to monitor for new versions. - """ + """Model representing a watch on a package to monitor for new versions.""" + DEPTH_CHOICES = ( (1, "Version"), (2, "Metadata"), @@ -1302,14 +1284,14 @@ class PackageWatch(models.Model): choices=DEPTH_CHOICES, default=3, help_text=_( - "Depth of data collection from listing versions up to a full scan."), + "Depth of data collection from listing versions up to a full scan." + ), ) watch_interval = models.PositiveSmallIntegerField( validators=[ MinValueValidator(1, message="Interval must be at least 1 day."), - MaxValueValidator( - 365, message="Interval must be at most 365 days."), + MaxValueValidator(365, message="Interval must be at most 365 days."), ], default=7, help_text=_("Number of days to wait between watches of this package."), @@ -1394,7 +1376,7 @@ def save(self, *args, **kwargs): if schedule: self.schedule_work_id = self.create_new_job() - super(PackageWatch, self).save(*args, **kwargs) + super().save(*args, **kwargs) def delete(self, *args, **kwargs): """Clear associated watch schedule.""" @@ -1417,16 +1399,13 @@ def create_new_job(self): class PackageSet(models.Model): - """ - A group of related Packages - """ + """A group of related Packages""" + uuid = models.UUIDField( verbose_name=_("UUID"), default=uuid.uuid4, unique=True, - help_text=_( - 'The identifier of the Package set' - ) + help_text=_("The identifier of the Package set"), ) def add_to_package_set(self, package): @@ -1435,15 +1414,13 @@ def add_to_package_set(self, package): def get_package_set_members(self): """Return related Packages""" return self.packages.order_by( - 'package_content', + "package_content", ) class ApiUserManager(UserManager): def create_api_user(self, username, first_name="", last_name="", **extra_fields): - """ - Create and return an API-only user. Raise ValidationError. - """ + """Create and return an API-only user. Raise ValidationError.""" username = self.normalize_email(username) email = username self._validate_username(email) @@ -1469,16 +1446,15 @@ def create_api_user(self, username, first_name="", last_name="", **extra_fields) return user def _validate_username(self, email): - """ - Validate username. If invalid, raise a ValidationError - """ + """Validate username. If invalid, raise a ValidationError""" try: self.get_by_natural_key(email) except models.ObjectDoesNotExist: pass else: raise exceptions.ValidationError( - f"Error: This email already exists: {email}") + f"Error: This email already exists: {email}" + ) @receiver(models.signals.post_save, sender=settings.AUTH_USER_MODEL) diff --git a/packagedb/package_managers.py b/packagedb/package_managers.py index 14916614..898dc4da 100644 --- a/packagedb/package_managers.py +++ b/packagedb/package_managers.py @@ -8,20 +8,17 @@ # import dataclasses -import json import logging import traceback import xml.etree.ElementTree as ET +from collections.abc import Iterable from datetime import datetime -from typing import Iterable -from typing import List -from typing import Optional -from typing import Set from urllib.parse import urlparse +from django.utils.dateparse import parse_datetime + import requests from dateutil import parser as dateparser -from django.utils.dateparse import parse_datetime from packageurl import PackageURL logger = logging.getLogger(__name__) @@ -38,7 +35,7 @@ @dataclasses.dataclass(frozen=True) class PackageVersion: value: str - release_date: Optional[datetime] = None + release_date: datetime | None = None def to_dict(self): release_date = self.release_date @@ -48,8 +45,8 @@ def to_dict(self): @dataclasses.dataclass class VersionResponse: - valid_versions: Set[str] = dataclasses.field(default_factory=set) - newer_versions: Set[str] = dataclasses.field(default_factory=set) + valid_versions: set[str] = dataclasses.field(default_factory=set) + newer_versions: set[str] = dataclasses.field(default_factory=set) def get_response(url, content_type="json", headers=None): @@ -61,7 +58,7 @@ def get_response(url, content_type="json", headers=None): try: resp = requests.get(url=url, headers=headers) - except: + except Exception: logger.error(traceback.format_exc()) return if not resp.status_code == 200: @@ -124,7 +121,9 @@ def get_until(self, package_name, until=None) -> VersionResponse: else: valid_versions.add(version.value) - return VersionResponse(valid_versions=valid_versions, newer_versions=new_versions) + return VersionResponse( + valid_versions=valid_versions, newer_versions=new_versions + ) def fetch(self, pkg: str) -> Iterable[PackageVersion]: """ @@ -135,16 +134,12 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: def remove_debian_default_epoch(version): - """ - Remove the default epoch from a Debian ``version`` string. - """ + """Remove the default epoch from a Debian ``version`` string.""" return version and version.replace("0:", "") class LaunchpadVersionAPI(VersionAPI): - """ - Fetch versions of Ubuntu debian packages from Launchpad - """ + """Fetch versions of Ubuntu debian packages from Launchpad""" package_type = "deb" @@ -165,7 +160,9 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: for release in entries: source_package_version = release.get("source_package_version") - source_package_version = remove_debian_default_epoch(version=source_package_version) + source_package_version = remove_debian_default_epoch( + version=source_package_version + ) date_published = release.get("date_published") release_date = None if date_published and type(date_published) is str: @@ -182,9 +179,7 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: class PypiVersionAPI(VersionAPI): - """ - Fetch versions of Python pypi packages from the PyPI API. - """ + """Fetch versions of Python pypi packages from the PyPI API.""" package_type = "pypi" @@ -238,9 +233,7 @@ def get_latest_date(self, downloads): class CratesVersionAPI(VersionAPI): - """ - Fetch versions of Rust cargo packages from the crates.io API. - """ + """Fetch versions of Rust cargo packages from the crates.io API.""" package_type = "cargo" @@ -255,9 +248,7 @@ def fetch(self, pkg): class RubyVersionAPI(VersionAPI): - """ - Fetch versions of Rubygems packages from the rubygems API. - """ + """Fetch versions of Rubygems packages from the rubygems API.""" package_type = "gem" @@ -280,9 +271,7 @@ def fetch(self, pkg): class NpmVersionAPI(VersionAPI): - """ - Fetch versions of npm packages from the npm registry API. - """ + """Fetch versions of npm packages from the npm registry API.""" package_type = "npm" @@ -300,9 +289,7 @@ def fetch(self, pkg): class DebianVersionAPI(VersionAPI): - """ - Fetch versions of Debian debian packages from the sources.debian.org API - """ + """Fetch versions of Debian debian packages from the sources.debian.org API""" package_type = "deb" @@ -324,9 +311,7 @@ def fetch(self, pkg): class MavenVersionAPI(VersionAPI): - """ - Fetch versions of Maven packages from Maven Central maven-metadata.xml data - """ + """Fetch versions of Maven packages from Maven Central maven-metadata.xml data""" package_type = "maven" @@ -339,7 +324,7 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: yield from self.extract_versions(xml_resp) @staticmethod - def artifact_url(artifact_comps: List[str]) -> str: + def artifact_url(artifact_comps: list[str]) -> str: try: group_id, artifact_id = artifact_comps except ValueError: @@ -365,9 +350,7 @@ def extract_versions(xml_response: ET.ElementTree) -> Iterable[PackageVersion]: class NugetVersionAPI(VersionAPI): - """ - Fetch versions of NuGet packages from the nuget.org API - """ + """Fetch versions of NuGet packages from the nuget.org API""" package_type = "nuget" @@ -396,16 +379,12 @@ def extract_versions(response: dict) -> Iterable[PackageVersion]: def cleaned_version(version): - """ - Return a ``version`` string stripped from leading "v" prefix. - """ + """Return a ``version`` string stripped from leading "v" prefix.""" return version.lstrip("vV") class ComposerVersionAPI(VersionAPI): - """ - Fetch versions of PHP Composer packages from the packagist.org API - """ + """Fetch versions of PHP Composer packages from the packagist.org API""" package_type = "composer" @@ -431,9 +410,7 @@ def extract_versions(resp: dict, pkg: str) -> Iterable[PackageVersion]: class HexVersionAPI(VersionAPI): - """ - Fetch versions of Erlang packages from the hex API - """ + """Fetch versions of Erlang packages from the hex API""" package_type = "hex" @@ -451,9 +428,7 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: class GoproxyVersionAPI(VersionAPI): - """ - Fetch versions of Go "golang" packages from the Go proxy API - """ + """Fetch versions of Go "golang" packages from the Go proxy API""" package_type = "golang" @@ -461,7 +436,7 @@ def __init__(self): self.module_name_by_package_name = {} @staticmethod - def trim_go_url_path(url_path: str) -> Optional[str]: + def trim_go_url_path(url_path: str) -> str | None: """ Return a trimmed Go `url_path` removing trailing package references and keeping only the module @@ -512,7 +487,9 @@ def escape_path(path: str) -> str: return escaped_path @staticmethod - def fetch_version_info(version_info: str, escaped_pkg: str) -> Optional[PackageVersion]: + def fetch_version_info( + version_info: str, escaped_pkg: str + ) -> PackageVersion | None: v = version_info.split() if not v: return None @@ -534,12 +511,13 @@ def fetch_version_info(version_info: str, escaped_pkg: str) -> Optional[PackageV f"Error while fetching version info for {escaped_pkg}/{escaped_ver} " f"from goproxy:\n{traceback.format_exc()}" ) - release_date = parse_datetime(response.get("Time", "")) if response else None + release_date = ( + parse_datetime(response.get("Time", "")) if response else None + ) return PackageVersion(value=value, release_date=release_date) def fetch(self, pkg: str) -> Iterable[PackageVersion]: - # escape uppercase in module path escaped_pkg = self.escape_path(pkg) trimmed_pkg = pkg @@ -584,7 +562,9 @@ def fetch(self, pkg: str) -> Iterable[PackageVersion]: } -VERSION_API_CLASSES_BY_PACKAGE_TYPE = {cls.package_type: cls for cls in VERSION_API_CLASSES} +VERSION_API_CLASSES_BY_PACKAGE_TYPE = { + cls.package_type: cls for cls in VERSION_API_CLASSES +} VERSION_API_CLASS_BY_PACKAGE_NAMESPACE = { @@ -617,7 +597,11 @@ def get_api_package_name(purl: PackageURL) -> str: def get_version_fetcher(package_url): if package_url.type == "deb": - versions_fetcher: VersionAPI = VERSION_API_CLASS_BY_PACKAGE_NAMESPACE[package_url.namespace] + versions_fetcher: VersionAPI = VERSION_API_CLASS_BY_PACKAGE_NAMESPACE[ + package_url.namespace + ] else: - versions_fetcher: VersionAPI = VERSION_API_CLASSES_BY_PACKAGE_TYPE[package_url.type] + versions_fetcher: VersionAPI = VERSION_API_CLASSES_BY_PACKAGE_TYPE[ + package_url.type + ] return versions_fetcher diff --git a/packagedb/schedules.py b/packagedb/schedules.py index 65b44eb8..916490ef 100644 --- a/packagedb/schedules.py +++ b/packagedb/schedules.py @@ -20,13 +20,10 @@ def get_next_execution(watch_interval_days, last_watch_date): - """ - Calculate the next execution time based on the watch_interval_days and last_watch_date. - """ + """Calculate the next execution time based on the watch_interval_days and last_watch_date.""" current_date_time = datetime.datetime.now(tz=datetime.timezone.utc) if last_watch_date: - next_execution = last_watch_date + \ - datetime.timedelta(days=watch_interval_days) + next_execution = last_watch_date + datetime.timedelta(days=watch_interval_days) if next_execution > current_date_time: return next_execution @@ -35,8 +32,8 @@ def get_next_execution(watch_interval_days, last_watch_date): def schedule_watch(watch): """ - Takes a `PackageWatch` object as input and schedule a - recurring job using `rq_scheduler` to watch the package. + Schedule a recurring job with a `PackageWatch` object using `rq_scheduler` + to watch the package. """ watch_interval = watch.watch_interval last_watch_date = watch.last_watch_date @@ -64,17 +61,14 @@ def clear_job(job): def scheduled_job_exists(job_id): - """ - Check if a scheduled job with the given job ID exists. - """ + """Check if a scheduled job with the given job ID exists.""" return job_id and (job_id in scheduler) def clear_zombie_watch_schedules(logger=log): - """ - Clear scheduled jobs not associated with any PackageWatch object. - """ + """Clear scheduled jobs not associated with any PackageWatch object.""" from packagedb.models import PackageWatch + schedule_ids = PackageWatch.objects.all().values_list("schedule_work_id", flat=True) for job in scheduler.get_jobs(): @@ -84,9 +78,7 @@ def clear_zombie_watch_schedules(logger=log): def is_redis_running(logger=log): - """ - Check the status of the Redis server. - """ + """Check the status of the Redis server.""" try: connection = django_rq.get_connection() return connection.ping() diff --git a/packagedb/serializers.py b/packagedb/serializers.py index 0bb22b03..1433f248 100644 --- a/packagedb/serializers.py +++ b/packagedb/serializers.py @@ -33,44 +33,45 @@ class ResourceAPISerializer(HyperlinkedModelSerializer): package = HyperlinkedRelatedField( - view_name='api:package-detail', lookup_field='uuid', read_only=True) - purl = CharField(source='package.package_url') + view_name="api:package-detail", lookup_field="uuid", read_only=True + ) + purl = CharField(source="package.package_url") class Meta: model = Resource fields = ( - 'package', - 'purl', - 'path', - 'type', - 'name', - 'extension', - 'size', - 'md5', - 'sha1', - 'sha256', - 'sha512', - 'git_sha1', - 'mime_type', - 'file_type', - 'programming_language', - 'is_binary', - 'is_text', - 'is_archive', - 'is_media', - 'is_key_file', - 'detected_license_expression', - 'detected_license_expression_spdx', - 'license_detections', - 'license_clues', - 'percentage_of_license_text', - 'copyrights', - 'holders', - 'authors', - 'package_data', - 'emails', - 'urls', - 'extra_data', + "package", + "purl", + "path", + "type", + "name", + "extension", + "size", + "md5", + "sha1", + "sha256", + "sha512", + "git_sha1", + "mime_type", + "file_type", + "programming_language", + "is_binary", + "is_text", + "is_archive", + "is_media", + "is_key_file", + "detected_license_expression", + "detected_license_expression_spdx", + "license_detections", + "license_clues", + "percentage_of_license_text", + "copyrights", + "holders", + "authors", + "package_data", + "emails", + "urls", + "extra_data", ) read_only_fields = fields @@ -81,37 +82,37 @@ class ResourceMetadataSerializer(HyperlinkedModelSerializer): class Meta: model = Resource fields = ( - 'path', - 'type', - 'name', - 'extension', - 'size', - 'md5', - 'sha1', - 'sha256', - 'sha512', - 'git_sha1', - 'mime_type', - 'file_type', - 'programming_language', - 'is_binary', - 'is_text', - 'is_archive', - 'is_media', - 'is_key_file', - 'detected_license_expression', - 'detected_license_expression_spdx', - 'license_detections', - 'license_clues', - 'percentage_of_license_text', - 'copyrights', - 'holders', - 'authors', - 'package_data', - 'for_packages', - 'emails', - 'urls', - 'extra_data', + "path", + "type", + "name", + "extension", + "size", + "md5", + "sha1", + "sha256", + "sha512", + "git_sha1", + "mime_type", + "file_type", + "programming_language", + "is_binary", + "is_text", + "is_archive", + "is_media", + "is_key_file", + "detected_license_expression", + "detected_license_expression_spdx", + "license_detections", + "license_clues", + "percentage_of_license_text", + "copyrights", + "holders", + "authors", + "package_data", + "for_packages", + "emails", + "urls", + "extra_data", ) @@ -119,11 +120,11 @@ class PartySerializer(ModelSerializer): class Meta: model = Party fields = ( - 'type', - 'role', - 'name', - 'email', - 'url', + "type", + "role", + "name", + "email", + "url", ) @@ -131,30 +132,28 @@ class DependentPackageSerializer(ModelSerializer): class Meta: model = DependentPackage fields = ( - 'purl', - 'extracted_requirement', - 'scope', - 'is_runtime', - 'is_optional', - 'is_resolved', + "purl", + "extracted_requirement", + "scope", + "is_runtime", + "is_optional", + "is_resolved", ) class PackageInPackageSetAPISerializer(ModelSerializer): """ - This serializes Package instances within a PackageSet that is within a - Package in the PackageAPISerializer + Serialize Package instances within a PackageSet that is within a Package in + the PackageAPISerializer """ + class Meta: model = Package - fields = ( - 'uuid', - ) + fields = ("uuid",) def to_representation(self, instance): - reverse_uri = reverse_lazy( - 'api:package-detail', kwargs={'uuid': instance.uuid}) - request = self.context['request'] + reverse_uri = reverse_lazy("api:package-detail", kwargs={"uuid": instance.uuid}) + request = self.context["request"] return request.build_absolute_uri(reverse_uri) @@ -164,8 +163,8 @@ class PackageSetAPISerializer(ModelSerializer): class Meta: model = PackageSet fields = ( - 'uuid', - 'packages', + "uuid", + "packages", ) @@ -173,11 +172,12 @@ class PackageAPISerializer(HyperlinkedModelSerializer): dependencies = DependentPackageSerializer(many=True) parties = PartySerializer(many=True) resources = HyperlinkedIdentityField( - view_name='api:package-resources', lookup_field='uuid') + view_name="api:package-resources", lookup_field="uuid" + ) history = HyperlinkedIdentityField( - view_name='api:package-history', lookup_field='uuid') - url = HyperlinkedIdentityField( - view_name='api:package-detail', lookup_field='uuid') + view_name="api:package-history", lookup_field="uuid" + ) + url = HyperlinkedIdentityField(view_name="api:package-detail", lookup_field="uuid") package_sets = PackageSetAPISerializer(many=True) package_content = SerializerMethodField() declared_license_expression_spdx = CharField() @@ -186,54 +186,54 @@ class PackageAPISerializer(HyperlinkedModelSerializer): class Meta: model = Package fields = ( - 'url', - 'uuid', - 'filename', - 'package_sets', - 'package_content', - 'purl', - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'primary_language', - 'description', - 'release_date', - 'parties', - 'keywords', - 'homepage_url', - 'download_url', - 'bug_tracking_url', - 'code_view_url', - 'vcs_url', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'size', - 'md5', - 'sha1', - 'sha256', - 'sha512', - 'copyright', - 'holder', - 'declared_license_expression', - 'declared_license_expression_spdx', - 'license_detections', - 'other_license_expression', - 'other_license_expression_spdx', - 'other_license_detections', - 'extracted_license_statement', - 'notice_text', - 'source_packages', - 'extra_data', - 'package_uid', - 'datasource_id', - 'file_references', - 'dependencies', - 'resources', - 'history', + "url", + "uuid", + "filename", + "package_sets", + "package_content", + "purl", + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "primary_language", + "description", + "release_date", + "parties", + "keywords", + "homepage_url", + "download_url", + "bug_tracking_url", + "code_view_url", + "vcs_url", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "size", + "md5", + "sha1", + "sha256", + "sha512", + "copyright", + "holder", + "declared_license_expression", + "declared_license_expression_spdx", + "license_detections", + "other_license_expression", + "other_license_expression_spdx", + "other_license_detections", + "extracted_license_statement", + "notice_text", + "source_packages", + "extra_data", + "package_uid", + "datasource_id", + "file_references", + "dependencies", + "resources", + "history", ) read_only_fields = fields @@ -243,14 +243,13 @@ def get_package_content(self, obj): class PackageInPackageSetMetadataSerializer(ModelSerializer): """ - This serializes Package instances within a PackageSet that is within a - Package in the PackageMetadataSerializer + Serialize Package instances within a PackageSet that is within a Package in + the PackageMetadataSerializer """ + class Meta: model = Package - fields = ( - 'uuid', - ) + fields = ("uuid",) def to_representation(self, instance): return instance.package_uid @@ -262,8 +261,8 @@ class PackageSetMetadataSerializer(ModelSerializer): class Meta: model = PackageSet fields = ( - 'uuid', - 'packages', + "uuid", + "packages", ) @@ -275,6 +274,7 @@ class PackageMetadataSerializer(ModelSerializer): This differs from PackageSerializer used for the API by the addition of the `package_url` field and the exclusion of the `uuid`, and `filename` fields. """ + dependencies = DependentPackageSerializer(many=True) parties = PartySerializer(many=True) package_sets = PackageSetMetadataSerializer(many=True) @@ -285,49 +285,49 @@ class PackageMetadataSerializer(ModelSerializer): class Meta: model = Package fields = ( - 'type', - 'namespace', - 'name', - 'version', - 'qualifiers', - 'subpath', - 'package_sets', - 'package_content', - 'primary_language', - 'description', - 'release_date', - 'parties', - 'keywords', - 'homepage_url', - 'download_url', - 'size', - 'md5', - 'sha1', - 'sha256', - 'sha512', - 'bug_tracking_url', - 'code_view_url', - 'vcs_url', - 'copyright', - 'holder', - 'declared_license_expression', - 'declared_license_expression_spdx', - 'license_detections', - 'other_license_expression', - 'other_license_expression_spdx', - 'other_license_detections', - 'extracted_license_statement', - 'notice_text', - 'source_packages', - 'extra_data', - 'dependencies', - 'package_uid', - 'datasource_id', - 'purl', - 'repository_homepage_url', - 'repository_download_url', - 'api_data_url', - 'file_references', + "type", + "namespace", + "name", + "version", + "qualifiers", + "subpath", + "package_sets", + "package_content", + "primary_language", + "description", + "release_date", + "parties", + "keywords", + "homepage_url", + "download_url", + "size", + "md5", + "sha1", + "sha256", + "sha512", + "bug_tracking_url", + "code_view_url", + "vcs_url", + "copyright", + "holder", + "declared_license_expression", + "declared_license_expression_spdx", + "license_detections", + "other_license_expression", + "other_license_expression_spdx", + "other_license_detections", + "extracted_license_statement", + "notice_text", + "source_packages", + "extra_data", + "dependencies", + "package_uid", + "datasource_id", + "purl", + "repository_homepage_url", + "repository_download_url", + "api_data_url", + "file_references", ) def get_package_content(self, obj): @@ -340,29 +340,28 @@ class PackageSetAPISerializer(ModelSerializer): class Meta: model = PackageSet fields = [ - 'uuid', - 'packages', + "uuid", + "packages", ] class PackageWatchAPISerializer(HyperlinkedModelSerializer): url = HyperlinkedIdentityField( - view_name='api:packagewatch-detail', - lookup_field='package_url' + view_name="api:packagewatch-detail", lookup_field="package_url" ) class Meta: model = PackageWatch fields = [ - 'url', - 'package_url', - 'is_active', - 'depth', - 'watch_interval', - 'creation_date', - 'last_watch_date', - 'watch_error', - 'schedule_work_id', + "url", + "package_url", + "is_active", + "depth", + "watch_interval", + "creation_date", + "last_watch_date", + "watch_error", + "schedule_work_id", ] @@ -371,8 +370,7 @@ class Meta: model = PackageWatch fields = ["package_url", "depth", "watch_interval", "is_active"] extra_kwargs = { - field: {"initial": PackageWatch._meta.get_field( - field).get_default()} + field: {"initial": PackageWatch._meta.get_field(field).get_default()} for field in ["depth", "watch_interval", "is_active"] } @@ -380,7 +378,7 @@ class Meta: class PackageWatchUpdateSerializer(ModelSerializer): class Meta: model = PackageWatch - fields = ['depth', 'watch_interval', 'is_active'] + fields = ["depth", "watch_interval", "is_active"] class CommaListField(ListField): @@ -390,7 +388,7 @@ def to_internal_value(self, data): if isinstance(data, str): split_data = [] for datum in data: - split_data.extend(datum.split(',')) + split_data.extend(datum.split(",")) data = split_data return super().to_internal_value(data) @@ -416,7 +414,7 @@ def validate_purl(self, value): try: PackageURL.from_string(value) except ValueError as e: - raise ValidationError(f'purl validation error: {e}') + raise ValidationError(f"purl validation error: {e}") return value def validate_source_purl(self, value): @@ -424,17 +422,25 @@ def validate_source_purl(self, value): try: PackageURL.from_string(value) except ValueError as e: - raise ValidationError(f'purl validation error: {e}') + raise ValidationError(f"purl validation error: {e}") return value def validate_addon_pipelines(self, value): - if invalid_pipelines := [pipe for pipe in value if not is_supported_addon_pipeline(pipe)]: - raise ValidationError(f'Error unsupported addon pipelines: {",".join(invalid_pipelines)}') + if invalid_pipelines := [ + pipe for pipe in value if not is_supported_addon_pipeline(pipe) + ]: + raise ValidationError( + f'Error unsupported addon pipelines: {",".join(invalid_pipelines)}' + ) return value def validate_sort(self, value): - if invalid_sort_fields := [field for field in value if not is_supported_sort_field(field)]: - raise ValidationError(f'Error unsupported sort fields: {",".join(invalid_sort_fields)}') + if invalid_sort_fields := [ + field for field in value if not is_supported_sort_field(field) + ]: + raise ValidationError( + f'Error unsupported sort fields: {",".join(invalid_sort_fields)}' + ) return value @@ -473,34 +479,39 @@ class PurlUpdateResponseSerializer(Serializer): class IndexPackagesResponseSerializer(Serializer): queued_packages_count = IntegerField( - help_text="Number of package urls placed on the index queue.") + help_text="Number of package urls placed on the index queue." + ) queued_packages = ListField( child=CharField(), - help_text="List of package urls that were placed on the index queue." + help_text="List of package urls that were placed on the index queue.", ) requeued_packages_count = IntegerField( - help_text="Number of existing package urls placed on the rescan queue.") + help_text="Number of existing package urls placed on the rescan queue." + ) requeued_packages = ListField( child=CharField(), - help_text="List of existing package urls that were placed on the rescan queue." + help_text="List of existing package urls that were placed on the rescan queue.", ) unqueued_packages_count = IntegerField( - help_text="Number of package urls not placed on the index queue.") + help_text="Number of package urls not placed on the index queue." + ) unqueued_packages = ListField( child=CharField(), - help_text="List of package urls that were not placed on the index queue." + help_text="List of package urls that were not placed on the index queue.", ) unsupported_packages_count = IntegerField( - help_text="Number of package urls that are not processable by the index queue.") + help_text="Number of package urls that are not processable by the index queue." + ) unsupported_packages = ListField( child=CharField(), - help_text="List of package urls that are not processable by the index queue." + help_text="List of package urls that are not processable by the index queue.", ) unsupported_vers_count = IntegerField( - help_text="Number of vers range that are not supported by the univers or package_manager.") + help_text="Number of vers range that are not supported by the univers or package_manager." + ) unsupported_vers = ListField( child=CharField(), - help_text="List of vers range that are not supported by the univers or package_manager." + help_text="List of vers range that are not supported by the univers or package_manager.", ) @@ -534,10 +545,12 @@ class PurltoGitRepoResponseSerializer(Serializer): def is_supported_addon_pipeline(addon_pipeline): from minecode.model_utils import SUPPORTED_ADDON_PIPELINES + return addon_pipeline in SUPPORTED_ADDON_PIPELINES def is_supported_sort_field(field): from packagedb.api import PACKAGE_FILTER_SORT_FIELDS + # A field could have a leading `-` - return field.lstrip('-') in PACKAGE_FILTER_SORT_FIELDS + return field.lstrip("-") in PACKAGE_FILTER_SORT_FIELDS diff --git a/packagedb/tasks.py b/packagedb/tasks.py index 2c390f61..decbb376 100644 --- a/packagedb/tasks.py +++ b/packagedb/tasks.py @@ -67,8 +67,7 @@ def get_and_index_new_purls(package_url): try: local_versions = [version_class(version) for version in local_versions] - all_versions = [version_class(version.value) - for version in all_versions] + all_versions = [version_class(version.value) for version in all_versions] except InvalidVersion as e: return f"InvalidVersion exception: {e}" @@ -101,8 +100,7 @@ def is_supported_watch_ecosystem(watch): watch.watch_error = ( f"`{watch.type}` ecosystem is not supported by {error_message}" ) - watch.last_watch_date = datetime.datetime.now( - tz=datetime.timezone.utc) + watch.last_watch_date = datetime.datetime.now(tz=datetime.timezone.utc) watch.save(update_fields=["last_watch_date"]) return False diff --git a/packagedb/tests/test_api.py b/packagedb/tests/test_api.py index e91dfe83..d7c150b8 100644 --- a/packagedb/tests/test_api.py +++ b/packagedb/tests/test_api.py @@ -15,15 +15,15 @@ from django.test import TestCase from django.urls import reverse from django.utils import timezone -from packageurl.contrib.django.utils import purl_to_lookups + from rest_framework import status from rest_framework.test import APIClient from univers.versions import MavenVersion from minecode.models import PriorityResourceURI from minecode.models import ScannableURI -from minecode.utils_test import JsonBasedTesting from minecode.tests import FIXTURES_REGEN +from minecode.utils_test import JsonBasedTesting from packagedb.models import Package from packagedb.models import PackageContentType from packagedb.models import PackageSet @@ -32,354 +32,337 @@ class ResourceAPITestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): self.package1 = Package.objects.create( - download_url='https://test-url.com/package1.tar.gz', - type='type1', - name='name1', + download_url="https://test-url.com/package1.tar.gz", + type="type1", + name="name1", ) self.package2 = Package.objects.create( - download_url='https://test-url.com/package2.tar.gz', - type='type2', - name='name2', + download_url="https://test-url.com/package2.tar.gz", + type="type2", + name="name2", ) self.resource1 = Resource.objects.create( package=self.package1, - path='package1/contents1.txt', + path="package1/contents1.txt", size=101, - sha1='testsha11', - md5='testmd51', - sha256='testsha2561', - sha512='testsha5121', - git_sha1='testgit_sha11', + sha1="testsha11", + md5="testmd51", + sha256="testsha2561", + sha512="testsha5121", + git_sha1="testgit_sha11", is_file=True, - extra_data=json.dumps({'test1': 'data1'}) + extra_data=json.dumps({"test1": "data1"}), ) self.resource2 = Resource.objects.create( package=self.package2, - path='package2/contents2.txt', + path="package2/contents2.txt", size=102, - sha1='testsha12', - md5='testmd52', - sha256='testsha2562', - sha512='testsha5122', - git_sha1='testgit_sha12', + sha1="testsha12", + md5="testmd52", + sha256="testsha2562", + sha512="testsha5122", + git_sha1="testgit_sha12", is_file=True, - extra_data=json.dumps({'test2': 'data2'}) + extra_data=json.dumps({"test2": "data2"}), ) - self.test_url = 'http://testserver/api/packages/{}/' + self.test_url = "http://testserver/api/packages/{}/" self.client = APIClient() def test_api_resource_list_endpoint(self): - response = self.client.get('/api/resources/') + response = self.client.get("/api/resources/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) + self.assertEqual(2, response.data.get("count")) def test_api_resource_retrieve_endpoint(self): - response = self.client.get( - '/api/resources/{}/'.format(self.resource1.sha1)) + response = self.client.get(f"/api/resources/{self.resource1.sha1}/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data.get('package'), - self.test_url.format(str(self.package1.uuid))) - self.assertEqual(response.data.get('purl'), self.package1.package_url) - self.assertEqual(response.data.get('path'), self.resource1.path) - self.assertEqual(response.data.get('size'), self.resource1.size) - self.assertEqual(response.data.get('sha1'), self.resource1.sha1) - self.assertEqual(response.data.get('md5'), self.resource1.md5) - self.assertEqual(response.data.get('sha256'), self.resource1.sha256) - self.assertEqual(response.data.get('sha512'), self.resource1.sha512) - self.assertEqual(response.data.get( - 'git_sha1'), self.resource1.git_sha1) - self.assertEqual(response.data.get('extra_data'), - self.resource1.extra_data) - self.assertEqual(response.data.get('type'), self.resource1.type) - - def test_api_resource_list_endpoint_returns_none_when_filtering_by_non_uuid_value(self): - response = self.client.get( - '/api/resources/?package={}'.format('not-a-uuid')) + self.assertEqual( + response.data.get("package"), self.test_url.format(str(self.package1.uuid)) + ) + self.assertEqual(response.data.get("purl"), self.package1.package_url) + self.assertEqual(response.data.get("path"), self.resource1.path) + self.assertEqual(response.data.get("size"), self.resource1.size) + self.assertEqual(response.data.get("sha1"), self.resource1.sha1) + self.assertEqual(response.data.get("md5"), self.resource1.md5) + self.assertEqual(response.data.get("sha256"), self.resource1.sha256) + self.assertEqual(response.data.get("sha512"), self.resource1.sha512) + self.assertEqual(response.data.get("git_sha1"), self.resource1.git_sha1) + self.assertEqual(response.data.get("extra_data"), self.resource1.extra_data) + self.assertEqual(response.data.get("type"), self.resource1.type) + + def test_api_resource_list_endpoint_returns_none_when_filtering_by_non_uuid_value( + self, + ): + response = self.client.get("/api/resources/?package={}".format("not-a-uuid")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(0, response.data.get('count')) + self.assertEqual(0, response.data.get("count")) def test_api_resource_list_endpoint_returns_none_when_filtering_by_wrong_uuid(self): response = self.client.get( - '/api/resources/?package={}'.format('4eb22e66-3e1c-4818-9b5e-858008a7c2b5')) + "/api/resources/?package={}".format("4eb22e66-3e1c-4818-9b5e-858008a7c2b5") + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(0, response.data.get('count')) + self.assertEqual(0, response.data.get("count")) def test_api_resource_list_endpoint_returns_none_when_filtering_by_blank_uuid(self): - response = self.client.get('/api/resources/?package={}'.format('')) + response = self.client.get("/api/resources/?package={}".format("")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) + self.assertEqual(2, response.data.get("count")) def test_api_resource_list_endpoint_filters_by_package1_uuid(self): - response = self.client.get( - '/api/resources/?package={}'.format(self.package1.uuid)) + response = self.client.get(f"/api/resources/?package={self.package1.uuid}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_resource = response.data.get('results')[0] - self.assertEqual(test_resource.get('package'), - self.test_url.format(str(self.package1.uuid))) - self.assertEqual(test_resource.get('purl'), self.package1.package_url) - self.assertEqual(test_resource.get('path'), self.resource1.path) - self.assertEqual(test_resource.get('size'), self.resource1.size) - self.assertEqual(test_resource.get('sha1'), self.resource1.sha1) - self.assertEqual(test_resource.get('md5'), self.resource1.md5) - self.assertEqual(test_resource.get('sha256'), self.resource1.sha256) - self.assertEqual(test_resource.get('sha512'), self.resource1.sha512) - self.assertEqual(test_resource.get( - 'git_sha1'), self.resource1.git_sha1) - self.assertEqual(test_resource.get('extra_data'), - self.resource1.extra_data) - self.assertEqual(test_resource.get('type'), self.resource1.type) + self.assertEqual(1, response.data.get("count")) + + test_resource = response.data.get("results")[0] + self.assertEqual( + test_resource.get("package"), self.test_url.format(str(self.package1.uuid)) + ) + self.assertEqual(test_resource.get("purl"), self.package1.package_url) + self.assertEqual(test_resource.get("path"), self.resource1.path) + self.assertEqual(test_resource.get("size"), self.resource1.size) + self.assertEqual(test_resource.get("sha1"), self.resource1.sha1) + self.assertEqual(test_resource.get("md5"), self.resource1.md5) + self.assertEqual(test_resource.get("sha256"), self.resource1.sha256) + self.assertEqual(test_resource.get("sha512"), self.resource1.sha512) + self.assertEqual(test_resource.get("git_sha1"), self.resource1.git_sha1) + self.assertEqual(test_resource.get("extra_data"), self.resource1.extra_data) + self.assertEqual(test_resource.get("type"), self.resource1.type) def test_api_resource_list_endpoint_filters_by_package2_uuid(self): - response = self.client.get( - '/api/resources/?package={}'.format(self.package2.uuid)) + response = self.client.get(f"/api/resources/?package={self.package2.uuid}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_resource = response.data.get('results')[0] - self.assertEqual(test_resource.get('package'), - self.test_url.format(str(self.package2.uuid))) - self.assertEqual(test_resource.get('purl'), self.package2.package_url) - self.assertEqual(test_resource.get('path'), self.resource2.path) - self.assertEqual(test_resource.get('size'), self.resource2.size) - self.assertEqual(test_resource.get('sha1'), self.resource2.sha1) - self.assertEqual(test_resource.get('md5'), self.resource2.md5) - self.assertEqual(test_resource.get('sha256'), self.resource2.sha256) - self.assertEqual(test_resource.get('sha512'), self.resource2.sha512) - self.assertEqual(test_resource.get( - 'git_sha1'), self.resource2.git_sha1) - self.assertEqual(test_resource.get('extra_data'), - self.resource2.extra_data) - self.assertEqual(test_resource.get('type'), self.resource2.type) + self.assertEqual(1, response.data.get("count")) + + test_resource = response.data.get("results")[0] + self.assertEqual( + test_resource.get("package"), self.test_url.format(str(self.package2.uuid)) + ) + self.assertEqual(test_resource.get("purl"), self.package2.package_url) + self.assertEqual(test_resource.get("path"), self.resource2.path) + self.assertEqual(test_resource.get("size"), self.resource2.size) + self.assertEqual(test_resource.get("sha1"), self.resource2.sha1) + self.assertEqual(test_resource.get("md5"), self.resource2.md5) + self.assertEqual(test_resource.get("sha256"), self.resource2.sha256) + self.assertEqual(test_resource.get("sha512"), self.resource2.sha512) + self.assertEqual(test_resource.get("git_sha1"), self.resource2.git_sha1) + self.assertEqual(test_resource.get("extra_data"), self.resource2.extra_data) + self.assertEqual(test_resource.get("type"), self.resource2.type) def test_api_resource_list_endpoint_returns_none_when_filtering_by_wrong_purl(self): response = self.client.get( - '/api/resources/?purl={}'.format('pkg:npm/test@1.0.0')) - self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(0, response.data.get('count')) - - def test_api_resource_list_endpoint_returns_none_when_filtering_by_blank_uuid(self): - response = self.client.get('/api/resources/?purl={}'.format('')) + "/api/resources/?purl={}".format("pkg:npm/test@1.0.0") + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) + self.assertEqual(0, response.data.get("count")) def test_api_resource_list_endpoint_filters_by_package1_purl(self): - response = self.client.get( - '/api/resources/?purl={}'.format(self.package1.package_url)) + response = self.client.get(f"/api/resources/?purl={self.package1.package_url}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_resource = response.data.get('results')[0] - self.assertEqual(test_resource.get('package'), - self.test_url.format(str(self.package1.uuid))) - self.assertEqual(test_resource.get('purl'), self.package1.package_url) - self.assertEqual(test_resource.get('path'), self.resource1.path) - self.assertEqual(test_resource.get('size'), self.resource1.size) - self.assertEqual(test_resource.get('sha1'), self.resource1.sha1) - self.assertEqual(test_resource.get('md5'), self.resource1.md5) - self.assertEqual(test_resource.get('sha256'), self.resource1.sha256) - self.assertEqual(test_resource.get('sha512'), self.resource1.sha512) - self.assertEqual(test_resource.get( - 'git_sha1'), self.resource1.git_sha1) - self.assertEqual(test_resource.get('extra_data'), - self.resource1.extra_data) - self.assertEqual(test_resource.get('type'), self.resource1.type) + self.assertEqual(1, response.data.get("count")) + + test_resource = response.data.get("results")[0] + self.assertEqual( + test_resource.get("package"), self.test_url.format(str(self.package1.uuid)) + ) + self.assertEqual(test_resource.get("purl"), self.package1.package_url) + self.assertEqual(test_resource.get("path"), self.resource1.path) + self.assertEqual(test_resource.get("size"), self.resource1.size) + self.assertEqual(test_resource.get("sha1"), self.resource1.sha1) + self.assertEqual(test_resource.get("md5"), self.resource1.md5) + self.assertEqual(test_resource.get("sha256"), self.resource1.sha256) + self.assertEqual(test_resource.get("sha512"), self.resource1.sha512) + self.assertEqual(test_resource.get("git_sha1"), self.resource1.git_sha1) + self.assertEqual(test_resource.get("extra_data"), self.resource1.extra_data) + self.assertEqual(test_resource.get("type"), self.resource1.type) def test_api_resource_list_endpoint_filters_by_package2_purl(self): - response = self.client.get( - '/api/resources/?purl={}'.format(self.package2.package_url)) + response = self.client.get(f"/api/resources/?purl={self.package2.package_url}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_resource = response.data.get('results')[0] - self.assertEqual(test_resource.get('package'), - self.test_url.format(str(self.package2.uuid))) - self.assertEqual(test_resource.get('purl'), self.package2.package_url) - self.assertEqual(test_resource.get('path'), self.resource2.path) - self.assertEqual(test_resource.get('size'), self.resource2.size) - self.assertEqual(test_resource.get('sha1'), self.resource2.sha1) - self.assertEqual(test_resource.get('md5'), self.resource2.md5) - self.assertEqual(test_resource.get('sha256'), self.resource2.sha256) - self.assertEqual(test_resource.get('sha512'), self.resource2.sha512) - self.assertEqual(test_resource.get( - 'git_sha1'), self.resource2.git_sha1) - self.assertEqual(test_resource.get('extra_data'), - self.resource2.extra_data) - self.assertEqual(test_resource.get('type'), self.resource2.type) + self.assertEqual(1, response.data.get("count")) + + test_resource = response.data.get("results")[0] + self.assertEqual( + test_resource.get("package"), self.test_url.format(str(self.package2.uuid)) + ) + self.assertEqual(test_resource.get("purl"), self.package2.package_url) + self.assertEqual(test_resource.get("path"), self.resource2.path) + self.assertEqual(test_resource.get("size"), self.resource2.size) + self.assertEqual(test_resource.get("sha1"), self.resource2.sha1) + self.assertEqual(test_resource.get("md5"), self.resource2.md5) + self.assertEqual(test_resource.get("sha256"), self.resource2.sha256) + self.assertEqual(test_resource.get("sha512"), self.resource2.sha512) + self.assertEqual(test_resource.get("git_sha1"), self.resource2.git_sha1) + self.assertEqual(test_resource.get("extra_data"), self.resource2.extra_data) + self.assertEqual(test_resource.get("type"), self.resource2.type) def test_api_resource_filter_by_checksums(self): sha1s = [ - 'testsha11', - 'testsha12', + "testsha11", + "testsha12", ] - data = { - 'sha1': sha1s - } - response = self.client.post( - '/api/resources/filter_by_checksums/', data=data) - self.assertEqual(2, response.data['count']) - expected = self.get_test_loc( - 'api/resource-filter_by_checksums-expected.json') - self.check_expected_results(response.data['results'], expected, fields_to_remove=[ - "url", "uuid", "package"], regen=FIXTURES_REGEN) + data = {"sha1": sha1s} + response = self.client.post("/api/resources/filter_by_checksums/", data=data) + self.assertEqual(2, response.data["count"]) + expected = self.get_test_loc("api/resource-filter_by_checksums-expected.json") + self.check_expected_results( + response.data["results"], + expected, + fields_to_remove=["url", "uuid", "package"], + regen=FIXTURES_REGEN, + ) - data = { - 'does-not-exist': 'dne' - } - response = self.client.post( - '/api/resources/filter_by_checksums/', data=data) + data = {"does-not-exist": "dne"} + response = self.client.post("/api/resources/filter_by_checksums/", data=data) self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = 'Unsupported field(s) given: does-not-exist' - self.assertEqual(expected_status, response.data['status']) + expected_status = "Unsupported field(s) given: does-not-exist" + self.assertEqual(expected_status, response.data["status"]) data = {} - response = self.client.post( - '/api/resources/filter_by_checksums/', data=data) + response = self.client.post("/api/resources/filter_by_checksums/", data=data) self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = 'No values provided' - self.assertEqual(expected_status, response.data['status']) + expected_status = "No values provided" + self.assertEqual(expected_status, response.data["status"]) class PackageApiTestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - self.package_data = { - 'type': 'generic', - 'namespace': 'generic', - 'name': 'Foo', - 'version': '12.34', - 'qualifiers': 'test_qual=qual', - 'subpath': 'test_subpath', - 'download_url': 'http://example.com', - 'filename': 'Foo.zip', - 'sha1': 'testsha1', - 'md5': 'testmd5', - 'size': 101, + "type": "generic", + "namespace": "generic", + "name": "Foo", + "version": "12.34", + "qualifiers": "test_qual=qual", + "subpath": "test_subpath", + "download_url": "http://example.com", + "filename": "Foo.zip", + "sha1": "testsha1", + "md5": "testmd5", + "size": 101, } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() - self.package.append_to_history('test-message') + self.package.append_to_history("test-message") self.package.save() self.package_data2 = { - 'type': 'npm', - 'namespace': 'example', - 'name': 'Bar', - 'version': '56.78', - 'qualifiers': '', - 'subpath': '', - 'download_url': 'http://somethingelse.org', - 'filename': 'Bar.zip', - 'sha1': 'testsha1-2', - 'md5': 'testmd5-2', - 'size': 100, + "type": "npm", + "namespace": "example", + "name": "Bar", + "version": "56.78", + "qualifiers": "", + "subpath": "", + "download_url": "http://somethingelse.org", + "filename": "Bar.zip", + "sha1": "testsha1-2", + "md5": "testmd5-2", + "size": 100, } self.package2 = Package.objects.create(**self.package_data2) self.package2.refresh_from_db() self.package_data3 = { - 'type': 'jar', - 'namespace': 'sample', - 'name': 'Baz', - 'version': '90.12', - 'qualifiers': '', - 'subpath': '', - 'download_url': 'http://anotherexample.com', - 'filename': 'Baz.zip', - 'sha1': 'testsha1-3', - 'md5': 'testmd5-3', - 'size': 100, + "type": "jar", + "namespace": "sample", + "name": "Baz", + "version": "90.12", + "qualifiers": "", + "subpath": "", + "download_url": "http://anotherexample.com", + "filename": "Baz.zip", + "sha1": "testsha1-3", + "md5": "testmd5-3", + "size": 100, } self.package3 = Package.objects.create(**self.package_data3) self.package3.refresh_from_db() self.package_data4 = { - 'type': 'jar', - 'namespace': 'sample', - 'name': 'Baz', - 'version': '90.123', - 'qualifiers': '', - 'subpath': '', - 'download_url': 'http://anothersample.com', - 'filename': 'Baz.zip', - 'sha1': 'testsha1-4', - 'md5': 'testmd5-3', - 'size': 100, - 'package_content': PackageContentType.BINARY, + "type": "jar", + "namespace": "sample", + "name": "Baz", + "version": "90.123", + "qualifiers": "", + "subpath": "", + "download_url": "http://anothersample.com", + "filename": "Baz.zip", + "sha1": "testsha1-4", + "md5": "testmd5-3", + "size": 100, + "package_content": PackageContentType.BINARY, } self.package4 = Package.objects.create(**self.package_data4) self.package4.refresh_from_db() self.package_data5 = { - 'type': 'maven', - 'namespace': 'foot', - 'name': 'baz', - 'version': '90.123', - 'qualifiers': 'classifier=source', - 'subpath': '', - 'download_url': 'http://test-maven.com', - 'filename': 'Baz.zip', - 'sha1': 'testsha1-5', - 'md5': 'testmd5-11', - 'size': 100, - 'package_content': PackageContentType.SOURCE_ARCHIVE, - 'declared_license_expression': 'MIT', + "type": "maven", + "namespace": "foot", + "name": "baz", + "version": "90.123", + "qualifiers": "classifier=source", + "subpath": "", + "download_url": "http://test-maven.com", + "filename": "Baz.zip", + "sha1": "testsha1-5", + "md5": "testmd5-11", + "size": 100, + "package_content": PackageContentType.SOURCE_ARCHIVE, + "declared_license_expression": "MIT", } self.package5 = Package.objects.create(**self.package_data5) self.package5.refresh_from_db() self.package_data6 = { - 'type': 'maven', - 'namespace': 'fooo', - 'name': 'baz', - 'version': '90.123', - 'qualifiers': '', - 'subpath': '', - 'download_url': 'http://test-maven-11.com', - 'filename': 'Baz.zip', - 'sha1': 'testsha1-6', - 'md5': 'testmd5-11', - 'size': 100, - 'package_content': PackageContentType.BINARY, + "type": "maven", + "namespace": "fooo", + "name": "baz", + "version": "90.123", + "qualifiers": "", + "subpath": "", + "download_url": "http://test-maven-11.com", + "filename": "Baz.zip", + "sha1": "testsha1-6", + "md5": "testmd5-11", + "size": 100, + "package_content": PackageContentType.BINARY, } self.package6 = Package.objects.create(**self.package_data6) self.package6.refresh_from_db() self.package_data7 = { - 'type': 'github', - 'namespace': 'glue', - 'name': 'cat', - 'version': '90.123', - 'qualifiers': '', - 'subpath': '', - 'download_url': 'http://test-maven-111.com', - 'filename': 'Baz.zip', - 'sha1': 'testsha1-7', - 'md5': 'testmd5-11', - 'size': 100, - 'copyright': 'BACC', - 'package_content': PackageContentType.SOURCE_REPO, + "type": "github", + "namespace": "glue", + "name": "cat", + "version": "90.123", + "qualifiers": "", + "subpath": "", + "download_url": "http://test-maven-111.com", + "filename": "Baz.zip", + "sha1": "testsha1-7", + "md5": "testmd5-11", + "size": 100, + "copyright": "BACC", + "package_content": PackageContentType.SOURCE_REPO, } self.package7 = Package.objects.create(**self.package_data7) @@ -390,79 +373,74 @@ def setUp(self): self.packageset_1.packages.add(self.package5) self.packageset_1.packages.add(self.package7) - self.test_url = 'http://testserver/api/packages/{}/' + self.test_url = "http://testserver/api/packages/{}/" self.client = APIClient() def test_package_api_list_endpoint(self): - response = self.client.get('/api/packages/') + response = self.client.get("/api/packages/") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(7, response.data.get('count')) + self.assertEqual(7, response.data.get("count")) def test_package_api_list_endpoint_filter(self): for key, value in self.package_data.items(): - response = self.client.get( - '/api/packages/?{}={}'.format(key, value)) + response = self.client.get(f"/api/packages/?{key}={value}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) + self.assertEqual(1, response.data.get("count")) def test_package_api_list_endpoint_filter_by_purl_fields_ignores_case(self): for key, value in self.package_data.items(): # Skip non-purl fields - if key not in ['type', 'namespace', 'name']: + if key not in ["type", "namespace", "name"]: continue - response = self.client.get( - '/api/packages/?{}={}'.format(key, value.lower())) + response = self.client.get(f"/api/packages/?{key}={value.lower()}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) + self.assertEqual(1, response.data.get("count")) - response = self.client.get( - '/api/packages/?{}={}'.format(key, value.upper())) + response = self.client.get(f"/api/packages/?{key}={value.upper()}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) + self.assertEqual(1, response.data.get("count")) def test_package_api_list_endpoint_search(self): # Create a dummy package to verify search filter works. Package.objects.create( - type='generic', - namespace='dummy-namespace', - name='dummy-name', - version='12.35', - download_url='https://dummy.com/dummy' + type="generic", + namespace="dummy-namespace", + name="dummy-name", + version="12.35", + download_url="https://dummy.com/dummy", ) + response = self.client.get("/api/packages/?search={}".format("generic")) + assert response.data.get("count") == 2 + response = self.client.get("/api/packages/?search={}".format("dummy")) + assert response.data.get("count") == 1 + response = self.client.get("/api/packages/?search={}".format("DUMMY")) + assert response.data.get("count") == 1 + response = self.client.get("/api/packages/?search={}".format("12.35")) + assert response.data.get("count") == 1 response = self.client.get( - '/api/packages/?search={}'.format('generic')) - assert response.data.get('count') == 2 - response = self.client.get('/api/packages/?search={}'.format('dummy')) - assert response.data.get('count') == 1 - response = self.client.get('/api/packages/?search={}'.format('DUMMY')) - assert response.data.get('count') == 1 - response = self.client.get('/api/packages/?search={}'.format('12.35')) - assert response.data.get('count') == 1 - response = self.client.get( - '/api/packages/?search={}'.format('https://dummy.com/dummy')) - assert response.data.get('count') == 1 + "/api/packages/?search={}".format("https://dummy.com/dummy") + ) + assert response.data.get("count") == 1 def test_package_api_retrieve_endpoint(self): - response = self.client.get( - '/api/packages/{}/'.format(self.package.uuid)) + response = self.client.get(f"/api/packages/{self.package.uuid}/") self.assertEqual(response.status_code, status.HTTP_200_OK) for key, value in response.data.items(): # Handle the API-only `url` key - if key == 'url': - self.assertEqual(value, self.test_url.format( - str(self.package.uuid))) + if key == "url": + self.assertEqual(value, self.test_url.format(str(self.package.uuid))) continue - if key in ['type', 'namespace', 'name', 'version', 'qualifiers', 'subpath']: + if key in ["type", "namespace", "name", "version", "qualifiers", "subpath"]: self.assertEqual(value, getattr(self.package, key)) continue - if key == 'history': - url = reverse('api:package-history', args=[self.package.uuid]) + if key == "history": + url = reverse("api:package-history", args=[self.package.uuid]) self.assertIn(url, value) self.assertTrue(hasattr(self.package, key)) @@ -471,117 +449,127 @@ def test_package_api_retrieve_endpoint(self): def test_api_package_latest_version_action(self): p1 = Package.objects.create( - download_url='http://a.a', type='generic', name='name', version='1.0') + download_url="http://a.a", type="generic", name="name", version="1.0" + ) p2 = Package.objects.create( - download_url='http://b.b', type='generic', name='name', version='2.0') + download_url="http://b.b", type="generic", name="name", version="2.0" + ) p3 = Package.objects.create( - download_url='http://c.c', type='generic', name='name', version='3.0') + download_url="http://c.c", type="generic", name="name", version="3.0" + ) response = self.client.get( - reverse('api:package-latest-version', args=[p1.uuid])) - self.assertEqual('3.0', response.data['version']) + reverse("api:package-latest-version", args=[p1.uuid]) + ) + self.assertEqual("3.0", response.data["version"]) response = self.client.get( - reverse('api:package-latest-version', args=[p2.uuid])) - self.assertEqual('3.0', response.data['version']) + reverse("api:package-latest-version", args=[p2.uuid]) + ) + self.assertEqual("3.0", response.data["version"]) response = self.client.get( - reverse('api:package-latest-version', args=[p3.uuid])) - self.assertEqual('3.0', response.data['version']) + reverse("api:package-latest-version", args=[p3.uuid]) + ) + self.assertEqual("3.0", response.data["version"]) def test_api_package_resources_action(self): # create 10 resources for i in range(0, 10): - Resource.objects.create( - package=self.package, path='path{}/'.format(i)) + Resource.objects.create(package=self.package, path=f"path{i}/") response = self.client.get( - reverse('api:package-resources', args=[self.package.uuid])) + reverse("api:package-resources", args=[self.package.uuid]) + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(10, response.data['count']) + self.assertEqual(10, response.data["count"]) - for result, i in zip(response.data['results'], range(0, 10)): - self.assertEqual(result.get('path'), 'path{}/'.format(i)) + for result, i in zip(response.data["results"], range(0, 10)): + self.assertEqual(result.get("path"), f"path{i}/") def test_api_package_list_endpoint_multiple_char_filters(self): - filters = f'?md5={self.package.md5}&md5={self.package2.md5}' - response = self.client.get(f'/api/packages/{filters}') - self.assertEqual(2, response.data['count']) - purls = [result.get('purl') for result in response.data['results']] + filters = f"?md5={self.package.md5}&md5={self.package2.md5}" + response = self.client.get(f"/api/packages/{filters}") + self.assertEqual(2, response.data["count"]) + purls = [result.get("purl") for result in response.data["results"]] self.assertIn(self.package.purl, purls) self.assertIn(self.package2.purl, purls) self.assertNotIn(self.package3.purl, purls) - filters = f'?sha1={self.package2.sha1}&sha1={self.package3.sha1}' - response = self.client.get(f'/api/packages/{filters}') + filters = f"?sha1={self.package2.sha1}&sha1={self.package3.sha1}" + response = self.client.get(f"/api/packages/{filters}") self.assertEqual(2, response.data["count"]) - purls = [result.get('purl') for result in response.data['results']] + purls = [result.get("purl") for result in response.data["results"]] self.assertIn(self.package2.purl, purls) self.assertIn(self.package3.purl, purls) self.assertNotIn(self.package.purl, purls) def test_package_api_filter_by_checksums(self): sha1s = [ - 'testsha1', - 'testsha1-2', - 'testsha1-3', - 'testsha1-4', - 'testsha1-6', + "testsha1", + "testsha1-2", + "testsha1-3", + "testsha1-4", + "testsha1-6", ] data = { - 'sha1': sha1s, + "sha1": sha1s, } - response = self.client.post( - '/api/packages/filter_by_checksums/', data=data) - self.assertEqual(5, response.data['count']) - expected = self.get_test_loc( - 'api/package-filter_by_checksums-expected.json') - self.check_expected_results(response.data['results'], expected, fields_to_remove=[ - "url", "uuid", "resources", "package_sets", "history"], regen=FIXTURES_REGEN) + response = self.client.post("/api/packages/filter_by_checksums/", data=data) + self.assertEqual(5, response.data["count"]) + expected = self.get_test_loc("api/package-filter_by_checksums-expected.json") + self.check_expected_results( + response.data["results"], + expected, + fields_to_remove=["url", "uuid", "resources", "package_sets", "history"], + regen=FIXTURES_REGEN, + ) data["enhance_package_data"] = True enhanced_response = self.client.post( - '/api/packages/filter_by_checksums/', data=data) - self.assertEqual(5, len(enhanced_response.data['results'])) + "/api/packages/filter_by_checksums/", data=data + ) + self.assertEqual(5, len(enhanced_response.data["results"])) expected = self.get_test_loc( - 'api/package-filter_by_checksums-enhanced-package-data-expected.json') - self.check_expected_results(enhanced_response.data['results'], expected, fields_to_remove=[ - "url", "uuid", "resources", "package_sets", "history"], regen=FIXTURES_REGEN) + "api/package-filter_by_checksums-enhanced-package-data-expected.json" + ) + self.check_expected_results( + enhanced_response.data["results"], + expected, + fields_to_remove=["url", "uuid", "resources", "package_sets", "history"], + regen=FIXTURES_REGEN, + ) - data = { - 'does-not-exist': 'dne' - } - response = self.client.post( - '/api/packages/filter_by_checksums/', data=data) + data = {"does-not-exist": "dne"} + response = self.client.post("/api/packages/filter_by_checksums/", data=data) self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = 'Unsupported field(s) given: does-not-exist' - self.assertEqual(expected_status, response.data['status']) + expected_status = "Unsupported field(s) given: does-not-exist" + self.assertEqual(expected_status, response.data["status"]) data = {} - response = self.client.post( - '/api/packages/filter_by_checksums/', data=data) + response = self.client.post("/api/packages/filter_by_checksums/", data=data) self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = 'No values provided' - self.assertEqual(expected_status, response.data['status']) + expected_status = "No values provided" + self.assertEqual(expected_status, response.data["status"]) class PackageApiReindexingTestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - package_download_url = 'http://anotherexample.com' + package_download_url = "http://anotherexample.com" self.package_data = { - 'type': 'maven', - 'namespace': 'sample', - 'name': 'Baz', - 'version': '90.12', - 'qualifiers': '', - 'subpath': '', - 'download_url': package_download_url, - 'filename': 'Baz.zip', - 'sha1': 'testsha1-3', - 'md5': 'testmd5-3', - 'size': 100, + "type": "maven", + "namespace": "sample", + "name": "Baz", + "version": "90.12", + "qualifiers": "", + "subpath": "", + "download_url": package_download_url, + "filename": "Baz.zip", + "sha1": "testsha1-3", + "md5": "testmd5-3", + "size": 100, } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() @@ -592,20 +580,24 @@ def setUp(self): self.scannableuri.scan_status = ScannableURI.SCAN_INDEXED self.scan_uuid = uuid4() self.scannableuri.scan_uuid = self.scan_uuid - self.scannableuri.scan_error = 'error' - self.scannableuri.index_error = 'error' + self.scannableuri.scan_error = "error" + self.scannableuri.index_error = "error" self.scan_date = timezone.now() self.scannableuri.scan_date = self.scan_date def test_reindex_package(self): self.assertEqual(1, ScannableURI.objects.all().count()) response = self.client.get( - f'/api/packages/{self.package.uuid}/reindex_package/') + f"/api/packages/{self.package.uuid}/reindex_package/" + ) self.assertEqual( - 'pkg:maven/sample/Baz@90.12 has been queued for reindexing', response.data['status']) + "pkg:maven/sample/Baz@90.12 has been queued for reindexing", + response.data["status"], + ) self.assertEqual(2, ScannableURI.objects.all().count()) new_scannable_uri = ScannableURI.objects.exclude( - pk=self.scannableuri.pk).first() + pk=self.scannableuri.pk + ).first() self.assertEqual(self.package, new_scannable_uri.package) self.assertEqual(True, new_scannable_uri.reindex_uri) self.assertEqual(100, new_scannable_uri.priority) @@ -616,54 +608,54 @@ def test_reindex_package(self): # Ensure previous ScannableURI was not modified self.assertEqual(False, self.scannableuri.reindex_uri) self.assertEqual(0, self.scannableuri.priority) - self.assertEqual('error', self.scannableuri.scan_error) - self.assertEqual('error', self.scannableuri.index_error) + self.assertEqual("error", self.scannableuri.scan_error) + self.assertEqual("error", self.scannableuri.index_error) self.assertEqual(self.scan_date, self.scannableuri.scan_date) class PackageApiPurlFilterTestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): self.package_data1 = { - 'type': 'maven', - 'namespace': 'org.apache.commons', - 'name': 'io', - 'version': '1.3.4', - 'download_url': 'http://example1.com', - 'extra_data': json.dumps({'test2': 'data2'}) + "type": "maven", + "namespace": "org.apache.commons", + "name": "io", + "version": "1.3.4", + "download_url": "http://example1.com", + "extra_data": json.dumps({"test2": "data2"}), } self.package_data2 = { - 'type': 'maven', - 'namespace': 'org.apache.commons', - 'name': 'io', - 'version': '2.3.4', - 'download_url': 'http://example2.com', - 'extra_data': json.dumps({'test2': 'data2'}) + "type": "maven", + "namespace": "org.apache.commons", + "name": "io", + "version": "2.3.4", + "download_url": "http://example2.com", + "extra_data": json.dumps({"test2": "data2"}), } self.package_data3 = { - 'type': 'maven', - 'namespace': '', - 'name': 'test', - 'version': '1.0.0', - 'qualifiers': '', - 'package_content': PackageContentType.BINARY, - 'download_url': 'https://example.com/test-1.0.0.jar', + "type": "maven", + "namespace": "", + "name": "test", + "version": "1.0.0", + "qualifiers": "", + "package_content": PackageContentType.BINARY, + "download_url": "https://example.com/test-1.0.0.jar", } self.package_data4 = { - 'type': 'maven', - 'namespace': '', - 'name': 'test', - 'version': '1.0.0', - 'qualifiers': 'classifier=sources', - 'declared_license_expression': 'apache-2.0', - 'copyright': 'Copyright (c) example corp.', - 'holder': 'example corp.', - 'package_content': PackageContentType.SOURCE_ARCHIVE, - 'download_url': 'https://example.com/test-1.0.0-sources.jar', + "type": "maven", + "namespace": "", + "name": "test", + "version": "1.0.0", + "qualifiers": "classifier=sources", + "declared_license_expression": "apache-2.0", + "copyright": "Copyright (c) example corp.", + "holder": "example corp.", + "package_content": PackageContentType.SOURCE_ARCHIVE, + "download_url": "https://example.com/test-1.0.0-sources.jar", } self.package1 = Package.objects.create(**self.package_data1) @@ -674,7 +666,7 @@ def setUp(self): self.purl1 = self.package1.package_url self.purl2 = self.package2.package_url - self.missing_purl = 'pkg:PYPI/Django_package@1.11.1.dev1' + self.missing_purl = "pkg:PYPI/Django_package@1.11.1.dev1" self.package_set1 = PackageSet.objects.create() self.package_set1.add_to_package_set(self.package1) @@ -690,192 +682,197 @@ def tearDown(self): Package.objects.all().delete() def test_package_api_purl_filter_by_query_param_invalid_purl(self): - response = self.client.get('/api/packages/?purl={}'.format('11111')) + response = self.client.get("/api/packages/?purl={}".format("11111")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(0, response.data.get('count')) + self.assertEqual(0, response.data.get("count")) def test_package_api_purl_filter_by_query_param_no_value(self): - response = self.client.get('/api/packages/?purl={}'.format('')) + response = self.client.get("/api/packages/?purl={}".format("")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(4, response.data.get('count')) + self.assertEqual(4, response.data.get("count")) def test_package_api_purl_filter_by_query_param_non_existant_purl(self): - response = self.client.get( - '/api/packages/?purl={}'.format(self.missing_purl)) + response = self.client.get(f"/api/packages/?purl={self.missing_purl}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(0, response.data.get('count')) + self.assertEqual(0, response.data.get("count")) def test_package_api_purl_filter_by_query_param_no_version(self): response = self.client.get( - '/api/packages/?purl={}'.format('pkg:maven/org.apache.commons/io')) + "/api/packages/?purl={}".format("pkg:maven/org.apache.commons/io") + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) + self.assertEqual(2, response.data.get("count")) def test_package_api_purl_filter_by_query_param1(self): - response = self.client.get('/api/packages/?purl={}'.format(self.purl1)) + response = self.client.get(f"/api/packages/?purl={self.purl1}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_package = response.data.get('results')[0] - self.assertEqual(test_package.get('type'), - self.package_data1.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data1.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data1.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data1.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data1.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data1.get('extra_data')) + self.assertEqual(1, response.data.get("count")) + + test_package = response.data.get("results")[0] + self.assertEqual(test_package.get("type"), self.package_data1.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data1.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data1.get("name")) + self.assertEqual(test_package.get("version"), self.package_data1.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data1.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data1.get("extra_data") + ) def test_package_api_purl_filter_by_query_param2(self): - response = self.client.get('/api/packages/?purl={}'.format(self.purl2)) + response = self.client.get(f"/api/packages/?purl={self.purl2}") self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_package = response.data.get('results')[0] - self.assertEqual(test_package.get('type'), - self.package_data2.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data2.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data2.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data2.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data2.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data2.get('extra_data')) + self.assertEqual(1, response.data.get("count")) + + test_package = response.data.get("results")[0] + self.assertEqual(test_package.get("type"), self.package_data2.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data2.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data2.get("name")) + self.assertEqual(test_package.get("version"), self.package_data2.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data2.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data2.get("extra_data") + ) def test_package_api_purl_filter_by_both_query_params(self): response = self.client.get( - '/api/packages/?purl={}&purl={}'.format(self.purl1, self.purl2)) + f"/api/packages/?purl={self.purl1}&purl={self.purl2}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) - - test_package = response.data.get('results')[0] - self.assertEqual(test_package.get('type'), - self.package_data1.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data1.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data1.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data1.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data1.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data1.get('extra_data')) - - test_package = response.data.get('results')[1] - self.assertEqual(test_package.get('type'), - self.package_data2.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data2.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data2.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data2.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data2.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data2.get('extra_data')) + self.assertEqual(2, response.data.get("count")) + + test_package = response.data.get("results")[0] + self.assertEqual(test_package.get("type"), self.package_data1.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data1.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data1.get("name")) + self.assertEqual(test_package.get("version"), self.package_data1.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data1.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data1.get("extra_data") + ) + + test_package = response.data.get("results")[1] + self.assertEqual(test_package.get("type"), self.package_data2.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data2.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data2.get("name")) + self.assertEqual(test_package.get("version"), self.package_data2.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data2.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data2.get("extra_data") + ) def test_package_api_purl_filter_by_two_purl_values_on_multiple_packages(self): - extra_test_package = Package.objects.create( - download_url='https://extra-pkg.com/download', - type='generic', - name='extra-name', - version='2.2.2' + # extra test package + Package.objects.create( + download_url="https://extra-pkg.com/download", + type="generic", + name="extra-name", + version="2.2.2", ) response = self.client.get( - '/api/packages/?purl={}&purl={}'.format(self.purl1, self.purl2)) + f"/api/packages/?purl={self.purl1}&purl={self.purl2}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(2, response.data.get('count')) - - test_package = response.data.get('results')[0] - self.assertEqual(test_package.get('type'), - self.package_data1.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data1.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data1.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data1.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data1.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data1.get('extra_data')) - - test_package = response.data.get('results')[1] - self.assertEqual(test_package.get('type'), - self.package_data2.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data2.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data2.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data2.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data2.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data2.get('extra_data')) + self.assertEqual(2, response.data.get("count")) + + test_package = response.data.get("results")[0] + self.assertEqual(test_package.get("type"), self.package_data1.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data1.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data1.get("name")) + self.assertEqual(test_package.get("version"), self.package_data1.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data1.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data1.get("extra_data") + ) + + test_package = response.data.get("results")[1] + self.assertEqual(test_package.get("type"), self.package_data2.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data2.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data2.get("name")) + self.assertEqual(test_package.get("version"), self.package_data2.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data2.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data2.get("extra_data") + ) def test_package_api_purl_filter_by_one_purl_multiple_params(self): response = self.client.get( - '/api/packages/?purl={}&purl={}'.format(self.purl1, self.missing_purl)) + f"/api/packages/?purl={self.purl1}&purl={self.missing_purl}" + ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(1, response.data.get('count')) - - test_package = response.data.get('results')[0] - self.assertEqual(test_package.get('type'), - self.package_data1.get('type')) - self.assertEqual(test_package.get('namespace'), - self.package_data1.get('namespace')) - self.assertEqual(test_package.get('name'), - self.package_data1.get('name')) - self.assertEqual(test_package.get('version'), - self.package_data1.get('version')) - self.assertEqual(test_package.get('download_url'), - self.package_data1.get('download_url')) - self.assertEqual(test_package.get('extra_data'), - self.package_data1.get('extra_data')) + self.assertEqual(1, response.data.get("count")) + + test_package = response.data.get("results")[0] + self.assertEqual(test_package.get("type"), self.package_data1.get("type")) + self.assertEqual( + test_package.get("namespace"), self.package_data1.get("namespace") + ) + self.assertEqual(test_package.get("name"), self.package_data1.get("name")) + self.assertEqual(test_package.get("version"), self.package_data1.get("version")) + self.assertEqual( + test_package.get("download_url"), self.package_data1.get("download_url") + ) + self.assertEqual( + test_package.get("extra_data"), self.package_data1.get("extra_data") + ) def test_package_api_purl_filter_by_multiple_blank_purl(self): - response = self.client.get( - '/api/packages/?purl={}&purl={}'.format('', '')) + response = self.client.get("/api/packages/?purl={}&purl={}".format("", "")) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(4, response.data.get('count')) + self.assertEqual(4, response.data.get("count")) def test_package_api_get_enhanced_package(self): response = self.client.get( - reverse('api:package-get-enhanced-package-data', args=[self.package3.uuid])) + reverse("api:package-get-enhanced-package-data", args=[self.package3.uuid]) + ) result = response.data - expected = self.get_test_loc('api/enhanced_package.json') - self.check_expected_results(result, expected, fields_to_remove=[ - 'package_sets'], regen=FIXTURES_REGEN) + expected = self.get_test_loc("api/enhanced_package.json") + self.check_expected_results( + result, expected, fields_to_remove=["package_sets"], regen=FIXTURES_REGEN + ) class CollectApiTestCase(JsonBasedTesting, TestCase): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles') + test_data_dir = os.path.join(os.path.dirname(__file__), "testfiles") def setUp(self): - self.package_download_url = 'http://anotherexample.com' + self.package_download_url = "http://anotherexample.com" self.package_data = { - 'type': 'maven', - 'namespace': 'sample', - 'name': 'Baz', - 'version': '90.12', - 'qualifiers': '', - 'subpath': '', - 'download_url': self.package_download_url, - 'filename': 'Baz.zip', - 'sha1': 'testsha1-3', - 'md5': 'testmd5-3', - 'size': 100, + "type": "maven", + "namespace": "sample", + "name": "Baz", + "version": "90.12", + "qualifiers": "", + "subpath": "", + "download_url": self.package_download_url, + "filename": "Baz.zip", + "sha1": "testsha1-3", + "md5": "testmd5-3", + "size": 100, } self.package = Package.objects.create(**self.package_data) self.scannableuri = ScannableURI.objects.create( @@ -885,24 +882,24 @@ def setUp(self): self.scannableuri.scan_status = ScannableURI.SCAN_INDEX_FAILED self.scan_uuid = uuid4() self.scannableuri.scan_uuid = self.scan_uuid - self.scannableuri.scan_error = 'error' - self.scannableuri.index_error = 'error' + self.scannableuri.scan_error = "error" + self.scannableuri.index_error = "error" self.scan_request_date = timezone.now() self.scannableuri.scan_request_date = self.scan_request_date - self.package_download_url2 = 'http://somethingelse.org' + self.package_download_url2 = "http://somethingelse.org" self.package_data2 = { - 'type': 'npm', - 'namespace': 'example', - 'name': 'bar', - 'version': '56.78', - 'qualifiers': '', - 'subpath': '', - 'download_url': self.package_download_url2, - 'filename': 'Bar.zip', - 'sha1': 'testsha1-2', - 'md5': 'testmd5-2', - 'size': 100, + "type": "npm", + "namespace": "example", + "name": "bar", + "version": "56.78", + "qualifiers": "", + "subpath": "", + "download_url": self.package_download_url2, + "filename": "Bar.zip", + "sha1": "testsha1-2", + "md5": "testmd5-2", + "size": 100, } self.package2 = Package.objects.create(**self.package_data2) self.scannableuri2 = ScannableURI.objects.create( @@ -912,217 +909,207 @@ def setUp(self): self.scannableuri2.scan_status = ScannableURI.SCAN_INDEX_FAILED self.scan_uuid2 = uuid4() self.scannableuri2.scan_uuid = self.scan_uuid2 - self.scannableuri2.scan_error = 'error' - self.scannableuri2.index_error = 'error' + self.scannableuri2.scan_error = "error" + self.scannableuri2.index_error = "error" self.scan_request_date2 = timezone.now() self.scannableuri2.scan_request_date = self.scan_request_date2 - self.package_download_url3 = 'http://clone.org/clone1.zip' + self.package_download_url3 = "http://clone.org/clone1.zip" self.package_data3 = { - 'type': 'pypi', - 'namespace': '', - 'name': 'clone', - 'version': '1', - 'qualifiers': '', - 'subpath': '', - 'download_url': self.package_download_url3, - 'filename': 'clone1.zip', - 'sha1': 'clone1', - 'md5': '', - 'size': 100, + "type": "pypi", + "namespace": "", + "name": "clone", + "version": "1", + "qualifiers": "", + "subpath": "", + "download_url": self.package_download_url3, + "filename": "clone1.zip", + "sha1": "clone1", + "md5": "", + "size": 100, } self.package3 = Package.objects.create(**self.package_data3) - self.package_download_url4 = 'http://clone.org/clone1-src.zip' + self.package_download_url4 = "http://clone.org/clone1-src.zip" self.package_data4 = { - 'type': 'pypi', - 'namespace': '', - 'name': 'clone', - 'version': '1', - 'qualifiers': 'package=src', - 'subpath': '', - 'download_url': self.package_download_url4, - 'filename': 'clone1-src.zip', - 'sha1': 'clone1-src', - 'md5': '', - 'size': 50, + "type": "pypi", + "namespace": "", + "name": "clone", + "version": "1", + "qualifiers": "package=src", + "subpath": "", + "download_url": self.package_download_url4, + "filename": "clone1-src.zip", + "sha1": "clone1-src", + "md5": "", + "size": 50, } self.package4 = Package.objects.create(**self.package_data4) - self.package_download_url5 = 'http://clone.org/clone1-all.zip' + self.package_download_url5 = "http://clone.org/clone1-all.zip" self.package_data5 = { - 'type': 'pypi', - 'namespace': '', - 'name': 'clone', - 'version': '1', - 'qualifiers': 'package=all', - 'subpath': '', - 'download_url': self.package_download_url5, - 'filename': 'clone1-all.zip', - 'sha1': 'clone1-all', - 'md5': '', - 'size': 25, + "type": "pypi", + "namespace": "", + "name": "clone", + "version": "1", + "qualifiers": "package=all", + "subpath": "", + "download_url": self.package_download_url5, + "filename": "clone1-all.zip", + "sha1": "clone1-all", + "md5": "", + "size": 25, } self.package5 = Package.objects.create(**self.package_data5) - def test_package_live(self): - purl_str = 'pkg:maven/org.apache.twill/twill-core@0.12.0' - download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar' - purl_sources_str = f'{purl_str}?classifier=sources' - sources_download_url = 'https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar' - - self.assertEqual(0, Package.objects.filter( - download_url=download_url).count()) - self.assertEqual(0, Package.objects.filter( - download_url=sources_download_url).count()) - response = self.client.get(f'/api/collect/?purl={purl_str}') - self.assertEqual(1, Package.objects.filter( - download_url=download_url).count()) - self.assertEqual(1, Package.objects.filter( - download_url=sources_download_url).count()) - expected = self.get_test_loc('api/twill-core-0.12.0.json') + purl_str = "pkg:maven/org.apache.twill/twill-core@0.12.0" + download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar" + sources_download_url = "https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar" - self.assertEqual(2, len(response.data)) - result = response.data[0] + self.assertEqual(0, Package.objects.filter(download_url=download_url).count()) + self.assertEqual( + 0, Package.objects.filter(download_url=sources_download_url).count() + ) + response = self.client.get(f"/api/collect/?purl={purl_str}") + self.assertEqual(1, Package.objects.filter(download_url=download_url).count()) + self.assertEqual( + 1, Package.objects.filter(download_url=sources_download_url).count() + ) + expected = self.get_test_loc("api/twill-core-0.12.0.json") + results = response.data + self.assertEqual(2, len(results)) # remove fields - result.pop('url') - fields_to_remove = [ - 'uuid', - 'resources', - 'package_sets', - 'history' - ] + for result in results: + result.pop("url") + # sort by filename + results = sorted(results, key=lambda x: x["filename"]) + + fields_to_remove = ["uuid", "resources", "package_sets", "history"] self.check_expected_results( - result, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN) + results, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN + ) # Ensure that the created ScannableURI objects have a priority of 100 package = Package.objects.get(download_url=download_url) source_package = Package.objects.get(download_url=sources_download_url) package_scannable_uri = ScannableURI.objects.get(package=package) - source_package_scannable_uri = ScannableURI.objects.get( - package=source_package) + source_package_scannable_uri = ScannableURI.objects.get(package=source_package) self.assertEqual(100, package_scannable_uri.priority) self.assertEqual(100, source_package_scannable_uri.priority) def test_package_live_works_with_purl2vcs(self): purl = "pkg:maven/org.elasticsearch.plugin/elasticsearch-scripting-painless-spi@6.8.15" - download_url = 'https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15.jar' - purl_sources_str = f'{purl}?classifier=sources' - sources_download_url = 'https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15-sources.jar' - - self.assertEqual(0, Package.objects.filter( - download_url=download_url).count()) - self.assertEqual(0, Package.objects.filter( - download_url=sources_download_url).count()) - response = self.client.get(f'/api/collect/?purl={purl}') - self.assertEqual(1, Package.objects.filter( - download_url=download_url).count()) - self.assertEqual(1, Package.objects.filter( - download_url=sources_download_url).count()) + download_url = "https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15.jar" + sources_download_url = "https://repo1.maven.org/maven2/org/elasticsearch/plugin/elasticsearch-scripting-painless-spi/6.8.15/elasticsearch-scripting-painless-spi-6.8.15-sources.jar" + + self.assertEqual(0, Package.objects.filter(download_url=download_url).count()) + self.assertEqual( + 0, Package.objects.filter(download_url=sources_download_url).count() + ) + response = self.client.get(f"/api/collect/?purl={purl}") + self.assertEqual(1, Package.objects.filter(download_url=download_url).count()) + self.assertEqual( + 1, Package.objects.filter(download_url=sources_download_url).count() + ) expected = self.get_test_loc( - 'api/elasticsearch-scripting-painless-spi-6.8.15.json') + "api/elasticsearch-scripting-painless-spi-6.8.15.json" + ) self.assertEqual(2, len(response.data)) result = response.data[0] # remove fields - result.pop('url') - fields_to_remove = [ - 'uuid', - 'resources', - 'package_sets', - 'history' - ] + result.pop("url") + fields_to_remove = ["uuid", "resources", "package_sets", "history"] self.check_expected_results( - result, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN) + result, expected, fields_to_remove=fields_to_remove, regen=FIXTURES_REGEN + ) def test_collect_sort(self): - purl_str = 'pkg:pypi/clone@1' - response = self.client.get(f'/api/collect/?purl={purl_str}&sort=size') + purl_str = "pkg:pypi/clone@1" + response = self.client.get(f"/api/collect/?purl={purl_str}&sort=size") for i, package_data in enumerate(response.data[1:], start=1): - prev_package_data = response.data[i-1] - self.assertTrue(prev_package_data['size'] < package_data['size']) + prev_package_data = response.data[i - 1] + self.assertTrue(prev_package_data["size"] < package_data["size"]) - response = self.client.get(f'/api/collect/?purl={purl_str}&sort=-size') + response = self.client.get(f"/api/collect/?purl={purl_str}&sort=-size") for i, package_data in enumerate(response.data[1:], start=1): - prev_package_data = response.data[i-1] - self.assertTrue(prev_package_data['size'] > package_data['size']) + prev_package_data = response.data[i - 1] + self.assertTrue(prev_package_data["size"] > package_data["size"]) def test_package_api_index_packages_endpoint(self): priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(0, priority_resource_uris_count) packages = [ - {'purl': 'pkg:maven/ch.qos.reload4j/reload4j@1.2.24'}, - {'purl': 'pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0'}, - {'purl': 'pkg:bitbucket/example/example@1.0.0'}, + {"purl": "pkg:maven/ch.qos.reload4j/reload4j@1.2.24"}, + {"purl": "pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0"}, + {"purl": "pkg:bitbucket/example/example@1.0.0"}, ] - data = { - 'packages': packages - } + data = {"packages": packages} response = self.client.post( - '/api/collect/index_packages/', data=data, content_type="application/json") - self.assertEqual(2, response.data['queued_packages_count']) + "/api/collect/index_packages/", data=data, content_type="application/json" + ) + self.assertEqual(2, response.data["queued_packages_count"]) expected_queued_packages = [ - 'pkg:maven/ch.qos.reload4j/reload4j@1.2.24', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0', + "pkg:maven/ch.qos.reload4j/reload4j@1.2.24", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0", ] self.assertEqual( - sorted(expected_queued_packages), - sorted(response.data['queued_packages']) - ) - self.assertEqual(0, response.data['unqueued_packages_count']) - self.assertEqual([], response.data['unqueued_packages']) - self.assertEqual(1, response.data['unsupported_packages_count']) - expected_unsupported_packages = [ - 'pkg:bitbucket/example/example@1.0.0' - ] - self.assertEqual(expected_unsupported_packages, - response.data['unsupported_packages']) + sorted(expected_queued_packages), sorted(response.data["queued_packages"]) + ) + self.assertEqual(0, response.data["unqueued_packages_count"]) + self.assertEqual([], response.data["unqueued_packages"]) + self.assertEqual(1, response.data["unsupported_packages_count"]) + expected_unsupported_packages = ["pkg:bitbucket/example/example@1.0.0"] + self.assertEqual( + expected_unsupported_packages, response.data["unsupported_packages"] + ) priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(2, priority_resource_uris_count) # Ensure that we don't add the same packages to the queue if they have # not yet been processed purls = [ - {'purl': 'pkg:maven/ch.qos.reload4j/reload4j@1.2.24'}, - {'purl': 'pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0'}, - {'purl': 'pkg:bitbucket/example/example@1.0.0'}, + {"purl": "pkg:maven/ch.qos.reload4j/reload4j@1.2.24"}, + {"purl": "pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0"}, + {"purl": "pkg:bitbucket/example/example@1.0.0"}, ] - data = { - 'packages': purls - } + data = {"packages": purls} response = self.client.post( - '/api/collect/index_packages/', data=data, content_type="application/json") - self.assertEqual(0, response.data['queued_packages_count']) - self.assertEqual([], response.data['queued_packages']) - self.assertEqual(0, response.data['requeued_packages_count']) - self.assertEqual([], response.data['requeued_packages']) - self.assertEqual(2, response.data['unqueued_packages_count']) + "/api/collect/index_packages/", data=data, content_type="application/json" + ) + self.assertEqual(0, response.data["queued_packages_count"]) + self.assertEqual([], response.data["queued_packages"]) + self.assertEqual(0, response.data["requeued_packages_count"]) + self.assertEqual([], response.data["requeued_packages"]) + self.assertEqual(2, response.data["unqueued_packages_count"]) expected_unqueued_packages = [ - 'pkg:maven/ch.qos.reload4j/reload4j@1.2.24', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0', + "pkg:maven/ch.qos.reload4j/reload4j@1.2.24", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0", ] self.assertEqual( sorted(expected_unqueued_packages), - sorted(response.data['unqueued_packages']) + sorted(response.data["unqueued_packages"]), + ) + self.assertEqual(1, response.data["unsupported_packages_count"]) + expected_unsupported_packages = ["pkg:bitbucket/example/example@1.0.0"] + self.assertEqual( + expected_unsupported_packages, response.data["unsupported_packages"] ) - self.assertEqual(1, response.data['unsupported_packages_count']) - expected_unsupported_packages = [ - 'pkg:bitbucket/example/example@1.0.0' - ] - self.assertEqual(expected_unsupported_packages, - response.data['unsupported_packages']) - bad_data = {'does-not-exist': 'dne'} + bad_data = {"does-not-exist": "dne"} response = self.client.post( - '/api/collect/index_packages/', data=bad_data, content_type="application/json") - expected_errors = {'packages': ['This field is required.']} + "/api/collect/index_packages/", + data=bad_data, + content_type="application/json", + ) + expected_errors = {"packages": ["This field is required."]} self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - self.assertEqual(expected_errors, response.data['errors']) + self.assertEqual(expected_errors, response.data["errors"]) @mock.patch("packagedb.api.get_all_versions") def test_package_api_index_packages_endpoint_with_vers(self, mock_get_all_versions): @@ -1169,11 +1156,10 @@ def test_package_api_index_packages_endpoint_with_vers(self, mock_get_all_versio "pkg:maven/ch.qos.reload4j/reload4j@1.2.23", ] self.assertEqual( - sorted(expected_queued_packages), sorted( - response.data["queued_packages"]) + sorted(expected_queued_packages), sorted(response.data["queued_packages"]) ) - self.assertEqual(0, response.data['requeued_packages_count']) - self.assertEqual([], response.data['requeued_packages']) + self.assertEqual(0, response.data["requeued_packages_count"]) + self.assertEqual([], response.data["requeued_packages"]) self.assertEqual(0, response.data["unqueued_packages_count"]) self.assertEqual([], response.data["unqueued_packages"]) self.assertEqual(0, response.data["unsupported_packages_count"]) @@ -1181,7 +1167,9 @@ def test_package_api_index_packages_endpoint_with_vers(self, mock_get_all_versio self.assertEqual(9, priority_resource_uris_count) @mock.patch("packagedb.api.get_all_versions") - def test_package_api_index_packages_endpoint_all_version_index(self, mock_get_all_versions): + def test_package_api_index_packages_endpoint_all_version_index( + self, mock_get_all_versions + ): priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(0, priority_resource_uris_count) packages = [ @@ -1228,11 +1216,10 @@ def test_package_api_index_packages_endpoint_all_version_index(self, mock_get_al "pkg:maven/ch.qos.reload4j/reload4j@1.2.25", ] self.assertEqual( - sorted(expected_queued_packages), sorted( - response.data["queued_packages"]) + sorted(expected_queued_packages), sorted(response.data["queued_packages"]) ) - self.assertEqual(0, response.data['requeued_packages_count']) - self.assertEqual([], response.data['requeued_packages']) + self.assertEqual(0, response.data["requeued_packages_count"]) + self.assertEqual([], response.data["requeued_packages"]) self.assertEqual(0, response.data["unqueued_packages_count"]) self.assertEqual([], response.data["unqueued_packages"]) self.assertEqual(0, response.data["unsupported_packages_count"]) @@ -1246,66 +1233,64 @@ def test_reindex_packages_bulk(self): self.assertEqual(False, self.scannableuri.reindex_uri) self.assertEqual(0, self.scannableuri.priority) self.assertEqual(self.scan_uuid, self.scannableuri.scan_uuid) - self.assertEqual('error', self.scannableuri.scan_error) - self.assertEqual('error', self.scannableuri.index_error) - self.assertEqual(self.scan_request_date, - self.scannableuri.scan_request_date) - self.assertEqual(ScannableURI.SCAN_INDEX_FAILED, - self.scannableuri.scan_status) + self.assertEqual("error", self.scannableuri.scan_error) + self.assertEqual("error", self.scannableuri.index_error) + self.assertEqual(self.scan_request_date, self.scannableuri.scan_request_date) + self.assertEqual(ScannableURI.SCAN_INDEX_FAILED, self.scannableuri.scan_status) self.assertEqual(False, self.scannableuri2.reindex_uri) self.assertEqual(0, self.scannableuri2.priority) self.assertEqual(self.scan_uuid2, self.scannableuri2.scan_uuid) - self.assertEqual('error', self.scannableuri2.scan_error) - self.assertEqual('error', self.scannableuri2.index_error) - self.assertEqual(self.scan_request_date2, - self.scannableuri2.scan_request_date) - self.assertEqual(ScannableURI.SCAN_INDEX_FAILED, - self.scannableuri2.scan_status) + self.assertEqual("error", self.scannableuri2.scan_error) + self.assertEqual("error", self.scannableuri2.index_error) + self.assertEqual(self.scan_request_date2, self.scannableuri2.scan_request_date) + self.assertEqual(ScannableURI.SCAN_INDEX_FAILED, self.scannableuri2.scan_status) packages = [ # Existing package { - "purl": 'pkg:maven/sample/Baz@90.12', + "purl": "pkg:maven/sample/Baz@90.12", }, { - "purl": 'pkg:npm/example/bar@56.78', + "purl": "pkg:npm/example/bar@56.78", }, # NOt in DB and unsupported { - "purl": 'pkg:pypi/does/not-exist@1', + "purl": "pkg:pypi/does/not-exist@1", }, ] data = {"packages": packages, "reindex": True} existing_purls = [ - 'pkg:maven/sample/Baz@90.12', - 'pkg:npm/example/bar@56.78', + "pkg:maven/sample/Baz@90.12", + "pkg:npm/example/bar@56.78", ] unsupported_purls = [ - 'pkg:pypi/does/not-exist@1', + "pkg:pypi/does/not-exist@1", ] response = self.client.post( - f'/api/collect/index_packages/', data=data, content_type="application/json") - - self.assertEqual(2, response.data['requeued_packages_count']) - self.assertListEqual(sorted(existing_purls), sorted( - response.data['requeued_packages'])) + "/api/collect/index_packages/", data=data, content_type="application/json" + ) - self.assertEqual(1, response.data['unsupported_packages_count']) + self.assertEqual(2, response.data["requeued_packages_count"]) self.assertListEqual( - unsupported_purls, response.data['unsupported_packages']) + sorted(existing_purls), sorted(response.data["requeued_packages"]) + ) + + self.assertEqual(1, response.data["unsupported_packages_count"]) + self.assertListEqual(unsupported_purls, response.data["unsupported_packages"]) - self.assertEqual(0, response.data['queued_packages_count']) - self.assertEqual([], response.data['queued_packages']) + self.assertEqual(0, response.data["queued_packages_count"]) + self.assertEqual([], response.data["queued_packages"]) self.assertEqual(0, response.data["unqueued_packages_count"]) self.assertEqual([], response.data["unqueued_packages"]) self.assertEqual(4, ScannableURI.objects.all().count()) new_scannable_uris = ScannableURI.objects.exclude( - pk__in=[self.scannableuri.pk, self.scannableuri2.pk]) + pk__in=[self.scannableuri.pk, self.scannableuri2.pk] + ) self.assertEqual(2, new_scannable_uris.count()) for scannable_uri in new_scannable_uris: @@ -1320,147 +1305,143 @@ def test_package_api_index_packages_priority(self): priority_resource_uris_count = PriorityResourceURI.objects.all().count() self.assertEqual(0, priority_resource_uris_count) packages = [ - {'purl': 'pkg:maven/ch.qos.reload4j/reload4j@1.2.24'}, - {'purl': 'pkg:maven/com.esotericsoftware.kryo/kryo'}, + {"purl": "pkg:maven/ch.qos.reload4j/reload4j@1.2.24"}, + {"purl": "pkg:maven/com.esotericsoftware.kryo/kryo"}, ] - data = { - 'packages': packages - } + data = {"packages": packages} response = self.client.post( - '/api/collect/index_packages/', data=data, content_type="application/json") - self.assertEqual(14, response.data['queued_packages_count']) + "/api/collect/index_packages/", data=data, content_type="application/json" + ) + self.assertEqual(14, response.data["queued_packages_count"]) expected_kryo_packages = [ - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.10', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.12', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.14', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.16', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.17', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.19', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.20', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.21', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.21.1', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.22', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.23.0', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.23.1', - 'pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0', + "pkg:maven/com.esotericsoftware.kryo/kryo@2.10", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.12", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.14", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.16", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.17", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.19", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.20", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.21", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.21.1", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.22", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.23.0", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.23.1", + "pkg:maven/com.esotericsoftware.kryo/kryo@2.24.0", + ] + expected_queued_packages = expected_kryo_packages + [ + "pkg:maven/ch.qos.reload4j/reload4j@1.2.24" ] - expected_queued_packages = expected_kryo_packages + \ - ['pkg:maven/ch.qos.reload4j/reload4j@1.2.24'] self.assertEqual( - sorted(expected_queued_packages), - sorted(response.data['queued_packages']) + sorted(expected_queued_packages), sorted(response.data["queued_packages"]) ) priority_resource_uri = PriorityResourceURI.objects.get( - package_url='pkg:maven/ch.qos.reload4j/reload4j@1.2.24') + package_url="pkg:maven/ch.qos.reload4j/reload4j@1.2.24" + ) self.assertEqual(100, priority_resource_uri.priority) for purl in expected_kryo_packages: - priority_resource_uri = PriorityResourceURI.objects.get( - package_url=purl) + priority_resource_uri = PriorityResourceURI.objects.get(package_url=purl) self.assertEqual(0, priority_resource_uri.priority) def test_collect_errors(self): - invalid_purl = 'pkg:asdf1' - response = self.client.get(f'/api/collect/?purl={invalid_purl}') + invalid_purl = "pkg:asdf1" + response = self.client.get(f"/api/collect/?purl={invalid_purl}") self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = {'purl': [ - "purl validation error: purl is missing the required type component: 'pkg:asdf1'."]} - self.assertEqual(expected_status, response.data['errors']) + expected_status = { + "purl": [ + "purl validation error: purl is missing the required type component: 'pkg:asdf1'." + ] + } + self.assertEqual(expected_status, response.data["errors"]) - unhandled_purl = 'pkg:does-not-exist/does-not-exist@1.0' - response = self.client.get(f'/api/collect/?purl={unhandled_purl}') + unhandled_purl = "pkg:does-not-exist/does-not-exist@1.0" + response = self.client.get(f"/api/collect/?purl={unhandled_purl}") self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) - expected_status = f'cannot fetch Package data for {unhandled_purl}: no available handler' - self.assertEqual(expected_status, response.data['status']) + expected_status = ( + f"cannot fetch Package data for {unhandled_purl}: no available handler" + ) + self.assertEqual(expected_status, response.data["status"]) - purl_str = 'pkg:maven/does-not-exist@1.0' - response = self.client.get(f'/api/collect/?purl={purl_str}') + purl_str = "pkg:maven/does-not-exist@1.0" + response = self.client.get(f"/api/collect/?purl={purl_str}") self.assertEqual(status.HTTP_400_BAD_REQUEST, response.status_code) expected_status = ( - 'error(s) occurred when fetching metadata for pkg:maven/does-not-exist@1.0: ' - 'Package does not exist on maven: pkg:maven/does-not-exist@1.0\n' - 'Package does not exist on maven: pkg:maven/does-not-exist@1.0?classifier=sources\n' + "error(s) occurred when fetching metadata for pkg:maven/does-not-exist@1.0: " + "Package does not exist on maven: pkg:maven/does-not-exist@1.0\n" + "Package does not exist on maven: pkg:maven/does-not-exist@1.0?classifier=sources\n" ) - self.assertEqual(expected_status, response.data['status']) + self.assertEqual(expected_status, response.data["status"]) class ResourceApiTestCase(TestCase): - def setUp(self): self.package_data = { - 'type': 'generic', - 'namespace': 'generic', - 'name': 'Foo', - 'version': '12.34', - 'qualifiers': 'test_qual=qual', - 'subpath': 'test_subpath', - 'download_url': 'http://example.com', - 'filename': 'Foo.zip', - 'sha1': 'testsha1', - 'md5': 'testmd5', - 'size': 101, + "type": "generic", + "namespace": "generic", + "name": "Foo", + "version": "12.34", + "qualifiers": "test_qual=qual", + "subpath": "test_subpath", + "download_url": "http://example.com", + "filename": "Foo.zip", + "sha1": "testsha1", + "md5": "testmd5", + "size": 101, } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() self.resource1 = Resource.objects.create( - path='foo', - name='foo', - sha1='sha1-1', - md5='md5-1', - package=self.package + path="foo", name="foo", sha1="sha1-1", md5="md5-1", package=self.package ) self.resource1.refresh_from_db() self.resource2 = Resource.objects.create( - path='foo/bar', - name='bar', - sha1='sha1-2', - md5='md5-2', - package=self.package + path="foo/bar", name="bar", sha1="sha1-2", md5="md5-2", package=self.package ) self.resource2.refresh_from_db() def test_api_resource_checksum_filter(self): - filters = f'?md5={self.resource1.md5}&md5={self.resource2.md5}' - response = self.client.get(f'/api/resources/{filters}') - self.assertEqual(2, response.data['count']) - names = sorted([result.get('name') - for result in response.data['results']]) - expected_names = sorted([ - self.resource1.name, - self.resource2.name, - ]) + filters = f"?md5={self.resource1.md5}&md5={self.resource2.md5}" + response = self.client.get(f"/api/resources/{filters}") + self.assertEqual(2, response.data["count"]) + names = sorted([result.get("name") for result in response.data["results"]]) + expected_names = sorted( + [ + self.resource1.name, + self.resource2.name, + ] + ) self.assertEqual(expected_names, names) - filters = f'?sha1={self.resource1.sha1}&sha1={self.resource2.sha1}' - response = self.client.get(f'/api/resources/{filters}') + filters = f"?sha1={self.resource1.sha1}&sha1={self.resource2.sha1}" + response = self.client.get(f"/api/resources/{filters}") self.assertEqual(2, response.data["count"]) - names = sorted([result.get('name') - for result in response.data['results']]) - expected_names = sorted([ - self.resource1.name, - self.resource2.name, - ]) + names = sorted([result.get("name") for result in response.data["results"]]) + expected_names = sorted( + [ + self.resource1.name, + self.resource2.name, + ] + ) self.assertEqual(expected_names, names) class PackageUpdateSetTestCase(TestCase): - def setUp(self): self.package_data = { - 'type': 'npm', - 'namespace': '', - 'name': 'foobar', - 'version': '1.1.0', - 'qualifiers': '', - 'subpath': '', - 'download_url': '', - 'filename': 'Foo.zip', - 'sha1': 'testsha1', - 'md5': 'testmd5', - 'size': 101, - 'package_content': 1 + "type": "npm", + "namespace": "", + "name": "foobar", + "version": "1.1.0", + "qualifiers": "", + "subpath": "", + "download_url": "", + "filename": "Foo.zip", + "sha1": "testsha1", + "md5": "testmd5", + "size": 101, + "package_content": 1, } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() @@ -1469,97 +1450,84 @@ def setUp(self): def test_api_purl_updation(self): data = { - "purls": [ - {"purl": "pkg:npm/hologram@1.1.0", "content_type": "CURATION"}], - "uuid": str(self.new_package_set_uuid) + "purls": [{"purl": "pkg:npm/hologram@1.1.0", "content_type": "CURATION"}], + "uuid": str(self.new_package_set_uuid), } response = self.client.post( - f"/api/update_packages/", data=data, content_type="application/json") + "/api/update_packages/", data=data, content_type="application/json" + ) - expected = [{"purl": "pkg:npm/hologram@1.1.0", - "update_status": "Updated"}] + expected = [{"purl": "pkg:npm/hologram@1.1.0", "update_status": "Updated"}] self.assertEqual(expected, response.data) def test_api_purl_updation_existing_package(self): data = { - "purls": [ - {"purl": "pkg:npm/foobar@1.1.0", "content_type": "PATCH"} - ], - "uuid": str(self.new_package_set_uuid) + "purls": [{"purl": "pkg:npm/foobar@1.1.0", "content_type": "PATCH"}], + "uuid": str(self.new_package_set_uuid), } - expected = [{"purl": "pkg:npm/foobar@1.1.0", - "update_status": "Already Exists"}] + expected = [{"purl": "pkg:npm/foobar@1.1.0", "update_status": "Already Exists"}] response = self.client.post( - f"/api/update_packages/", data=data, content_type="application/json") + "/api/update_packages/", data=data, content_type="application/json" + ) self.assertEqual(expected, response.data) def test_api_purl_updation_non_existing_uuid(self): data = { - "purls": [ - {"purl": "pkg:npm/foobar@1.1.0", "content_type": "SOURCE_REPO"} - ], - "uuid": "ac9c36f4-a1ed-4824-8448-c6ed8f1da71d" + "purls": [{"purl": "pkg:npm/foobar@1.1.0", "content_type": "SOURCE_REPO"}], + "uuid": "ac9c36f4-a1ed-4824-8448-c6ed8f1da71d", } expected = { - "update_status": "No Package Set found for ac9c36f4-a1ed-4824-8448-c6ed8f1da71d"} + "update_status": "No Package Set found for ac9c36f4-a1ed-4824-8448-c6ed8f1da71d" + } response = self.client.post( - f"/api/update_packages/", data=data, content_type="application/json") + "/api/update_packages/", data=data, content_type="application/json" + ) self.assertEqual(expected, response.data) def test_api_purl_updation_without_uuid(self): - data = { - "purls": [ - {"purl": "pkg:npm/jammy@1.1.9", "content_type": "BINARY"} - ] - } + data = {"purls": [{"purl": "pkg:npm/jammy@1.1.9", "content_type": "BINARY"}]} - expected = [{"purl": "pkg:npm/jammy@1.1.9", - "update_status": "Updated"}] + expected = [{"purl": "pkg:npm/jammy@1.1.9", "update_status": "Updated"}] response = self.client.post( - f"/api/update_packages/", data=data, content_type="application/json") + "/api/update_packages/", data=data, content_type="application/json" + ) self.assertEqual(expected, response.data) def test_api_purl_validation_empty_request(self): data = {} response = self.client.post( - f"/api/update_packages/", data=data, content_type="application/json") + "/api/update_packages/", data=data, content_type="application/json" + ) - expected = { - "errors": { - "purls": [ - "This field is required." - ] - } - } + expected = {"errors": {"purls": ["This field is required."]}} self.assertAlmostEqual(expected, response.data) class PurlValidateApiTestCase(TestCase): - def setUp(self): self.package_data = { - 'type': 'npm', - 'namespace': '', - 'name': 'foobar', - 'version': '1,1.0', - 'qualifiers': '', - 'subpath': '', - 'download_url': '', - 'filename': 'Foo.zip', - 'sha1': 'testsha1', - 'md5': 'testmd5', - 'size': 101, + "type": "npm", + "namespace": "", + "name": "foobar", + "version": "1,1.0", + "qualifiers": "", + "subpath": "", + "download_url": "", + "filename": "Foo.zip", + "sha1": "testsha1", + "md5": "testmd5", + "size": 101, } self.package = Package.objects.create(**self.package_data) self.package.refresh_from_db() @@ -1569,13 +1537,13 @@ def test_api_purl_validation(self): "purl": "pkg:npm/foobar@1.1.0", "check_existence": True, } - response1 = self.client.get(f"/api/validate/", data=data1) + response1 = self.client.get("/api/validate/", data=data1) data2 = { "purl": "pkg:npm/?foobar@1.1.0", "check_existence": True, } - response2 = self.client.get(f"/api/validate/", data=data2) + response2 = self.client.get("/api/validate/", data=data2) self.assertEqual(True, response1.data["valid"]) self.assertEqual(True, response1.data["exists"]) @@ -1596,31 +1564,25 @@ def test_api_purl_validation_unsupported_package_type(self): "purl": "pkg:random/foobar@1.1.0", "check_existence": True, } - response1 = self.client.get(f"/api/validate/", data=data1) + response1 = self.client.get("/api/validate/", data=data1) self.assertEqual(True, response1.data["valid"]) self.assertEqual( - "The provided PackageURL is valid, but `check_existence` is not supported for this package type.", response1.data[ - "message"] + "The provided PackageURL is valid, but `check_existence` is not supported for this package type.", + response1.data["message"], ) self.assertEqual(None, response1.data["exists"]) def test_api_purl_validation_empty_request(self): data1 = {} - response1 = self.client.get(f"/api/validate/", data=data1) + response1 = self.client.get("/api/validate/", data=data1) data2 = { "does-not-exist": "dne", } - response2 = self.client.get(f"/api/validate/", data=data2) + response2 = self.client.get("/api/validate/", data=data2) - expected = { - "errors": { - "purl": [ - "This field is required." - ] - } - } + expected = {"errors": {"purl": ["This field is required."]}} self.assertAlmostEqual(expected, response1.data) self.assertEqual(status.HTTP_400_BAD_REQUEST, response1.status_code) @@ -1630,7 +1592,6 @@ def test_api_purl_validation_empty_request(self): class PackageWatchTestCase(TestCase): - @mock.patch("packagedb.models.PackageWatch.create_new_job") def setUp(self, mock_create_new_job): mock_create_new_job.return_value = None @@ -1724,12 +1685,10 @@ def test_api_package_watch_put_not_allowed(self): "/api/watch/pkg:npm/foobar/", data=data, content_type="application/json" ) - self.assertEqual(status.HTTP_405_METHOD_NOT_ALLOWED, - response1.status_code) + self.assertEqual(status.HTTP_405_METHOD_NOT_ALLOWED, response1.status_code) class ToGolangPurlTestCase(TestCase): - def test_to_golang_purl(self): response = self.client.get( "/api/to_purl/go", diff --git a/packagedb/tests/test_filters.py b/packagedb/tests/test_filters.py index c4d5fb65..8d859b2a 100644 --- a/packagedb/tests/test_filters.py +++ b/packagedb/tests/test_filters.py @@ -8,27 +8,27 @@ # from django.test import TestCase + from packagedb.api import PackageFilterSet from packagedb.filters import parse_query_string_to_lookups from packagedb.models import Package class PackageDBFilterTest(TestCase): - def test_scanpipe_filters_package_filterset_search(self): p1 = Package.objects.create( - type='maven', - namespace='org.example', - name='foo', - version='1.0.0', - download_url='https://example.com/foo-1.0.0.jar', + type="maven", + namespace="org.example", + name="foo", + version="1.0.0", + download_url="https://example.com/foo-1.0.0.jar", ) - p2 = Package.objects.create( - type='maven', - namespace='org.somethingelse', - name='foo', - version='0.35.7', - download_url='https://somethingelse.net/foo-0.35.7.jar', + Package.objects.create( + type="maven", + namespace="org.somethingelse", + name="foo", + version="0.35.7", + download_url="https://somethingelse.net/foo-0.35.7.jar", ) filterset = PackageFilterSet(data={}) @@ -80,6 +80,5 @@ def test_packagedb_filters_parse_query_string_to_lookups(self): } for query_string, expected in inputs.items(): - lookups = parse_query_string_to_lookups( - query_string, "icontains", "name") + lookups = parse_query_string_to_lookups(query_string, "icontains", "name") self.assertEqual(expected, str(lookups)) diff --git a/packagedb/tests/test_migrations.py b/packagedb/tests/test_migrations.py index cd07a379..4015bacb 100644 --- a/packagedb/tests/test_migrations.py +++ b/packagedb/tests/test_migrations.py @@ -96,21 +96,27 @@ def test_package_set_creation(self): self.package4, self.package5, ] - self.assertTrue(all(package.package_sets for package in packages_in_package_sets)) + self.assertTrue( + all(package.package_sets for package in packages_in_package_sets) + ) package_set1 = PackageSet.objects.get(uuid=self.package_set1) self.assertTrue(package_set1) - self.assertRaises(PackageSet.DoesNotExist, PackageSet.objects.get, uuid=self.package_set2) + self.assertRaises( + PackageSet.DoesNotExist, PackageSet.objects.get, uuid=self.package_set2 + ) self.assertEqual(1, self.package1.package_sets.count()) self.assertEqual(1, self.package2.package_sets.count()) self.assertEqual(package_set1.uuid, self.package1.package_sets.first().uuid) self.assertEqual(package_set1.uuid, self.package2.package_sets.first().uuid) - self.assertEqual(0, self.package3.package_sets.count()) + self.assertEqual(0, self.package3.package_sets.count()) - self.assertEqual(1, self.package4.package_sets.count()) - self.assertEqual(1, self.package5.package_sets.count()) - self.assertEqual(self.package4.package_sets.first(), self.package5.package_sets.first()) + self.assertEqual(1, self.package4.package_sets.count()) + self.assertEqual(1, self.package5.package_sets.count()) + self.assertEqual( + self.package4.package_sets.first(), self.package5.package_sets.first() + ) package_set_for_package4_and_package5 = self.package4.package_sets.first() self.assertEqual(2, package_set_for_package4_and_package5.packages.count()) diff --git a/packagedb/tests/test_models.py b/packagedb/tests/test_models.py index aa9f3334..4babf5ee 100644 --- a/packagedb/tests/test_models.py +++ b/packagedb/tests/test_models.py @@ -7,35 +7,32 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from dateutil.parser import parse as dateutil_parse +from unittest.mock import patch from django.db import IntegrityError from django.test import TransactionTestCase from django.utils import timezone -from packagedb.models import DependentPackage, PackageWatch +from dateutil.parser import parse as dateutil_parse + +from packagedb.models import DependentPackage from packagedb.models import Package +from packagedb.models import PackageWatch from packagedb.models import Party from packagedb.models import Resource -from unittest.mock import patch - class ResourceModelTestCase(TransactionTestCase): def setUp(self): - self.package = Package.objects.create(download_url='test-pkg.com') - self.resource_paths = [ - 'root/', - 'root/test.json' - ] + self.package = Package.objects.create(download_url="test-pkg.com") + self.resource_paths = ["root/", "root/test.json"] def tearDown(self): Package.objects.all().delete() Resource.objects.all().delete() def test_resource_is_created_on_a_package(self): - Resource.objects.create(package=self.package, - path=self.resource_paths[0]) + Resource.objects.create(package=self.package, path=self.resource_paths[0]) self.assertEqual(1, Resource.objects.all().count()) @@ -50,7 +47,8 @@ def test_duplicate_resources_are_not_created(self): Resource.objects.create(package=self.package, path=path) for path in self.resource_paths: self.assertRaises( - IntegrityError, Resource.objects.create, package=self.package, path=path) + IntegrityError, Resource.objects.create, package=self.package, path=path + ) self.assertEqual(2, Resource.objects.all().count()) @@ -58,87 +56,97 @@ def test_duplicate_resources_are_not_created(self): class PackageModelHistoryFieldTestCase(TransactionTestCase): def setUp(self): self.test_package = Package.objects.create( - download_url='https://test.com', + download_url="https://test.com", ) - self.message0 = 'test-message0' - self.message1 = 'test-message1' - self.message2 = 'test-message2' + self.message0 = "test-message0" + self.message1 = "test-message1" + self.message2 = "test-message2" def test_history_field_append_and_get_one_item(self): self.test_package.append_to_history(self.message0) - expected_date = timezone.now().strftime('%Y-%m-%d') + expected_date = timezone.now().strftime("%Y-%m-%d") expected_message = self.message0 history = self.test_package.get_history()[0] - self.assertIn(expected_date, history.get('timestamp')) - self.assertEqual(expected_message, history.get('message')) + self.assertIn(expected_date, history.get("timestamp")) + self.assertEqual(expected_message, history.get("message")) def test_history_field_append_and_get_multiple_items(self): self.test_package.append_to_history(self.message0) self.test_package.append_to_history(self.message1) self.test_package.append_to_history(self.message2) - expected_date = timezone.now().strftime('%Y-%m-%d') + expected_date = timezone.now().strftime("%Y-%m-%d") expected_messages = [ self.message0, self.message1, self.message2, ] - for expected_message, entry in zip(expected_messages, self.test_package.get_history()): - self.assertIn(expected_date, entry.get('timestamp')) - self.assertEqual(expected_message, entry.get('message')) + for expected_message, entry in zip( + expected_messages, self.test_package.get_history() + ): + self.assertIn(expected_date, entry.get("timestamp")) + self.assertEqual(expected_message, entry.get("message")) class PackageModelTestCase(TransactionTestCase): def setUp(self): - self.created_package_download_url = 'https://created-example.com' - self.inserted_package_download_url = 'https://inserted-example.com' + self.created_package_download_url = "https://created-example.com" + self.inserted_package_download_url = "https://inserted-example.com" self.created_package_data = { - 'download_url': self.created_package_download_url, - 'type': 'generic', - 'namespace': 'generic', - 'name': 'Foo', - 'version': '12.34', + "download_url": self.created_package_download_url, + "type": "generic", + "namespace": "generic", + "name": "Foo", + "version": "12.34", } self.inserted_package_data = { - 'download_url': self.inserted_package_download_url, - 'type': 'generic', - 'namespace': 'generic', - 'name': 'Bar', - 'version': '12.34', + "download_url": self.inserted_package_download_url, + "type": "generic", + "namespace": "generic", + "name": "Bar", + "version": "12.34", } - self.created_package = Package.objects.create( - **self.created_package_data) - self.inserted_package = Package.objects.insert( - **self.inserted_package_data) + self.created_package = Package.objects.create(**self.created_package_data) + self.inserted_package = Package.objects.insert(**self.inserted_package_data) def test_package_download_url_is_unique(self): - self.assertIsNone(Package.objects.insert( - download_url=self.created_package_download_url)) - self.assertIsNone(Package.objects.insert( - download_url=self.inserted_package_download_url)) + self.assertIsNone( + Package.objects.insert(download_url=self.created_package_download_url) + ) + self.assertIsNone( + Package.objects.insert(download_url=self.inserted_package_download_url) + ) def test_packagedb_package_model_history_field(self): - self.created_package.append_to_history('test-message') + self.created_package.append_to_history("test-message") for entry in self.created_package.get_history(): - self.assertEqual('test-message', entry.get('message')) + self.assertEqual("test-message", entry.get("message")) def test_packagedb_package_model_get_all_versions(self): p1 = Package.objects.create( - download_url='http://a.a', type='generic', name='name', version='1.0') + download_url="http://a.a", type="generic", name="name", version="1.0" + ) p2 = Package.objects.create( - download_url='http://b.b', type='generic', name='name', version='2.0') + download_url="http://b.b", type="generic", name="name", version="2.0" + ) p3 = Package.objects.create( - download_url='http://c.c', type='generic', name='name', version='3.0') - p4 = Package.objects.create(download_url='http://d.d', type='generic', namespace='space', name='name', - version='4.0') + download_url="http://c.c", type="generic", name="name", version="3.0" + ) + p4 = Package.objects.create( + download_url="http://d.d", + type="generic", + namespace="space", + name="name", + version="4.0", + ) self.assertEqual([p1, p2, p3], list(p1.get_all_versions())) self.assertEqual([p1, p2, p3], list(p2.get_all_versions())) @@ -147,13 +155,17 @@ def test_packagedb_package_model_get_all_versions(self): def test_packagedb_package_model_get_latest_version(self): p1 = Package.objects.create( - download_url='http://a.a', name='name', version='1.0') + download_url="http://a.a", name="name", version="1.0" + ) p2 = Package.objects.create( - download_url='http://b.b', name='name', version='2.0') + download_url="http://b.b", name="name", version="2.0" + ) p3 = Package.objects.create( - download_url='http://c.c', name='name', version='3.0') - p4 = Package.objects.create(download_url='http://d.d', namespace='space', name='name', - version='4.0') + download_url="http://c.c", name="name", version="3.0" + ) + p4 = Package.objects.create( + download_url="http://d.d", namespace="space", name="name", version="4.0" + ) self.assertEqual(p3, p1.get_latest_version()) self.assertEqual(p3, p2.get_latest_version()) @@ -162,54 +174,51 @@ def test_packagedb_package_model_get_latest_version(self): def test_packagedb_package_model_update_fields(self): p1 = Package.objects.create( - download_url='http://a.a', name='name', version='1.0') + download_url="http://a.a", name="name", version="1.0" + ) self.assertFalse(p1.history) - self.assertEqual('', p1.namespace) + self.assertEqual("", p1.namespace) self.assertEqual(None, p1.homepage_url) package, updated_fields = p1.update_fields( - namespace='test', homepage_url='https://example.com') + namespace="test", homepage_url="https://example.com" + ) self.assertEqual( - sorted(updated_fields), - sorted(['homepage_url', 'history', 'namespace']) + sorted(updated_fields), sorted(["homepage_url", "history", "namespace"]) ) - self.assertEqual('test', p1.namespace) - self.assertEqual('https://example.com', p1.homepage_url) + self.assertEqual("test", p1.namespace) + self.assertEqual("https://example.com", p1.homepage_url) self.assertEqual(1, len(p1.history)) expected_history_entry = { - 'message': 'Package field values have been updated.', - 'data': { - 'updated_fields': - [ + "message": "Package field values have been updated.", + "data": { + "updated_fields": [ + {"field": "namespace", "old_value": "", "new_value": "test"}, { - 'field': 'namespace', - 'old_value': '', - 'new_value': 'test' + "field": "homepage_url", + "old_value": None, + "new_value": "https://example.com", }, - { - 'field': 'homepage_url', - 'old_value': None, - 'new_value': 'https://example.com' - } ] - } + }, } history_entry = p1.history[0] - history_entry.pop('timestamp') + history_entry.pop("timestamp") self.assertEqual(expected_history_entry, history_entry) def test_packagedb_package_model_update_fields_special_cases(self): p1 = Package.objects.create( - download_url='http://a.a', name='name', version='1.0') + download_url="http://a.a", name="name", version="1.0" + ) # Test dates date_fields = [ - 'created_date', - 'last_indexed_date', - 'release_date', + "created_date", + "last_indexed_date", + "release_date", ] for field in date_fields: value = getattr(p1, field) self.assertEqual(None, value) - timestamp_str = '2017-03-25T14:39:00+00:00' + timestamp_str = "2017-03-25T14:39:00+00:00" package, updated_fields = p1.update_fields( **{field: timestamp_str for field in date_fields} ) @@ -217,32 +226,25 @@ def test_packagedb_package_model_update_fields_special_cases(self): for field in date_fields: value = getattr(package, field) self.assertEqual(timestamp, value) - self.assertEqual( - sorted(updated_fields), - sorted(date_fields + ['history']) - ) + self.assertEqual(sorted(updated_fields), sorted(date_fields + ["history"])) # Test qualifiers - self.assertEqual('', p1.qualifiers) + self.assertEqual("", p1.qualifiers) dict_qualifiers1 = { - 'classifier': 'sources', - 'type': 'war', + "classifier": "sources", + "type": "war", } - string_qualifiers1 = 'classifier=sources&type=war' + string_qualifiers1 = "classifier=sources&type=war" package, updated_fields = p1.update_fields(qualifiers=dict_qualifiers1) self.assertEqual( - sorted(['qualifiers', 'history']), + sorted(["qualifiers", "history"]), sorted(updated_fields), ) + self.assertEqual(string_qualifiers1, p1.qualifiers) + string_qualifiers2 = "classifier=somethingelse" + package, updated_fields = p1.update_fields(qualifiers=string_qualifiers2) self.assertEqual( - string_qualifiers1, - p1.qualifiers - ) - string_qualifiers2 = 'classifier=somethingelse' - package, updated_fields = p1.update_fields( - qualifiers=string_qualifiers2) - self.assertEqual( - sorted(['qualifiers', 'history']), + sorted(["qualifiers", "history"]), sorted(updated_fields), ) self.assertEqual( @@ -251,74 +253,77 @@ def test_packagedb_package_model_update_fields_special_cases(self): ) expected_history = [ { - 'message': 'Package field values have been updated.', - 'data': { - 'updated_fields': [ + "message": "Package field values have been updated.", + "data": { + "updated_fields": [ { - 'field': 'created_date', - 'old_value': 'None', - 'new_value': '2017-03-25 14:39:00+00:00' - }, { - 'field': 'last_indexed_date', - 'old_value': 'None', - 'new_value': '2017-03-25 14:39:00+00:00' - }, { - 'field': 'release_date', - 'old_value': 'None', - 'new_value': '2017-03-25 14:39:00+00:00' - } + "field": "created_date", + "old_value": "None", + "new_value": "2017-03-25 14:39:00+00:00", + }, + { + "field": "last_indexed_date", + "old_value": "None", + "new_value": "2017-03-25 14:39:00+00:00", + }, + { + "field": "release_date", + "old_value": "None", + "new_value": "2017-03-25 14:39:00+00:00", + }, ] - } + }, }, { - 'message': 'Package field values have been updated.', - 'data': { - 'updated_fields': [ + "message": "Package field values have been updated.", + "data": { + "updated_fields": [ { - 'field': 'qualifiers', - 'old_value': '', - 'new_value': 'classifier=sources&type=war' + "field": "qualifiers", + "old_value": "", + "new_value": "classifier=sources&type=war", } ] - } + }, }, { - 'message': 'Package field values have been updated.', - 'data': { - 'updated_fields': [ + "message": "Package field values have been updated.", + "data": { + "updated_fields": [ { - 'field': 'qualifiers', - 'old_value': 'classifier=sources&type=war', - 'new_value': 'classifier=somethingelse' + "field": "qualifiers", + "old_value": "classifier=sources&type=war", + "new_value": "classifier=somethingelse", } ] - } - } + }, + }, ] # remove timestamp before comparison history = [] for entry in p1.history: - entry.pop('timestamp') + entry.pop("timestamp") history.append(entry) self.assertEqual(expected_history, history) def test_packagedb_package_model_update_fields_related_models(self): p1 = Package.objects.create( - download_url='http://a.a', name='name', version='1.0') - path = 'asdf' + download_url="http://a.a", name="name", version="1.0" + ) + path = "asdf" resources = [Resource(package=p1, path=path)] _, updated_fields = p1.update_fields(resources=resources) - self.assertEqual( - sorted(['resources', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["resources", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'resources' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'resources' with 1 new entries." self.assertEqual(1, len(p1.history)) - history_message = p1.history[0]['message'] + history_message = p1.history[0]["message"] self.assertEqual(expected_message, history_message) p2 = Package.objects.create( - download_url='http://b.b', name='example', version='1.0') + download_url="http://b.b", name="example", version="1.0" + ) resources = [ { "path": "example.jar", @@ -350,115 +355,113 @@ def test_packagedb_package_model_update_fields_related_models(self): "holders": [], "authors": [], "package_data": [], - "for_packages": [ - - ], + "for_packages": [], "emails": [], "urls": [], - "extra_data": {} + "extra_data": {}, } ] _, updated_fields = p2.update_fields(resources=resources) - self.assertEqual( - sorted(['resources', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["resources", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'resources' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'resources' with 1 new entries." self.assertEqual(1, len(p2.history)) - history_message = p2.history[0]['message'] + history_message = p2.history[0]["message"] self.assertEqual(expected_message, history_message) p3 = Package.objects.create( - download_url='http://foo', name='foo', version='1.0') + download_url="http://foo", name="foo", version="1.0" + ) parties = [ dict( - type='admin', - role='admin', - name='foo', - email='foo@foo.com', - url='foo.com', + type="admin", + role="admin", + name="foo", + email="foo@foo.com", + url="foo.com", ) ] _, updated_fields = p3.update_fields(parties=parties) - self.assertEqual( - sorted(['parties', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["parties", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'parties' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'parties' with 1 new entries." self.assertEqual(1, len(p3.history)) - history_message = p3.history[0]['message'] + history_message = p3.history[0]["message"] self.assertEqual(expected_message, history_message) p4 = Package.objects.create( - download_url='http://bar', name='bar', version='1.0') + download_url="http://bar", name="bar", version="1.0" + ) parties = [ Party( package=p4, - type='admin', - role='admin', - name='bar', - email='bar@bar.com', - url='foo.com', + type="admin", + role="admin", + name="bar", + email="bar@bar.com", + url="foo.com", ) ] _, updated_fields = p4.update_fields(parties=parties) - self.assertEqual( - sorted(['parties', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["parties", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'parties' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'parties' with 1 new entries." self.assertEqual(1, len(p4.history)) - history_message = p4.history[0]['message'] + history_message = p4.history[0]["message"] self.assertEqual(expected_message, history_message) p5 = Package.objects.create( - download_url='http://baz', name='baz', version='1.0') + download_url="http://baz", name="baz", version="1.0" + ) dependencies = [ dict( - purl='pkg:baz_dep@1.0', - extracted_requirement='>1', - scope='runtime', + purl="pkg:baz_dep@1.0", + extracted_requirement=">1", + scope="runtime", is_runtime=True, is_optional=False, is_resolved=True, ) ] _, updated_fields = p5.update_fields(dependencies=dependencies) - self.assertEqual( - sorted(['dependencies', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["dependencies", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'dependencies' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'dependencies' with 1 new entries." self.assertEqual(1, len(p5.history)) - history_message = p5.history[0]['message'] + history_message = p5.history[0]["message"] self.assertEqual(expected_message, history_message) p6 = Package.objects.create( - download_url='http://qux', name='qux', version='1.0') + download_url="http://qux", name="qux", version="1.0" + ) dependencies = [ DependentPackage( package=p6, - purl='pkg:qux_dep@1.0', - extracted_requirement='>1', - scope='runtime', + purl="pkg:qux_dep@1.0", + extracted_requirement=">1", + scope="runtime", is_runtime=True, is_optional=False, is_resolved=True, ) ] _, updated_fields = p6.update_fields(dependencies=dependencies) - self.assertEqual( - sorted(['dependencies', 'history']), - sorted(updated_fields) + self.assertEqual(sorted(["dependencies", "history"]), sorted(updated_fields)) + expected_message = ( + "Replaced 0 existing entries of field 'dependencies' with 1 new entries." ) - expected_message = "Replaced 0 existing entries of field 'dependencies' with 1 new entries." self.assertEqual(1, len(p6.history)) - history_message = p6.history[0]['message'] + history_message = p6.history[0]["message"] self.assertEqual(expected_message, history_message) def test_packagedb_package_model_update_fields_exceptions(self): p1 = Package.objects.create( - download_url='http://a.a', name='name', version='1.0') + download_url="http://a.a", name="name", version="1.0" + ) with self.assertRaises(AttributeError): p1.update_fields(asdf=123) @@ -524,20 +527,21 @@ def test_package_watch_reschedule_on_modification(self, mock_create_new_job): self.package_watch1.watch_interval = 1 self.package_watch1.save() - self.assertEqual("reschedule_id_new_interval", - self.package_watch1.schedule_work_id) + self.assertEqual( + "reschedule_id_new_interval", self.package_watch1.schedule_work_id + ) self.package_watch1.is_active = False self.package_watch1.save() self.assertEqual(None, self.package_watch1.schedule_work_id) def test_get_or_none(self): - Package.objects.create(download_url='http://a.ab', - name='name', version='1.0', type="foo") - package = Package.objects.filter( - download_url="http://a.ab" - ).get_or_none() + Package.objects.create( + download_url="http://a.ab", name="name", version="1.0", type="foo" + ) + package = Package.objects.filter(download_url="http://a.ab").get_or_none() assert package - assert Package.objects.filter( - download_url="http://a.ab-foobar" - ).get_or_none() == None + assert ( + Package.objects.filter(download_url="http://a.ab-foobar").get_or_none() + is None + ) diff --git a/packagedb/tests/test_package_managers.py b/packagedb/tests/test_package_managers.py index 90e99d23..5c672177 100644 --- a/packagedb/tests/test_package_managers.py +++ b/packagedb/tests/test_package_managers.py @@ -12,9 +12,9 @@ from datetime import datetime from functools import partial from unittest import mock + from django.test import TestCase -import pytest from dateutil.tz import tzlocal from packageurl import PackageURL @@ -34,84 +34,144 @@ dt_local = partial(datetime, tzinfo=tzlocal()) -class TestPackageManagers(TestCase): +class TestPackageManagers(TestCase): def test_trim_go_url_path(self): - assert GoproxyVersionAPI.trim_go_url_path("https://pkg.go.dev/https://github.com/xx/a/b") == "github.com/xx/a" - assert GoproxyVersionAPI.trim_go_url_path("https://github.com/xx/a/b") == "github.com/xx/a" - + assert ( + GoproxyVersionAPI.trim_go_url_path( + "https://pkg.go.dev/https://github.com/xx/a/b" + ) + == "github.com/xx/a" + ) + assert ( + GoproxyVersionAPI.trim_go_url_path("https://github.com/xx/a/b") + == "github.com/xx/a" + ) def test_nuget_extract_version(self): - with open(os.path.join(TEST_DATA, "nuget-data.json"), "r") as f: + with open(os.path.join(TEST_DATA, "nuget-data.json")) as f: response = json.load(f) results = list(NugetVersionAPI().extract_versions(response)) expected = [ - PackageVersion(value="2.1.0", release_date=dt_local(2011, 1, 22, 13, 34, 8, 550000)), - PackageVersion(value="3.0.0", release_date=dt_local(2011, 11, 24, 0, 26, 2, 527000)), - PackageVersion(value="3.0.3", release_date=dt_local(2011, 11, 27, 13, 50, 2, 63000)), - PackageVersion(value="3.0.4", release_date=dt_local(2011, 12, 12, 10, 18, 33, 380000)), - PackageVersion(value="3.0.5", release_date=dt_local(2011, 12, 12, 12, 0, 25, 947000)), - PackageVersion(value="3.0.6", release_date=dt_local(2012, 1, 2, 21, 10, 43, 403000)), - PackageVersion(value="3.4.0", release_date=dt_local(2013, 10, 20, 13, 32, 30, 837000)), - PackageVersion(value="3.4.1", release_date=dt_local(2014, 1, 17, 9, 17, 43, 680000)), - PackageVersion(value="3.5.0-beta2", release_date=dt_local(2015, 1, 1, 14, 9, 28, 710000)), - PackageVersion(value="3.5.0-beta3", release_date=dt_local(2015, 1, 6, 17, 39, 25, 147000)), - PackageVersion(value="3.5.0", release_date=dt_local(2015, 1, 14, 2, 1, 58, 853000)), - PackageVersion(value="3.5.1", release_date=dt_local(2015, 1, 23, 1, 5, 44, 447000)), + PackageVersion( + value="2.1.0", release_date=dt_local(2011, 1, 22, 13, 34, 8, 550000) + ), + PackageVersion( + value="3.0.0", release_date=dt_local(2011, 11, 24, 0, 26, 2, 527000) + ), + PackageVersion( + value="3.0.3", release_date=dt_local(2011, 11, 27, 13, 50, 2, 63000) + ), + PackageVersion( + value="3.0.4", release_date=dt_local(2011, 12, 12, 10, 18, 33, 380000) + ), + PackageVersion( + value="3.0.5", release_date=dt_local(2011, 12, 12, 12, 0, 25, 947000) + ), + PackageVersion( + value="3.0.6", release_date=dt_local(2012, 1, 2, 21, 10, 43, 403000) + ), + PackageVersion( + value="3.4.0", release_date=dt_local(2013, 10, 20, 13, 32, 30, 837000) + ), + PackageVersion( + value="3.4.1", release_date=dt_local(2014, 1, 17, 9, 17, 43, 680000) + ), + PackageVersion( + value="3.5.0-beta2", + release_date=dt_local(2015, 1, 1, 14, 9, 28, 710000), + ), + PackageVersion( + value="3.5.0-beta3", + release_date=dt_local(2015, 1, 6, 17, 39, 25, 147000), + ), + PackageVersion( + value="3.5.0", release_date=dt_local(2015, 1, 14, 2, 1, 58, 853000) + ), + PackageVersion( + value="3.5.1", release_date=dt_local(2015, 1, 23, 1, 5, 44, 447000) + ), ] assert results == expected - def test_nuget_extract_version_with_illformed_data(self): test_data = {"items": [{"items": [{"catalogEntry": {}}]}]} results = list(NugetVersionAPI.extract_versions(test_data)) assert results == [] - @mock.patch("packagedb.package_managers.get_response") def test_pypi_fetch_data(self, mock_response): pypi_api = PypiVersionAPI() - with open(os.path.join(TEST_DATA, "pypi.json"), "r") as f: + with open(os.path.join(TEST_DATA, "pypi.json")) as f: mock_response.return_value = json.load(f) results = list(pypi_api.fetch("django")) expected = [ - PackageVersion(value="1.1.3", release_date=dt_local(2010, 12, 23, 5, 14, 23, 509436)), - PackageVersion(value="1.1.4", release_date=dt_local(2011, 2, 9, 4, 13, 7, 75)), - PackageVersion(value="1.10", release_date=dt_local(2016, 8, 1, 18, 32, 16, 280614)), - PackageVersion(value="1.10.1", release_date=dt_local(2016, 9, 1, 23, 18, 18, 672706)), - PackageVersion(value="1.10.2", release_date=dt_local(2016, 10, 1, 20, 5, 31, 330942)), - PackageVersion(value="1.10.3", release_date=dt_local(2016, 11, 1, 13, 57, 16, 55061)), - PackageVersion(value="1.10.4", release_date=dt_local(2016, 12, 1, 23, 46, 50, 215935)), - PackageVersion(value="1.10.5", release_date=dt_local(2017, 1, 4, 19, 23, 0, 596664)), - PackageVersion(value="1.10.6", release_date=dt_local(2017, 3, 1, 13, 37, 40, 243134)), - PackageVersion(value="1.10.7", release_date=dt_local(2017, 4, 4, 14, 27, 54, 235551)), - PackageVersion(value="1.10.8", release_date=dt_local(2017, 9, 5, 15, 31, 58, 221021)), - PackageVersion(value="1.10a1", release_date=dt_local(2016, 5, 20, 12, 24, 59, 952686)), - PackageVersion(value="1.10b1", release_date=dt_local(2016, 6, 22, 1, 15, 17, 267637)), - PackageVersion(value="1.10rc1", release_date=dt_local(2016, 7, 18, 18, 5, 5, 503584)), + PackageVersion( + value="1.1.3", release_date=dt_local(2010, 12, 23, 5, 14, 23, 509436) + ), + PackageVersion( + value="1.1.4", release_date=dt_local(2011, 2, 9, 4, 13, 7, 75) + ), + PackageVersion( + value="1.10", release_date=dt_local(2016, 8, 1, 18, 32, 16, 280614) + ), + PackageVersion( + value="1.10.1", release_date=dt_local(2016, 9, 1, 23, 18, 18, 672706) + ), + PackageVersion( + value="1.10.2", release_date=dt_local(2016, 10, 1, 20, 5, 31, 330942) + ), + PackageVersion( + value="1.10.3", release_date=dt_local(2016, 11, 1, 13, 57, 16, 55061) + ), + PackageVersion( + value="1.10.4", release_date=dt_local(2016, 12, 1, 23, 46, 50, 215935) + ), + PackageVersion( + value="1.10.5", release_date=dt_local(2017, 1, 4, 19, 23, 0, 596664) + ), + PackageVersion( + value="1.10.6", release_date=dt_local(2017, 3, 1, 13, 37, 40, 243134) + ), + PackageVersion( + value="1.10.7", release_date=dt_local(2017, 4, 4, 14, 27, 54, 235551) + ), + PackageVersion( + value="1.10.8", release_date=dt_local(2017, 9, 5, 15, 31, 58, 221021) + ), + PackageVersion( + value="1.10a1", release_date=dt_local(2016, 5, 20, 12, 24, 59, 952686) + ), + PackageVersion( + value="1.10b1", release_date=dt_local(2016, 6, 22, 1, 15, 17, 267637) + ), + PackageVersion( + value="1.10rc1", release_date=dt_local(2016, 7, 18, 18, 5, 5, 503584) + ), ] assert results == expected - @mock.patch("packagedb.package_managers.get_response") def test_pypi_fetch_with_no_release(self, mock_response): mock_response.return_value = {"info": {}} results = list(PypiVersionAPI().fetch("django")) assert results == [] - @mock.patch("packagedb.package_managers.get_response") - def test_ruby_fetch_with_no_release(self,mock_response): - + def test_ruby_fetch_with_no_release(self, mock_response): with open(os.path.join(TEST_DATA, "gem.json")) as f: mock_response.return_value = json.load(f) results = list(RubyVersionAPI().fetch("rails")) expected = [ - PackageVersion(value="7.0.2.3", release_date=dt_local(2022, 3, 8, 17, 50, 52, 496000)), - PackageVersion(value="7.0.2.2", release_date=dt_local(2022, 2, 11, 19, 44, 19, 17000)), + PackageVersion( + value="7.0.2.3", release_date=dt_local(2022, 3, 8, 17, 50, 52, 496000) + ), + PackageVersion( + value="7.0.2.2", release_date=dt_local(2022, 2, 11, 19, 44, 19, 17000) + ), ] assert results == expected @@ -124,7 +184,6 @@ def test_get_version_fetcher(self): class TestComposerVersionAPI(TestCase): - expected_versions = [ PackageVersion(value="10.0.0", release_date=dt_local(2019, 7, 23, 7, 6, 3)), PackageVersion(value="10.1.0", release_date=dt_local(2019, 10, 1, 8, 18, 18)), @@ -198,7 +257,9 @@ def test_extract_versions(self): with open(os.path.join(TEST_DATA, "composer.json")) as f: mock_response = json.load(f) - results = list(ComposerVersionAPI().extract_versions(mock_response, "typo3/cms-core")) + results = list( + ComposerVersionAPI().extract_versions(mock_response, "typo3/cms-core") + ) assert results == self.expected_versions @mock.patch("packagedb.package_managers.get_response") @@ -218,7 +279,11 @@ def test_extract_versions(self): mock_response = ET.parse(f) results = list(MavenVersionAPI().extract_versions(mock_response)) - expected = [PackageVersion("1.2.2"), PackageVersion("1.2.3"), PackageVersion("1.3.0")] + expected = [ + PackageVersion("1.2.2"), + PackageVersion("1.2.3"), + PackageVersion("1.3.0"), + ] assert results == expected def test_artifact_url(self): @@ -228,7 +293,9 @@ def test_artifact_url(self): url1 = MavenVersionAPI.artifact_url(eg_comps1) url2 = MavenVersionAPI.artifact_url(eg_comps2) - assert url1 == "https://repo1.maven.org/maven2/org/apache/kafka/maven-metadata.xml" + assert ( + url1 == "https://repo1.maven.org/maven2/org/apache/kafka/maven-metadata.xml" + ) assert ( url2 == "https://repo1.maven.org/maven2/apple/msft/windows/mac/oss/exfat-ntfs/maven-metadata.xml" @@ -259,15 +326,20 @@ def test_fetch(self, mock_response): class TestGoproxyVersionAPI(TestCase): def test_trim_go_url_path(self): - url1 = "https://pkg.go.dev/github.com/containous/traefik/v2" - assert GoproxyVersionAPI.trim_go_url_path(url1) == "github.com/containous/traefik" + assert ( + GoproxyVersionAPI.trim_go_url_path(url1) == "github.com/containous/traefik" + ) url2 = "github.com/FerretDB/FerretDB/cmd/ferretdb" - assert GoproxyVersionAPI.trim_go_url_path(url2) == "github.com/FerretDB/FerretDB" + assert ( + GoproxyVersionAPI.trim_go_url_path(url2) == "github.com/FerretDB/FerretDB" + ) url3 = GoproxyVersionAPI.trim_go_url_path(url2) - assert GoproxyVersionAPI.trim_go_url_path(url3) == "github.com/FerretDB/FerretDB" + assert ( + GoproxyVersionAPI.trim_go_url_path(url3) == "github.com/FerretDB/FerretDB" + ) def test_escape_path(self): path = "github.com/FerretDB/FerretDB" @@ -276,7 +348,10 @@ def test_escape_path(self): @mock.patch("packagedb.package_managers.get_response") def test_fetch_version_info(self, mock_response): - mock_response.return_value = {"Version": "v0.0.5", "Time": "2022-01-04T13:54:01Z"} + mock_response.return_value = { + "Version": "v0.0.5", + "Time": "2022-01-04T13:54:01Z", + } result = GoproxyVersionAPI.fetch_version_info( "v0.0.5", "github.com/!ferret!d!b/!ferret!d!b", @@ -303,11 +378,21 @@ def test_fetch(self, mock_fetcher): results = list(GoproxyVersionAPI().fetch("github.com/FerretDB/FerretDB")) expected = [ - PackageVersion(value="v0.0.1", release_date=dt_local(2021, 11, 2, 6, 56, 38)), - PackageVersion(value="v0.0.5", release_date=dt_local(2021, 11, 13, 21, 36, 37)), - PackageVersion(value="v0.0.3", release_date=dt_local(2021, 11, 19, 20, 31, 22)), - PackageVersion(value="v0.0.4", release_date=dt_local(2021, 12, 1, 19, 2, 44)), - PackageVersion(value="v0.0.2", release_date=dt_local(2022, 1, 4, 13, 54, 1)), + PackageVersion( + value="v0.0.1", release_date=dt_local(2021, 11, 2, 6, 56, 38) + ), + PackageVersion( + value="v0.0.5", release_date=dt_local(2021, 11, 13, 21, 36, 37) + ), + PackageVersion( + value="v0.0.3", release_date=dt_local(2021, 11, 19, 20, 31, 22) + ), + PackageVersion( + value="v0.0.4", release_date=dt_local(2021, 12, 1, 19, 2, 44) + ), + PackageVersion( + value="v0.0.2", release_date=dt_local(2022, 1, 4, 13, 54, 1) + ), ] assert results == expected @@ -323,22 +408,49 @@ def test_fetch_with_responses_are_none(self, mock_fetcher): class TestNugetVersionAPI(TestCase): expected_versions = [ - PackageVersion(value="0.23.0", release_date=dt_local(2018, 1, 17, 9, 32, 59, 283000)), - PackageVersion(value="0.24.0", release_date=dt_local(2018, 3, 30, 7, 25, 18, 393000)), - PackageVersion(value="1.0.0", release_date=dt_local(2018, 9, 13, 8, 16, 0, 420000)), - PackageVersion(value="1.0.1", release_date=dt_local(2020, 1, 17, 15, 31, 41, 857000)), - PackageVersion(value="1.0.2", release_date=dt_local(2020, 4, 21, 12, 24, 53, 877000)), PackageVersion( - value="2.0.0-preview01", release_date=dt_local(2018, 1, 9, 17, 12, 20, 440000) + value="0.23.0", release_date=dt_local(2018, 1, 17, 9, 32, 59, 283000) + ), + PackageVersion( + value="0.24.0", release_date=dt_local(2018, 3, 30, 7, 25, 18, 393000) + ), + PackageVersion( + value="1.0.0", release_date=dt_local(2018, 9, 13, 8, 16, 0, 420000) + ), + PackageVersion( + value="1.0.1", release_date=dt_local(2020, 1, 17, 15, 31, 41, 857000) + ), + PackageVersion( + value="1.0.2", release_date=dt_local(2020, 4, 21, 12, 24, 53, 877000) + ), + PackageVersion( + value="2.0.0-preview01", + release_date=dt_local(2018, 1, 9, 17, 12, 20, 440000), + ), + PackageVersion( + value="2.0.0", release_date=dt_local(2018, 9, 27, 13, 33, 15, 370000) + ), + PackageVersion( + value="2.1.0", release_date=dt_local(2018, 10, 16, 6, 59, 44, 680000) + ), + PackageVersion( + value="2.2.0", release_date=dt_local(2018, 11, 23, 8, 13, 8, 3000) + ), + PackageVersion( + value="2.3.0", release_date=dt_local(2019, 6, 27, 14, 27, 31, 613000) + ), + PackageVersion( + value="2.4.0", release_date=dt_local(2020, 1, 17, 15, 11, 5, 810000) + ), + PackageVersion( + value="2.5.0", release_date=dt_local(2020, 3, 24, 14, 22, 39, 960000) + ), + PackageVersion( + value="2.6.0", release_date=dt_local(2020, 3, 27, 11, 6, 27, 500000) + ), + PackageVersion( + value="2.7.0", release_date=dt_local(2020, 4, 21, 12, 27, 36, 427000) ), - PackageVersion(value="2.0.0", release_date=dt_local(2018, 9, 27, 13, 33, 15, 370000)), - PackageVersion(value="2.1.0", release_date=dt_local(2018, 10, 16, 6, 59, 44, 680000)), - PackageVersion(value="2.2.0", release_date=dt_local(2018, 11, 23, 8, 13, 8, 3000)), - PackageVersion(value="2.3.0", release_date=dt_local(2019, 6, 27, 14, 27, 31, 613000)), - PackageVersion(value="2.4.0", release_date=dt_local(2020, 1, 17, 15, 11, 5, 810000)), - PackageVersion(value="2.5.0", release_date=dt_local(2020, 3, 24, 14, 22, 39, 960000)), - PackageVersion(value="2.6.0", release_date=dt_local(2020, 3, 27, 11, 6, 27, 500000)), - PackageVersion(value="2.7.0", release_date=dt_local(2020, 4, 21, 12, 27, 36, 427000)), ] def test_extract_versions(self): diff --git a/packagedb/tests/test_schedules.py b/packagedb/tests/test_schedules.py index c2c5ae02..2965fa60 100644 --- a/packagedb/tests/test_schedules.py +++ b/packagedb/tests/test_schedules.py @@ -39,7 +39,5 @@ def test_get_next_execution(): with patch("datetime.datetime", wraps=datetime.datetime) as dt: dt.now.return_value = time_now - assert expected1 == get_next_execution( - watch_interval_days1, last_watch_date1) - assert expected2 == get_next_execution( - watch_interval_days2, last_watch_date2) + assert expected1 == get_next_execution(watch_interval_days1, last_watch_date1) + assert expected2 == get_next_execution(watch_interval_days2, last_watch_date2) diff --git a/packagedb/tests/test_tasks.py b/packagedb/tests/test_tasks.py index 851a36e6..6677e2f1 100644 --- a/packagedb/tests/test_tasks.py +++ b/packagedb/tests/test_tasks.py @@ -10,6 +10,7 @@ from unittest.mock import patch from django.test import TestCase + from fetchcode.package_versions import PackageVersion from minecode.models import PriorityResourceURI diff --git a/packagedb/tests/test_throttling.py b/packagedb/tests/test_throttling.py index f21a26d8..d17557fe 100644 --- a/packagedb/tests/test_throttling.py +++ b/packagedb/tests/test_throttling.py @@ -7,22 +7,23 @@ # See https://aboutcode.org for more information about nexB OSS projects. # -from rest_framework.test import APIClient -from rest_framework.test import APITestCase from unittest.mock import patch from django.contrib.auth.models import User +from rest_framework.test import APIClient +from rest_framework.test import APITestCase + -@patch('rest_framework.throttling.UserRateThrottle.get_rate', lambda x: '20/day') -@patch('rest_framework.throttling.AnonRateThrottle.get_rate', lambda x: '10/day') +@patch("rest_framework.throttling.UserRateThrottle.get_rate", lambda x: "20/day") +@patch("rest_framework.throttling.AnonRateThrottle.get_rate", lambda x: "10/day") class ThrottleApiTests(APITestCase): def setUp(self): # create a basic user self.user = User.objects.create_user( username="username", email="e@mail.com", - password="secret" + password="secret", # NOQA ) self.auth = f"Token {self.user.auth_token.key}" self.csrf_client = APIClient(enforce_csrf_checks=True) @@ -32,8 +33,8 @@ def setUp(self): self.staff_user = User.objects.create_user( username="staff_username", email="staff_e@mail.com", - password="secret", - is_staff=True + password="secret", # NOQA + is_staff=True, ) self.staff_auth = f"Token {self.staff_user.auth_token.key}" self.staff_csrf_client = APIClient(enforce_csrf_checks=True) @@ -43,36 +44,36 @@ def setUp(self): def test_package_endpoint_throttling(self): for i in range(0, 20): - response = self.csrf_client.get('/api/packages/') + response = self.csrf_client.get("/api/packages/") self.assertEqual(response.status_code, 200) - response = self.staff_csrf_client.get('/api/packages/') + response = self.staff_csrf_client.get("/api/packages/") self.assertEqual(response.status_code, 200) - response = self.csrf_client.get('/api/packages/') + response = self.csrf_client.get("/api/packages/") # 429 - too many requests for basic user self.assertEqual(response.status_code, 429) - response = self.staff_csrf_client.get('/api/packages/', format='json') + response = self.staff_csrf_client.get("/api/packages/", format="json") # 200 - staff user can access API unlimited times self.assertEqual(response.status_code, 200) # A anonymous user can only access /packages endpoint 10 times a day for i in range(0, 10): - response = self.csrf_client_anon.get('/api/packages/') + response = self.csrf_client_anon.get("/api/packages/") self.assertEqual(response.status_code, 200) - response = self.csrf_client_anon.get('/api/packages/') + response = self.csrf_client_anon.get("/api/packages/") # 429 - too many requests for anon user self.assertEqual(response.status_code, 429) self.assertEqual( - response.data.get('message'), - 'Your request has been throttled. Please contact support@nexb.com', + response.data.get("message"), + "Your request has been throttled. Please contact support@nexb.com", ) - response = self.csrf_client_anon.get('/api/resources/') + response = self.csrf_client_anon.get("/api/resources/") # 429 - too many requests for anon user self.assertEqual(response.status_code, 429) self.assertEqual( - response.data.get('message'), - 'Your request has been throttled. Please contact support@nexb.com', + response.data.get("message"), + "Your request has been throttled. Please contact support@nexb.com", ) diff --git a/packagedb/tests/test_views.py b/packagedb/tests/test_views.py index 227e3885..2a0ae9b7 100644 --- a/packagedb/tests/test_views.py +++ b/packagedb/tests/test_views.py @@ -12,10 +12,10 @@ class TestViews(TestCase): def test_robots_txt(self): - response = self.client.get('/robots.txt') + response = self.client.get("/robots.txt") assert response.status_code == 200 - assert response['content-type'] == 'text/plain' - assert response.content == b'User-agent: *\nDisallow: *\n' + assert response["content-type"] == "text/plain" + assert response.content == b"User-agent: *\nDisallow: *\n" response = self.client.post("/robots.txt") assert response.status_code == 405 diff --git a/packagedb/tests/testfiles/api/twill-core-0.12.0.json b/packagedb/tests/testfiles/api/twill-core-0.12.0.json index 0c1c60fa..39b13fc5 100644 --- a/packagedb/tests/testfiles/api/twill-core-0.12.0.json +++ b/packagedb/tests/testfiles/api/twill-core-0.12.0.json @@ -1,184 +1,370 @@ -{ - "filename":"twill-core-0.12.0.jar", - "package_content":"binary", - "purl":"pkg:maven/org.apache.twill/twill-core@0.12.0", - "type":"maven", - "namespace":"org.apache.twill", - "name":"twill-core", - "version":"0.12.0", - "qualifiers":"", - "subpath":"", - "primary_language":"Java", - "description":"Apache Twill core library", - "release_date":null, - "parties":[ - { - "type":"organization", - "role":"owner", - "name":"The Apache Software Foundation", - "email":null, - "url":"http://www.apache.org/" - } - ], - "keywords":[], - "homepage_url":"http://www.apache.org/", - "download_url":"https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar", - "bug_tracking_url":null, - "code_view_url":null, - "vcs_url":null, - "repository_homepage_url":null, - "repository_download_url":null, - "api_data_url":null, - "size":null, - "md5":null, - "sha1":"252cc5e60690d611a9981d1b3fabeb0d3a7e8a28", - "sha256":null, - "sha512":null, - "copyright":null, - "holder":null, - "declared_license_expression":"apache-2.0", - "declared_license_expression_spdx":"Apache-2.0", - "license_detections":[], - "other_license_expression":null, - "other_license_expression_spdx":null, - "other_license_detections":[], - "extracted_license_statement":null, - "notice_text":null, - "source_packages":[ - "pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources" - ], - "extra_data":{}, - "package_uid":"pkg:maven/org.apache.twill/twill-core@0.12.0?uuid=fixed-uid-done-for-testing-5642512d1758", - "datasource_id":null, - "file_references":[], - "dependencies":[ - { - "purl":"pkg:maven/org.apache.twill/twill-api@0.12.0", - "extracted_requirement":"0.12.0", - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":true - }, - { - "purl":"pkg:maven/org.apache.twill/twill-zookeeper@0.12.0", - "extracted_requirement":"0.12.0", - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":true - }, - { - "purl":"pkg:maven/org.apache.twill/twill-discovery-core@0.12.0", - "extracted_requirement":"0.12.0", - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":true - }, - { - "purl":"pkg:maven/com.google.guava/guava", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/com.google.code.gson/gson", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/io.netty/netty", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.xerial.snappy/snappy-java", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.ow2.asm/asm-all", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.slf4j/slf4j-api", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/ch.qos.logback/logback-core", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/ch.qos.logback/logback-classic", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.apache.kafka/kafka_2.10", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/net.sf.jopt-simple/jopt-simple", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/junit/junit", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.unitils/unitils-core", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - }, - { - "purl":"pkg:maven/org.apache.commons/commons-compress", - "extracted_requirement":null, - "scope":"compile", - "is_runtime":false, - "is_optional":true, - "is_resolved":false - } - ] -} \ No newline at end of file +[ + { + "filename":"twill-core-0.12.0-sources.jar", + "package_content":"source_archive", + "purl":"pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources", + "type":"maven", + "namespace":"org.apache.twill", + "name":"twill-core", + "version":"0.12.0", + "qualifiers":"classifier=sources", + "subpath":"", + "primary_language":"Java", + "description":"Apache Twill core library", + "release_date":null, + "parties":[ + { + "type":"organization", + "role":"owner", + "name":"The Apache Software Foundation", + "email":null, + "url":"http://www.apache.org/" + } + ], + "keywords":[], + "homepage_url":"http://www.apache.org/", + "download_url":"https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0-sources.jar", + "bug_tracking_url":null, + "code_view_url":null, + "vcs_url":null, + "repository_homepage_url":null, + "repository_download_url":null, + "api_data_url":null, + "size":null, + "md5":null, + "sha1":"dfbe61539b44213f389ff7d9a7745173d114b6df", + "sha256":null, + "sha512":null, + "copyright":null, + "holder":null, + "declared_license_expression":"apache-2.0", + "declared_license_expression_spdx":"Apache-2.0", + "license_detections":[], + "other_license_expression":null, + "other_license_expression_spdx":null, + "other_license_detections":[], + "extracted_license_statement":null, + "notice_text":null, + "source_packages":[ + "pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources" + ], + "extra_data":{}, + "package_uid":"pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources&uuid=fixed-uid-done-for-testing-5642512d1758", + "datasource_id":null, + "file_references":[], + "dependencies":[ + { + "purl":"pkg:maven/org.apache.twill/twill-api@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/org.apache.twill/twill-zookeeper@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/org.apache.twill/twill-discovery-core@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/com.google.guava/guava", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/com.google.code.gson/gson", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/io.netty/netty", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.xerial.snappy/snappy-java", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.ow2.asm/asm-all", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.slf4j/slf4j-api", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/ch.qos.logback/logback-core", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/ch.qos.logback/logback-classic", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.apache.kafka/kafka_2.10", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/net.sf.jopt-simple/jopt-simple", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/junit/junit", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.unitils/unitils-core", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.apache.commons/commons-compress", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + } + ] + }, + { + "filename":"twill-core-0.12.0.jar", + "package_content":"binary", + "purl":"pkg:maven/org.apache.twill/twill-core@0.12.0", + "type":"maven", + "namespace":"org.apache.twill", + "name":"twill-core", + "version":"0.12.0", + "qualifiers":"", + "subpath":"", + "primary_language":"Java", + "description":"Apache Twill core library", + "release_date":null, + "parties":[ + { + "type":"organization", + "role":"owner", + "name":"The Apache Software Foundation", + "email":null, + "url":"http://www.apache.org/" + } + ], + "keywords":[], + "homepage_url":"http://www.apache.org/", + "download_url":"https://repo1.maven.org/maven2/org/apache/twill/twill-core/0.12.0/twill-core-0.12.0.jar", + "bug_tracking_url":null, + "code_view_url":null, + "vcs_url":null, + "repository_homepage_url":null, + "repository_download_url":null, + "api_data_url":null, + "size":null, + "md5":null, + "sha1":"252cc5e60690d611a9981d1b3fabeb0d3a7e8a28", + "sha256":null, + "sha512":null, + "copyright":null, + "holder":null, + "declared_license_expression":"apache-2.0", + "declared_license_expression_spdx":"Apache-2.0", + "license_detections":[], + "other_license_expression":null, + "other_license_expression_spdx":null, + "other_license_detections":[], + "extracted_license_statement":null, + "notice_text":null, + "source_packages":[ + "pkg:maven/org.apache.twill/twill-core@0.12.0?classifier=sources" + ], + "extra_data":{}, + "package_uid":"pkg:maven/org.apache.twill/twill-core@0.12.0?uuid=fixed-uid-done-for-testing-5642512d1758", + "datasource_id":null, + "file_references":[], + "dependencies":[ + { + "purl":"pkg:maven/org.apache.twill/twill-api@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/org.apache.twill/twill-zookeeper@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/org.apache.twill/twill-discovery-core@0.12.0", + "extracted_requirement":"0.12.0", + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":true + }, + { + "purl":"pkg:maven/com.google.guava/guava", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/com.google.code.gson/gson", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/io.netty/netty", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.xerial.snappy/snappy-java", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.ow2.asm/asm-all", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.slf4j/slf4j-api", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/ch.qos.logback/logback-core", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/ch.qos.logback/logback-classic", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.apache.kafka/kafka_2.10", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/net.sf.jopt-simple/jopt-simple", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/junit/junit", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.unitils/unitils-core", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + }, + { + "purl":"pkg:maven/org.apache.commons/commons-compress", + "extracted_requirement":null, + "scope":"compile", + "is_runtime":false, + "is_optional":true, + "is_resolved":false + } + ] + } +] \ No newline at end of file diff --git a/packagedb/throttling.py b/packagedb/throttling.py index 6f310043..ac1dee95 100644 --- a/packagedb/throttling.py +++ b/packagedb/throttling.py @@ -13,9 +13,7 @@ class StaffUserRateThrottle(UserRateThrottle): def allow_request(self, request, view): - """ - Do not apply throttling for superusers and admins. - """ + """Do not apply throttling for superusers and admins.""" if request.user.is_superuser or request.user.is_staff: return True @@ -23,10 +21,7 @@ def allow_request(self, request, view): def throttled_exception_handler(exception, context): - """ - Return this response whenever a request has been throttled - """ - + """Return this response whenever a request has been throttled""" response = exception_handler(exception, context) if isinstance(exception, Throttled): diff --git a/packagedb/to_purl.py b/packagedb/to_purl.py index 0fd03afa..5066b23a 100644 --- a/packagedb/to_purl.py +++ b/packagedb/to_purl.py @@ -21,8 +21,7 @@ @extend_schema( parameters=[ - OpenApiParameter("go_package", str, "query", - description="go import package"), + OpenApiParameter("go_package", str, "query", description="go import package"), ], responses={200: GoLangPurlResponseSerializer()}, ) @@ -59,7 +58,7 @@ def list(self, request): go_import = validated_data.get("go_package") try: purl = get_golang_purl(go_import) - except: + except Exception: return Response( {"errors": "`@` is not supported either in import or go.mod string"}, status=status.HTTP_400_BAD_REQUEST, diff --git a/purl2vcs/src/purl2vcs/find_source_repo.py b/purl2vcs/src/purl2vcs/find_source_repo.py index a3bd708c..e3608d3f 100644 --- a/purl2vcs/src/purl2vcs/find_source_repo.py +++ b/purl2vcs/src/purl2vcs/find_source_repo.py @@ -19,7 +19,7 @@ from scancode.api import get_urls as get_urls_from_location from minecode.model_utils import add_package_to_scan_queue -from minecode.visitors.maven import get_merged_ancestor_package_from_maven_package +from minecode.collectors.maven import get_merged_ancestor_package_from_maven_package from packagedb.models import Package, PackageContentType, PackageSet logger = logging.getLogger(__name__) @@ -147,7 +147,7 @@ def get_source_package_and_add_to_package_set(package): download_url = get_download_url(str(source_purl)) if not download_url: return - except: + except Exception: logger.error(f"Error getting download_url for {source_purl}") return diff --git a/purldb_project/__init__.py b/purldb_project/__init__.py index fd15ad83..8e16890b 100644 --- a/purldb_project/__init__.py +++ b/purldb_project/__init__.py @@ -14,8 +14,8 @@ def command_line(): - '''Command line entry point.''' + """Command line entry point.""" from django.core.management import execute_from_command_line - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'purldb_project.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "purldb_project.settings") execute_from_command_line(sys.argv) diff --git a/purldb_project/settings.py b/purldb_project/settings.py index ff8cf620..aacea674 100644 --- a/purldb_project/settings.py +++ b/purldb_project/settings.py @@ -11,6 +11,7 @@ from pathlib import Path import environ + from purldb_project import __version__ PURLDB_VERSION = __version__ @@ -32,15 +33,12 @@ SECRET_KEY = env.str("SECRET_KEY") -ALLOWED_HOSTS = env.list("ALLOWED_HOSTS", default=[ - ".localhost", "127.0.0.1", "[::1]"]) +ALLOWED_HOSTS = env.list("ALLOWED_HOSTS", default=[".localhost", "127.0.0.1", "[::1]"]) # SECURITY WARNING: do not run with debug turned on in production DEBUG = env.bool("PURLDB_DEBUG", default=False) -PURLDB_REQUIRE_AUTHENTICATION = env.bool( - "PURLDB_REQUIRE_AUTHENTICATION", default=False -) +PURLDB_REQUIRE_AUTHENTICATION = env.bool("PURLDB_REQUIRE_AUTHENTICATION", default=False) # SECURITY WARNING: do not run with debug turned on in production DEBUG_TOOLBAR = env.bool("PURLDB_DEBUG_TOOLBAR", default=False) @@ -62,38 +60,38 @@ INSTALLED_APPS = ( # Local apps # Must come before Third-party apps for proper templates override - 'clearcode', - 'clearindex', - 'minecode', - 'matchcode', - 'packagedb', + "clearcode", + "clearindex", + "minecode", + "matchcode", + "packagedb", # Django built-in "django.contrib.auth", - 'django.contrib.contenttypes', - 'django.contrib.sessions', - 'django.contrib.messages', - 'django.contrib.staticfiles', - 'django.contrib.admin', + "django.contrib.contenttypes", + "django.contrib.sessions", + "django.contrib.messages", + "django.contrib.staticfiles", + "django.contrib.admin", "django.contrib.humanize", # Third-party apps - 'django_filters', - 'rest_framework', - 'drf_spectacular', - 'rest_framework.authtoken', - 'django_rq', + "django_filters", + "rest_framework", + "drf_spectacular", + "rest_framework.authtoken", + "django_rq", ) MIDDLEWARE = ( "django.middleware.security.SecurityMiddleware", - 'django.contrib.sessions.middleware.SessionMiddleware', - 'django.middleware.common.CommonMiddleware', - 'django.middleware.csrf.CsrfViewMiddleware', - 'django.contrib.auth.middleware.AuthenticationMiddleware', - 'django.contrib.messages.middleware.MessageMiddleware', - 'django.middleware.clickjacking.XFrameOptionsMiddleware', + "django.contrib.sessions.middleware.SessionMiddleware", + "django.middleware.common.CommonMiddleware", + "django.middleware.csrf.CsrfViewMiddleware", + "django.contrib.auth.middleware.AuthenticationMiddleware", + "django.contrib.messages.middleware.MessageMiddleware", + "django.middleware.clickjacking.XFrameOptionsMiddleware", ) -ROOT_URLCONF = 'purldb_project.urls' +ROOT_URLCONF = "purldb_project.urls" WSGI_APPLICATION = "purldb_project.wsgi.application" @@ -103,20 +101,18 @@ # API -DATA_UPLOAD_MAX_NUMBER_FIELDS = env.int( - "DATA_UPLOAD_MAX_NUMBER_FIELDS", default=2048 -) +DATA_UPLOAD_MAX_NUMBER_FIELDS = env.int("DATA_UPLOAD_MAX_NUMBER_FIELDS", default=2048) # Database DATABASES = { - 'default': { - 'ENGINE': env.str('PACKAGEDB_DB_ENGINE', 'django.db.backends.postgresql'), - 'HOST': env.str('PACKAGEDB_DB_HOST', 'localhost'), - 'NAME': env.str('PACKAGEDB_DB_NAME', 'packagedb'), - 'USER': env.str('PACKAGEDB_DB_USER', 'packagedb'), - 'PASSWORD': env.str('PACKAGEDB_DB_PASSWORD', 'packagedb'), - 'PORT': env.str('PACKAGEDB_DB_PORT', '5432'), - 'ATOMIC_REQUESTS': True, + "default": { + "ENGINE": env.str("PACKAGEDB_DB_ENGINE", "django.db.backends.postgresql"), + "HOST": env.str("PACKAGEDB_DB_HOST", "localhost"), + "NAME": env.str("PACKAGEDB_DB_NAME", "packagedb"), + "USER": env.str("PACKAGEDB_DB_USER", "packagedb"), + "PASSWORD": env.str("PACKAGEDB_DB_PASSWORD", "packagedb"), + "PORT": env.str("PACKAGEDB_DB_PORT", "5432"), + "ATOMIC_REQUESTS": True, } } @@ -126,15 +122,15 @@ TEMPLATES = [ { - 'BACKEND': 'django.template.backends.django.DjangoTemplates', + "BACKEND": "django.template.backends.django.DjangoTemplates", "DIRS": [str(PROJECT_DIR.joinpath("templates"))], "APP_DIRS": True, - 'OPTIONS': { + "OPTIONS": { "debug": DEBUG, - 'context_processors': [ - 'django.contrib.auth.context_processors.auth', - 'django.contrib.messages.context_processors.messages', - 'django.template.context_processors.request', + "context_processors": [ + "django.contrib.auth.context_processors.auth", + "django.contrib.messages.context_processors.messages", + "django.template.context_processors.request", "django.template.context_processors.static", ], }, @@ -174,8 +170,8 @@ # Cache CACHES = { - 'default': { - 'BACKEND': 'django.core.cache.backends.locmem.LocMemCache', + "default": { + "BACKEND": "django.core.cache.backends.locmem.LocMemCache", "LOCATION": "default", } } @@ -229,42 +225,43 @@ # Static files (CSS, JavaScript, Images) -STATIC_URL = '/static/' +STATIC_URL = "/static/" -STATIC_ROOT = '/var/purldb/static/' +STATIC_ROOT = "/var/purldb/static/" STATICFILES_DIRS = [ - PROJECT_DIR / 'static', + PROJECT_DIR / "static", ] # Third-party apps # Django restframework -REST_FRAMEWORK_DEFAULT_THROTTLE_RATES = { - 'anon': '3600/hour', 'user': '10800/hour'} +REST_FRAMEWORK_DEFAULT_THROTTLE_RATES = {"anon": "3600/hour", "user": "10800/hour"} REST_FRAMEWORK = { - 'DEFAULT_AUTHENTICATION_CLASSES': ('rest_framework.authentication.TokenAuthentication',), - 'DEFAULT_PERMISSION_CLASSES': ('rest_framework.permissions.IsAuthenticated',), - 'DEFAULT_RENDERER_CLASSES': ( - 'rest_framework.renderers.JSONRenderer', - 'rest_framework.renderers.BrowsableAPIRenderer', - 'rest_framework.renderers.AdminRenderer', + "DEFAULT_AUTHENTICATION_CLASSES": ( + "rest_framework.authentication.TokenAuthentication", + ), + "DEFAULT_PERMISSION_CLASSES": ("rest_framework.permissions.IsAuthenticated",), + "DEFAULT_RENDERER_CLASSES": ( + "rest_framework.renderers.JSONRenderer", + "rest_framework.renderers.BrowsableAPIRenderer", + "rest_framework.renderers.AdminRenderer", ), - 'DEFAULT_FILTER_BACKENDS': ( - 'django_filters.rest_framework.DjangoFilterBackend', - 'rest_framework.filters.SearchFilter', + "DEFAULT_FILTER_BACKENDS": ( + "django_filters.rest_framework.DjangoFilterBackend", + "rest_framework.filters.SearchFilter", ), - 'DEFAULT_THROTTLE_CLASSES': [ - 'packagedb.throttling.StaffUserRateThrottle', - 'rest_framework.throttling.AnonRateThrottle', - 'rest_framework.throttling.UserRateThrottle', + "DEFAULT_THROTTLE_CLASSES": [ + "packagedb.throttling.StaffUserRateThrottle", + "rest_framework.throttling.AnonRateThrottle", + "rest_framework.throttling.UserRateThrottle", ], - 'DEFAULT_THROTTLE_RATES': REST_FRAMEWORK_DEFAULT_THROTTLE_RATES, - 'EXCEPTION_HANDLER': 'packagedb.throttling.throttled_exception_handler', - 'DEFAULT_PAGINATION_CLASS': 'packagedb.api_custom.PageSizePagination', - 'DEFAULT_SCHEMA_CLASS': 'drf_spectacular.openapi.AutoSchema', + "DEFAULT_THROTTLE_RATES": REST_FRAMEWORK_DEFAULT_THROTTLE_RATES, + "EXCEPTION_HANDLER": "packagedb.throttling.throttled_exception_handler", + "DEFAULT_PAGINATION_CLASS": "packagedb.api_custom.PageSizePagination", + "DEFAULT_SCHEMA_CLASS": "drf_spectacular.openapi.AutoSchema", # Limit the load on the Database returning a small number of records by default. https://github.com/aboutcode-org/vulnerablecode/issues/819 "PAGE_SIZE": 20, } @@ -302,18 +299,18 @@ # Active seeders: each active seeder class need to be added explicitly here ACTIVE_SEEDERS = [ - 'minecode.visitors.maven.MavenSeed', + "minecode.miners.maven.MavenSeed", ] SPECTACULAR_SETTINGS = { - 'TITLE': 'PurlDB API', - 'DESCRIPTION': 'Tools to create and expose a database of purls (Package URLs)', - 'VERSION': PURLDB_VERSION, - 'SERVE_INCLUDE_SCHEMA': False, + "TITLE": "PurlDB API", + "DESCRIPTION": "Tools to create and expose a database of purls (Package URLs)", + "VERSION": PURLDB_VERSION, + "SERVE_INCLUDE_SCHEMA": False, } RQ_QUEUES = { - 'default': { + "default": { "HOST": env.str("PURLDB_REDIS_HOST", default="localhost"), "PORT": env.str("PURLDB_REDIS_PORT", default="6379"), "PASSWORD": env.str("PURLDB_REDIS_PASSWORD", default=""), diff --git a/purldb_project/urls.py b/purldb_project/urls.py index 36e17371..c3e8fab5 100644 --- a/purldb_project/urls.py +++ b/purldb_project/urls.py @@ -11,6 +11,7 @@ from django.urls import path from django.views.generic import RedirectView from django.views.generic.base import TemplateView + from drf_spectacular.views import SpectacularAPIView from drf_spectacular.views import SpectacularSwaggerView from rest_framework import routers @@ -30,34 +31,40 @@ from packagedb.to_purl import api_to_purl_router api_router = routers.DefaultRouter() -api_router.register('packages', PackageViewSet) -api_router.register('update_packages', PackageUpdateSet, 'update_packages') -api_router.register('package_sets', PackageSetViewSet) -api_router.register('resources', ResourceViewSet) -api_router.register('validate', PurlValidateViewSet, 'validate') -api_router.register('collect', CollectViewSet, 'collect') -api_router.register('watch', PackageWatchViewSet) -api_router.register('scan_queue', ScannableURIViewSet) -api_router.register('approximate_directory_content_index', - ApproximateDirectoryContentIndexViewSet) -api_router.register('approximate_directory_structure_index', - ApproximateDirectoryStructureIndexViewSet) +api_router.register("packages", PackageViewSet) +api_router.register("update_packages", PackageUpdateSet, "update_packages") +api_router.register("package_sets", PackageSetViewSet) +api_router.register("resources", ResourceViewSet) +api_router.register("validate", PurlValidateViewSet, "validate") +api_router.register("collect", CollectViewSet, "collect") +api_router.register("watch", PackageWatchViewSet) +api_router.register("scan_queue", ScannableURIViewSet) +api_router.register( + "approximate_directory_content_index", ApproximateDirectoryContentIndexViewSet +) +api_router.register( + "approximate_directory_structure_index", ApproximateDirectoryStructureIndexViewSet +) urlpatterns = [ path( - 'robots.txt', - TemplateView.as_view(template_name='robots.txt', - content_type='text/plain'), + "robots.txt", + TemplateView.as_view(template_name="robots.txt", content_type="text/plain"), ), - path('api/', include((api_router.urls, 'api'))), - path('api/to_purl/', include((api_to_purl_router.urls, 'api_to'))), - path('api/from_purl/', include((api_from_purl_router.urls, 'api_from'))), + path("api/", include((api_router.urls, "api"))), + path("api/to_purl/", include((api_to_purl_router.urls, "api_to"))), + path("api/from_purl/", include((api_from_purl_router.urls, "api_from"))), path("", RedirectView.as_view(url="api/")), - path('api/schema/', SpectacularAPIView.as_view(), name='schema'), - path('api/docs/', SpectacularSwaggerView.as_view(url_name='schema'), - name='swagger-ui'), + path("api/schema/", SpectacularAPIView.as_view(), name="schema"), + path( + "api/docs/", + SpectacularSwaggerView.as_view(url_name="schema"), + name="swagger-ui", + ), path( - 'api/scan_queue/index_package_scan//', index_package_scan, name='index_package_scan' + "api/scan_queue/index_package_scan//", + index_package_scan, + name="index_package_scan", ), ] diff --git a/purldb_project/wsgi.py b/purldb_project/wsgi.py index 485a66e1..bfb452a3 100644 --- a/purldb_project/wsgi.py +++ b/purldb_project/wsgi.py @@ -8,8 +8,8 @@ # import os -from django.core.wsgi import get_wsgi_application +from django.core.wsgi import get_wsgi_application """ WSGI config for purldb. @@ -18,6 +18,6 @@ """ -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'purldb_project.settings') +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "purldb_project.settings") application = get_wsgi_application() diff --git a/purldb_public_project/__init__.py b/purldb_public_project/__init__.py index adb81bf9..a77c6343 100644 --- a/purldb_public_project/__init__.py +++ b/purldb_public_project/__init__.py @@ -12,9 +12,8 @@ def command_line(): - '''Command line entry point.''' + """Command line entry point.""" from django.core.management import execute_from_command_line - os.environ.setdefault('DJANGO_SETTINGS_MODULE', - 'purldb_public_project.settings') + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "purldb_public_project.settings") execute_from_command_line(sys.argv) diff --git a/purldb_public_project/settings.py b/purldb_public_project/settings.py index 5e6d5bee..f990759e 100644 --- a/purldb_public_project/settings.py +++ b/purldb_public_project/settings.py @@ -9,7 +9,6 @@ from purldb_project.settings import * +ROOT_URLCONF = "purldb_public_project.urls" -ROOT_URLCONF = 'purldb_public_project.urls' - -WSGI_APPLICATION = 'purldb_public_project.wsgi.application' +WSGI_APPLICATION = "purldb_public_project.wsgi.application" diff --git a/purldb_public_project/urls.py b/purldb_public_project/urls.py index 14e5546a..fddb594b 100644 --- a/purldb_public_project/urls.py +++ b/purldb_public_project/urls.py @@ -12,30 +12,31 @@ from django.views.generic import RedirectView from django.views.generic.base import TemplateView +from drf_spectacular.views import SpectacularAPIView +from drf_spectacular.views import SpectacularSwaggerView from rest_framework import routers from packagedb.api import PackagePublicViewSet from packagedb.api import PurlValidateViewSet from packagedb.api import ResourceViewSet -from drf_spectacular.views import SpectacularAPIView -from drf_spectacular.views import SpectacularSwaggerView - api_router = routers.DefaultRouter() -api_router.register('packages', PackagePublicViewSet) -api_router.register('resources', ResourceViewSet) -api_router.register('validate', PurlValidateViewSet, 'validate') +api_router.register("packages", PackagePublicViewSet) +api_router.register("resources", ResourceViewSet) +api_router.register("validate", PurlValidateViewSet, "validate") urlpatterns = [ path( - 'robots.txt', - TemplateView.as_view(template_name='robots.txt', - content_type='text/plain'), + "robots.txt", + TemplateView.as_view(template_name="robots.txt", content_type="text/plain"), + ), + path("api/", include((api_router.urls, "api"))), + path("", RedirectView.as_view(url="api/")), + path("api/schema/", SpectacularAPIView.as_view(), name="schema"), + path( + "api/docs/", + SpectacularSwaggerView.as_view(url_name="schema"), + name="swagger-ui", ), - path('api/', include((api_router.urls, 'api'))), - path('', RedirectView.as_view(url='api/')), - path('api/schema/', SpectacularAPIView.as_view(), name='schema'), - path('api/docs/', SpectacularSwaggerView.as_view(url_name='schema'), - name='swagger-ui'), ] diff --git a/purldb_public_project/wsgi.py b/purldb_public_project/wsgi.py index f1448671..242c50a8 100644 --- a/purldb_public_project/wsgi.py +++ b/purldb_public_project/wsgi.py @@ -8,8 +8,8 @@ # import os -from django.core.wsgi import get_wsgi_application +from django.core.wsgi import get_wsgi_application """ WSGI config for purldb-public. @@ -18,7 +18,6 @@ """ -os.environ.setdefault('DJANGO_SETTINGS_MODULE', - 'purldb_public_project.settings') +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "purldb_public_project.settings") application = get_wsgi_application() diff --git a/pyproject.toml b/pyproject.toml index cde79074..81017f08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,3 +50,62 @@ addopts = [ "--strict-markers", "--doctest-modules" ] + +[tool.ruff] +line-length = 88 +extend-exclude = ["migrations", "var"] +target-version = "py310" + +[tool.ruff.lint] +# Rules: https://docs.astral.sh/ruff/rules/ +select = [ + "E", # pycodestyle + "W", # pycodestyle warnings + "D", # pydocstyle + "F", # Pyflakes + "UP", # pyupgrade + "S", # flake8-bandit + "I", # isort + "C9", # McCabe complexity +] +ignore = [ + "D1", + "D203", # one-blank-line-before-class + "D205", # blank-line-after-summary + "D212", # multi-line-summary-first-line + "D400", # ends-in-period + "D415", # ends-in-punctuation + # TODO: we want to address these issues in the codebase, then get rid of + # the following ignores + "C901", # complex-structure + "E501", # line-too-long + "S101", # assert + "S103", # bad-file-permissions + "S113", # request-without-timeout + "S202", # tarfile-unsafe-members + "S314", # suspicious-xml-element-tree-usage + "S320", # suspicious-xmle-tree-usage + "S324", # hashlib-insecure-hash-function + "S506", # unsafe-yaml-load + "S602", # subprocess-popen-with-shell-equals-true +] + +[tool.ruff.lint.isort] +force-single-line = true +sections = { django = ["django"] } +section-order = [ + "future", + "standard-library", + "django", + "third-party", + "first-party", + "local-folder", +] + +[tool.ruff.lint.mccabe] +max-complexity = 10 + +[tool.ruff.lint.per-file-ignores] +"**/testfiles/**.py" = ["F821"] # Ignore undefined names from test files +"matchcode_project/settings.py" = ["F403", "F405"] # Ignore undefined names from star imports and star imports +"purldb_public_project/settings.py" = ["F403", "F405"] # Ignore undefined names from star imports and star imports diff --git a/setup.cfg b/setup.cfg index 87505db8..898d1042 100644 --- a/setup.cfg +++ b/setup.cfg @@ -80,6 +80,7 @@ testing = black mock flot + ruff docs = Sphinx>=5.0.2