diff --git a/Makefile b/Makefile index 64abdc54..6ebc900c 100644 --- a/Makefile +++ b/Makefile @@ -130,10 +130,9 @@ process_scans: test: @echo "-> Run the test suite" - ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode-toolkit --ignore matchcode_pipeline --ignore matchcode_project --ignore purldb-toolkit --ignore packagedb/tests/test_throttling.py + ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode_pipeline --ignore matchcode_project --ignore purldb-toolkit --ignore packagedb/tests/test_throttling.py ${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb_project.settings ${PYTHON_EXE} -m pytest -vvs packagedb/tests/test_throttling.py ${ACTIVATE} DJANGO_SETTINGS_MODULE=matchcode_project.settings ${PYTHON_EXE} -m pytest -vvs matchcode_pipeline - ${ACTIVATE} ${PYTHON_EXE} -m pytest -vvs matchcode-toolkit --ignore matchcode-toolkit/src/matchcode_toolkit/pipelines ${ACTIVATE} ${PYTHON_EXE} -m pytest -vvs purldb-toolkit/ shell: diff --git a/configure b/configure index 5c3359b2..ea0619c0 100755 --- a/configure +++ b/configure @@ -30,9 +30,9 @@ CLI_ARGS=$1 CUSTOM_PACKAGES="" # Requirement arguments passed to pip and used by default or with --dev. -REQUIREMENTS="$CUSTOM_PACKAGES --editable matchcode-toolkit/ --editable purldb-toolkit/ --editable . --constraint requirements.txt" -DEV_REQUIREMENTS="$CUSTOM_PACKAGES --editable matchcode-toolkit/ --editable purldb-toolkit/[testing] --editable .[testing] --constraint requirements.txt --constraint requirements-dev.txt" -DOCS_REQUIREMENTS="$CUSTOM_PACKAGES --editable matchcode-toolkit/ --editable purldb-toolkit/ --editable .[docs] --constraint requirements.txt" +REQUIREMENTS="$CUSTOM_PACKAGES --editable purldb-toolkit/ --editable . --constraint requirements.txt" +DEV_REQUIREMENTS="$CUSTOM_PACKAGES --editable purldb-toolkit/[testing] --editable .[testing] --constraint requirements.txt --constraint requirements-dev.txt" +DOCS_REQUIREMENTS="$CUSTOM_PACKAGES --editable purldb-toolkit/ --editable .[docs] --constraint requirements.txt" # where we create a virtualenv VIRTUALENV_DIR=venv diff --git a/matchcode-toolkit/AUTHORS.rst b/matchcode-toolkit/AUTHORS.rst deleted file mode 100644 index 0fd6530e..00000000 --- a/matchcode-toolkit/AUTHORS.rst +++ /dev/null @@ -1,3 +0,0 @@ -The following organizations or individuals have contributed to this repo: - -- Jono Yang diff --git a/matchcode-toolkit/CHANGELOG.rst b/matchcode-toolkit/CHANGELOG.rst deleted file mode 100644 index ab830eeb..00000000 --- a/matchcode-toolkit/CHANGELOG.rst +++ /dev/null @@ -1,43 +0,0 @@ -Changelog -========= - -v3.0.0 ------- - -*2024-02-23* -- Update ``ScanAndFingerprintPackage`` pipeline to reflect the renaming of the ``ScanPackage`` pipeline to ``ScanSinglePackage`` in scancode.io - -v2.0.1 ------- - -*2023-12-19* -- Update ``ScanAndFingerprintPackage`` pipeline with updates from the upstream ``ScanPackage`` pipeline from scancode.io - -v2.0.0 ------- - -*2023-12-18* -- Remove ``ScanAndFingerprintPackage`` pipeline from matchcode-toolkit's entry points. (https://github.com/nexB/purldb/issues/263) - -v1.1.3 ------- - -*2023-08-31* -- Do not fingerprint empty directories. -*2023-08-31* -- Track fingerprints to ignore in ``matchcode_toolkit.fingerprinting.IGNORED_DIRECTORY_FINGERPRINTS``. - -v1.1.2 ------- - -*2023-08-02* -- Update ``scan_and_fingerprint_package`` pipeline to use new directory fingerprinting functions from scancode.io. - -v1.1.1 ------- - -*2023-06-29* -- Do not include empty files when computing directory fingerprints. - -v1.1.0 ------- - -*2023-06-22* -- Rename ``compute_directory_fingerprints`` to ``compute_codebase_directory_fingerprints`` and create a new version of ``compute_directory_fingerprints`` that works on Resource objects instead of codebases. - -v1.0.0 ------- - -*2023-06-05* -- Initial release. diff --git a/matchcode-toolkit/MANIFEST.in b/matchcode-toolkit/MANIFEST.in deleted file mode 100644 index ef3721e8..00000000 --- a/matchcode-toolkit/MANIFEST.in +++ /dev/null @@ -1,15 +0,0 @@ -graft src - -include *.LICENSE -include NOTICE -include *.ABOUT -include *.toml -include *.yml -include *.rst -include setup.* -include configure* -include requirements* -include .git* - -global-exclude *.py[co] __pycache__ *.*~ - diff --git a/matchcode-toolkit/NOTICE b/matchcode-toolkit/NOTICE deleted file mode 100644 index 65936b2b..00000000 --- a/matchcode-toolkit/NOTICE +++ /dev/null @@ -1,19 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. -# ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# diff --git a/matchcode-toolkit/README.rst b/matchcode-toolkit/README.rst index 488831ee..36795bbd 100644 --- a/matchcode-toolkit/README.rst +++ b/matchcode-toolkit/README.rst @@ -1,73 +1,3 @@ MatchCode toolkit ================= -MatchCode toolkit is a Python library that provides the directory fingerprinting -functionality for `ScanCode toolkit `_ -and `ScanCode.io `_ by implementing the -HaloHash algorithm and using it in ScanCode toolkit and ScanCode.io plugins and -pipelines. - - -Installation ------------- - -MatchCode toolkit must be installed in the same environment as ScanCode toolkit -or ScanCode.io. - -From PyPI: -:: - - pip install matchcode-toolkit - -A checkout of this repo can also be installed into an environment using pip's -``--editable`` option, -:: - - # Activate the virtual environment you want to install MatchCode-toolkit into, - # change directories to the ``matchcode-toolkit`` directory - pip install --editable . - -or built into a wheel and then installed: -:: - - pip install flot - flot --wheel --sdist # The built wheel will be in the dist/ directory - pip install matchcode_toolkit-*-py3-none-any.whl - - -Usage ------ - -MatchCode toolkit provides the ``--fingerprint`` option for ScanCode toolkit. -This is a post-scan plugin that adds the fields -``directory_content_fingerprint`` and ``directory_structure_fingerprint`` to -Resources and computes those values for directories. -:: - - scancode --info --fingerprint --json-pp - - -MatchCode toolkit provides the ``scan_and_fingerprint_package`` pipeline for -ScanCode.io. This is the same as the ``scan_single_package`` pipeline, but has the -added step of computing fingerprints for directories. - - -License -------- - -SPDX-License-Identifier: Apache-2.0 - -The ScanCode.io software is licensed under the Apache License version 2.0. -Data generated with ScanCode.io is provided as-is without warranties. -ScanCode is a trademark of nexB Inc. - -You may not use this software except in compliance with the License. -You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software distributed -under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -CONDITIONS OF ANY KIND, either express or implied. See the License for the -specific language governing permissions and limitations under the License. - -Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -OR CONDITIONS OF ANY KIND, either express or implied. No content created from -ScanCode.io should be considered or used as legal advice. Consult an Attorney -for any legal advice. +MatchCode toolkit has moved to its own repo at https://github.com/nexB/matchcode-toolkit diff --git a/matchcode-toolkit/apache-2.0.LICENSE b/matchcode-toolkit/apache-2.0.LICENSE deleted file mode 100644 index 261eeb9e..00000000 --- a/matchcode-toolkit/apache-2.0.LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/matchcode-toolkit/pyproject.toml b/matchcode-toolkit/pyproject.toml deleted file mode 100644 index fe2e41b4..00000000 --- a/matchcode-toolkit/pyproject.toml +++ /dev/null @@ -1,165 +0,0 @@ -[build-system] -requires = [ "flot>=0.7.0" ] -build-backend = "flot.buildapi" - -[project] -name = "matchcode-toolkit" -version = "3.0.0" -description = "matchcode-toolkit" -readme = "README.rst" -license = { text = "Apache-2.0" } -requires-python = ">=3.7" - -authors = [ - { name = "nexB. Inc. and others", email = "info@aboutcode.org" }, -] - -keywords = [ - "matchcode", - "ScanCode.io", - "open source", -] - -classifiers = [ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3 :: Only", - "Topic :: Software Development", - "Topic :: Utilities", -] - -dependencies = [ - "bitarray", - "commoncode", - "plugincode", -] - -urls = { Homepage = "https://github.com/nexB/purldb/tree/main/matchcode-toolkit" } - - -[project.optional-dependencies] -docs = [ - "doc8 >= 0.8.1", - "Sphinx == 5.1.0", - "sphinx-autobuild", - "sphinx-copybutton", - "sphinx-reredirects >= 0.1.2", - "sphinx-rtd-dark-mode>=1.3.0", - "sphinx_rtd_theme >= 0.5.1", -] - -testing = [ - "aboutcode-toolkit >= 7.0.2", - "black", - "isort", - "pycodestyle >= 2.8.0", - "pytest >= 6, != 7.0.0", - "pytest-rerunfailures", - "pytest-xdist >= 2", - "twine", - "vendorize >= 0.2.0", - "bump-my-version", -] - - -[project.entry-points."scancode_post_scan"] -fingerprint = "matchcode_toolkit.plugin_fingerprint:Fingerprint" - - -[tool.bumpversion] -current_version = "3.0.0" -allow_dirty = true - -files = [ - { filename = "pyproject.toml" }, -] - - -[tool.flot] -includes = [ - "src/**/*", -] -excludes = [ - # Python compiled files - "**/*.py[cod]", - "**/*.egg-info", - # Various junk and temp files - "**/.DS_Store", - "**/*~", - "**/.*.sw[po]", - "**/.ve", - "**/*.bak", - "**/.ipynb_checkpoints", -] - -metadata_files = ["*.LICENSE", "NOTICE", ] -editable_paths = ["src", "tests"] -wheel_path_prefixes_to_strip = ["src"] -sdist_extra_includes = [ - "etc/**/*", - "docs/**/*", - ".github/**/*", - "tests/**/*", - ".gitattributes", - ".gitignore", - "thirdparty/**/*", - "*.LICENSE", - "NOTICE", - "*.ABOUT", - "*.toml", - "*.yml", - "*.rst", - "*.py", -] - -[tool.isort] -force_single_line = "True" -line_length = 88 -sections = "FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" -skip = "doc,venv,tmp,thirdparty,build,dist" - - -[tool.pytest.ini_options] -norecursedirs = [ - ".git", - "bin", - "dist", - "build", - "_build", - "dist", - "etc", - "local", - "ci", - "docs", - "man", - "share", - "samples", - ".cache", - ".settings", - "Include", - "include", - "Lib", - "lib", - "lib64", - "Lib64", - "Scripts", - "thirdparty", - "tmp", - "venv", - "tests/data", - ".eggs", - "src/*/data", - "tests/*/data" -] - -python_files = "*.py" - -python_classes = "Test" -python_functions = "test" - -addopts = [ - "-rfExXw", - "--strict-markers", - "--doctest-modules" -] diff --git a/matchcode-toolkit/src/matchcode_toolkit/__init__.py b/matchcode-toolkit/src/matchcode_toolkit/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py b/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py deleted file mode 100644 index 69217b45..00000000 --- a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py +++ /dev/null @@ -1,160 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import binascii - -from matchcode_toolkit.halohash import BitAverageHaloHash - - -# A collection of directory fingerprints that we want to avoid -IGNORED_DIRECTORY_FINGERPRINTS = [ - # This is both the directory content and directory structure fingerprint for - # an empty directory. - '0000000000000000000000000000000000000000', -] - - -def _create_directory_fingerprint(inputs): - """ - Return a 128-bit BitAverageHaloHash fingerprint in hex from `inputs` - """ - inputs = [i.encode('utf-8') for i in inputs if i] - bah128 = BitAverageHaloHash(inputs, size_in_bits=128).hexdigest() - inputs_count = len(inputs) - inputs_count_hex_str = '%08x' % inputs_count - bah128 = bah128.decode('utf-8') - directory_fingerprint = inputs_count_hex_str + bah128 - return directory_fingerprint - - -def create_content_fingerprint(resources): - """ - Collect SHA1 strings from a list of Resources (`resources`) and create a - directory fingerprint from them - """ - features = [r.sha1 for r in resources if r.sha1] - return _create_directory_fingerprint(features) - - -def _get_resource_subpath(resource, top): - """ - Return the subpath of `resource` relative to `top` from `codebase` - - For example: - - top.path = 'foo/bar/' - resource.path = 'foo/bar/baz.c' - - The subpath returned would be 'baz.c' - """ - _, _, subpath = resource.path.partition(top.path) - subpath = subpath.lstrip('/') - return subpath - - -def create_structure_fingerprint(directory, children): - """ - Collect the subpaths of children Resources of Resource `directory` and - create a fingerprint from them - """ - features = [] - for child in children: - if not child.path: - continue - child_subpath = _get_resource_subpath(child, directory) - if not child.size: - rounded_child_size = 0 - else: - rounded_child_size = int(child.size / 10) * 10 - path_feature = str(rounded_child_size) + child_subpath - features.append(path_feature) - return _create_directory_fingerprint(features) - - -def _compute_directory_fingerprints(directory, codebase): - """ - Compute fingerprints for `directory` from `codebase` - """ - # We do not want to add empty files to our fingerprint - children = [r for r in directory.walk(codebase) if r.is_file and r.size] - if len(children) <= 1: - return - - directory_content_fingerprint = create_content_fingerprint(children) - if hasattr(directory, 'directory_content_fingerprint'): - directory.directory_content_fingerprint = directory_content_fingerprint - else: - directory.extra_data['directory_content'] = directory_content_fingerprint - - directory_structure_fingerprint = create_structure_fingerprint(directory, children) - if hasattr(directory, 'directory_structure_fingerprint'): - directory.directory_structure_fingerprint = directory_structure_fingerprint - else: - directory.extra_data['directory_structure'] = directory_structure_fingerprint - - directory.save(codebase) - return directory - - -def compute_directory_fingerprints(directory, codebase): - """ - Recursivly compute fingerprints for `directory` from `codebase` - """ - for resource in directory.walk(codebase, topdown=False): - if resource.is_file: - continue - _ = _compute_directory_fingerprints(resource, codebase) - return directory - - -def compute_codebase_directory_fingerprints(codebase): - """ - Compute fingerprints for directories from `codebase` - """ - for resource in codebase.walk(topdown=False): - if resource.is_file or not resource.path: - continue - _ = _compute_directory_fingerprints(resource, codebase) - return codebase - - -def split_fingerprint(directory_fingerprint): - """ - Given a string `directory_fingerprint`, return the indexed elements count as - an integer and the bah128 fingerprint string - """ - indexed_elements_count_hash = directory_fingerprint[0:8] - indexed_elements_count = int(indexed_elements_count_hash, 16) - bah128 = directory_fingerprint[8:] - return indexed_elements_count, bah128 - - -def hexstring_to_binarray(hex_string): - """ - Convert a hex string to binary form, then store in a bytearray - """ - return bytearray(binascii.unhexlify(hex_string)) - - -def create_halohash_chunks(bah128): - """ - Given a 128-bit bah128 hash string, split it into 4 chunks and return those - chunks as bytearrays - """ - chunk1 = bah128[0:8] - chunk2 = bah128[8:16] - chunk3 = bah128[16:24] - chunk4 = bah128[24:32] - - chunk1 = hexstring_to_binarray(chunk1) - chunk2 = hexstring_to_binarray(chunk2) - chunk3 = hexstring_to_binarray(chunk3) - chunk4 = hexstring_to_binarray(chunk4) - - return chunk1, chunk2, chunk3, chunk4 diff --git a/matchcode-toolkit/src/matchcode_toolkit/halohash.py b/matchcode-toolkit/src/matchcode_toolkit/halohash.py deleted file mode 100644 index 0eb1e1a1..00000000 --- a/matchcode-toolkit/src/matchcode_toolkit/halohash.py +++ /dev/null @@ -1,390 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import binascii - -from bitarray import bitarray -from bitarray.util import count_xor - -from commoncode import codec -from commoncode import hash as commoncode_hash - -""" -Halo is a family of hash functions that have the un-common property that mostly -similar -- but not identical -- inputs will hash to very similar outputs. This -type of hash function is sometimes called a locality-sensitive hash function, -because it is sensitive to the locality of the data being hashed. - -The purpose of these hashes is to quickly compare a large number of elements -that are likely to be similar to find candidates and then compute a more -comprehensive similarity only on the candidates. This includes goals such as -identifying near-duplicates of things or to group very similar things together -(a.k.a. clustering), as well as to detect similarities between inputs or perform -quick comparisons under a certain threshold. - -For a traditional 'good' hash function, small changes in the input will yield -very different hash outputs (through diffusion and avalanche effect). For -instance, cryptographic hashes such as SHA1 or MD5 behave this way. If you hash -two bit strings with a SHA1 function and there is only one bit of difference -between these two strings then the resulting hashes will be rather different. On -average, each time one bit is added to the input, good hash functions have half -of the output bits switched from 0 to 1. - -A Halo hash instead hashes similar inputs to the same hash or to a hash that -differs only by a few bits. The similarity between two hashes becomes an -approximation of the similarity between the two original inputs. This simalirity -is computed using the hamming distance or number of non-matching bits between -two hashes outputs bit straings. This hamming distance is roughly proportional -to the similarity between the two original inputs and can be used to estimate -the similarity of inputs without having access to these full input. - -The Halo name is a play on what one of the hashing function does: a halo is like -a fuzzy, halo'ish representation of the input. - -The bit average function ressembles Charikar's algorithm by using each bits in an -array of hashes but does not use a TF/IDF resulting in a simpler procedure. -""" - - -class BitAverageHaloHash(object): - """ - A bit matrix averaging hash. - - The high level processing sketch looks like this: - For an input of: - ['this' ,'is', 'a', 'rose', 'great']: - - * we first hash each list item to get something like - [4, 15, 2, 12, 12] (for instance with a very short hash function of 4 bits output) - - or as bits this would be something like this: - - ['0011', - '1110', - '0010', - '1100', - '1100'] - - * we sum up each bit positions/columns together: - ['0011', - '1110', - '0010', - '1100', - '1100'] - ------- - 3331 - - or stated otherwise: pos1=3, pos2=3, pos3=3, pos4=1 - - * The mean value for a column is number of hashes/2 (2 because we use bits). - Here mean = 5 hashes/2 = 2.5 - - * We compare the sum of each position with the mean and yield a bit: - if pos sum > mean yield 1 else yield 0 - position1 = 3 > mean = 2.5 , then bit=1 - position2 = 3 > mean = 2.5 , then bit=1 - position3 = 3 > mean = 2.5 , then bit=1 - position4 = 1 < mean = 2.5 , then bit=0 - - * We build a hash by concatenating the resulting bits: - pos 1 + pos2 + pos3 + pos4 = '1110' - - In general, this hash seems to show a lower accuracy and higher sensitivity - with small string and small inputs variations than the bucket average hash. - But it works better on shorter inputs. - - Some usage examples: - - >>> z = b'''The value specified for size must be at - ... least as large as for the smallest bit vector possible for intVal'''.split() - >>> a = BitAverageHaloHash(z, size_in_bits=256) - >>> len(a.digest()) - 32 - >>> z = b'''The value specified for size must be no - ... more larger than the smallest bit vector possible for intVal'''.split() - >>> b = BitAverageHaloHash(z, size_in_bits=256) - >>> a.distance(b) - 57 - >>> b.distance(a) - 57 - >>> a = BitAverageHaloHash(size_in_bits=160) - >>> z = [a.update(x) for x in b'''The value specified for size must be at - ... least as large as for the smallest bit vector possible for intVal'''.split()] - >>> assert a.hexdigest() == b'2c10223104c43470e10b1157e6415b2f730057d0' - >>> b = BitAverageHaloHash(size_in_bits=160) - >>> z = [b.update(x) for x in b'''The value specified for size must be no - ... more larger than the smallest bit vector possible for intVal'''.split()] - >>> assert b.hexdigest() == b'2c912433c4c624e0b03b34576641df8fe00017d0' - >>> a.distance(b) - 29 - >>> a = BitAverageHaloHash(size_in_bits=128) - >>> z =[a.update(x) for x in b'''The value specified for size must be at - ... least as large as for the smallest bit vector possible for intVal'''.split()] - >>> assert a.hexdigest() == b'028b1699c0c5310cd1b566a893d12f10' - >>> b = BitAverageHaloHash(size_in_bits=128) - >>> z = [b.update(x) for x in b'''The value specified for size must be no - ... more larger than the smallest bit vector possible for intVal'''.split()] - >>> assert b.hexdigest() == b'0002969060d5b344d1b7602cd9e127b0' - >>> a.distance(b) - 27 - >>> a = BitAverageHaloHash(size_in_bits=64) - >>> z = [a.update(x) for x in b'''The value specified for size must be at - ... least as large as for the smallest bit vector possible for intVal'''.split()] - >>> assert a.hexdigest() == b'028b1699c0c5310c' - >>> b = BitAverageHaloHash(size_in_bits=64) - >>> z = [b.update(x) for x in b'''The value specified for size must be no - ... more larger than the smallest bit vector possible for intVal'''.split()] - >>> assert b.hexdigest() == b'0002969060d5b344' - >>> a.distance(b) - 14 - >>> a = BitAverageHaloHash(size_in_bits=32) - >>> z = [a.update(x) for x in b'''The value specified for size must be at - ... least as large as for the smallest bit vector possible for intVal'''.split()] - >>> b = BitAverageHaloHash(size_in_bits=32) - >>> z = [b.update(x) for x in b'''The value specified for size must be at - ... least as large as for the smallest bit vector possible by intVal'''.split()] - >>> a.distance(b) - 5 - >>> a = BitAverageHaloHash(size_in_bits=512) - >>> z = [a.update(x) for x in b'''The value specified for size must be at - ... least as large as for the smallest bit vector possible for intVal'''.split()] - >>> b = BitAverageHaloHash(size_in_bits=512) - >>> z = [b.update(x) for x in b'''The value specified for size must be at - ... least as large as for the smallest bit vector possible by intVal'''.split()] - >>> a.distance(b) - 46 - """ - - # TODO: Keep state, keep 1 position per column - - # TODO: create method to aggregate multiple BitAverageHaloHashes together - # TODO: refactor this, don't keep all hashes - # TODO: keep only a list of columns - def __init__(self, msg=None, size_in_bits=128): - self.size_in_bits = size_in_bits - self.columns = [0] * size_in_bits - - # TODO: pick one hash module instead of selecting from multiple hash modules - self.hashmodule = lambda x: x - try: - # TODO: pick one hash algorithm - self.hashmodule = commoncode_hash.get_hasher(size_in_bits) - except: - raise Exception('No available hash module for the requested ' - 'hash size in bits: %(size_in_bits)d' % locals()) - self.update(msg) - - @property - def digest_size(self): - return self.size_in_bits // 8 - - def update(self, msg): - """ - Append a bytestring or sequence of bytestrings to the hash. - """ - if not msg: - return - if isinstance(msg, (list, tuple,)): - for m in msg: - self.__hashup(m) - else: - self.__hashup(msg) - - def __hashup(self, msg): - assert isinstance(msg, bytes) - hsh = self.hashmodule(msg).digest() - bits = bitarray_from_bytes(hsh) - normalized = (-1 if v else 1 for v in bits) - for i, column in enumerate(normalized): - self.columns[i] += column - - def hexdigest(self): - """ - Return the hex-encoded hash value. - """ - return binascii.hexlify(self.digest()) - - def b64digest(self): - """ - Return a base64 "url safe"-encoded string representing this hash. - """ - return codec.b64encode(self.digest()) - - def digest(self): - """ - Return a binary string representing this hash. - """ - flattened = [1 if col > 0 else 0 for col in self.columns] - bits = bitarray(flattened) - return bits.tobytes() - - def distance(self, other): - """ - Return the bit Hamming distance between this hash and another hash. - """ - return int(count_xor(self.hash(), other.hash())) - - def hash(self): - return bitarray_from_bytes(self.digest()) - - @classmethod - def combine(cls, hashes): - """ - Return a BitAverageHaloHash by summing and averaging the columns of the - BitAverageHaloHashes in `hashes` together, putting the resulting - columns into a new BitAverageHaloHash and returning it - """ - size_in_bits = hashes[0].size_in_bits - for h in hashes: - assert isinstance(hash, cls), 'all hashes should be a BitAverageHaloHash, not {}'.format(type(h)) - assert h.size_in_bits == size_in_bits - - all_columns = [h.columns for h in hashes] - b = cls() - b.columns = [sum(col) for col in zip(*all_columns)] - return b - - -def bitarray_from_bytes(b): - """ - Return a bitarray built from a byte string b. - """ - a = bitarray() - a.frombytes(b) - return a - - -def byte_hamming_distance(b1, b2): - b1 = binascii.unhexlify(b1) - b2 = binascii.unhexlify(b2) - b1 = bitarray_from_bytes(b1) - b2 = bitarray_from_bytes(b2) - return hamming_distance(b1, b2) - - -def hamming_distance(bv1, bv2): - """ - Return the Hamming distance between `bv1` and `bv2` bitvectors as the - number of equal bits for all positions. (e.g. the count of bits set to one - in an XOR between two bit strings.) - - `bv1` and `bv2` must both be either hash-like Halohash instances (with a - hash() function) or bit array instances (that can be manipulated as-is). - - See http://en.wikipedia.org/wiki/Hamming_distance - - For example: - - >>> b1 = bitarray('0001010111100001111') - >>> b2 = bitarray('0001010111100001111') - >>> hamming_distance(b1, b2) - 0 - >>> b1 = bitarray('11110000') - >>> b2 = bitarray('00001111') - >>> hamming_distance(b1, b2) - 8 - >>> b1 = bitarray('11110000') - >>> b2 = bitarray('00110011') - >>> hamming_distance(b1, b2) - 4 - """ - return int(count_xor(bv1, bv2)) - - -def slices(s, size): - """ - Given a sequence s, return a sequence of non-overlapping slices of `size`. - Raise an AssertionError if the sequence length is not a multiple of `size`. - - For example: - >>> slices([1, 2, 3, 4, 5, 6], 2) - [(1, 2), (3, 4), (5, 6)] - >>> slices([1, 2, 3, 4, 5, 6], 3) - [(1, 2, 3), (4, 5, 6)] - >>> try: - ... slices([1, 2, 3, 4, 5, 6], 4) - ... except AssertionError: - ... pass - """ - length = len(s) - assert length % size == 0, 'Invalid slice size: len(%(s)r) is not a multiple of %(size)r' % locals() - # TODO: time alternative - # return [s[index:index + size] for index in range(0, length, size)] - chunks = [iter(s)] * size - return list(zip(*chunks)) - - -def common_chunks_from_hexdigest(h1, h2, chunk_bytes_length=4): - """ - Compute the number of common chunks of byte length `chunk_bytes_length` between two - strings h1 and h2, each representing a BitAverageHaloHash hexdigest value. - - For example: - - >>> a = '1f22c2c871cd70521211b138cd76fc04' - >>> b = '1f22c2c871cd7852121bbd38c576bc84' - >>> common_chunks_from_hexdigest(a, b, 32) - 1 - - Note: `a` and `b` start with the same 8 characters, where the next groups - of 8 have a few characters off - - >>> byte_hamming_distance(a, b) - 8 - """ - h1 = bitarray_from_bytes(bytes(binascii.unhexlify(h1))) - h2 = bitarray_from_bytes(bytes(binascii.unhexlify(h2))) - h1_slices = slices(h1, chunk_bytes_length) - h2_slices = slices(h2, chunk_bytes_length) - commons = (1 for h1s, h2s in zip(h1_slices, h2_slices) if h1s == h2s) - return sum(commons) - - -def common_chunks(h1, h2, chunk_bytes_length=4): - """ - Compute the number of common chunks of byte length `chunk_bytes_length` between to - hashes h1 and h2 using the digest. - - Note that chunks that are all set to zeroes are matched too: they are be - significant such as empty buckets of bucket hashes. - - For example: - - >>> m1 = b'The value specified for size must be at least as large'.split() - >>> m2 = b'The value specific for size must be at least as large'.split() - >>> a = BitAverageHaloHash(msg=m1, size_in_bits=256) - >>> b = BitAverageHaloHash(msg=m2, size_in_bits=256) - >>> common_chunks(a, b, 2) - 1 - >>> byte_hamming_distance(a.hexdigest(), b.hexdigest()) - 32 - """ - h1_slices = slices(h1.digest(), chunk_bytes_length) - h2_slices = slices(h2.digest(), chunk_bytes_length) - commons = (1 for h1s, h2s in zip(h1_slices, h2_slices) if h1s == h2s) - return sum(commons) - - -def bit_to_num(bits): - """ - Return an int (or long) for a bit array. - - For example: - TODO: test - """ - return int(bits.to01(), 2) - - -# TODO: add test! -def decode_vector(b64_str): - """ - Return a bit array from an encoded string representation. - """ - decoded = codec.urlsafe_b64decode(b64_str) - return bitarray_from_bytes(decoded) diff --git a/matchcode-toolkit/src/matchcode_toolkit/pipelines/__init__.py b/matchcode-toolkit/src/matchcode_toolkit/pipelines/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py b/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py deleted file mode 100644 index 4b9787bc..00000000 --- a/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_package.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# -# http://nexb.com and https://github.com/nexB/scancode.io -# The ScanCode.io software is licensed under the Apache License version 2.0. -# Data generated with ScanCode.io is provided as-is without warranties. -# ScanCode is a trademark of nexB Inc. -# -# You may not use this software except in compliance with the License. -# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software distributed -# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -# CONDITIONS OF ANY KIND, either express or implied. See the License for the -# specific language governing permissions and limitations under the License. -# -# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES -# OR CONDITIONS OF ANY KIND, either express or implied. No content created from -# ScanCode.io should be considered or used as legal advice. Consult an Attorney -# for any legal advice. -# -# ScanCode.io is a free software code scanning tool from nexB Inc. and others. -# Visit https://github.com/nexB/scancode.io for support and download. - -from scanpipe.pipelines.scan_single_package import ScanSinglePackage -from scanpipe.pipes import matchcode - - -class ScanAndFingerprintPackage(ScanSinglePackage): - """ - Scan a single package file or package archive with ScanCode-toolkit, then - calculate the directory fingerprints of the codebase. - - The output is a summary of the scan results in JSON format. - """ - - @classmethod - def steps(cls): - return ( - cls.get_package_input, - cls.collect_input_information, - cls.extract_input_to_codebase_directory, - cls.run_scan, - cls.load_inventory_from_toolkit_scan, - cls.fingerprint_codebase, - cls.make_summary_from_scan_results, - ) - - scancode_options = [ - "--copyright", - "--email", - "--info", - "--license", - "--license-text", - "--package", - "--url", - "--classify", - "--summary", - ] - - def fingerprint_codebase(self): - """ - Compute directory fingerprints for matching purposes - """ - matchcode.fingerprint_codebase_directories(self.project) diff --git a/matchcode-toolkit/src/matchcode_toolkit/plugin_fingerprint.py b/matchcode-toolkit/src/matchcode_toolkit/plugin_fingerprint.py deleted file mode 100644 index d55e4bb7..00000000 --- a/matchcode-toolkit/src/matchcode_toolkit/plugin_fingerprint.py +++ /dev/null @@ -1,44 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# ScanCode is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/scancode-toolkit for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import attr - -from commoncode.cliutils import PluggableCommandLineOption -from commoncode.cliutils import POST_SCAN_GROUP -from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints -from plugincode.post_scan import post_scan_impl -from plugincode.post_scan import PostScanPlugin - - -@post_scan_impl -class Fingerprint(PostScanPlugin): - resource_attributes = dict( - directory_content_fingerprint=attr.ib(default=None, repr=False), - directory_structure_fingerprint=attr.ib(default=None, repr=False), - ) - sort_order = 6 - - options = [ - PluggableCommandLineOption( - ( - '--fingerprint', - ), - is_flag=True, - default=False, - help='Compute directory fingerprints that are used for matching', - help_group=POST_SCAN_GROUP, - sort_order=20, - ) - ] - - def is_enabled(self, fingerprint, **kwargs): - return fingerprint - - def process_codebase(self, codebase, **kwargs): - codebase = compute_codebase_directory_fingerprints(codebase) diff --git a/matchcode-toolkit/tests/test_fingerprinting.py b/matchcode-toolkit/tests/test_fingerprinting.py deleted file mode 100644 index b1f9b920..00000000 --- a/matchcode-toolkit/tests/test_fingerprinting.py +++ /dev/null @@ -1,126 +0,0 @@ -# -# Copyright (c) nexB Inc. and others. All rights reserved. -# purldb is a trademark of nexB Inc. -# SPDX-License-Identifier: Apache-2.0 -# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. -# See https://github.com/nexB/purldb for support or download. -# See https://aboutcode.org for more information about nexB OSS projects. -# - -import os - -from commoncode.resource import VirtualCodebase -from commoncode.testcase import FileBasedTesting - -from matchcode_toolkit.fingerprinting import _create_directory_fingerprint -from matchcode_toolkit.fingerprinting import _get_resource_subpath -from matchcode_toolkit.fingerprinting import compute_codebase_directory_fingerprints -from matchcode_toolkit.fingerprinting import create_content_fingerprint -from matchcode_toolkit.fingerprinting import create_halohash_chunks -from matchcode_toolkit.fingerprinting import create_structure_fingerprint -from matchcode_toolkit.fingerprinting import split_fingerprint - - -class Resource(): - def __init__(self, path='', size=0, sha1=''): - self.path = path - self.size = size - self.sha1 = sha1 - - -class TestFingerprintingFunctions(FileBasedTesting): - test_data_dir = os.path.join(os.path.dirname(__file__), 'testfiles/fingerprinting') - - def test__create_directory_fingerprint(self): - test_input = [ - 'package', - 'package/readme.txt', - 'package/index.js', - 'package/package.json', - ] - directory_fingerprint = _create_directory_fingerprint(test_input) - expected_directory_fingerprint = '0000000410d24471969646cb5402032288493126' - self.assertEqual(expected_directory_fingerprint, directory_fingerprint) - indexed_elements_count, _ = split_fingerprint(directory_fingerprint) - self.assertEqual(len(test_input), indexed_elements_count) - - def test_split_fingerprint(self): - directory_fingerprint = '0000000410d24471969646cb5402032288493126' - indexed_elements_count, bah128 = split_fingerprint(directory_fingerprint) - - expected_indexed_elements_count = 4 - self.assertEqual(expected_indexed_elements_count, indexed_elements_count) - - expected_bah128 = '10d24471969646cb5402032288493126' - self.assertEqual(expected_bah128, bah128) - - def test_create_content_fingerprint(self): - test_resources = [ - Resource(sha1='d4e4abbe8e2a8169d6a52907152c2c80ec884745'), - Resource(sha1='0c94f137f6e0536db8cb2622a9dc84253b91b90c'), - Resource(sha1='10cab45fe6f353b47b587a576c1077a96ce348f5'), - Resource(sha1='134f2b052b6e5f56b631be2eded70f89d44cf381'), - ] - fingerprint = create_content_fingerprint(test_resources) - expected_fingerprint = '00000004005b88c2800f0044044781ae05680419' - self.assertEqual(expected_fingerprint, fingerprint) - - def test__get_resource_subpath(self): - test_resource = Resource(path='foo/bar/baz/qux.c') - test_top_resource = Resource(path='foo/bar/') - subpath = _get_resource_subpath(test_resource, test_top_resource) - expected_subpath = 'baz/qux.c' - self.assertEqual(expected_subpath, subpath) - - def test_create_structure_fingerprint(self): - test_top_resource = Resource(path='package') - test_child_resources = [ - Resource(path='package/readme.txt', size=771), - Resource(path='package/index.js', size=608), - Resource(path='package/package.json', size=677), - ] - fingerprint = create_structure_fingerprint(test_top_resource, test_child_resources) - expected_fingerprint = '00000003ce72f4308a1bc1afb0fb47ed590b5c53' - self.assertEqual(expected_fingerprint, fingerprint) - - def test_create_halohash_chunks(self): - test_bah128 = 'ce72f4308a1bc1afb0fb47ed590b5c53' - chunk1, chunk2, chunk3, chunk4 = create_halohash_chunks(test_bah128) - expected_chunk1 = bytearray(b'\xcer\xf40') - expected_chunk2 = bytearray(b'\x8a\x1b\xc1\xaf') - expected_chunk3 = bytearray(b'\xb0\xfbG\xed') - expected_chunk4 = bytearray(b'Y\x0b\\S') - self.assertEqual(chunk1, expected_chunk1) - self.assertEqual(chunk2, expected_chunk2) - self.assertEqual(chunk3, expected_chunk3) - self.assertEqual(chunk4, expected_chunk4) - - def test_compute_codebase_directory_fingerprints(self): - scan_loc = self.get_test_loc('abbrev-1.0.3-i.json') - vc = VirtualCodebase(location=scan_loc) - vc = compute_codebase_directory_fingerprints(vc) - directory_content = vc.root.extra_data['directory_content'] - directory_structure = vc.root.extra_data['directory_structure'] - expected_directory_content = '0000000346ce04751a3c98f00086f16a91d9790b' - expected_directory_structure = '000000034f9bf110673bdf06197cd514a799a66c' - self.assertEqual(expected_directory_content, directory_content) - self.assertEqual(expected_directory_structure, directory_structure) - - def test_do_not_compute_fingerprint_for_empty_dirs(self): - scan_loc = self.get_test_loc('test.json') - vc = VirtualCodebase(location=scan_loc) - vc = compute_codebase_directory_fingerprints(vc) - directory_content = vc.root.extra_data['directory_content'] - directory_structure = vc.root.extra_data['directory_structure'] - expected_directory_content = '000000032a5fa8d01922536b53e8fc6e3d43766f' - expected_directory_structure = '000000030a399ce2b947a6f611821965a4fcc577' - self.assertEqual(expected_directory_content, directory_content) - self.assertEqual(expected_directory_structure, directory_structure) - # These directories should not have fingerprints generated or stored in - # extra_data - empty_dir_1 = vc.get_resource('test/test') - empty_dir_2 = vc.get_resource('test/test/test2') - self.assertEqual({}, empty_dir_1.extra_data) - self.assertEqual({}, empty_dir_1.extra_data) - self.assertEqual({}, empty_dir_2.extra_data) - self.assertEqual({}, empty_dir_2.extra_data) diff --git a/matchcode-toolkit/tests/testfiles/fingerprinting/abbrev-1.0.3-i.json b/matchcode-toolkit/tests/testfiles/fingerprinting/abbrev-1.0.3-i.json deleted file mode 100644 index 2d418ac9..00000000 --- a/matchcode-toolkit/tests/testfiles/fingerprinting/abbrev-1.0.3-i.json +++ /dev/null @@ -1,161 +0,0 @@ -{ - "headers": [ - { - "tool_name": "scancode-toolkit", - "tool_version": "31.2.2", - "options": { - "input": [ - "package" - ], - "--info": true, - "--json-pp": "./abbrev-1.0.3.tgz-i.json" - }, - "notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.", - "start_timestamp": "2022-12-02T011500.811761", - "end_timestamp": "2022-12-02T011501.090542", - "output_format_version": "2.0.0", - "duration": 0.2787973880767822, - "message": null, - "errors": [], - "warnings": [], - "extra_data": { - "system_environment": { - "operating_system": "linux", - "cpu_architecture": "64", - "platform": "Linux-5.15.39-3-pve-x86_64-with-glibc2.35", - "platform_version": "#2 SMP PVE 5.15.39-3 (Wed, 27 Jul 2022 13:45:39 +0200)", - "python_version": "3.10.5 (main, Jul 30 2022, 06:09:26) [GCC 9.4.0]" - }, - "spdx_license_list_version": "3.17", - "files_count": 3 - } - } - ], - "files": [ - { - "path": "package", - "type": "directory", - "name": "package", - "base_name": "package", - "extension": "", - "size": 0, - "date": null, - "sha1": null, - "md5": null, - "sha256": null, - "mime_type": null, - "file_type": null, - "programming_language": null, - "is_binary": false, - "is_text": false, - "is_archive": false, - "is_media": false, - "is_source": false, - "is_script": false, - "files_count": 3, - "dirs_count": 1, - "size_count": 3358, - "scan_errors": [] - }, - { - "path": "package/package.json", - "type": "file", - "name": "package.json", - "base_name": "package", - "extension": ".json", - "size": 277, - "date": "2011-03-24", - "sha1": "d61dc2c98ab10bf909b99f60e7bf584a7f7ead8c", - "md5": "8468753cba56d0075f6532a657ee5821", - "sha256": "5ab100bf0eb08adb175db170a1254d14e0be705ff1b563e5acddd3c8d03faee1", - "mime_type": "application/json", - "file_type": "JSON data", - "programming_language": null, - "is_binary": false, - "is_text": true, - "is_archive": false, - "is_media": false, - "is_source": false, - "is_script": false, - "files_count": 0, - "dirs_count": 0, - "size_count": 0, - "scan_errors": [] - }, - { - "path": "package/README.md", - "type": "file", - "name": "README.md", - "base_name": "README", - "extension": ".md", - "size": 499, - "date": "2011-03-24", - "sha1": "c520bc857ec612ed88e13d794c47882d5aed3286", - "md5": "96b93093abdfdfef1ef8a3e2d5ca7f71", - "sha256": "2581765d44e15c58a2b88ad7bc9cc5c9ee029b4b5013c06dc45d9e94e8cb2ba4", - "mime_type": "text/plain", - "file_type": "ASCII text", - "programming_language": null, - "is_binary": false, - "is_text": true, - "is_archive": false, - "is_media": false, - "is_source": false, - "is_script": false, - "files_count": 0, - "dirs_count": 0, - "size_count": 0, - "scan_errors": [] - }, - { - "path": "package/lib", - "type": "directory", - "name": "lib", - "base_name": "lib", - "extension": "", - "size": 0, - "date": null, - "sha1": null, - "md5": null, - "sha256": null, - "mime_type": null, - "file_type": null, - "programming_language": null, - "is_binary": false, - "is_text": false, - "is_archive": false, - "is_media": false, - "is_source": false, - "is_script": false, - "files_count": 1, - "dirs_count": 0, - "size_count": 2582, - "scan_errors": [] - }, - { - "path": "package/lib/abbrev.js", - "type": "file", - "name": "abbrev.js", - "base_name": "abbrev", - "extension": ".js", - "size": 2582, - "date": "2011-03-24", - "sha1": "055ec01ac8b111bc948e498d87d9dc47f5e5acaa", - "md5": "06aebeadc85e52f4b8bf88eab6cd8b6c", - "sha256": "efd2c9b755dc4b2df3231222b5b6a63b7a1343472dfbc8807c5f15e1d28a0c75", - "mime_type": "text/plain", - "file_type": "ASCII text", - "programming_language": "JavaScript", - "is_binary": false, - "is_text": true, - "is_archive": false, - "is_media": false, - "is_source": true, - "is_script": false, - "files_count": 0, - "dirs_count": 0, - "size_count": 0, - "scan_errors": [] - } - ] -} \ No newline at end of file diff --git a/matchcode-toolkit/tests/testfiles/fingerprinting/test.json b/matchcode-toolkit/tests/testfiles/fingerprinting/test.json deleted file mode 100644 index 5f6a9930..00000000 --- a/matchcode-toolkit/tests/testfiles/fingerprinting/test.json +++ /dev/null @@ -1,211 +0,0 @@ -{ - "headers": [ - { - "tool_name": "scancode-toolkit", - "tool_version": "32.0.6", - "options": { - "input": [ - "/home/jono/Desktop/test" - ], - "--info": true, - "--json-pp": "/home/jono/test.json" - }, - "notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.", - "start_timestamp": "2023-08-31T215533.874398", - "end_timestamp": "2023-08-31T215533.977407", - "output_format_version": "3.0.0", - "duration": 0.10302162170410156, - "message": null, - "errors": [], - "warnings": [], - "extra_data": { - "system_environment": { - "operating_system": "linux", - "cpu_architecture": "64", - "platform": "Linux-5.4.0-150-generic-x86_64-with-glibc2.27", - "platform_version": "#167~18.04.1-Ubuntu SMP Wed May 24 00:51:42 UTC 2023", - "python_version": "3.10.8 (main, Nov 20 2022, 18:43:48) [GCC 7.5.0]" - }, - "spdx_license_list_version": "3.21", - "files_count": 3 - } - } - ], - "files": [ - { - "path": "test", - "type": "directory", - "name": "test", - "base_name": "test", - "extension": "", - "size": 0, - "date": null, - "sha1": null, - "md5": null, - "sha256": null, - "mime_type": null, - "file_type": null, - "programming_language": null, - "is_binary": false, - "is_text": false, - "is_archive": false, - "is_media": false, - "is_source": false, - "is_script": false, - "files_count": 3, - "dirs_count": 3, - "size_count": 55, - "scan_errors": [] - }, - { - "path": "test/package.json", - "type": "file", - "name": "package.json", - "base_name": "package", - "extension": ".json", - "size": 3, - "date": "2023-08-31", - "sha1": "f10e2821bbbea527ea02200352313bc059445190", - "md5": "7815696ecbf1c96e6894b779456d330e", - "sha256": "688787d8ff144c502c7f5cffaafe2cc588d86079f9de88304c26b0cb99ce91c6", - "mime_type": "text/plain", - "file_type": "ASCII text, with no line terminators", - "programming_language": null, - "is_binary": false, - "is_text": true, - "is_archive": false, - "is_media": false, - "is_source": false, - "is_script": false, - "files_count": 0, - "dirs_count": 0, - "size_count": 0, - "scan_errors": [] - }, - { - "path": "test/src", - "type": "directory", - "name": "src", - "base_name": "src", - "extension": "", - "size": 0, - "date": null, - "sha1": null, - "md5": null, - "sha256": null, - "mime_type": null, - "file_type": null, - "programming_language": null, - "is_binary": false, - "is_text": false, - "is_archive": false, - "is_media": false, - "is_source": false, - "is_script": false, - "files_count": 2, - "dirs_count": 0, - "size_count": 52, - "scan_errors": [] - }, - { - "path": "test/src/bar.txt", - "type": "file", - "name": "bar.txt", - "base_name": "bar", - "extension": ".txt", - "size": 3, - "date": "2023-08-31", - "sha1": "62cdb7020ff920e5aa642c3d4066950dd1f01f4d", - "md5": "37b51d194a7513e45b56f6524f2d51f2", - "sha256": "fcde2b2edba56bf408601fb721fe9b5c338d10ee429ea04fae5511b68fbf8fb9", - "mime_type": "text/plain", - "file_type": "ASCII text, with no line terminators", - "programming_language": null, - "is_binary": false, - "is_text": true, - "is_archive": false, - "is_media": false, - "is_source": false, - "is_script": false, - "files_count": 0, - "dirs_count": 0, - "size_count": 0, - "scan_errors": [] - }, - { - "path": "test/src/foo.js", - "type": "file", - "name": "foo.js", - "base_name": "foo", - "extension": ".js", - "size": 49, - "date": "2023-07-26", - "sha1": "fef9e8e1746b8f2175b500c57a9c6d250623885b", - "md5": "54149367c4c4523241c945701eee1a02", - "sha256": "8ab7888ffceb5004ff3d14417c71d7b56812e04d0ac86545e1592208c6d56d04", - "mime_type": "text/plain", - "file_type": "ASCII text", - "programming_language": "JavaScript", - "is_binary": false, - "is_text": true, - "is_archive": false, - "is_media": false, - "is_source": true, - "is_script": false, - "files_count": 0, - "dirs_count": 0, - "size_count": 0, - "scan_errors": [] - }, - { - "path": "test/test", - "type": "directory", - "name": "test", - "base_name": "test", - "extension": "", - "size": 0, - "date": null, - "sha1": null, - "md5": null, - "sha256": null, - "mime_type": null, - "file_type": null, - "programming_language": null, - "is_binary": false, - "is_text": false, - "is_archive": false, - "is_media": false, - "is_source": false, - "is_script": false, - "files_count": 0, - "dirs_count": 1, - "size_count": 0, - "scan_errors": [] - }, - { - "path": "test/test/test2", - "type": "directory", - "name": "test2", - "base_name": "test2", - "extension": "", - "size": 0, - "date": null, - "sha1": null, - "md5": null, - "sha256": null, - "mime_type": null, - "file_type": null, - "programming_language": null, - "is_binary": false, - "is_text": false, - "is_archive": false, - "is_media": false, - "is_source": false, - "is_script": false, - "files_count": 0, - "dirs_count": 0, - "size_count": 0, - "scan_errors": [] - } - ] -} \ No newline at end of file