diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2e1e556c..13dbb88e 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,11 @@ Next Release - Add `/api/from_purl/purl2git` endpoint to get a git repo for a purl. - Add `/api/to_purl/go` endpoint to get a purl from a golang import string or a package string in go.mod. +- Support indexing of PURLs listed in https://github.com/nexB/purldb/issues/326, + https://github.com/nexB/purldb/issues/327, https://github.com/nexB/purldb/issues/328, + https://github.com/nexB/purldb/issues/329 and https://github.com/nexB/purldb/issues/356. +- Support ``addon_pipelines`` for symbol and string collection in ``/api/collect`` endpoint. https://github.com/nexB/purldb/pull/393 +- Store ``source_symbols`` and ``source_strings`` in ``extra_data`` field. https://github.com/nexB/purldb/pull/351 v4.0.0 diff --git a/docs/source/how-to-guides/index.rst b/docs/source/how-to-guides/index.rst index 064a46fd..47d33636 100644 --- a/docs/source/how-to-guides/index.rst +++ b/docs/source/how-to-guides/index.rst @@ -11,4 +11,4 @@ thourgh specifica use cases: :maxdepth: 2 matchcode - purl2sym + tutorial_symbol_and_string_collection diff --git a/docs/source/how-to-guides/purl2sym.rst b/docs/source/how-to-guides/purl2sym.rst deleted file mode 100644 index f7bbf2f6..00000000 --- a/docs/source/how-to-guides/purl2sym.rst +++ /dev/null @@ -1,3 +0,0 @@ -How To get symbols from a PURL/package -====================================== - diff --git a/docs/source/how-to-guides/tutorial_symbol_and_string_collection.rst b/docs/source/how-to-guides/tutorial_symbol_and_string_collection.rst new file mode 100644 index 00000000..661798bf --- /dev/null +++ b/docs/source/how-to-guides/tutorial_symbol_and_string_collection.rst @@ -0,0 +1,388 @@ +.. _tutorial_symbol_and_string_collection: + +How To get symbols and strings from a PURL/package +================================================== + +In this tutorial we'll introduce the different addon pipeline that can be used for +collecting symbols and strings from codebase resources. + +.. note:: + This tutorial assumes that you have a working installation of PurlDB. + If you don't, please refer to the `installation <../purldb/overview.html#installation>`_ page. + + +Through out this tutorial we will use ``pkg:github/llvm/llvm-project@10.0.0`` and will show +the symbol and string for `llvm-project/clang/lib/Basic/Targets/BPF.cpp `_ +resource. + +.. raw:: html + +
+ BPF.cpp +
+ +.. code-block:: cpp + + //===--- BPF.cpp - Implement BPF target feature support -------------------===// + // + // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + // See https://llvm.org/LICENSE.txt for license information. + // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + // + //===----------------------------------------------------------------------===// + // + // This file implements BPF TargetInfo objects. + // + //===----------------------------------------------------------------------===// + + #include "BPF.h" + #include "Targets.h" + #include "clang/Basic/MacroBuilder.h" + #include "clang/Basic/TargetBuiltins.h" + #include "llvm/ADT/StringRef.h" + + using namespace clang; + using namespace clang::targets; + + const Builtin::Info BPFTargetInfo::BuiltinInfo[] = { + #define BUILTIN(ID, TYPE, ATTRS) \ + {#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr}, + #include "clang/Basic/BuiltinsBPF.def" + }; + + void BPFTargetInfo::getTargetDefines(const LangOptions &Opts, + MacroBuilder &Builder) const { + Builder.defineMacro("__bpf__"); + Builder.defineMacro("__BPF__"); + } + + static constexpr llvm::StringLiteral ValidCPUNames[] = {"generic", "v1", "v2", + "v3", "probe"}; + + bool BPFTargetInfo::isValidCPUName(StringRef Name) const { + return llvm::find(ValidCPUNames, Name) != std::end(ValidCPUNames); + } + + void BPFTargetInfo::fillValidCPUList(SmallVectorImpl &Values) const { + Values.append(std::begin(ValidCPUNames), std::end(ValidCPUNames)); + } + + ArrayRef BPFTargetInfo::getTargetBuiltins() const { + return llvm::makeArrayRef(BuiltinInfo, clang::BPF::LastTSBuiltin - + Builtin::FirstTSBuiltin); + } + +.. raw:: html + +
+
+ + +Ctags Symbols +------------- + +- Send GET request to PurlDB with:: + + /api/collect/?purl=pkg:github/llvm/llvm-project@10.0.0&addon_pipelines=collect_symbols_ctags + +.. warning:: + The ``collect_symbols_ctags`` pipeline requires ``universal-ctags``. + +- Once the indexing has completed visit ``/api/resources/?purl=pkg:github/llvm/llvm-project@10.0.0`` + to get the ``source_symbols`` for resources. + +.. code-block:: json + :caption: Ctags symbol for ``clang/lib/Basic/Targets/BPF.cpp`` in ``extra_data`` field + :emphasize-lines: 35-41 + + { + "package": "http://127.0.0.1:8001/api/packages/", + "purl": "pkg:github/llvm/llvm-project@10.0.0", + "path": "llvm-project-llvmorg-10.0.0.tar.gz-extract/llvm-project-llvmorg-10.0.0/clang/lib/Basic/Targets/BPF.cpp", + "type": "file", + "name": "BPF.cpp", + "extension": ".cpp", + "size": 1788, + "md5": "382b406d1023d12cd8f28106043774ee", + "sha1": "366146c8228c4e2cd46c47618fa3211ce48d96e2", + "sha256": "d7609c502c7d462dcee1b631a80eb765ad7d10597991d88c3d4cd2ae0370eeba", + "sha512": null, + "git_sha1": null, + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C++", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_key_file": false, + "detected_license_expression": "", + "detected_license_expression_spdx": "", + "license_detections": [], + "license_clues": [], + "percentage_of_license_text": null, + "copyrights": [], + "holders": [], + "authors": [], + "package_data": [], + "emails": [], + "urls": [], + "extra_data": { + "source_symbols": [ + "BUILTIN", + "BuiltinInfo", + "ValidCPUNames", + "fillValidCPUList", + "getTargetBuiltins", + "getTargetDefines", + "isValidCPUName" + ] + } + } + + +Xgettext Strings +---------------- + +- Send GET request to PurlDB with:: + + /api/collect/?purl=pkg:github/llvm/llvm-project@10.0.0&addon_pipelines=collect_strings_gettext + +.. warning:: + The ``collect_strings_gettext`` pipeline requires ``gettext``. + +- Once the indexing has completed visit ``/api/resources/?purl=pkg:github/llvm/llvm-project@10.0.0`` + to get the ``source_strings`` for resources. + +.. code-block:: json + :caption: Xgettext strings for ``clang/lib/Basic/Targets/BPF.cpp`` in ``extra_data`` field + :emphasize-lines: 35-41 + + { + "package": "http://127.0.0.1:8001/api/packages/", + "purl": "pkg:github/llvm/llvm-project@10.0.0", + "path": "llvm-project-llvmorg-10.0.0.tar.gz-extract/llvm-project-llvmorg-10.0.0/clang/lib/Basic/Targets/BPF.cpp", + "type": "file", + "name": "BPF.cpp", + "extension": ".cpp", + "size": 1788, + "md5": "382b406d1023d12cd8f28106043774ee", + "sha1": "366146c8228c4e2cd46c47618fa3211ce48d96e2", + "sha256": "d7609c502c7d462dcee1b631a80eb765ad7d10597991d88c3d4cd2ae0370eeba", + "sha512": null, + "git_sha1": null, + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C++", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_key_file": false, + "detected_license_expression": "", + "detected_license_expression_spdx": "", + "license_detections": [], + "license_clues": [], + "percentage_of_license_text": null, + "copyrights": [], + "holders": [], + "authors": [], + "package_data": [], + "emails": [], + "urls": [], + "extra_data": { + "source_strings": [ + "__bpf__", + "__BPF__", + "generic", + "v", + "v", + "v", + "probe" + ] + } + } + +Tree-Sitter Symbols and Strings +------------------------------- + +- Send GET request to PurlDB with:: + + /api/collect/?purl=pkg:github/llvm/llvm-project@10.0.0&addon_pipelines=collect_symbols_tree_sitter + +- Once the indexing has completed visit ``/api/resources/?purl=pkg:github/llvm/llvm-project@10.0.0`` + to get the ``source_symbols`` and ``source_strings`` for resources. + +.. code-block:: json + :caption: Tree-Sitter symbols and strings for ``clang/lib/Basic/Targets/BPF.cpp`` in ``extra_data`` field + :emphasize-lines: 35-69, 72-84 + + { + "package": "http://127.0.0.1:8001/api/packages/", + "purl": "pkg:github/llvm/llvm-project@10.0.0", + "path": "llvm-project-llvmorg-10.0.0.tar.gz-extract/llvm-project-llvmorg-10.0.0/clang/lib/Basic/Targets/BPF.cpp", + "type": "file", + "name": "BPF.cpp", + "extension": ".cpp", + "size": 1788, + "md5": "382b406d1023d12cd8f28106043774ee", + "sha1": "366146c8228c4e2cd46c47618fa3211ce48d96e2", + "sha256": "d7609c502c7d462dcee1b631a80eb765ad7d10597991d88c3d4cd2ae0370eeba", + "sha512": null, + "git_sha1": null, + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C++", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_key_file": false, + "detected_license_expression": "", + "detected_license_expression_spdx": "", + "license_detections": [], + "license_clues": [], + "percentage_of_license_text": null, + "copyrights": [], + "holders": [], + "authors": [], + "package_data": [], + "emails": [], + "urls": [], + "extra_data": { + "source_symbols": [ + "clang", + "targets", + "BuiltinInfo", + "BUILTIN", + "ID", + "TYPE", + "ATTRS", + "TYPE", + "ATTRS", + "ALL_LANGUAGES", + "getTargetDefines", + "Opts", + "Builder", + "Builder", + "Builder", + "ValidCPUNames", + "isValidCPUName", + "Name", + "find", + "ValidCPUNames", + "Name", + "end", + "ValidCPUNames", + "fillValidCPUList", + "Values", + "Values", + "begin", + "ValidCPUNames", + "end", + "ValidCPUNames", + "getTargetBuiltins", + "makeArrayRef", + "BuiltinInfo", + "LastTSBuiltin", + "FirstTSBuiltin" + ], + "source_strings": [ + "BPF.h", + "Targets.h", + "clang/Basic/MacroBuilder.h", + "clang/Basic/TargetBuiltins.h", + "llvm/ADT/StringRef.h", + "clang/Basic/BuiltinsBPF.def", + "__bpf__", + "__BPF__", + "generic", + "v1", + "v2", + "v3", + "probe" + ] + } + } + +Pygments Symbols and Strings +------------------------------- + +- Send GET request to PurlDB with:: + + /api/collect/?purl=pkg:github/llvm/llvm-project@10.0.0&addon_pipelines=collect_symbols_pygments + +- Once the indexing has completed visit ``/api/resources/?purl=pkg:github/llvm/llvm-project@10.0.0`` + to get the ``source_symbols`` and ``source_strings`` for resources. + + +.. code-block:: json + :caption: Pygments symbols and strings for ``clang/lib/Basic/Targets/BPF.cpp`` in ``extra_data`` field + :emphasize-lines: 35-40, 43-63 + + { + "package": "http://127.0.0.1:8001/api/packages/", + "purl": "pkg:github/llvm/llvm-project@10.0.0", + "path": "llvm-project-llvmorg-10.0.0.tar.gz-extract/llvm-project-llvmorg-10.0.0/clang/lib/Basic/Targets/BPF.cpp", + "type": "file", + "name": "BPF.cpp", + "extension": ".cpp", + "size": 1788, + "md5": "382b406d1023d12cd8f28106043774ee", + "sha1": "366146c8228c4e2cd46c47618fa3211ce48d96e2", + "sha256": "d7609c502c7d462dcee1b631a80eb765ad7d10597991d88c3d4cd2ae0370eeba", + "sha512": null, + "git_sha1": null, + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C++", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_key_file": false, + "detected_license_expression": "", + "detected_license_expression_spdx": "", + "license_detections": [], + "license_clues": [], + "percentage_of_license_text": null, + "copyrights": [], + "holders": [], + "authors": [], + "package_data": [], + "emails": [], + "urls": [], + "extra_data": { + "source_symbols": [ + "clang", + "clang", + "targets", + "BPFTargetInfo::getTargetDefines", + "BPFTargetInfo::isValidCPUName", + "BPFTargetInfo::fillValidCPUList" + ], + "source_strings": [ + "\"", + "__bpf__", + "\"", + "\"", + "__BPF__", + "\"", + "\"", + "generic", + "\"", + "\"", + "v1", + "\"", + "\"", + "v2", + "\"", + "\"", + "v3", + "\"", + "\"", + "probe", + "\"" + ] + } + } diff --git a/docs/source/purldb/index.rst b/docs/source/purldb/index.rst index b07e0373..21a31890 100644 --- a/docs/source/purldb/index.rst +++ b/docs/source/purldb/index.rst @@ -5,4 +5,4 @@ PURLdb :maxdepth: 2 overview - purl2sym + symbol_and_string_collection diff --git a/docs/source/purldb/purl2sym.rst b/docs/source/purldb/purl2sym.rst deleted file mode 100644 index c4cac435..00000000 --- a/docs/source/purldb/purl2sym.rst +++ /dev/null @@ -1,78 +0,0 @@ -.. _purl2sym: - -Purl2Sym -============ - -Purl2Sym collects the core package metadata along with symbols and strings -from source code and stores them in the ``extra_data`` field of the resource. - -How it works ------------- - -When PurlDB receives an index request for a PURL via the ``/api/collect`` -endpoint, it fetches the archive download_url and creates a package for -the PURL with relevant metadata. Thereafter, a scan job is scheduled which -downloads the archive of the PURL and runs the `scan_single_package `_ -package pipeline. Thereafter, the scan job also runs the two addon pipelines: -`collect_symbols `_ -and `collect_source_strings `_ -for symbol and string collection respectively. Upon completion of the scan -job, the package is updated with resource data along with the ``source_symbols`` -and ``source_strings`` in the ``extra_data`` field of resources. - -source-inspector ------------------- - -source-inspector is a set of utilities to inspect and analyze source -code and collect interesting data using various tools such as code symbols and strings. -This is also a ScanCode-toolkit plugin. - -Requirements -~~~~~~~~~~~~~ - -This utility is designed to work on Linux and POSIX OS with these utilities: - -- xgettext that comes with GNU gettext. -- universal ctags, version 5.9 or higher, built with JSON support. - -On Debian systems run this:: - - sudo apt-get install universal-ctags gettext - -On MacOS systems run this:: - - brew install universal-ctags gettext - -To get started: -~~~~~~~~~~~~~~~~ - -1. Clone this repo - -2. Run:: - - ./configure --dev - source venv/bin/activate - -3. Run tests with:: - - pytest -vvs - -4. Run a basic scan to collect symbols and display as YAML on screen:: - - scancode --source-symbol tests/data/symbols_ctags/test3.cpp --yaml - - -5. Run a basic scan to collect strings and display as YAML on screen:: - - scancode --source-string tests/data/symbols_ctags/test3.cpp --yaml - - -Pipeline in scancode.io -------------------------- - -There is a ``collect_symbols`` pipeline in scancode.io to get symbols -using the ``source-inspector`` library for codebases. - -See the `pipeline `_ for more details. - -This is also available in the standard scancode.io pipelines used to scan packages -in purldb and symbols are stored in the ``extra_data`` field for all scanned resources -available in purldb. diff --git a/docs/source/purldb/symbol_and_string_collection.rst b/docs/source/purldb/symbol_and_string_collection.rst new file mode 100644 index 00000000..dc674fa4 --- /dev/null +++ b/docs/source/purldb/symbol_and_string_collection.rst @@ -0,0 +1,34 @@ +.. _symbol_and_string_collection: + +Symbol and String Collection +============================ + +The package indexing endpoint now also supports the symbol and string collection +pipeline and stores them in the ``extra_data`` field of the resource. + +How it works +------------ + +When PurlDB receives an index request for a PURL via the ``/api/collect`` +endpoint along with the symbol/string addon_pipeline, it fetches the archive +download_url and creates a package for the PURL with relevant metadata. +Thereafter, a scan job is scheduled which downloads the archive of the PURL +and runs the `scan_single_package `_ +package pipeline. Scan job also runs the requested addon_pipelines. +Upon completion of the scan job, the package is updated with resource data along +with the ``source_symbols`` and ``source_strings`` in the ``extra_data`` field of +resources. + +Currently PurlDB supports these addon_pipeline for symbol/string collection. + +- ``collect_symbols_ctags`` +- ``collect_strings_gettext`` +- ``collect_symbols_tree_sitter`` +- ``collect_symbols_pygments`` + +See the detailed tutorial on :ref:`tutorial_symbol_and_string_collection` in PurlDB. + +.. line-block:: + + To use these pipeline on ScanCode.io refer to `Symbol and String Collection `_. + For more details on these plugins refer to `source-inspector `_. diff --git a/minecode/model_utils.py b/minecode/model_utils.py index d43aa816..882d4cb8 100644 --- a/minecode/model_utils.py +++ b/minecode/model_utils.py @@ -35,10 +35,10 @@ # These are the list of supported addon pipelines to run when we scan a Package for # indexing. SUPPORTED_ADDON_PIPELINES = ( - 'collect_pygments_symbols', - 'collect_source_strings', - 'collect_symbols', - 'collect_tree_sitter_symbols', + 'collect_strings_gettext', + 'collect_symbols_ctags', + 'collect_symbols_pygments', + 'collect_symbols_tree_sitter', 'inspect_elf_binaries', ) diff --git a/packagedb/api.py b/packagedb/api.py index 1fcb9046..12c5f4f6 100644 --- a/packagedb/api.py +++ b/packagedb/api.py @@ -679,7 +679,7 @@ class CollectViewSet(viewsets.ViewSet): **Example:** - /api/collect/?purl=pkg:npm/foo@1.2.3&addon_pipelines=collect_symbols&addon_pipelines=inspect_elf_binaries + /api/collect/?purl=pkg:npm/foo@1.2.3&addon_pipelines=collect_symbols_ctags&addon_pipelines=inspect_elf_binaries **Note:** Use `Index packages` for bulk indexing/reindexing of packages. @@ -773,7 +773,7 @@ def index_packages(self, request, *args, **kwargs): "purl": "pkg:npm/less@1.0.32", "vers": null, "source_purl": None, - "addon_pipelines": ['collect_symbols'] + "addon_pipelines": ['collect_symbols_ctags'] }, { "purl": "pkg:npm/less", @@ -785,7 +785,7 @@ def index_packages(self, request, *args, **kwargs): "purl": "pkg:npm/foobar", "vers": null, "source_purl": None, - "addon_pipelines": ['inspect_elf_binaries', 'collect_symbols'] + "addon_pipelines": ['inspect_elf_binaries', 'collect_symbols_ctags'] } ] "reindex": true,