Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): parse licenses from dist-info folder #4724

Merged
merged 20 commits into from
Jan 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 139 additions & 33 deletions pkg/fanal/analyzer/language/python/packaging/packaging.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,43 @@ import (
"archive/zip"
"bytes"
"context"
"errors"
"io"
"io/fs"
"os"
"path"
"path/filepath"
"strings"

"github.com/samber/lo"
"golang.org/x/xerrors"

dio "github.com/aquasecurity/go-dep-parser/pkg/io"
"github.com/aquasecurity/go-dep-parser/pkg/python/packaging"
godeptypes "github.com/aquasecurity/go-dep-parser/pkg/types"
"github.com/aquasecurity/trivy/pkg/fanal/analyzer"
"github.com/aquasecurity/trivy/pkg/fanal/analyzer/language"
"github.com/aquasecurity/trivy/pkg/fanal/types"
"github.com/aquasecurity/trivy/pkg/licensing"
"github.com/aquasecurity/trivy/pkg/log"
"github.com/aquasecurity/trivy/pkg/utils/fsutils"
)

func init() {
analyzer.RegisterAnalyzer(&packagingAnalyzer{})
analyzer.RegisterPostAnalyzer(analyzer.TypePythonPkg, newPackagingAnalyzer)
}

const version = 1

func newPackagingAnalyzer(opt analyzer.AnalyzerOptions) (analyzer.PostAnalyzer, error) {
return &packagingAnalyzer{
pkgParser: packaging.NewParser(),
licenseClassifierConfidenceLevel: opt.LicenseScannerOption.ClassifierConfidenceLevel,
}, nil
}

var (
requiredFiles = []string{
eggFiles = []string{
// .egg format
// https://setuptools.readthedocs.io/en/latest/deprecated/python_eggs.html#eggs-and-their-formats
".egg", // zip format
Expand All @@ -34,35 +50,125 @@ var (
// https://setuptools.readthedocs.io/en/latest/deprecated/python_eggs.html#eggs-and-their-formats
".egg-info",
".egg-info/PKG-INFO",

// wheel
".dist-info/METADATA",
}
)

type packagingAnalyzer struct{}
type packagingAnalyzer struct {
pkgParser godeptypes.Parser
licenseClassifierConfidenceLevel float64
}

// PostAnalyze analyzes egg and wheel files.
func (a packagingAnalyzer) PostAnalyze(_ context.Context, input analyzer.PostAnalysisInput) (*analyzer.AnalysisResult, error) {

// Analyze analyzes egg and wheel files.
func (a packagingAnalyzer) Analyze(_ context.Context, input analyzer.AnalysisInput) (*analyzer.AnalysisResult, error) {
r := input.Content
var apps []types.Application

required := func(path string, _ fs.DirEntry) bool {
return filepath.Base(path) == "METADATA" || isEggFile(path)
}

err := fsutils.WalkDir(input.FS, ".", required, func(path string, d fs.DirEntry, r io.Reader) error {
rsa, ok := r.(dio.ReadSeekerAt)
if !ok {
return xerrors.New("invalid reader")
}

// .egg file is zip format and PKG-INFO needs to be extracted from the zip file.
if strings.HasSuffix(input.FilePath, ".egg") {
pkginfoInZip, err := a.analyzeEggZip(input.Content, input.Info.Size())
// .egg file is zip format and PKG-INFO needs to be extracted from the zip file.
if strings.HasSuffix(path, ".egg") {
info, err := d.Info()
if err != nil {
return xerrors.Errorf("egg file error: %w", err)
}
pkginfoInZip, err := a.analyzeEggZip(rsa, info.Size())
if err != nil {
return xerrors.Errorf("egg analysis error: %w", err)
}

// Egg archive may not contain required files, then we will get nil. Skip this archives
if pkginfoInZip == nil {
return nil
}
rsa = pkginfoInZip
}

app, err := a.parse(path, rsa, input.Options.FileChecksum)
if err != nil {
return nil, xerrors.Errorf("egg analysis error: %w", err)
return xerrors.Errorf("parse error: %w", err)
} else if app == nil {
return nil
}

if err := a.fillAdditionalData(input.FS, app); err != nil {
log.Logger.Warnf("Unable to collect additional info: %s", err)
}

// Egg archive may not contain required files, then we will get nil. Skip this archives
if pkginfoInZip == nil {
return nil, nil
apps = append(apps, *app)
return nil
})

if err != nil {
return nil, xerrors.Errorf("python package walk error: %w", err)
}
return &analyzer.AnalysisResult{
Applications: apps,
}, nil
}

func (a packagingAnalyzer) fillAdditionalData(fsys fs.FS, app *types.Application) error {
for i, lib := range app.Libraries {
var licenses []string
for _, lic := range lib.Licenses {
// Parser adds `file://` prefix to filepath from `License-File` field
// We need to read this file to find licenses
// Otherwise, this is the name of the license
if !strings.HasPrefix(lic, "file://") {
DmitriyLewen marked this conversation as resolved.
Show resolved Hide resolved
licenses = append(licenses, lic)
continue
}
licenseFilePath := path.Base(strings.TrimPrefix(lic, "file://"))

findings, err := classifyLicense(app.FilePath, licenseFilePath, a.licenseClassifierConfidenceLevel, fsys)
if err != nil {
return err
} else if len(findings) == 0 {
continue
}

// License found
foundLicenses := lo.Map(findings, func(finding types.LicenseFinding, _ int) string {
return finding.Name
})
licenses = append(licenses, foundLicenses...)
}
app.Libraries[i].Licenses = licenses
}

r = pkginfoInZip
return nil
}

func classifyLicense(dir, licPath string, classifierConfidenceLevel float64, fsys fs.FS) (types.LicenseFindings, error) {
// Note that fs.FS is always slashed regardless of the platform,
// and path.Join should be used rather than filepath.Join.
f, err := fsys.Open(path.Join(path.Dir(dir), licPath))
if errors.Is(err, fs.ErrNotExist) {
return nil, nil
} else if err != nil {
return nil, xerrors.Errorf("file open error: %w", err)
}
defer f.Close()

p := packaging.NewParser()
return language.AnalyzePackage(types.PythonPkg, input.FilePath, r, p, input.Options.FileChecksum)
l, err := licensing.Classify(licPath, f, classifierConfidenceLevel)
if err != nil {
return nil, xerrors.Errorf("license classify error: %w", err)
} else if l == nil {
return nil, nil
}

return l.Findings, nil
}

func (a packagingAnalyzer) parse(filePath string, r dio.ReadSeekerAt, checksum bool) (*types.Application, error) {
return language.ParsePackage(types.PythonPkg, filePath, r, a.pkgParser, checksum)
}

func (a packagingAnalyzer) analyzeEggZip(r io.ReaderAt, size int64) (dio.ReadSeekerAt, error) {
Expand All @@ -71,17 +177,16 @@ func (a packagingAnalyzer) analyzeEggZip(r io.ReaderAt, size int64) (dio.ReadSee
return nil, xerrors.Errorf("zip reader error: %w", err)
}

for _, file := range zr.File {
if !a.Required(file.Name, nil) {
continue
}

return a.open(file)
found, ok := lo.Find(zr.File, func(f *zip.File) bool {
return isEggFile(f.Name)
})
if !ok {
return nil, nil
}

return nil, nil
return a.open(found)
}

// open reads the file content in the zip archive to make it seekable.
func (a packagingAnalyzer) open(file *zip.File) (dio.ReadSeekerAt, error) {
f, err := file.Open()
if err != nil {
Expand All @@ -98,12 +203,13 @@ func (a packagingAnalyzer) open(file *zip.File) (dio.ReadSeekerAt, error) {
}

func (a packagingAnalyzer) Required(filePath string, _ os.FileInfo) bool {
for _, r := range requiredFiles {
if strings.HasSuffix(filePath, r) {
return true
}
}
return false
return strings.Contains(filePath, ".dist-info") || isEggFile(filePath)
}

func isEggFile(filePath string) bool {
return lo.SomeBy(eggFiles, func(fileName string) bool {
return strings.HasSuffix(filePath, fileName)
})
}

func (a packagingAnalyzer) Type() analyzer.Type {
Expand Down
Loading