In [65]:
import os, sys
import json
from tqdm import tqdm

In [2]:
github_advisory_path = '{advisory_path}/advisories/github-reviewed/'
output_folder = '.'

In [6]:
vuln_paths = []
def extract_vulns(cur_folder):
    global vuln_paths
    files = os.listdir(cur_folder)
    sub_paths = [os.path.join(cur_folder, file) for file in files]
    json_paths = [path for path in sub_paths if path.endswith('.json')]
    folder_paths = [path for path in sub_paths if os.path.isdir(path)]
    
    vuln_paths = vuln_paths + json_paths
    for path in folder_paths:
        extract_vulns(path)
extract_vulns(github_advisory_path)

In [8]:
vuln_reports = []
for path in tqdm(vuln_paths):
    with open(path, 'r') as f:
        report = json.load(f)
        vuln_reports.append(report)

100%|██████████| 15614/15614 [00:01<00:00, 15466.73it/s]


In [9]:
vuln_report_path = os.path.join(output_folder, 'vuln_reports.json')
with open(vuln_report_path, 'w') as f:
    json.dump(vuln_reports, f)

In [41]:
def get_ecosystem(vuln_report):
    affecteds = vuln_report['affected']
    ecosystems = [entry['package']['ecosystem'] for entry in affecteds]
    if len(set(ecosystems)) > 1:
        return 'multiple'
    elif len(ecosystems) == 0:
        return 'unknown'
    else:
        return ecosystems[0]
    
def get_libraries(vuln_report):
    affecteds = vuln_report['affected']
    libraries = [entry['package']['name'] for entry in affecteds]
    return list(set(libraries))

In [34]:
maven_reports = [report for report in vuln_reports if get_ecosystem(report) == 'Maven']
npm_reports = [report for report in vuln_reports if get_ecosystem(report) == 'npm']
pypi_reports = [report for report in vuln_reports if get_ecosystem(report) == 'PyPI']
go_reports = [report for report in vuln_reports if get_ecosystem(report) == 'Go']
combined_reports = maven_reports + npm_reports + pypi_reports + go_reports

print('maven', len(maven_reports), 'npm', len(npm_reports),
    'pypi', len(pypi_reports), 'go', len(go_reports))

maven 4308 npm 3193 pypi 2237 go 1351


In [84]:
len(npm_reports), int(len(npm_reports) * 3/5), int(len(npm_reports) * 4/5) - int(len(npm_reports) * 3/5), len(npm_reports) - int(len(npm_reports) * 4/5)

(3193, 1915, 639, 639)

In [91]:
len(set([lib for vuln in go_reports for lib in get_libraries(vuln)]))

601

In [85]:
len(pypi_reports), int(len(pypi_reports) * 3/5), int(len(pypi_reports) * 4/5) - int(len(pypi_reports) * 3/5), len(pypi_reports) - int(len(pypi_reports) * 4/5)

(2237, 1342, 447, 448)

In [86]:
len(go_reports), int(len(go_reports) * 3/5), int(len(go_reports) * 4/5) - int(len(go_reports) * 3/5), len(go_reports) - int(len(go_reports) * 4/5)

(1351, 810, 270, 271)

In [35]:
maven_report_path = os.path.join(output_folder, 'maven_reports.json')
npm_report_path = os.path.join(output_folder, 'npm_reports.json')
pypi_report_path = os.path.join(output_folder, 'pypi_reports.json')
go_report_path = os.path.join(output_folder, 'go_reports.json')
combined_report_path = os.path.join(output_folder, 'combined_reports.json')

with open(maven_report_path, 'w') as f:
    json.dump(maven_reports, f)
with open(npm_report_path, 'w') as f:
    json.dump(npm_reports, f)
with open(pypi_report_path, 'w') as f:
    json.dump(pypi_reports, f)
with open(go_report_path, 'w') as f:
    json.dump(go_reports, f)
with open(combined_report_path, 'w') as f:
    json.dump(combined_reports, f)

In [73]:
rust_reports = [report for report in vuln_reports if get_ecosystem(report) == 'crates.io']
print(len(rust_reports))

681


In [74]:
rust_report_path = os.path.join(output_folder, 'rust_reports.json')
with open(rust_report_path, 'w') as f:
    json.dump(rust_reports, f)

In [75]:
rust_cwe = [cwe for report in rust_reports for cwe in report['database_specific']['cwe_ids']]

In [77]:
import pandas as pd

In [78]:
df = pd.DataFrame(rust_cwe)

In [None]:
df.value_counts().index