Before we make any requests, we need to initialize our api key. 

In [1]:
# Access the api key from a local file
api_key = open('/Users/andreweden/Desktop/big_data/project/nvd_key.txt').read()
headers = {"token": api_key}

Now we can use the api key to requests some data from the rest JSON api, using `requests.get`. To familiarize ourselves with this process, let's just start with one item. Let's also store the total number of results for later. 

In [2]:
import requests

# Request a response from the url 
base_url = "https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=1&startIndex=0"
response = requests.get(base_url, headers=headers)

# Check for successful response
if response.status_code == 200:
    data = response.json()
    print(data)
    
# Otherwise print error message
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

{'resultsPerPage': 1, 'startIndex': 0, 'totalResults': 230171, 'format': 'NVD_CVE', 'version': '2.0', 'timestamp': '2023-11-08T22:24:21.543', 'vulnerabilities': [{'cve': {'id': 'CVE-1999-0095', 'sourceIdentifier': 'cve@mitre.org', 'published': '1988-10-01T04:00:00.000', 'lastModified': '2019-06-11T20:29:00.263', 'vulnStatus': 'Modified', 'descriptions': [{'lang': 'en', 'value': 'The debug command in Sendmail is enabled, allowing attackers to execute commands as root.'}, {'lang': 'es', 'value': 'El comando de depuración de Sendmail está activado, permitiendo a atacantes ejecutar comandos como root.'}], 'metrics': {'cvssMetricV2': [{'source': 'nvd@nist.gov', 'type': 'Primary', 'cvssData': {'version': '2.0', 'vectorString': 'AV:N/AC:L/Au:N/C:C/I:C/A:C', 'accessVector': 'NETWORK', 'accessComplexity': 'LOW', 'authentication': 'NONE', 'confidentialityImpact': 'COMPLETE', 'integrityImpact': 'COMPLETE', 'availabilityImpact': 'COMPLETE', 'baseScore': 10.0}, 'baseSeverity': 'HIGH', 'exploitabili

Now let's write a function to extract the data we are interested in. This will take in data from the `'cve'`, `'metrics'`, and `'weakness'` categories above, all of which will be stored in a dictionary called `all_data`

In [3]:
def store_cve_data(all_data, cve_data, metrics_data, weakness_data):
    data_dict = {
        # Store the 'cve' data
        'CVE_ID': cve_data.get('id', ''),
        'Published': cve_data.get('published', ''),
        'Last_Modified': cve_data.get('lastModified', ''),
        'Vulnerability_Status': cve_data.get('vulnStatus', ''),
        'Description': cve_data.get('descriptions', [{'value': ''}])[0]['value'], # Only english 
        
        # Store the 'metrics' data
        'Base_Score': metrics_data.get('baseScore', ''),
        'Base_Severity': metrics_data.get('baseSeverity', ''),
        'Access_Vector': metrics_data.get('accessVector', ''),
        'Access_Complexity': metrics_data.get('accessComplexity', ''),
        'Authentication': metrics_data.get('authentication', ''),
        'Confidentiality_Impact': metrics_data.get('confidentialityImpact', ''),
        'Integrity_Impact': metrics_data.get('integrityImpact', ''),
        'Availability_Impact': metrics_data.get('availabilityImpact', ''),
        
        # Store the 'weakness' data
        'Weakness_Description': weakness_data.get('description', [{'value': ''}])[0]['value'],
    }
    all_data.append(data_dict)

Now we can test this on the first 20 results. Once we have all the data in the `all_data` dictionary, we want to convert the data to a pandas dataframe.

In [5]:
import pandas as pd
import requests

# Request a response from the url 
base_url = "https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=20&startIndex=0"
response = requests.get(base_url, headers=headers)

# Check for successful response
if response.status_code == 200:
    data = response.json()
    
    # Get the vulnerabilites from the data response
    vulnerabilities = data.get('vulnerabilities', [])

    # Declare list to store dictionary data
    all_data = []

    # Iterating through each vulnerability
    for vulnerability in vulnerabilities:
        # Get the 'cve', 'metrics', and 'weakness' data
        cve_data = vulnerability.get('cve', {})
        metrics_data = cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get('cvssData', {})
        weakness_data = cve_data.get('weaknesses', [{}])[0]

        # Call the function to store the data in a dictionary
        store_cve_data(all_data, cve_data, metrics_data, weakness_data)

    # Creating a dataframe from the list of dictionaries
    df = pd.DataFrame(all_data)

    # Print the dataframe
    print(df)
    
# Otherwise print error message
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

           CVE_ID                Published            Last_Modified  \
0   CVE-1999-0095  1988-10-01T04:00:00.000  2019-06-11T20:29:00.263   
1   CVE-1999-0082  1988-11-11T05:00:00.000  2008-09-09T12:33:40.853   
2   CVE-1999-1471  1989-01-01T05:00:00.000  2008-09-05T20:19:36.257   
3   CVE-1999-1122  1989-07-26T04:00:00.000  2018-05-03T01:29:04.817   
4   CVE-1999-1467  1989-10-26T04:00:00.000  2017-12-19T02:29:08.393   
5   CVE-1999-1506  1990-01-29T05:00:00.000  2008-09-05T20:19:41.257   
6   CVE-1999-0084  1990-05-01T04:00:00.000  2017-10-10T01:29:00.387   
7   CVE-2000-0388  1990-05-09T04:00:00.000  2008-09-10T19:04:33.930   
8   CVE-1999-0209  1990-08-14T04:00:00.000  2008-09-09T12:34:01.117   
9   CVE-1999-1198  1990-10-03T04:00:00.000  2008-09-05T20:18:57.260   
10  CVE-1999-1391  1990-10-03T04:00:00.000  2008-09-05T20:19:24.600   
11  CVE-1999-1392  1990-10-03T04:00:00.000  2008-09-05T20:19:24.740   
12  CVE-1999-1057  1990-10-25T04:00:00.000  2008-09-05T20:18:37.230   
13  CV

Now let's make a request without limiting the results per page to get the `results_per_page` and the `total_results`.

In [13]:
base_url = f"https://services.nvd.nist.gov/rest/json/cves/2.0/?startIndex=0"
response = requests.get(base_url, headers=headers)
    
# Check for successful response
if response.status_code == 200:
    data = response.json()
        
    # Get the number of results per page 
    results_per_page = data.get('resultsPerPage', 0)
    print(f"Results Per Page: {results_per_page}")
    
    # Get the total number of results
    total_results = data.get('totalResults', 0)
    print(f"Total Results: {total_results}")
        
# Otherwise print error message
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

Results Per Page: 2000
Total Results: 230171


Finally, we can update our code to iterate over all the pages. Let's also make sure to include a request session to properly handel any HTTP connection issues. 

In [17]:
import time
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Declare starting index
start_index = 0

# Declare list to store dictionary data
all_data = []

print(f"Results per page {results_per_page}")

# Set up a session with a retry strategy
session = requests.Session()
# Declare the retry strategy 
retry_strategy = Retry(
    total=3,                                     # Max number of retries per session
    status_forcelist=[429, 500, 502, 503, 504],  # HTTPS codes which should be retried
    allowed_methods=["GET"],                     # Only allow GET to be retried
    backoff_factor=1                             # Sleep time between retries
)
# Create and mount an HTTP adapter from the retry strategy
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)

# Iterate through all the results
while start_index < total_results:
    # TESTING
    print(f"Start Index - {start_index}")

    # Request a response from the URL
    base_url = f"https://services.nvd.nist.gov/rest/json/cves/2.0/?startIndex={start_index}"
    response = session.get(base_url, headers=headers, timeout=10)

    # Sleep for 6 seconds between requests
    time.sleep(6)

    # Check for a successful response
    if response.status_code == 200:
        data = response.json()

        # Get the vulnerabilities from the data response
        vulnerabilities = data.get('vulnerabilities', [])

        # Iterate through each vulnerability
        for vulnerability in vulnerabilities:
            # Get the 'cve', 'metrics', and 'weakness' data
            cve_data = vulnerability.get('cve', {})
            metrics_data = cve_data.get('metrics', {}).get('cvssMetricV2', [{}])[0].get('cvssData', {})
            weakness_data = cve_data.get('weaknesses', [{}])[0]

            # Call the function to store the data in a dictionary
            store_cve_data(all_data, cve_data, metrics_data, weakness_data)

        # Increase the start index
        start_index += results_per_page

    # Otherwise print an error message
    else:
        print(f"Failed to retrieve data. Status Code: {response.status_code}")
        break

# Creating a dataframe from the list of dictionaries
df = pd.DataFrame(all_data)

# Print the dataframe
print(df)

Results per page 2000
Start Index - 0
Start Index - 2000
Start Index - 4000
Start Index - 6000
Start Index - 8000
Start Index - 10000
Start Index - 12000
Start Index - 14000
Start Index - 16000
Start Index - 18000
Start Index - 20000
Start Index - 22000
Start Index - 24000
Start Index - 26000
Start Index - 28000
Start Index - 30000
Start Index - 32000
Start Index - 34000
Start Index - 36000
Start Index - 38000
Start Index - 40000
Start Index - 42000
Start Index - 44000
Start Index - 46000
Start Index - 48000
Start Index - 50000
Start Index - 52000
Start Index - 54000
Start Index - 56000
Start Index - 58000
Start Index - 60000
Start Index - 62000
Start Index - 64000
Start Index - 66000
Start Index - 68000
Start Index - 70000
Start Index - 72000
Start Index - 74000
Start Index - 76000
Start Index - 78000
Start Index - 80000
Start Index - 82000
Start Index - 84000
Start Index - 86000
Start Index - 88000
Start Index - 90000
Start Index - 92000
Start Index - 94000
Start Index - 96000
Start 

In [18]:
# Save the DataFrame to a CSV file
df.to_csv('output.csv', index=False)