# Final Project Data Gathering - Andrew Eden

## Getting Started with the API

Before we make any requests, we need to initialize our api key. 

In [32]:
# Access the api key from a local file
api_key = open('/Users/andreweden/Desktop/big_data/project/big_data_group_project/nvd_key.txt').read()
headers = {"token": api_key}

Now we can use the api key to requests some data from the rest JSON api, using `requests.get`. To familiarize ourselves with this process, let's just start with one item. Let's also store the total number of results for later. 

In [6]:
import requests

# Request a response from the url 
base_url = "https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=1&startIndex=0"
response = requests.get(base_url, headers=headers)

# Check for successful response
if response.status_code == 200:
    data = response.json()
    print(data)
    
# Otherwise print error message
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

{'resultsPerPage': 1, 'startIndex': 0, 'totalResults': 231469, 'format': 'NVD_CVE', 'version': '2.0', 'timestamp': '2023-11-24T16:59:32.860', 'vulnerabilities': [{'cve': {'id': 'CVE-1999-0095', 'sourceIdentifier': 'cve@mitre.org', 'published': '1988-10-01T04:00:00.000', 'lastModified': '2019-06-11T20:29:00.263', 'vulnStatus': 'Modified', 'descriptions': [{'lang': 'en', 'value': 'The debug command in Sendmail is enabled, allowing attackers to execute commands as root.'}, {'lang': 'es', 'value': 'El comando de depuraci칩n de Sendmail est치 activado, permitiendo a atacantes ejecutar comandos como root.'}], 'metrics': {'cvssMetricV2': [{'source': 'nvd@nist.gov', 'type': 'Primary', 'cvssData': {'version': '2.0', 'vectorString': 'AV:N/AC:L/Au:N/C:C/I:C/A:C', 'accessVector': 'NETWORK', 'accessComplexity': 'LOW', 'authentication': 'NONE', 'confidentialityImpact': 'COMPLETE', 'integrityImpact': 'COMPLETE', 'availabilityImpact': 'COMPLETE', 'baseScore': 10.0}, 'baseSeverity': 'HIGH', 'exploitabili

Now let's write a function called `store_cve_data` to extract the data we are interested in. This will take in data from the `'cve'`, `'metrics'`, and `'weakness'` categories above, all of which will be stored in a dictionary called `all_data`. We also need to create a function `extract_metrics_info` that deals with the nuances of extracting from the metrics category, givven that the metrics have changed over time given the cvss version. This function will dynamically extract characteristics from the primary metric, ignoring secondary metrics for convenience. 

In [7]:
def extract_metrics_info(metrics_data):
    data = {
        'baseSeverity': '',
        'baseScore': '',
        'accessVector': '',
        'accessComplexity': '',
        'authentication': '',
        'confidentialityImpact': '',
        'integrityImpact': '',
        'availabilityImpact': ''
    }
    
    for metric_type, metrics_list in metrics_data.items():        
        for metric_info in metrics_list:
            # Only extract the primary metric
            if metric_info.get('type') == 'Primary':
                # Extract the cvssData
                cvss_data = metric_info.get('cvssData', {})
                
                # Extract specific characterstics from metric_info or cvss_data
                data['baseSeverity'] = metric_info.get('baseSeverity', '') or cvss_data.get('baseSeverity', '')
                data['baseScore'] = metric_info.get('baseScore', '') or cvss_data.get('baseScore', '')
                data['confidentialityImpact'] = metric_info.get('confidentialityImpact', '') or cvss_data.get('confidentialityImpact', '')
                data['integrityImpact'] = metric_info.get('integrityImpact', '') or cvss_data.get('integrityImpact', '')
                data['availabilityImpact'] = metric_info.get('availabilityImpact', '') or cvss_data.get('availabilityImpact', '')
                
                # Extract specific characterstics with name variations
                data['authentication'] = cvss_data.get('authentication', '') or cvss_data.get('privilegesRequired', '')
                data['accessVector'] = cvss_data.get('accessVector', '') or cvss_data.get('attackVector', '')
                data['accessComplexity'] = cvss_data.get('accessComplexity', '') or cvss_data.get('attackComplexity', '')
                break  # Stop after finding the primary metric
    
    return data

In [8]:
def store_cve_data(cve_data, metrics_data, weakness_data, description_text):
    
    # Extract the 'metrics' data
    metrics_info = extract_metrics_info(metrics_data)
    
    data_dict = {
        # Store the 'cve' data
        'CVE_ID': cve_data.get('id', ''),
        'Published': cve_data.get('published', ''),
        'Last_Modified': cve_data.get('lastModified', ''),
        'Vulnerability_Status': cve_data.get('vulnStatus', ''),
        'Description': description_text, 
        
        # Store the 'metrics' data
        'Base_Score': metrics_info['baseScore'],
        'Base_Severity': metrics_info['baseSeverity'],
        'Access_Vector': metrics_info['accessVector'],
        'Access_Complexity': metrics_info['accessComplexity'],
        'Authentication': metrics_info['authentication'],
        'Confidentiality_Impact': metrics_info['confidentialityImpact'],
        'Integrity_Impact': metrics_info['integrityImpact'],
        'Availability_Impact': metrics_info['availabilityImpact'],
        
        # Store the 'weakness' data
        'Weakness': weakness_data

    }    
    
    return data_dict

## Converting a small portion to Pandas 

Let's now create a function that splits the requested data before calling `store_cve_data`. 

In [9]:
def extract_data(data):
    # Get the vulnerabilites from the data response
    vulnerabilities = data.get('vulnerabilities', [])

    # Declare list to store dictionary data
    extracted_data = []

    # Iterate through each vulnerability
    for vulnerability in vulnerabilities:
        # Get the 'cve', 'metrics' data
        cve_data = vulnerability.get('cve', {})
        metrics_data = cve_data.get('metrics', {})
        
        # Get the 'weakness data' as a list
        weakness_data = cve_data.get('weaknesses', [])
        weaknesses = [weakness.get('description', [{'value': ''}])[0]['value'] for weakness in weakness_data]
        
        # Extract only English descriptions
        descriptions = cve_data.get('descriptions', [])
        english_description = next((desc.get('value', '') for desc in descriptions if desc.get('lang', '') == 'en'), '') 
        english_description = english_description.replace('\r', '').replace('\n', '') # Remove escape characters

        # Call the function to store the data in a dictionary
        vuln_dict = store_cve_data(cve_data, metrics_data, weaknesses, english_description)
        
        # Append the current data dictionary to the extracted_data list
        extracted_data.append(vuln_dict)

    # Creating a dataframe from the list of dictionaries
    extracted_df = pd.DataFrame(extracted_data)
    
    return extracted_df

Now we can test this on the first 20 results. Once we have all the data in the `all_data` dictionary, we want to convert the data to a pandas dataframe.

In [18]:
import pandas as pd
import requests

# Request a response from the url 
base_url = "https://services.nvd.nist.gov/rest/json/cves/2.0/?resultsPerPage=20&startIndex=0"
response = requests.get(base_url, headers=headers)

# Check for successful response
if response.status_code == 200:
    # Get the data from the response
    data = response.json()
    
    # Call the extract data function 
    df = extract_data(data)

    # Print the dataframe
    print(df)
    
    # Save the DataFrame to a CSV file
    df.to_csv('20test.csv', index=False)
    
# Otherwise print error message
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

           CVE_ID                Published            Last_Modified  \
0   CVE-1999-0095  1988-10-01T04:00:00.000  2019-06-11T20:29:00.263   
1   CVE-1999-0082  1988-11-11T05:00:00.000  2008-09-09T12:33:40.853   
2   CVE-1999-1471  1989-01-01T05:00:00.000  2008-09-05T20:19:36.257   
3   CVE-1999-1122  1989-07-26T04:00:00.000  2018-05-03T01:29:04.817   
4   CVE-1999-1467  1989-10-26T04:00:00.000  2017-12-19T02:29:08.393   
5   CVE-1999-1506  1990-01-29T05:00:00.000  2008-09-05T20:19:41.257   
6   CVE-1999-0084  1990-05-01T04:00:00.000  2017-10-10T01:29:00.387   
7   CVE-2000-0388  1990-05-09T04:00:00.000  2008-09-10T19:04:33.930   
8   CVE-1999-0209  1990-08-14T04:00:00.000  2008-09-09T12:34:01.117   
9   CVE-1999-1198  1990-10-03T04:00:00.000  2008-09-05T20:18:57.260   
10  CVE-1999-1391  1990-10-03T04:00:00.000  2008-09-05T20:19:24.600   
11  CVE-1999-1392  1990-10-03T04:00:00.000  2008-09-05T20:19:24.740   
12  CVE-1999-1057  1990-10-25T04:00:00.000  2008-09-05T20:18:37.230   
13  CV

### Testing a problematic entry

In [20]:
# Construct the URL with the specific CVE ID
cve_id = 'CVE-2023-26153'
base_url = f"https://services.nvd.nist.gov/rest/json/cves/2.0?cveId={cve_id}"
response = requests.get(base_url)

# Check for a successful response
if response.status_code == 200:
    data = response.json()
    print(data)
else:
    print(f"Failed to retrieve data for CVE ID: {cve_id}. Status Code: {response.status_code}")

{'resultsPerPage': 1, 'startIndex': 0, 'totalResults': 1, 'format': 'NVD_CVE', 'version': '2.0', 'timestamp': '2023-11-24T17:02:52.443', 'vulnerabilities': [{'cve': {'id': 'CVE-2023-26153', 'sourceIdentifier': 'report@snyk.io', 'published': '2023-10-06T05:15:52.803', 'lastModified': '2023-11-07T04:09:29.387', 'vulnStatus': 'Modified', 'descriptions': [{'lang': 'en', 'value': "Versions of the package geokit-rails before 2.5.0 are vulnerable to Command Injection due to unsafe deserialisation of YAML within the 'geo_location' cookie. This issue can be exploited remotely via a malicious cookie value.\r\r**Note:**\r\r An attacker can use this vulnerability to execute commands on the host system."}, {'lang': 'es', 'value': "Las versiones del paquete geokit-rails anteriores a la 2.5.0 son vulnerables a la inyecci칩n de comandos debido a una deserializaci칩n insegura de YAML dentro de la cookie 'geo_location'. Este problema se puede explotar de forma remota mediante un valor de cookie malicioso.

In [22]:
# Request a response from the url 
cve_id = 'CVE-1999-0095'
base_url = f"https://services.nvd.nist.gov/rest/json/cves/2.0?cveId={cve_id}"
response = requests.get(base_url, headers=headers)

# Check for successful response
if response.status_code == 200:
    # Get the data from the response
    data = response.json()
    
    # Call the extract data function 
    df = extract_data(data)

    # Print the dataframe
    print(df)
    
# Otherwise print error message
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

          CVE_ID                Published            Last_Modified  \
0  CVE-1999-0095  1988-10-01T04:00:00.000  2019-06-11T20:29:00.263   

  Vulnerability_Status                                        Description  \
0             Modified  The debug command in Sendmail is enabled, allo...   

   Base_Score Base_Severity Access_Vector Access_Complexity Authentication  \
0        10.0          HIGH       NETWORK               LOW           NONE   

  Confidentiality_Impact Integrity_Impact Availability_Impact         Weakness  
0               COMPLETE         COMPLETE            COMPLETE  [NVD-CWE-Other]  


In [24]:
# Request a response from the url 
cve_id = 'CVE-2023-26153'
base_url = f"https://services.nvd.nist.gov/rest/json/cves/2.0?cveId={cve_id}"
response = requests.get(base_url, headers=headers)

# Check for successful response
if response.status_code == 200:
    # Get the data from the response
    data = response.json()
    
    # Call the extract data function 
    df = extract_data(data)

    # Print the dataframe
    print(df)
    
# Otherwise print error message
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

           CVE_ID                Published            Last_Modified  \
0  CVE-2023-26153  2023-10-06T05:15:52.803  2023-11-07T04:09:29.387   

  Vulnerability_Status                                        Description  \
0             Modified  Versions of the package geokit-rails before 2....   

   Base_Score Base_Severity Access_Vector Access_Complexity Authentication  \
0         9.8      CRITICAL       NETWORK               LOW           NONE   

  Confidentiality_Impact Integrity_Impact Availability_Impact  \
0                   HIGH             HIGH                HIGH   

            Weakness  
0  [CWE-502, CWE-78]  


In [25]:
# Request a response from the url 
cve_id = 'CVE-2007-4543'
base_url = f"https://services.nvd.nist.gov/rest/json/cves/2.0?cveId={cve_id}"
response = requests.get(base_url, headers=headers)

# Check for successful response
if response.status_code == 200:
    # Get the data from the response
    data = response.json()
    
    # Call the extract data function 
    df = extract_data(data)

    # Print the dataframe
    print(df)
    
# Otherwise print error message
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

          CVE_ID                Published            Last_Modified  \
0  CVE-2007-4543  2007-08-27T21:17:00.000  2018-10-15T21:36:01.777   

  Vulnerability_Status                                        Description  \
0             Modified  Cross-site scripting (XSS) vulnerability in en...   

   Base_Score Base_Severity Access_Vector Access_Complexity Authentication  \
0         4.3        MEDIUM       NETWORK            MEDIUM           NONE   

  Confidentiality_Impact Integrity_Impact Availability_Impact  Weakness  
0                   NONE          PARTIAL                NONE  [CWE-79]  


## Requesting and Storing the Final Data

Now let's make a request without limiting the results per page to get the `results_per_page` and the `total_results`.

In [26]:
base_url = f"https://services.nvd.nist.gov/rest/json/cves/2.0/?startIndex=0"
response = requests.get(base_url, headers=headers)
    
# Check for successful response
if response.status_code == 200:
    data = response.json()
        
    # Get the number of results per page 
    results_per_page = data.get('resultsPerPage', 0)
    print(f"Results Per Page: {results_per_page}")
    
    # Get the total number of results
    total_results = data.get('totalResults', 0)
    print(f"Total Results: {total_results}")
        
# Otherwise print error message
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

Results Per Page: 2000
Total Results: 231469


Finally, we can update our code to iterate over all the pages. Let's also make sure to include a request session to properly handel any HTTP connection issues. Let's also implement paralell processing. 

# Basic Form

In [35]:
import time
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Declare starting index
start_index = 0

# Declare list to store dictionary data
all_data = []

print(f"Results per page {results_per_page}")

# Set up a session with a retry strategy
session = requests.Session()
# Declare the retry strategy 
retry_strategy = Retry(
    total=3,                                     # Max number of retries per session
    status_forcelist=[429, 500, 502, 503, 504],  # HTTPS codes which should be retried
    allowed_methods=["GET"],                     # Only allow GET to be retried
    backoff_factor=1                             # Sleep time between retries
)
# Create and mount an HTTP adapter from the retry strategy
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)

# Iterate through all the results
while start_index < total_results:
    # TESTING
    print(f"Start Index - {start_index}")

    # Request a response from the URL
    base_url = f"https://services.nvd.nist.gov/rest/json/cves/2.0/?startIndex={start_index}"
    response = session.get(base_url, headers=headers, timeout=30)

    # Sleep for 6 seconds between requests
    time.sleep(6)

    # Check for a successful response
    if response.status_code == 200:
        data = response.json()
        
        # Get the dataframe for current indexed data
        indexed_data = extract_data(data)

        # Append the data here
        all_data.append(indexed_data)

        # Increase the start index
        start_index += results_per_page

    # Otherwise print an error message
    else:
        print(f"Failed to retrieve data. Status Code: {response.status_code}")
        break

# Creating a dataframe from the list of dictionaries
df = pd.concat(all_data, ignore_index=True)

# Print the dataframe
print(df)

# Save the DataFrame to a CSV file
df.to_csv('output.csv', index=False)

Results per page 2000
Start Index - 0


KeyboardInterrupt: 

# NEW GARBAGE

In [None]:
import time
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Declare starting index
start_index = 0

# Declare list to store dictionary data
all_data = []

print(f"Results per page {results_per_page}")

# Set up a session with a retry strategy
session = requests.Session()
# Declare the retry strategy 
retry_strategy = Retry(
    total=5,                                     # Max number of retries per session
    status_forcelist=[429, 500, 502, 503, 504, 403],  # HTTPS codes which should be retried
    allowed_methods=["GET"],                     # Only allow GET to be retried
    backoff_factor=1,                             # Sleep time between retries
    connect=5,                                    # Max number of retries for establishing connection
    read=5                                        # Max number of retries for reading response
)
# Create and mount an HTTP adapter from the retry strategy
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)

# Iterate through all the results
while start_index < total_results:
    # TESTING
    print(f"Start Index - {start_index}")

    # Request a response from the URL
    base_url = f"https://services.nvd.nist.gov/rest/json/cves/2.0/?startIndex={start_index}"
    
    try:
        response = session.get(base_url, headers=headers, timeout=30)

        # Sleep for 6 seconds between requests
        time.sleep(6)

        # Check for a successful response
        if response.status_code == 200:
            data = response.json()

            # Get the dataframe for current indexed data
            indexed_data = extract_data(data)

            # Append the data here
            all_data.append(indexed_data)

            # Increase the start index
            start_index += results_per_page

        # Otherwise print an error message
        else:
            print(f"Failed to retrieve data. Status Code: {response.status_code}")
            break
    
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")

# Creating a dataframe from the list of dictionaries
df = pd.concat(all_data, ignore_index=True)

# Print the dataframe
print(df)

# Save the DataFrame to a CSV file
df.to_csv('output.csv', index=False)

Results per page 2000
Start Index - 0
Start Index - 2000
Start Index - 4000
Start Index - 6000
Start Index - 8000
Start Index - 10000
Start Index - 12000
Start Index - 14000
Start Index - 16000
Start Index - 18000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=18000 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Read timed out. (read timeout=30)"))
Start Index - 18000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=18000 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Read timed out. (read timeout=30)"))
Start Index - 18000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=18000 (Caused by ConnectTimeoutError(<urllib3.connection.HTT

Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=36000 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Read timed out. (read timeout=30)"))
Start Index - 36000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=36000 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Read timed out. (read timeout=30)"))
Start Index - 36000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=36000 (Caused by ResponseError('too many 503 error responses'))
Start Index - 36000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=36000 (Caused by ReadTimeoutError("HTTPSConnection

Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=58000 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Read timed out. (read timeout=30)"))
Start Index - 58000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=58000 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Read timed out. (read timeout=30)"))
Start Index - 58000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=58000 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Read timed out. (read timeout=30)"))
Start Index - 58000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cv

Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=74000 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Read timed out. (read timeout=30)"))
Start Index - 74000
Start Index - 76000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=76000 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Read timed out. (read timeout=30)"))
Start Index - 76000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded with url: /rest/json/cves/2.0/?startIndex=76000 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Read timed out. (read timeout=30)"))
Start Index - 76000
Request failed: HTTPSConnectionPool(host='services.nvd.nist.gov', port=443): Max retries exceeded wit

## Augmenting the Data with CVE

We can use other databases like the CVE database to augment our data. Similarily to previously, we can request the json data for vulnerability using an api request. As an example we'll use a randomly selected id `CVE-2023-22334`. 

In [None]:
cve_id = "CVE-2023-22334"
api_url = f"https://cveawg.mitre.org/api/cve/{cve_id}"
response = requests.get(api_url)

In [None]:
# Check for successful response
if response.status_code == 200:
    data = response.json()
    print(data)
        
# Otherwise print error message
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

Next, we can modify the code above to extract certain characteristics from the page. 

In [None]:
if response.status_code == 200:
    data = response.json()
    
    # Extracting information from the JSON data
    state = data['cveMetadata']['state']
    date_reserved = data['cveMetadata']['dateReserved']
    
    # Extracting multiple vendors, products, and versions
    vendors = [item['vendor'] for item in data['containers']['cna']['affected']]
    products = [item['product'] for item in data['containers']['cna']['affected']]
    versions = [item['versions'][0]['version'] for item in data['containers']['cna']['affected']]
    
    # Printing the extracted information
    print(f"State: {state}")
    print(f"Date Reserved: {date_reserved}")
    print(f"Vendors: {', '.join(vendors)}")
    print(f"Products: {', '.join(products)}")
    print(f"Versions: {', '.join(versions)}")
    
else:
    print(f"Failed to retrieve data. Status Code: {response.status_code}")

Next, we read the previously created csv file as a pandas dataframe. 

In [None]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('output.csv')

# Display the first few rows of the DataFrame
print(df.head())

Finally, we can modify the code to iterate through each CVE ID in our outputted file and add all the information from json api to the pandas dataframe.

In [None]:
def fetch_info(cve_id):
    api_url = f"https://cveawg.mitre.org/api/cve/{cve_id}"
    response = requests.get(api_url)
    
    if response.status_code == 200:
        print(cve_id)
        data = response.json()
        
        affected_data = data.get('containers', {}).get('cna', {}).get('affected', [])
        vendors = ', '.join(item.get('vendor') for item in affected_data) if affected_data else None
        products = ', '.join(item.get('product') for item in affected_data) if affected_data else None
        versions = ', '.join(item.get('versions')[0].get('version') for item in affected_data if item.get('versions')) if affected_data else None
        
        state = data['cveMetadata'].get('state')
        date_reserved = data['cveMetadata'].get('dateReserved')
        
        return pd.Series({
            'CVE_ID': cve_id,
            'State': state,
            'Date_Reserved': date_reserved,
            'Vendors': vendors,
            'Products': products,
            'Versions': versions
        })
    else:
        print(f"Failed to retrieve data for CVE_ID: {cve_id}. Status Code: {response.status_code}")
        return pd.Series({
            'CVE_ID': cve_id,
            'State': None,
            'Date_Reserved': None,
            'Vendors': None,
            'Products': None,
            'Versions': None
        })

Let's now iterate through each CVE ID and run the function above. To speed up the process, we have also implemented multithreading.

In [None]:
import requests
import concurrent.futures
import pandas as pd

# List of CVE IDs
cve_ids = df['CVE_ID'].tolist()  

results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    futures = {executor.submit(fetch_info, cve_id): cve_id for cve_id in cve_ids}
    
    for future in concurrent.futures.as_completed(futures):
        cve_id = futures[future]
        data = future.result()
        results.append(data)

# Convert the results to a DataFrame
new_df = pd.DataFrame(results)

In [None]:
# Apply the function to each row of the DataFrame
#new_data = df['CVE_ID'].apply(fetch_info)

In [None]:
# Save the DataFrame to a CSV file
new_df.to_csv('extra_data.csv', index=False)

In [None]:
# Merge the new data with the original DataFrame based on the 'CVE_ID' column
augmented_df = pd.merge(df, new_df, on='CVE_ID')

# Display the updated DataFrame with the extracted information
print(augmented_df.head())

In [None]:
# Save the DataFrame to a CSV file
augmented_df.to_csv('augmented_data.csv', index=False)