# Web Scraping CVE Details and Updating a CSV File with Python

This notebook provides a structured approach to scrape CVE details, such as CVSS scores, EPSS scores, vulnerability categories, and references, and update them into a CSV file. The code uses multithreading to process multiple CVEs concurrently for efficiency.

---

## **1. Import Required Libraries**


In [None]:
%%time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

In [None]:
%%time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Function to get BeautifulSoup object for a given CVE ID
def get_soup(cve_id, retries=3, delay=5):
    
    url = f"https://www.cvedetails.com/cve/{cve_id}/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_code == 200:
                return BeautifulSoup(response.content, 'html.parser')
            else:
                print(f"Failed to fetch {cve_id}: Status {response.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} for {cve_id} failed: {e}")
        time.sleep(delay)
    return None

# Function to scrape CVSS-related data
def scrape_cvss_data(soup):
    cvss_dict = {
        'CVSS Score': [],
        'Severity': [],
        'Vector': [],
        'Exploitability Score': [],
        'Impact Score': [],
        'Score Source': []
    }

    rows = soup.find_all('tr')

    for row in rows:
        score_div = row.find('div', class_='cvssbox')
        if score_div:
            score = score_div.text.strip()
            severity_td = row.find_all('td')[1]
            vector_td = row.find_all('td')[2]
            source_td = row.find_all('td')[5]
            exploitability_td = row.find_all('td')[3]
            impact_td = row.find_all('td')[4]

            severity = severity_td.text.strip() if severity_td else None
            vector = vector_td.text.strip() if vector_td else None
            source = source_td.text.strip() if source_td else None
            exploitability = exploitability_td.text.strip() if exploitability_td else None
            impact = impact_td.text.strip() if impact_td else None


            cvss_dict['CVSS Score'].append(score)
            cvss_dict['Severity'].append(severity)
            cvss_dict['Vector'].append(vector)
            cvss_dict['Exploitability Score'].append(exploitability)
            cvss_dict['Impact Score'].append(impact)
            cvss_dict['Score Source'].append(source)

    return cvss_dict

# Function to scrape EPSS score from the page
def scrape_epss_score(soup):
    epss_tag = soup.find('span', class_=['epss-score', 'epssbox', 'score_4'])
    if epss_tag:
        return epss_tag.text.strip()
    return None

def update_vulnerability_category(soup):
    cwe_section = soup.find('h2', id='cvedH2CWEs')
    if cwe_section:
        cwe_list = cwe_section.find_next('ul', class_='list-group')
        if cwe_list:
            cwe_items = cwe_list.find_all('li', class_='list-group-item')
            cwe_details = [item.find('a').text.strip() for item in cwe_items if item.find('a')]
            updated_value = ', '.join(cwe_details)
            return updated_value
    return None

def update_refrence_links(soup):
    reference_section = soup.find('h2', id='cvedH2References')
    if reference_section:
        reference_links = reference_section.find_next('ul', class_='list-group')
        if reference_links:
            reference_link = reference_links.find_all('li', class_='list-group-item')
            reference_details = [item.find('a')['href'] for item in reference_link if item.find('a')]
            # Join the links with a comma and space, making sure the result is a single string
            updated_reference =', '.join(reference_details)
            return updated_reference
    return None

# Function to process each CVE and fetch its details
def process_cve(cve_id):
    print(f"Processing CVE ID: {cve_id}")
    soup = get_soup(cve_id)
    if not soup:
        return None
    cvss_data = scrape_cvss_data(soup)
    epss_score = scrape_epss_score(soup)
    vulnerability_category = update_vulnerability_category(soup)
    reference_links = update_refrence_links(soup)
    result = {
        'CVE_ID': cve_id,
        'EPSS_Score': epss_score,
        'vulnerability_category':vulnerability_category,
        'CVSS_Scores': cvss_data['CVSS Score'],
        'Severities': cvss_data['Severity'],
        'Vectors': cvss_data['Vector'],
        'Exploitability_Scores': cvss_data['Exploitability Score'],
        'Impact_Scores': cvss_data['Impact Score'],
        'Score_Sources': cvss_data['Score Source'],
        'references':reference_links,
    }
    return result

# Function to update the CVE data in the provided CSV file
def update_cve_data(file_path, max_workers=3, update_interval=10):
    data = pd.read_csv(file_path)

    # Identify rows that need updates (where either CVSS_Score or EPSS_Score is missing)
    rows_to_update = data[data[['CVSS_Score', 'EPSS_Score','references']].isnull().any(axis=1)]

    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {executor.submit(process_cve, row['CVE_ID']): index for index, row in rows_to_update.iterrows()}
        for future in as_completed(future_to_index):
            index = future_to_index[future]
            try:
                result = future.result()
                if result:
                    results.append(result)
                    print(f"Fetched data for CVE_ID {result['CVE_ID']}: CVSS Scores {result['CVSS_Scores']}, EPSS {result['EPSS_Score']}, Problem Type {result['vulnerability_category']},References {result['references']}")
                
                # Update every `update_interval` successful results
                if len(results) >= update_interval:
                    update_batch(data, results)
                    results.clear()  # Clear results after each batch update
                    data.to_csv(file_path, index=False)
                    print(f"Batch update completed for {update_interval} CVE IDs.")
            except Exception as e:
                print(f"Error processing CVE ID at index {index}: {e}")

    # Final update for any remaining results
    if results:
        update_batch(data, results)
        data.to_csv(file_path, index=False)
        print(f"Final batch update completed for remaining CVE IDs.")

# Function to update DataFrame with scraped results
def update_batch(data, results):
    for result in results:
        index = data[data['CVE_ID'] == result['CVE_ID']].index[0]
        data.at[index, 'CVSS_Score'] = ', '.join(result['CVSS_Scores'])
        data.at[index, 'EPSS_Score'] = result['EPSS_Score']
        data.at[index, 'Severity'] = ', '.join(result['Severities'])
        data.at[index, 'Vector'] = ', '.join(result['Vectors'])
        data.at[index, 'Exploitability Score'] = ', '.join(result['Exploitability_Scores'])
        data.at[index, 'Impact Score'] = ', '.join(result['Impact_Scores'])
        data.at[index, 'Score Source'] = ', '.join(result['Score_Sources'])
        data.at[index, 'Problem Type']= result['vulnerability_category']
        data.at[index, 'references'] = result['references']
# Replace with your CSV file path
file_path = "/home/ankit/Desktop/projects/CFTs/data/filtered_cve_data2.csv"
update_cve_data(file_path, max_workers=3, update_interval=10)


Processing CVE ID: CVE-2009-2429Processing CVE ID: CVE-2009-1958

Processing CVE ID: CVE-2009-1561
Processing CVE ID: CVE-2009-1934
Fetched data for CVE_ID CVE-2009-2429: CVSS Scores ['4.6'], EPSS 0.04%, Problem Type CWE-255,References https://exchange.xforce.ibmcloud.com/vulnerabilities/49338, http://archives.neohapsis.com/archives/fulldisclosure/2009-03/0314.html
Processing CVE ID: CVE-2009-1152
Fetched data for CVE_ID CVE-2009-1561: CVSS Scores ['6.8'], EPSS 1.11%, Problem Type CWE-352 Cross-Site Request Forgery (CSRF),References http://www.vupen.com/english/advisories/2009/1172, http://packetstormsecurity.org/0904-exploits/linksysadmin-passwd.txt, http://www.falandodeseguranca.com/?p=17, http://www.securityfocus.com/bid/34616, http://archives.neohapsis.com/archives/bugtraq/2009-04/0198.html
Processing CVE ID: CVE-2009-1474
Fetched data for CVE_ID CVE-2009-1958: CVSS Scores ['5.0'], EPSS 6.54%, Problem Type CWE-399,References http://lists.opensuse.org/opensuse-security-announce/2009

In [1]:
import pandas as pd
data = pd.read_csv('filtered_cve_data.csv')
data

Unnamed: 0,CVE_ID,Description,Device,Product,Vendor,Version,Firmware,Patch Availability,Reserved Date,Published Date,Update Date,Problem Type,EPSS_Score,CVSS_Score,Severity,Vector,Exploitability Score,Impact Score,Score Source,references
0,CVE-2009-3564,puppetmasterd in puppet 0.24.6 does not reset ...,Switch,,,,,,2009-10-05T00:00:00,2009-10-06T17:22:00,2024-08-07T06:31:10.575Z,CWE-264,0.04%,4.7,MEDIUM,AV:L/AC:M/Au:N/C:C/I:N/A:N,3.4,6.9,NIST,"https://puppet.com/security/cve/cve-2009-3564,..."
1,CVE-2009-3341,Buffer overflow on the Linksys WRT54GL wireles...,Router,,,,,,2009-09-24T00:00:00Z,2009-09-24T16:00:00Z,2024-09-17T03:03:10.465Z,CWE-119 Improper Restriction of Operations wit...,3.66%,10.0,HIGH,AV:N/AC:L/Au:N/C:C/I:C/A:C,10.0,10.0,NIST,"http://www.securitytracker.com/id?1022827, htt..."
2,CVE-2009-3962,The management interface on the 2wire Gateway ...,Router,,,,,,2009-11-17T00:00:00,2009-11-17T18:00:00,2024-08-07T06:45:50.747Z,CWE-20 Improper Input Validation,1.75%,7.8,HIGH,AV:N/AC:L/Au:N/C:N/I:N/A:C,10.0,6.9,NIST,http://www.securityfocus.com/archive/1/507587/...
3,CVE-2009-3828,The web interface for Everfocus EDR1600 DVR al...,NVR,,,,,,2009-10-30T00:00:00,2009-10-30T19:00:00,2024-08-07T06:38:30.497Z,CWE-287 Improper Authentication,1.86%,5.0,MEDIUM,AV:N/AC:L/Au:N/C:P/I:N/A:N,10.0,2.9,NIST,http://www.securityfocus.com/archive/1/507373/...
4,CVE-2009-3322,The Siemens Gigaset SE361 WLAN router allows r...,Router,,,,,,2009-09-23T00:00:00,2009-09-23T10:00:00,2024-08-07T06:22:24.435Z,,13.58%,7.8,HIGH,AV:N/AC:L/Au:N/C:N/I:N/A:C,10.0,6.9,NIST,http://www.securityfocus.com/archive/1/506414/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7398,CVE-2024-23910,Cross-site request forgery (CSRF) vulnerabilit...,Router,WRC-1167GS2-B,"ELECOM CO.,LTD.",v1.67 and earlier,,,2024-02-15T01:25:06.163Z,2024-02-28T23:07:02.324Z,2024-09-09T06:36:00.982Z,,0.04%,8.8,HIGH,CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:U/C:H/I:H/A:H,2.8,5.9,134c704f-9b21-4f2e-91b3-4a467353bcc0,https://www.elecom.co.jp/news/security/2024022...
7399,CVE-2024-23727,The YI Smart Kami Vision com.kamivision.yismar...,Camera,,,,,,2024-01-21T00:00:00,2024-03-28T00:00:00,2024-08-27T19:12:26.981Z,CWE-94 Improper Control of Generation of Code ...,0.04%,8.4,HIGH,CVSS:3.1/AV:L/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H,2.5,5.9,134c704f-9b21-4f2e-91b3-4a467353bcc0,https://github.com/actuator/yi/blob/main/com.k...
7400,CVE-2024-23842,Improper Input Validation in Hitron Systems DV...,NVR,DVR LGUVR-16H,Hitron Systems DVR,1.02,,,2024-01-23T04:53:48.120Z,2024-01-23T04:56:41.242Z,2024-10-22T03:55:45.527Z,"CWE-20 Improper Input Validation, CWE-798 Use ...",0.05%,"7.5, 7.4","HIGH, HIGH","CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H, ...","3.9, 2.8","3.6, 4.0","NIST, KrCERT/CC",http://www.hitron.co.kr/firmware/
7401,CVE-2024-23614,A buffer overflow vulnerability exists in Syma...,Router,Messaging Gateway,Symantec,0,['Linux'],,2024-01-18T21:37:15.392Z,2024-01-25T23:32:21.154Z,2024-09-05T18:28:07.954Z,CWE-119 Improper Restriction of Operations wit...,0.21%,"9.4, 10.0, 9.8","HIGH, CRITICAL, CRITICAL","AV:N/AC:L/Au:N/C:C/I:C/A:N, CVSS:3.1/AV:N/AC:L...","10.0, 3.9, 3.9","9.2, 5.8, 5.9","Exodus Intelligence, Exodus Intelligence, NIST",https://blog.exodusintel.com/2024/01/25/symant...


In [2]:
data.fillna('No Data Found')

Unnamed: 0,CVE_ID,Description,Device,Product,Vendor,Version,Firmware,Patch Availability,Reserved Date,Published Date,Update Date,Problem Type,EPSS_Score,CVSS_Score,Severity,Vector,Exploitability Score,Impact Score,Score Source,references
0,CVE-2009-3564,puppetmasterd in puppet 0.24.6 does not reset ...,Switch,No Data Found,No Data Found,No Data Found,No Data Found,No Data Found,2009-10-05T00:00:00,2009-10-06T17:22:00,2024-08-07T06:31:10.575Z,CWE-264,0.04%,4.7,MEDIUM,AV:L/AC:M/Au:N/C:C/I:N/A:N,3.4,6.9,NIST,"https://puppet.com/security/cve/cve-2009-3564,..."
1,CVE-2009-3341,Buffer overflow on the Linksys WRT54GL wireles...,Router,No Data Found,No Data Found,No Data Found,No Data Found,No Data Found,2009-09-24T00:00:00Z,2009-09-24T16:00:00Z,2024-09-17T03:03:10.465Z,CWE-119 Improper Restriction of Operations wit...,3.66%,10.0,HIGH,AV:N/AC:L/Au:N/C:C/I:C/A:C,10.0,10.0,NIST,"http://www.securitytracker.com/id?1022827, htt..."
2,CVE-2009-3962,The management interface on the 2wire Gateway ...,Router,No Data Found,No Data Found,No Data Found,No Data Found,No Data Found,2009-11-17T00:00:00,2009-11-17T18:00:00,2024-08-07T06:45:50.747Z,CWE-20 Improper Input Validation,1.75%,7.8,HIGH,AV:N/AC:L/Au:N/C:N/I:N/A:C,10.0,6.9,NIST,http://www.securityfocus.com/archive/1/507587/...
3,CVE-2009-3828,The web interface for Everfocus EDR1600 DVR al...,NVR,No Data Found,No Data Found,No Data Found,No Data Found,No Data Found,2009-10-30T00:00:00,2009-10-30T19:00:00,2024-08-07T06:38:30.497Z,CWE-287 Improper Authentication,1.86%,5.0,MEDIUM,AV:N/AC:L/Au:N/C:P/I:N/A:N,10.0,2.9,NIST,http://www.securityfocus.com/archive/1/507373/...
4,CVE-2009-3322,The Siemens Gigaset SE361 WLAN router allows r...,Router,No Data Found,No Data Found,No Data Found,No Data Found,No Data Found,2009-09-23T00:00:00,2009-09-23T10:00:00,2024-08-07T06:22:24.435Z,No Data Found,13.58%,7.8,HIGH,AV:N/AC:L/Au:N/C:N/I:N/A:C,10.0,6.9,NIST,http://www.securityfocus.com/archive/1/506414/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7398,CVE-2024-23910,Cross-site request forgery (CSRF) vulnerabilit...,Router,WRC-1167GS2-B,"ELECOM CO.,LTD.",v1.67 and earlier,No Data Found,No Data Found,2024-02-15T01:25:06.163Z,2024-02-28T23:07:02.324Z,2024-09-09T06:36:00.982Z,No Data Found,0.04%,8.8,HIGH,CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:U/C:H/I:H/A:H,2.8,5.9,134c704f-9b21-4f2e-91b3-4a467353bcc0,https://www.elecom.co.jp/news/security/2024022...
7399,CVE-2024-23727,The YI Smart Kami Vision com.kamivision.yismar...,Camera,No Data Found,No Data Found,No Data Found,No Data Found,No Data Found,2024-01-21T00:00:00,2024-03-28T00:00:00,2024-08-27T19:12:26.981Z,CWE-94 Improper Control of Generation of Code ...,0.04%,8.4,HIGH,CVSS:3.1/AV:L/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H,2.5,5.9,134c704f-9b21-4f2e-91b3-4a467353bcc0,https://github.com/actuator/yi/blob/main/com.k...
7400,CVE-2024-23842,Improper Input Validation in Hitron Systems DV...,NVR,DVR LGUVR-16H,Hitron Systems DVR,1.02,No Data Found,No Data Found,2024-01-23T04:53:48.120Z,2024-01-23T04:56:41.242Z,2024-10-22T03:55:45.527Z,"CWE-20 Improper Input Validation, CWE-798 Use ...",0.05%,"7.5, 7.4","HIGH, HIGH","CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H, ...","3.9, 2.8","3.6, 4.0","NIST, KrCERT/CC",http://www.hitron.co.kr/firmware/
7401,CVE-2024-23614,A buffer overflow vulnerability exists in Syma...,Router,Messaging Gateway,Symantec,0,['Linux'],No Data Found,2024-01-18T21:37:15.392Z,2024-01-25T23:32:21.154Z,2024-09-05T18:28:07.954Z,CWE-119 Improper Restriction of Operations wit...,0.21%,"9.4, 10.0, 9.8","HIGH, CRITICAL, CRITICAL","AV:N/AC:L/Au:N/C:C/I:C/A:N, CVSS:3.1/AV:N/AC:L...","10.0, 3.9, 3.9","9.2, 5.8, 5.9","Exodus Intelligence, Exodus Intelligence, NIST",https://blog.exodusintel.com/2024/01/25/symant...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7403 entries, 0 to 7402
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CVE_ID                7403 non-null   object
 1   Description           7403 non-null   object
 2   Device                7403 non-null   object
 3   Product               3838 non-null   object
 4   Vendor                3311 non-null   object
 5   Version               3342 non-null   object
 6   Firmware              141 non-null    object
 7   Patch Availability    8 non-null      object
 8   Reserved Date         7384 non-null   object
 9   Published Date        7384 non-null   object
 10  Update Date           7403 non-null   object
 11  Problem Type          6121 non-null   object
 12  EPSS_Score            7381 non-null   object
 13  CVSS_Score            7293 non-null   object
 14  Severity              7295 non-null   object
 15  Vector                7295 non-null   