In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('./data/filtered_cve_data.csv')  
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7403 entries, 0 to 7402
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   CVE_ID                7403 non-null   object
 1   Description           7403 non-null   object
 2   Device                7403 non-null   object
 3   Product               3838 non-null   object
 4   Vendor                3311 non-null   object
 5   Version               3342 non-null   object
 6   Firmware              141 non-null    object
 7   Patch Availability    8 non-null      object
 8   Reserved Date         7384 non-null   object
 9   Published Date        7384 non-null   object
 10  Update Date           7403 non-null   object
 11  Problem Type          6123 non-null   object
 12  EPSS_Score            7381 non-null   object
 13  CVSS_Score            7295 non-null   object
 14  Severity              7297 non-null   object
 15  Vector                7297 non-null   

In [3]:
import pandas as pd

# Read the CSV file into a DataFrame
# data = pd.read_csv("your_csv_file.csv")

# Fill missing values with 'No Info Available'
data.fillna("No Info Available", inplace=True)

# Format date fields into consistent 'YYYY-MM-DD' format
date_fields = ['Reserved Date', 'Published Date', 'Update Date']
for field in date_fields:
    data[field] = pd.to_datetime(data[field], errors='coerce').dt.date
    data[field].fillna("No Info Available", inplace=True)

# Normalize score-related fields
def normalize_scores(row):
    """Parse and align scores with their respective vectors and sources."""
    scores = str(row['CVSS_Score']).split(',')
    exploitability_scores = str(row['Exploitability Score']).split(',')
    impact_scores = str(row['Impact Score']).split(',')
    vectors = str(row['Vector']).split(',')
    score_sources = str(row['Score Source']).split(',')
    severity = str(row['Severity']).split(',')

    # Normalize lengths by padding missing values
    max_length = max(len(scores), len(exploitability_scores), len(impact_scores), len(vectors), len(score_sources))
    scores = (scores + ["No Info Available"] * (max_length - len(scores)))[:max_length]
    severity = (severity + ["No Info Available"] * (max_length - len(severity)))[:max_length]
    exploitability_scores = (exploitability_scores + ["No Info Available"] * (max_length - len(exploitability_scores)))[:max_length]
    impact_scores = (impact_scores + ["No Info Available"] * (max_length - len(impact_scores)))[:max_length]
    vectors = (vectors + ["No Info Available"] * (max_length - len(vectors)))[:max_length]
    score_sources = (score_sources + ["No Info Available"] * (max_length - len(score_sources)))[:max_length]

    return {
        "Scores": scores,
        "Severity": severity,
        "Exploitability Scores": exploitability_scores,
        "Impact Scores": impact_scores,
        "Vectors": vectors,
        "Score Sources": score_sources,
    }

# Apply normalization to score-related fields
normalized_scores = data.apply(normalize_scores, axis=1)

# Convert normalized scores back into separate columns
data['CVSS Scores'] = normalized_scores.apply(lambda x: x['Scores'])
data['Severity '] = normalized_scores.apply(lambda x: x['Severity'])
data['Exploitability Scores'] = normalized_scores.apply(lambda x: x['Exploitability Scores'])
data['Impact Scores'] = normalized_scores.apply(lambda x: x['Impact Scores'])
data['Vectors'] = normalized_scores.apply(lambda x: x['Vectors'])
data['Score Sources'] = normalized_scores.apply(lambda x: x['Score Sources'])

# Drop old redundant columns if necessary
data.drop(columns=['CVSS_Score', 'Exploitability Score', 'Impact Score', 'Vector', 'Score Source','Severity'], inplace=True)

# Save the normalized data back to a CSV
data.to_csv("normalized_cve_data.csv", index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[field].fillna("No Info Available", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[field].fillna("No Info Available", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 

In [5]:
import pandas as pd
data_after_scrapping = pd.read_csv('./data/normalized_cve_data.csv')
data_after_scrapping 

Unnamed: 0,CVE_ID,Description,Device,Product,Vendor,Version,Firmware,Patch Availability,Reserved Date,Published Date,Update Date,Problem Type,EPSS_Score,references,CVSS Scores,Severity,Exploitability Scores,Impact Scores,Vectors,Score Sources
0,CVE-2009-3564,puppetmasterd in puppet 0.24.6 does not reset ...,Switch,No Info Available,No Info Available,No Info Available,No Info Available,No Info Available,2009-10-05,2009-10-06,2024-08-07,CWE-264,0.04%,"https://puppet.com/security/cve/cve-2009-3564,...",['4.7'],['MEDIUM'],['3.4'],['6.9'],['AV:L/AC:M/Au:N/C:C/I:N/A:N'],['NIST']
1,CVE-2009-3341,Buffer overflow on the Linksys WRT54GL wireles...,Router,No Info Available,No Info Available,No Info Available,No Info Available,No Info Available,No Info Available,No Info Available,2024-09-17,CWE-119 Improper Restriction of Operations wit...,3.66%,"http://www.securitytracker.com/id?1022827, htt...",['10.0'],['HIGH'],['10.0'],['10.0'],['AV:N/AC:L/Au:N/C:C/I:C/A:C'],['NIST']
2,CVE-2009-3962,The management interface on the 2wire Gateway ...,Router,No Info Available,No Info Available,No Info Available,No Info Available,No Info Available,2009-11-17,2009-11-17,2024-08-07,CWE-20 Improper Input Validation,1.75%,http://www.securityfocus.com/archive/1/507587/...,['7.8'],['HIGH'],['10.0'],['6.9'],['AV:N/AC:L/Au:N/C:N/I:N/A:C'],['NIST']
3,CVE-2009-3828,The web interface for Everfocus EDR1600 DVR al...,NVR,No Info Available,No Info Available,No Info Available,No Info Available,No Info Available,2009-10-30,2009-10-30,2024-08-07,CWE-287 Improper Authentication,1.86%,http://www.securityfocus.com/archive/1/507373/...,['5.0'],['MEDIUM'],['10.0'],['2.9'],['AV:N/AC:L/Au:N/C:P/I:N/A:N'],['NIST']
4,CVE-2009-3322,The Siemens Gigaset SE361 WLAN router allows r...,Router,No Info Available,No Info Available,No Info Available,No Info Available,No Info Available,2009-09-23,2009-09-23,2024-08-07,No Info Available,13.58%,http://www.securityfocus.com/archive/1/506414/...,['7.8'],['HIGH'],['10.0'],['6.9'],['AV:N/AC:L/Au:N/C:N/I:N/A:C'],['NIST']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7398,CVE-2024-23910,Cross-site request forgery (CSRF) vulnerabilit...,Router,WRC-1167GS2-B,"ELECOM CO.,LTD.",v1.67 and earlier,No Info Available,No Info Available,No Info Available,No Info Available,2024-09-09,No Info Available,0.04%,https://www.elecom.co.jp/news/security/2024022...,['8.8'],['HIGH'],['2.8'],['5.9'],['CVSS:3.1/AV:N/AC:L/PR:N/UI:R/S:U/C:H/I:H/A:H'],['134c704f-9b21-4f2e-91b3-4a467353bcc0']
7399,CVE-2024-23727,The YI Smart Kami Vision com.kamivision.yismar...,Camera,No Info Available,No Info Available,No Info Available,No Info Available,No Info Available,2024-01-21,2024-03-28,2024-08-27,CWE-94 Improper Control of Generation of Code ...,0.04%,https://github.com/actuator/yi/blob/main/com.k...,['8.4'],['HIGH'],['2.5'],['5.9'],['CVSS:3.1/AV:L/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H'],['134c704f-9b21-4f2e-91b3-4a467353bcc0']
7400,CVE-2024-23842,Improper Input Validation in Hitron Systems DV...,NVR,DVR LGUVR-16H,Hitron Systems DVR,1.02,No Info Available,No Info Available,No Info Available,No Info Available,2024-10-22,"CWE-20 Improper Input Validation, CWE-798 Use ...",0.05%,http://www.hitron.co.kr/firmware/,"['7.5', ' 7.4']","['HIGH', ' HIGH']","['3.9', ' 2.8']","['3.6', ' 4.0']",['CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H...,"['NIST', ' KrCERT/CC']"
7401,CVE-2024-23614,A buffer overflow vulnerability exists in Syma...,Router,Messaging Gateway,Symantec,0,['Linux'],No Info Available,No Info Available,No Info Available,2024-09-05,CWE-119 Improper Restriction of Operations wit...,0.21%,https://blog.exodusintel.com/2024/01/25/symant...,"['9.4', ' 10.0', ' 9.8']","['HIGH', ' CRITICAL', ' CRITICAL']","['10.0', ' 3.9', ' 3.9']","['9.2', ' 5.8', ' 5.9']","['AV:N/AC:L/Au:N/C:C/I:C/A:N', ' CVSS:3.1/AV:N...","['Exodus Intelligence', ' Exodus Intelligence'..."
