In [None]:
import os
import pandas as pd
from tqdm import tqdm
import spacy

nlp = spacy.load("en_core_web_sm")

data = "updatedCveDB.csv"

if not os.path.exists(data):
    print(f"[INFO] Downloading data from MITRE...")
    url = "https://cve.mitre.org/data/downloads/allitems.csv"
    df = pd.read_csv(url, skiprows=2, encoding='latin-1', dtype=str)
    total_rows = len(df)
    with tqdm(total=total_rows, desc="Processing data") as pbar:
        df.to_csv(data, index=False)
        pbar.update(total_rows)

In [None]:
# Load processed data
df = pd.read_csv(data, usecols=[0, 1, 2])

# Remove rows where description starts with disclaimers, remove rows with missing description
df = df.dropna(subset=['Description'])
df = df[~df['Description'].str.startswith('**')]

# Save the processed data
df.to_csv('updatedCveDB.csv', index=False)
print(f"[INFO] Data saved to updatedCveDB.csv")

# Make another csv with 100 random rows for testing
df.sample(100).to_csv('sample.csv', index=False)

In [None]:
# Tokenize the sample descriptions
print(f"[INFO] Tokenizing sample descriptions...")
sample = pd.read_csv('sample.csv')
sample['tokens'] = sample['Description'].apply(lambda x: [token.text for token in nlp(x)])
sample.to_csv('sample.csv', index=False)
print(f"[INFO] Sample data saved to sample.csv")

print(sample.head())