In [None]:
import os
import pandas as pd
import warnings
from tqdm import tqdm
from transformers import pipeline

# Filter out warning
warnings.filterwarnings('ignore', category=UserWarning, module='tqdm')

data = "updatedCveDB.csv"

if not os.path.exists(data):
    print(f"[INFO] Downloading data from MITRE...")
    url = "https://cve.mitre.org/data/downloads/allitems.csv"
    df = pd.read_csv(url, skiprows=2, encoding='latin-1', dtype=str)
    total_rows = len(df)
    with tqdm(total=total_rows, desc="Processing data") as pbar:
        df.to_csv(data, index=False)
        pbar.update(total_rows)

In [None]:
# Load processed data
df = pd.read_csv(data, usecols=["Name", "Description"])

# Remove rows where description starts with disclaimers, remove rows with missing description
df = df.dropna(subset=['Description'])
df = df[~df['Description'].str.startswith('**')]

# Save the processed data
df.to_csv('updatedCveDB.csv', index=False)
print(f"[INFO] Data saved to updatedCveDB.csv")

# Make another csv with 100 random rows for testing
df.sample(200).to_csv('sample.csv', index=False)

df1 = pd.read_csv('sample.csv')

In [None]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

# Read in your CSV file
data = pd.read_csv("sample.csv")

# Iterate over each description and tokenize it
for description in data["Description"]:
    # Tokenize the description
    doc = nlp(description)
    
    # Print out the tokens
    for token in doc:
        print(token.text)

In [None]:
import torch
dataTensor = torch.tensor(token.text, dtype=torch.float32)

print(dataTensor.shape, dataTensor.dtype)
print(dataTensor[:10])
