In [1]:
import os
import pandas as pd
import warnings
from tqdm import tqdm

# Filter out warning
warnings.filterwarnings('ignore', category=UserWarning, module='tqdm')

data = "updatedCveDB.csv"

if not os.path.exists(data):
    print(f"[INFO] Downloading data from MITRE...")
    url = "https://cve.mitre.org/data/downloads/allitems.csv"
    df = pd.read_csv(url, skiprows=2, encoding='latin-1', dtype=str)
    total_rows = len(df)
    with tqdm(total=total_rows, desc="Processing data") as pbar:
        df.to_csv(data, index=False)
        pbar.update(total_rows)

In [2]:
# Load processed data
df = pd.read_csv(data, usecols=["Name", "Description"])

# Remove rows where description starts with disclaimers, remove rows with missing description
df = df.dropna(subset=['Description'])
df = df[~df['Description'].str.startswith('**')]

# Save the processed data
df.to_csv('updatedCveDB.csv', index=False)
print(f"[INFO] Data saved to updatedCveDB.csv")

# Make another csv with 10,000 random rows for testing
df.sample(10000).to_csv('sample.csv', index=False)

df1 = pd.read_csv('sample.csv')

[INFO] Data saved to updatedCveDB.csv


In [3]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_md")

# Read in your CSV file
data = pd.read_csv("sample.csv")

desc_data = data["Description"]

# Iterate over each description and tokenize it
for description in desc_data:
    # Tokenize the description
    doc = nlp(description)

    # Print out the tokens
    for token in doc:
        print(token.text)


Multiple
SQL
injection
vulnerabilities
in
Ganesha
Digital
Library
(
GDL
)
4.2
allow
remote
attackers
to
execute
arbitrary
SQL
commands
via
the
i
d
parameter
in
(
1
)
download.php
or
(
2
)
main.php
.
The
Debian
xscreensaver
5.42+dfsg1
-
1
package
for
XScreenSaver
has
cap_net_raw
enabled
for
the
/usr
/
libexec
/
xscreensaver
/
sonar
file
,
which
allows
local
users
to
gain
privileges
because
this
is
arguably
incompatible
with
the
design
of
the
Mesa
3D
Graphics
library
dependency
.
Integer
overflow
in
the
substr_compare
function
in
PHP
5.2.1
and
earlier
allows
context
-
dependent
attackers
to
read
sensitive
memory
via
a
large
value
in
the
length
argument
,
a
different
vulnerability
than
CVE-2006
-
1991
.
Denial
-
of
-
service
in
NodeBB
<
=
v2.8.10
allows
unauthenticated
attackers
to
trigger
a
crash
,
when
invoking
`
eventName.startsWith
(
)
`
or
`
eventName.toString
(
)
`
,
while
processing
Socket
.
IO
messages
via
crafted
Socket
.
IO
messages
containing
array
or
object
type
for
the
event


# We need to find out the average token count for each CVE description!

Making a dateframe, including the number of characters and number of sentences:

In [42]:
import pandas as pd
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

# Initialize empty lists
charCount = []
sentenceCount = []

for cve in desc_data:
    charCount.append(len(cve))
    doc = nlp(cve)
    num_sentences = len(list(doc.sents))
    sentenceCount.append(num_sentences)

# Create a dictionary with the collected data
data = {"charCount": charCount, "tokenCount": sentenceCount}

# Create a DataFrame from the token data
token_df = pd.DataFrame(data)

print(data)

token_df.describe().round(2)

{'charCount': [190, 273, 220, 289, 140, 359, 63, 215, 184, 258, 270, 251, 421, 55, 281, 200, 308, 769, 135, 277, 978, 212, 173, 90, 152, 493, 238, 105, 406, 306, 225, 255, 266, 450, 431, 491, 146, 183, 182, 220, 351, 241, 236, 222, 270, 384, 135, 263, 396, 301, 291, 152, 192, 110, 260, 72, 563, 149, 680, 149, 231, 316, 168, 239, 461, 238, 159, 65, 380, 166, 137, 425, 446, 391, 195, 271, 161, 294, 207, 208, 289, 142, 262, 280, 259, 197, 130, 421, 572, 163, 187, 177, 893, 84, 540, 168, 125, 205, 510, 152, 334, 613, 160, 171, 109, 235, 277, 239, 280, 507, 172, 181, 1259, 428, 259, 287, 482, 122, 263, 140, 277, 412, 140, 262, 317, 293, 249, 707, 227, 464, 277, 1183, 227, 130, 426, 191, 265, 508, 81, 91, 466, 99, 195, 178, 424, 156, 352, 132, 177, 180, 91, 160, 265, 90, 150, 282, 326, 317, 149, 138, 206, 517, 293, 207, 949, 236, 265, 78, 373, 214, 70, 355, 169, 407, 278, 315, 405, 298, 570, 217, 211, 102, 368, 265, 348, 322, 791, 228, 409, 49, 215, 360, 889, 312, 361, 238, 305, 160, 223, 30

Unnamed: 0,charCount,tokenCount
count,10000.0,10000.0
mean,285.71,1.94
std,203.38,1.68
min,30.0,1.0
25%,173.0,1.0
50%,238.0,1.0
75%,330.0,2.0
max,3862.0,31.0


In [39]:
random.sample(list(desc_data), k=3)


['The Qualcomm buspm driver in Android before 2016-05-01 on Nexus 5X, 6, and 6P devices allows attackers to gain privileges via a crafted application, aka internal bug 26494907.',
 'Absolute path traversal vulnerability in controlcenter.php in FlatnuX CMS 2011 08.09.2 allows remote administrators to read arbitrary files via a full pathname in the dir parameter in a contents/Files action.',
 'Format string vulnerability in the log function in Georgia SoftWorks SSH2 Server (GSW_SSHD) 7.01.0003 and earlier allows remote attackers to execute arbitrary code via format string specifiers in the username field, as demonstrated by a certain LoginPassword message.']

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)