Software is free software released under the "GNU General Public License v3.0"

Copyright (c) 2022 Yuning-Jiang - yuning.jiang17@gmail.com

In [15]:
import pandas as pd, numpy as np
import config
import re 
from os import listdir

In [16]:
#Generate combined historical dataset using NVD reports from 2002 to 2020.
def generate_CWECAPEC_CombinedFile():
    list = listdir(config.report_CWECAPEC_path)
    number_files = len(list)-1
    dict = []
    dict_of_reports = {}
    for year in range(2002,2002 + number_files):
        year_in_string = str(year)
        fileName = 'NVD_'+ year_in_string + '_CWE_CAPEClabel.csv'
        dict_of_reports[year_in_string] = []
        dict_of_reports[year_in_string] = pd.read_csv(config.report_CWECAPEC_path + fileName)
        dict.append(dict_of_reports[year_in_string])
    df = pd.concat(dict, ignore_index=True)
    return df

df = generate_CWECAPEC_CombinedFile()
# print schema
print("Schema:\n\n",df.dtypes)
print("Number of vulnerability reports,columns=",df.shape)

Schema:

 CVEID                object
CWEID                object
VulnerabilityType    object
AttackPattern        object
Report               object
dtype: object
Number of vulnerability reports,columns= (160093, 5)


In [17]:
# Remove the vulnerabilities that are rejected by NVD analysts. 
df = df[~df['Report'].str.contains('REJECT')]
# print schema
print("Schema:\n\n",df.dtypes)
print("Number of vulnerability reports,columns=",df.shape)

Schema:

 CVEID                object
CWEID                object
VulnerabilityType    object
AttackPattern        object
Report               object
dtype: object
Number of vulnerability reports,columns= (150528, 5)


In [18]:
df['CWEID'].value_counts()

['NVD-CWE-Other']         26562
['NVD-CWE-noinfo']        16465
['CWE-79']                15308
[]                        13263
['CWE-119']               11050
                          ...  
['CWE-611', 'CWE-311']        1
['CWE-706', 'CWE-863']        1
['CWE-119', 'CWE-917']        1
['CWE-190', 'CWE-704']        1
['CWE-77', 'CWE-116']         1
Name: CWEID, Length: 835, dtype: int64

In [21]:
df['cweIDamount']=df["CWEID"].apply(lambda x: len(re.findall(",",str(x)))+1)

In [23]:
indexNames1 = df[df['CWEID'].str.contains('Missing Data') & df['cweIDamount'] == 1].index
indexNames2 = df[df['CWEID'].str.contains('NVD-CWE-Other') & df['cweIDamount'] == 1].index
indexNames3 = df[df['CWEID'].str.contains('NVD-CWE-noinfo') & df['cweIDamount'] == 1].index

In [24]:
df.drop(indexNames1, inplace=True)
df.drop(indexNames2 , inplace=True)
df.drop(indexNames3 , inplace=True)
len(df)

107497

In [25]:
df = df[df['CWEID'].map(lambda d: len(d)) > 9]

In [26]:
df['cweIDamount'].value_counts()

1    91931
2     2192
3       96
4       11
5        4
Name: cweIDamount, dtype: int64

In [27]:
tempdf = df.loc[df['cweIDamount']==2]
tempdf.sample(2)

Unnamed: 0,CVEID,CWEID,VulnerabilityType,AttackPattern,Report,cweIDamount
146153,CVE-2020-14402,"['CWE-125', 'CWE-670']","['Out-of-bounds Read', 'Always-Incorrect Contr...",['CAPEC-540'],An issue was discovered in LibVNCServer before...,2
155141,CVE-2020-3898,"['CWE-20', 'CWE-787']","['Improper Input Validation', 'Out-of-bounds W...","['CAPEC-10', 'CAPEC-101', 'CAPEC-104', 'CAPEC-...",A memory corruption issue was addressed with i...,2


In [28]:
tempdf = df.loc[df['cweIDamount']==3]
tempdf.sample(2)

Unnamed: 0,CVEID,CWEID,VulnerabilityType,AttackPattern,Report,cweIDamount
106537,CVE-2017-7778,"['CWE-119', 'CWE-125', 'CWE-787']",['Improper Restriction of Operations within th...,"['CAPEC-10', 'CAPEC-100', 'CAPEC-123', 'CAPEC-...",A number of security vulnerabilities in the Gr...,3
135679,CVE-2019-2304,"['CWE-20', 'CWE-787', 'CWE-190']","['Improper Input Validation', 'Out-of-bounds W...","['CAPEC-10', 'CAPEC-101', 'CAPEC-104', 'CAPEC-...",Integer overflow to buffer overflow due to lac...,3


In [29]:
tempdf = df.loc[df['cweIDamount']==4]
tempdf.sample(2)

Unnamed: 0,CVEID,CWEID,VulnerabilityType,AttackPattern,Report,cweIDamount
134105,CVE-2019-19307,"['CWE-125', 'CWE-787', 'CWE-190', 'CWE-835']","['Out-of-bounds Read', 'Out-of-bounds Write', ...","['CAPEC-540', 'CAPEC-92']",An integer overflow in parse_mqtt in mongoose....,4
144161,CVE-2020-11901,"['CWE-125', 'CWE-787', 'CWE-330', 'CWE-131']","['Out-of-bounds Read', 'Out-of-bounds Write', ...","['CAPEC-540', 'CAPEC-112', 'CAPEC-485', 'CAPEC...",The Treck TCP/IP stack before 6.0.1.66 allows ...,4


In [31]:
# This method depicts the CWE-IDs with highest occurences in NVD vulnerabiliites.
cweIDcount = []
tempdf1 = df.loc[df['cweIDamount']==1]
for i in range(0, len(tempdf1)):
    cweID = tempdf1['CWEID'].values[i]
    cweID = eval(str(cweID).strip('[]'))
    cweIDcount.append(cweID)
tempdf2 = df.loc[df['cweIDamount']==2]
for i in range(0, len(tempdf2)):
    cweID = tempdf2['CWEID'].values[i]
    cweID = eval(str(cweID).strip('[]'))
    for item in cweID:
        cweIDcount.append(item)
tempdf3 = df.loc[df['cweIDamount']==3]
for i in range(0, len(tempdf3)):
    cweID = tempdf3['CWEID'].values[i]
    cweID = eval(str(cweID).strip('[]'))
    for item in cweID:
        cweIDcount.append(item)
tempdf4 = df.loc[df['cweIDamount']==4]
for i in range(0, len(tempdf4)):
    cweID = tempdf4['CWEID'].values[i]
    cweID = eval(str(cweID).strip('[]'))
    for item in cweID:
        cweIDcount.append(item)
tempdf5 = df.loc[df['cweIDamount']==5]
for i in range(0, len(tempdf5)):
    cweID = tempdf5['CWEID'].values[i]
    cweID = eval(str(cweID).strip('[]'))
    for item in cweID:
        cweIDcount.append(item)
        
from collections import Counter
len(cweIDcount), Counter(cweIDcount).most_common()

(96667,
 [('CWE-79', 15410),
  ('CWE-119', 11263),
  ('CWE-20', 8075),
  ('CWE-89', 6477),
  ('CWE-200', 6428),
  ('CWE-787', 4252),
  ('CWE-22', 3741),
  ('CWE-125', 3367),
  ('CWE-352', 3035),
  ('CWE-94', 2573),
  ('CWE-287', 2194),
  ('CWE-416', 2103),
  ('CWE-78', 1675),
  ('CWE-190', 1533),
  ('CWE-476', 1339),
  ('CWE-284', 989),
  ('CWE-362', 907),
  ('CWE-434', 794),
  ('CWE-400', 759),
  ('CWE-269', 741),
  ('CWE-59', 739),
  ('CWE-732', 712),
  ('CWE-120', 692),
  ('CWE-862', 609),
  ('CWE-798', 599),
  ('CWE-611', 591),
  ('CWE-295', 587),
  ('CWE-522', 544),
  ('CWE-502', 537),
  ('CWE-77', 532),
  ('CWE-306', 527),
  ('CWE-863', 484),
  ('CWE-74', 456),
  ('CWE-601', 432),
  ('CWE-276', 400),
  ('CWE-918', 386),
  ('CWE-772', 351),
  ('CWE-426', 351),
  ('CWE-835', 350),
  ('CWE-532', 303),
  ('CWE-319', 302),
  ('CWE-427', 262),
  ('CWE-401', 253),
  ('CWE-415', 252),
  ('CWE-134', 248),
  ('CWE-770', 243),
  ('CWE-327', 217),
  ('CWE-755', 199),
  ('CWE-326', 193),
  ('

In [33]:
# This is just to extract the CWE-IDs used by NVD analysts when assigning weakness categories.
nvdcweID = list(dict.fromkeys(cweIDcount))
len(nvdcweID), nvdcweID

(227,
 ['CWE-20',
  'CWE-119',
  'CWE-327',
  'CWE-252',
  'CWE-200',
  'CWE-120',
  'CWE-287',
  'CWE-384',
  'CWE-94',
  'CWE-59',
  'CWE-362',
  'CWE-665',
  'CWE-798',
  'CWE-209',
  'CWE-79',
  'CWE-22',
  'CWE-434',
  'CWE-401',
  'CWE-400',
  'CWE-284',
  'CWE-668',
  'CWE-770',
  'CWE-532',
  'CWE-269',
  'CWE-294',
  'CWE-669',
  'CWE-134',
  'CWE-787',
  'CWE-89',
  'CWE-78',
  'CWE-352',
  'CWE-190',
  'CWE-415',
  'CWE-772',
  'CWE-476',
  'CWE-369',
  'CWE-326',
  'CWE-601',
  'CWE-835',
  'CWE-193',
  'CWE-319',
  'CWE-77',
  'CWE-74',
  'CWE-306',
  'CWE-295',
  'CWE-191',
  'CWE-93',
  'CWE-502',
  'CWE-863',
  'CWE-113',
  'CWE-732',
  'CWE-918',
  'CWE-331',
  'CWE-416',
  'CWE-1188',
  'CWE-338',
  'CWE-426',
  'CWE-91',
  'CWE-312',
  'CWE-613',
  'CWE-129',
  'CWE-909',
  'CWE-116',
  'CWE-640',
  'CWE-755',
  'CWE-916',
  'CWE-307',
  'CWE-824',
  'CWE-704',
  'CWE-611',
  'CWE-843',
  'CWE-681',
  'CWE-311',
  'CWE-649',
  'CWE-617',
  'CWE-494',
  'CWE-330',
  '