-
Notifications
You must be signed in to change notification settings - Fork 55
/
scrape_all_the_cve.py
240 lines (205 loc) · 9.63 KB
/
scrape_all_the_cve.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#coding=utf8
import sys
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import datetime
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# Global Variables
vulnCount = 0;
codeLinkCount = 0;
data_log = open("CVE-Scraper_all.dat", "w+") # volatile extra data storage [JSON]
# Error log writer
error_log = open("./Logs/main_log_all.log", "a+")
def log_data(CVEID, CVEPage, CWEID, knownExploits, vulnClassification,
publishDate, updateDate, score, accessGained, attackOrigin,
complexity, authenticationRequired, confidentiality, integrity,
availability,summary,codeLink):
global vulnCount
print("Logging cell data...")
vulnCount = vulnCount + 1
print("VULNERABILITIES FOUND: " + str(vulnCount))
data_log.write('{\n\t"CVE ID":\"' + CVEID + '\",\n\t"CVE Page":\"' +
CVEPage + '\",\n\t"CWE ID":\"' + CWEID +
'\",\n\t"Known Exploits":\"' + knownExploits +
'\",\n\t"Vulnerability Classification":\"' +
vulnClassification + '\",\n\t"Publish Date":\"' + publishDate
+ '\",\n\t"Update Date":\"' + updateDate +
'\",\n\t"Score":\"' + score + '\",\n\t"Access Gained":\"' +
accessGained + '\",\n\t"Attack Origin":\"' + attackOrigin +
'\",\n\t"Complexity":\"' + complexity +
'\",\n\t"Authentication Required":\"' +
authenticationRequired + '\",\n\t"Confidentiality":\"' +
confidentiality + '\",\n\t"Integrity":\"' + integrity +
'\",\n\t"Availability":\"' + availability +
'\",\n\t"Summmary":\"' + summary +
'\",\n\t"codeLink":\"' + codeLink + '\"\n}\n\n')
print('{\n\t"CVE ID":\"' + CVEID + '\",\n\t"CVE Page":\"' +
CVEPage + '\",\n\t"CWE ID":\"' + CWEID +
'\",\n\t"Known Exploits":\"' + knownExploits +
'\",\n\t"Vulnerability Classification":\"' +
vulnClassification + '\",\n\t"Publish Date":\"' + publishDate
+ '\",\n\t"Update Date":\"' + updateDate +
'\",\n\t"Score":\"' + score + '\",\n\t"Access Gained":\"' +
accessGained + '\",\n\t"Attack Origin":\"' + attackOrigin +
'\",\n\t"Complexity":\"' + complexity +
'\",\n\t"Authentication Required":\"' +
authenticationRequired + '\",\n\t"Confidentiality":\"' +
confidentiality + '\",\n\t"Integrity":\"' + integrity +
'\",\n\t"Availability":\"' + availability + '\"\n}\n\n')
def find_code_link(CVEPage):
# each cve example on each page
global codeLinkCount
try:
cveSoup = BeautifulSoup(urlopen(Request(CVEPage, headers={'User-Agent': 'Mozilla/5.0'})).read(), 'html.parser')
linkStr = ""
referTable = cveSoup.find('table', {'id': 'vulnrefstable'}, class_='listtable')
row = referTable.findAll('td', class_="r_average")
for cell in row:
link = cell.find('a')['href']
#for chrome
#if "stable-channel-update" in link and "chrome" in link:
#for android
#if "android.googlesource.com" in link:
if "github.com" in link and "commit" in link:
codeLinkCount += 1
linkStr += cell.find('a')['href']
print("codeLinkCount:" + str(codeLinkCount))
return linkStr
except:
return ""
# Log errors to the file specified by error_log
def log_message(msg):
timestamp = str(datetime.datetime.now())
error_log.write(timestamp + ":\t" + msg + "\n")
# Does the heavy lifting of breaking down the CVE tables
def record_cve_data(pageURL):
log_message("scrape extracting from: " + pageURL + "\n")
pageSoup = BeautifulSoup(urlopen(Request(pageURL,
headers={'User-Agent': 'Mozilla/5.0'})).read(),
'html.parser')
pageTable = pageSoup.find('table', class_ = "searchresults sortable")
for row, summarys in zip(pageTable.findAll('tr', class_ = "srrowns"), pageTable.findAll('td', class_ = "cvesummarylong")):
print(row)
# Temp variables to hold data that will be stored.
CVEID = "NULL"
CVEPage = "NULL"
CWEID = "NULL"
knownExploits = "NULL"
vulnClassification = "NULL"
publishDate = "NULL"
updateDate = "NULL"
score = "NULL"
accessGained = "NULL"
attackOrigin = "NULL"
complexity = "NULL"
authenticationRequired = "NULL"
confidentiality = "NULL"
integrity = "NULL"
availability = "NULL"
summary = "NULL"
index = 0
for cell in row.findAll('td'):
print("<" + str(cell.next) + ">")
# Push scraped cell data into organized variables
if(index == 1):
CVEPage = ("https://www.cvedetails.com" +
(cell.find('a'))['href'])
CVEID = cell.find('a').next
if(index == 2):
try:
CWEID = "CWE-"+str(cell.find('a').next).strip("\r\n\t")
except:
CWEID = str(cell.next).strip("\r\n\t")
if(index == 3):
knownExploits = str(cell.next).strip("\r\n\t")
if(index == 4):
vulnClassification = str(cell.next).strip("\r\n\t")
if(index == 5):
publishDate = str(cell.next).strip("\r\n\t")
if(index == 6):
updateDate = str(cell.next).strip("\r\n\t")
if(index == 7):
score = cell.find('div').next
if(index == 8):
accessGained = str(cell.next).strip("\r\n\t")
if(index == 9):
attackOrigin = str(cell.next).strip("\r\n\t")
if(index == 10):
complexity = str(cell.next).strip("\r\n\t")
if(index == 11):
authenticationRequired = str(cell.next).strip("\r\n\t")
if(index == 12):
confidentiality = str(cell.next).strip("\r\n\t")
if(index == 13):
integrity = str(cell.next).strip("\r\n\t")
if(index == 14):
availability = str(cell.next).strip("\r\n\t")
print("---")
index += 1
summary = str(summarys.next).strip("\r\n\t")
# List all values gained from this row
print("\n\n")
print("===")
print("CVE ID:\t\t\t\t" + CVEID)
print("CVE Page:\t\t\t" + CVEPage)
print("CWE ID:\t\t\t\t" + CWEID)
print("Number of Exploits:\t\t\t" + knownExploits)
print("Vulnerability Classification:\t" + vulnClassification)
print("Publish Date:\t\t\t" + publishDate)
print("Update Date:\t\t\t" + updateDate)
print("CVSS Score:\t\t\t" + score)
print("Access Gained:\t\t\t" + accessGained)
print("Attack Origin:\t\t\t" + attackOrigin)
print("Complexity:\t\t\t" + complexity)
print("Authentication Required:\t" + authenticationRequired)
print("Confidentiality:\t\t" + confidentiality)
print("Integrity:\t\t\t" + integrity)
print("Availability:\t\t\t" + availability)
print("Summary:\t\t\t" + summary)
print("===\n\n")
codeLink = find_code_link(CVEPage)
log_data(CVEID, CVEPage, CWEID, knownExploits, vulnClassification,
publishDate, updateDate, score, accessGained, attackOrigin,
complexity, authenticationRequired, confidentiality, integrity,
availability,summary,codeLink)
def scrape_cve_data():
# grab the CVE Details page and throw it in beautifulSoup.
#For android
#pageURL = "https://www.cvedetails.com/product/19997/Google-Android.html?vendor_id=1224"
# For chrome
# pageURL = "https://www.cvedetails.com/product/15031/Google-Chrome.html?vendor_id=1224"
pageURL = "https://www.cvedetails.com/browse-by-date.php"
log_message("Scrape starting up... root page: " + pageURL)
catalogSoup=BeautifulSoup(urlopen(Request(pageURL,
headers={'User-Agent': 'Mozilla/5.0'})).read(),
'html.parser')
# Scrape the browse-by-date page to gather all of the different month's links
catalogTable = catalogSoup.find('table', class_='stats')
yearlyReports = []
for row in catalogTable.findAll('th'):
for year in row.findAll('a', href=True):
print("Found year at: https://www.cvedetails.com" + year['href'] + "\n")
yearlyReports.append("https://www.cvedetails.com" + year['href'])
print("\n === Years discovered. Grabbing pages for each year ===\n\n")
# discover the pages for each year and pass on those pages to be dissected
for yearURL in yearlyReports:
yearTableSoup = BeautifulSoup(urlopen(Request(yearURL,
headers={'User-Agent': 'Mozilla/5.0'}))\
.read(), 'html.parser')
pageIndex = yearTableSoup.find('div', {'id':'pagingb'}, class_='paging')
for page in pageIndex.findAll('a', href=True):
pageURL = ("https://www.cvedetails.com" + page['href'])
record_cve_data(pageURL)
###############################################################################
# MAIN
###############################################################################
def main(argv):
print("\n==== CVE-Scraper ====")
print("==== Main.py ====\n")
print("PYTHON VERSION:\t\t" + sys.version)
log_message("CVE-Scraper Starting up...")
scrape_cve_data()
log_message("Scrape complete")
if __name__ == '__main__':
main(sys.argv[1:])