-
Notifications
You must be signed in to change notification settings - Fork 73
/
Copy pathsite-scan.rb
executable file
·421 lines (386 loc) · 14.3 KB
/
site-scan.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
#!/usr/bin/env ruby
# Scans Apache project homepages and captures text|urls for common links
# Gathers data that can be used to check for policy compliance:
# https://www.apache.org/foundation/marks/pmcs#navigation
# http://www.apache.org/events/README.txt
# See Also: lib/whimsy/sitestandards.rb
#
# Makes no value judgements. Simply extracts raw data for offline analysis.
$LOAD_PATH.unshift '/srv/whimsy/lib'
require 'net/http'
require 'nokogiri'
require 'json'
require 'whimsy/asf'
require 'whimsy/cache'
require 'whimsy/sitestandards'
require_relative 'asf-site-check'
$stdout.sync = true
# Normalize spaces in text runs
def squash(text)
return text.scrub.gsub(/[[:space:]]+/, ' ').strip
end
# Get text from a node; use parent if text does not appear to be complete
# This is used when scanning for some links that may
# be in an image or other commonly related node on websites
def getText(txt, node, match=/Apache Software Foundation/i)
parent = nil # debug to show where parent needed to be fetched
if txt !~ match # have we got all the text?
if node.parent.name == 'a' # e.g. whimsical. such parents don't have extra text.
newnode = node.parent.parent
else
newnode = node.parent
end
# ensure <br> is treated as a separator when extracting the combined text
newnode.css('br').each { |br| br.replace(' ') }
txt = squash(newnode.text)
parent = true
end
return txt, parent
end
# helper for multiple events
# TODO should we show them all?
def save_events(data, value)
prev = data[:events]
if prev and prev != value
puts "Events: already have '#{prev}', not storing '#{value}'"
else
data[:events] = value
end
end
# Extract link text, skipping invisible stuff (assumed to be a class ending with 'sr-only')
def get_link_text(anode)
bits = []
anode.traverse do |node|
if node.name == 'text'
bits << node.text unless node.parent.name == 'span' and node.parent.attribute('class')&.value&.end_with? 'sr-only'
end
end
bits.join(' ')
end
# Parse an Apache project website and return text|urls that match our checks
# @return Hash of symbols: text|url found from a check made
# @see SiteStandards for definitions of what we should scan for (in general)
def parse(id, site, name, podling=false)
show_anyway = Time.now.gmtime.strftime('%H') == '08' # show suppressed errors once a day
data = {}
# force https to avoid issue with cache (sites should use https anyway)
site.sub!(%r{^http:},'https:')
SiteStandards::COMMON_CHECKS.each_key do |k|
data[k.to_sym] = nil
end
data[:display_name] = name
data[:uri] = site
uri = URI.parse(site)
begin
Socket.getaddrinfo(uri.host, uri.scheme)
rescue SocketError => se
data[:errors] = se.message
return data
end
begin
uri, response, status = $cache.get(site.to_s)
rescue IOError => ioe
data[:errors] = ioe.message
return data
end
puts "#{id} #{uri} #{status}"
# Bail and return if getting the site returns an error code
if response.respond_to? :code and response.code =~ /^[45]/
data[:errors] = "cache.get(#{site}) error code #{response.code}"
return data
end
doc = Nokogiri::HTML(response)
if $saveparse
file = File.join('/tmp',"site-scan_#{$$}.txt")
File.write(file, doc.to_s)
$stderr.puts "Wrote parsed input to #{file}"
end
data[:uri] = uri.to_s
subpages = Hash.new
# FIRST: scan each link's a_href to see if we need to capture it
# also capture script src for events, and some page refs for podlings
doc.traverse do |a|
if a.name == 'script'
a_src = a['src'].to_s.strip
if a_src =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
save_events data, uri + a_src
end
end
next unless a.name == 'a'
# Normalize the text and href for our capture purposes
a_href = a['href'].to_s.strip
a_text = get_link_text(a) # Not down-cased yet
$stderr.puts "#{a_text.inspect} #{a_href}" if $verbose
# Check the href urls for some patterns
if a_href =~ SiteStandards::COMMON_CHECKS['foundation'][SiteStandards::CHECK_CAPTURE]
img = a.at('img')
if img
# use the title (hover text) in preference to the source
data[:foundation] = img['title'] ? squash(img['title']) : uri + img['src'].strip
else
data[:foundation] = squash(a_text)
end
end
if a_href =~ SiteStandards::COMMON_CHECKS['events'][SiteStandards::CHECK_CAPTURE]
# Hack to ignore hidden links on main site
save_events data, uri + a_href unless a['class'] == 'visible-home' and uri.path != '/'
end
# Check the a_text strings for other patterns
a_text = a_text.downcase.strip # needs to be downcased here
# Note this is an unusual case
if (a_text =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_TEXT]) and
(a_href =~ SiteStandards::COMMON_CHECKS['license'][SiteStandards::CHECK_CAPTURE])
begin
data[:license] = uri + a_href
rescue StandardError
data[:license] = a_href
end
end
%w(thanks security sponsorship privacy).each do |check|
if a_text =~ SiteStandards::COMMON_CHECKS[check][SiteStandards::CHECK_CAPTURE]
begin
data[check.to_sym] = uri + a_href
rescue StandardError
data[check.to_sym] = a_href
end
end
end
unless a_href =~ %r{^(#|mailto:)}
begin
if a_href =~ %r{^https?://} # no need to rebase this
site2 = URI.parse(a_href.gsub(' ','%20').gsub('|', '%7C')) # needs to be a URI
else
site2 = URI.join(site,a_href.gsub(' ','%20').gsub('|', '%7C')) # HACK
end
if site2.host == uri.host and site2.path.size > 2
subpages[site2.to_s] = a
end
rescue StandardError => e
if show_anyway or !a_href.include?('fineract.gateway.scarf.sh/{version}') # reported, but not yet fixed, so suppress noise
$stderr.puts "#{id}: Bad a_href #{a_href} #{e}"
end
end
end
end
# SECOND: scan each text node to match and capture
doc.traverse do |node|
next unless node.is_a?(Nokogiri::XML::Text)
txt = squash(node.text)
# allow override if phrase looks good
if (txt =~ SiteStandards::COMMON_CHECKS['trademarks'][SiteStandards::CHECK_CAPTURE] and not data[:trademarks]) or
txt =~ /are trademarks of [Tt]he Apache Software/
t, p = getText(txt, node)
# drop previous text if it looks like Copyright sentence
data[:trademarks] = t.sub(/^.*?Copyright .+? Foundation[.]?/, '').strip
data[:tradeparent] = p if p
end
if txt =~ SiteStandards::COMMON_CHECKS['copyright'][SiteStandards::CHECK_CAPTURE]
t, p = getText(txt, node)
# drop text around the Copyright (or the symbol)
data[:copyright] = t.sub(/^.*?((Copyright|©) .+? Foundation[.]?).*/, '\1').strip
data[:copyparent] = p if p
end
# Note we also check for incubator disclaimer (immaterial of tlp|podling)
if txt =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
t, _p = getText(txt, node, / is an effort undergoing/)
data[:disclaimer] = t
end
end
# Brief scan of initial sub-pages to look for disclaimers and downloads
hasdisclaimer = 0
nodisclaimer = []
subpages.each do |subpage, anchor|
if podling
begin
uri, response, status = $cache.get(subpage)
if uri&.to_s == subpage or uri&.to_s == subpage + '/'
puts "#{id} #{uri} #{status}"
else
puts "#{id} #{subpage} => #{uri} #{status}"
end
unless status == 'error'
if response =~ SiteStandards::PODLING_CHECKS['disclaimer'][SiteStandards::CHECK_CAPTURE]
hasdisclaimer += 1
else
nodisclaimer << subpage
end
else
unless %w(nlpcraft).include? id # reported, but unresponsive, so suppress noise
$stderr.puts "#{id} #{subpage} => #{uri} #{status} '#{anchor.text.strip}'"
end
end
rescue URI::InvalidURIError
# ignore
end
end
end
if nodisclaimer.size > 0
data[:disclaimers] = [hasdisclaimer, nodisclaimer]
end
# Show potential download pages
data[:downloads] = subpages.select{|k,_v| k =~ %r{download|release|install|dlcdn\.apache\.org|dyn/closer}i}
# THIRD: see if an image has been uploaded
data[:image] = ASF::SiteImage.find(id)
# Check for resource loading from non-ASF domains
if $skipresourcecheck
data[:resources] = 'Not checked'
else
cmd = ['node', '/srv/whimsy/tools/scan-page.js', site]
out, err, status = exec_with_timeout(cmd, 60)
if status
ext_urls = out.split("\n").reject {|x| ASFDOMAIN.asfhost? x}.tally
resources = ext_urls.values.sum
data[:resources] = "Found #{resources} external resources: #{ext_urls}"
else
data[:resources] = err
end
end
# TODO: does not find js references such as:
# ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
return data
end
require 'timeout'
# the node script appears to stall sometimes, so apply a timeout
def exec_with_timeout(cmd, timeout)
begin
# stdout, stderr pipes
rout, wout = IO.pipe
rerr, werr = IO.pipe
stdout, stderr = nil
status = false
pid = Process.spawn(*cmd, pgroup: true, :out => wout, :err => werr)
Timeout.timeout(timeout) do
Process.waitpid(pid)
status = $?.success?
# close write ends so we can read from them
wout.close
werr.close
stdout = rout.readlines.join
stderr = rerr.readlines.join
unless status
$stderr.puts "WARN: #{Time.now} failed scanning #{cmd} #{pid} #{stderr}"
stderr = 'Scanning failed'
end
end
rescue Timeout::Error
# Try to determine why the kill does not tidy the chrome processes
# Also whether a kill was actually issued!
puts "WARN: timeout scanning #{cmd[-1]} #{pid}"
$stderr.puts "WARN: #{Time.now} timeout scanning #{cmd[-1]} #{pid}"
stderr = 'Timeout'
ret=''
# Try to show process tree
cmd = "ps -lfg #{$$}"
begin
$stderr.puts "WARN: #{Time.now} #{cmd}:"
$stderr.puts `#{cmd}`
reaper = Process.detach(pid) # ensure the process is reaped
# kill -pid responds with EINVAL - invalid argument
$stderr.puts "WARN: #{Time.now} about to kill -15 #{pid}"
ret = Process.kill(-15, pid) # SIGTERM
$stderr.puts "WARN: #{Time.now} sent kill -15 #{pid} ret=#{ret}"
thrd = reaper.join 30 # allow some time for process to exit
if thrd # original process has finished
$stderr.puts "WARN: #{Time.now} process completed #{thrd.value}"
else # not yet finished, try a stronger kill
$stderr.puts "WARN: #{Time.now} about to kill -9 #{pid}"
ret = Process.kill(-9, pid) # SIGKILL
$stderr.puts "WARN: #{Time.now} sent kill -9 #{pid} ret=#{ret}"
thrd = reaper.join 5 # allow some time for process to exit
if thrd
$stderr.puts "WARN: #{Time.now} process completed #{thrd.value}"
else
$stderr.puts "ERROR: #{Time.now} failed to kill -9 #{pid}"
end
end
rescue StandardError => e
$stderr.puts "WARN: #{Time.now} ret=#{ret} exception: #{e}"
end
$stderr.puts "WARN: #{Time.now} #{cmd}:"
$stderr.puts `#{cmd}`
ensure
wout.close unless wout.closed?
werr.close unless werr.closed?
# dispose the read ends of the pipes
rout.close
rerr.close
end
return stdout, stderr, status
end
#########################################################################
# Main execution begins here
results = {}
podlings = {}
$cache = Cache.new(dir: ENV['SITE_SCAN_CACHE'] || 'site-scan')
$verbose = ARGV.delete '--verbose'
$saveparse = ARGV.delete '--saveparse'
$skipresourcecheck = ARGV.delete '--noresource'
sites_checked = 0
sites_failed = 0
k = ARGV.select {|k| k.start_with? '-'}
if k.size > 0
raise "Unexpected options: #{k} (valid: verbose, saveparse, noresource)"
end
puts "Started: #{Time.now}" # must agree with site-scan monitor
# USAGE:
# site-scan.rb https://whimsical.apache.org [Whimsy] [whimsy-scan.json] - to scan one project
# site-scan.rb [project-output.json] [podlings-output.json] [projname podlingname ...]
# If additional projname|podlingname are provided, only scans those sites
if ARGV.first =~ /^https?:\/\/\w/
# Scan a single URL provided by user
podling = ARGV.delete('--podling')
site = ARGV.shift.dup # needs to be unfrozen
name = ARGV.shift || site[/\/(\w[^.]*)/, 1].capitalize
output_projects = ARGV.shift
results[name] = parse(name, site, name, podling)
else
# Gather output filenames (if any) and scan various projects
if ARGV.first =~ %r{[./]} # have we a file name?
output_projects = ARGV.shift
if ARGV.first =~ %r{[./]} # have we another file name?
output_podlings = ARGV.shift
else
output_podlings = nil
end
else
output_projects = nil
end
# Scan committees, including non-pmcs
ASF::Committee.load_committee_info
committees = (ASF::Committee.pmcs + ASF::Committee.nonpmcs).uniq
committees.sort_by {|committee| committee.name}.each do |committee|
next unless committee.site
# if more parameters specified, parse only those names
if ARGV.length > 0
next unless ARGV.include? committee.name
end
results[committee.name] = parse(committee.name, committee.site, committee.display_name)
results[committee.name]['nonpmc'] = committee.nonpmc?
sites_checked += 1
sites_failed += 1 unless results[committee.name][:resources].start_with? 'Found'
# Don't keep checking unnecessarily
$skipresourcecheck = (sites_failed > 10 or (sites_failed > 3 and sites_failed == sites_checked))
end
# Scan podlings that have a website
ASF::Podling.list.sort_by(&:name).each do |podling|
if podling.status == 'current' and podling.podlingStatus[:website]
# if more parameters specified, parse only those names
if ARGV.length > 0
next unless ARGV.include? podling.name
end
podlings[podling.name] = parse(podling.name, podling.podlingStatus[:website], podling.display_name, true)
end
end
end
# Output all results
if output_projects
File.write(output_projects, JSON.pretty_generate(results))
else
puts JSON.pretty_generate(results)
end
if output_podlings
File.write(output_podlings, JSON.pretty_generate(podlings))
else
puts JSON.pretty_generate(podlings)
end
puts "Ended: #{Time.now}" # must agree with site-scan monitor