-
-
Notifications
You must be signed in to change notification settings - Fork 83
/
importer.rb
199 lines (167 loc) · 4.92 KB
/
importer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# frozen_string_literal: true
# Utility functions for parsing and manipulating public-suffix domain lists
# Only used in development and not loaded by default
require 'yaml'
require 'open-uri'
require 'resolv'
require 'logger'
require 'swot'
require_relative '../gman'
require_relative './domain_list'
class Gman
class Importer
attr_accessor :domain_list
# Known false positives from vendored lists
BLACKLIST = %w[
business.centurytel.net
chesnee.net
citlink.net
egovlink.com
emainehosting.com
fantasyspringsresort.com
frontiernet.net
hartford-hwp.com
homepages.sover.net
htc.net
koasekabenaki.org
kstrom.net
laworkforce.net
mississippistateparks.reserveamerica.com
mylocalgov.com
myweb.cebridge.net
ncstars.org
neagrelations.org
qis.net
rootsweb.com
showcase.netins.net
valuworld.com
wctc.net
webconnections.net
webpages.charter.net
].freeze
REGEX_CHECKS = {
'home. regex' => /^home\./,
'user. regex' => /^users?\./,
'sites. regex' => /^sites?\./,
'weebly' => /weebly\.com$/,
'wordpress' => /wordpress\.com$/,
'govoffice' => /govoffice\d?\.com$/,
'homestead' => /homestead\.com$/,
'wix.com' => /wix\.com$/,
'blogspot.com' => /blogspot\.com$/,
'tripod.com' => /tripod\.com$/,
'squarespace.com' => /squarespace\.com$/,
'github.io' => /github\.io$/,
'tumblr' => /tumblr\.com$/,
'locality' => Gman::Locality::REGEX,
'french edu' => /^ac-.*?\.fr/
}.freeze
def initialize(domains)
@domain_list = DomainList.new(data: domains)
end
def logger
@logger ||= Logger.new($stdout)
end
def normalize_domain(domain)
domain = Gman.new(domain).to_s
domain.to_s.downcase.strip.gsub(/^www./, '').gsub(%r{/$}, '')
end
def valid_domain?(domain, options = {})
return false if !options[:skip_dupe] && !ensure_not_dupe(domain)
return false unless ensure_valid(domain)
return false if !options[:skip_resolve] && !ensure_resolves(domain)
true
end
# if RECONCILING=true, return the reason,
# rather than a bool and silence log output
def reject(domain, reason)
return reason if ENV['RECONCILING']
logger.info "👎 `#{domain}`: #{reason}"
false
end
def current
@current ||= DomainList.current
end
def import(options = {})
logger.info "Current: #{Gman::DomainList.current.count} domains"
logger.info "Adding: #{domain_list.count} domains"
normalize_domains!
ensure_validity!(options)
add_to_current
logger.info "New: #{current.count} domains"
end
def resolver
@resolver ||= Resolv::DNS.new(nameserver: ['1.1.1.1', '8.8.8.8'])
end
# Verifies that the given domain has an MX record, and thus is valid
def domain_resolves?(domain)
domain = Addressable::URI.new(host: domain).normalize.host
return true if ip?(domain)
returns_record?(domain, 'NS') || returns_record?(domain, 'MX')
end
private
def ensure_regex(domain)
REGEX_CHECKS.each do |msg, regex|
return reject(domain, msg) if domain&.match?(regex)
end
true
end
def ensure_valid(domain)
return false if domain.empty?
if BLACKLIST.include?(domain)
reject(domain, 'blacklist')
elsif !PublicSuffix.valid?("foo.#{domain}")
reject(domain, 'invalid')
elsif Swot.is_academic?(domain)
reject(domain, 'academic')
else
ensure_regex(domain)
end
end
def ensure_resolves(domain)
return reject(domain, 'unresolvable') unless domain_resolves?(domain)
true
end
def ensure_not_dupe(domain)
return true unless dupe?(domain)
if current.domains.include?(domain)
reject(domain, 'duplicate')
else
parent = current.parent_domain(domain)
reject(domain, "subdomain of #{parent}")
end
end
def dupe?(domain)
current.domains.include?(domain) || current.parent_domain(domain)
end
def normalize_domains!
domain_list.to_h.each_value do |domains|
domains.map! { |domain| normalize_domain(domain) }
domains.uniq!
end
end
def ensure_validity!(options = {})
domain_list.data.each_value do |domains|
domains.select! { |domain| valid_domain?(domain, options) }
end
end
def add_to_current
domain_list.data.each do |group, domains|
current.data[group] ||= []
current.data[group].concat domains
end
current.write
end
def ip?(domain)
resolver.getaddress(domain)
rescue Resolv::ResolvError
false
end
def returns_record?(domain, type)
type = Object.const_get "Resolv::DNS::Resource::IN::#{type}"
resolver.getresource(domain, type)
rescue Resolv::ResolvError
false
end
end
end