/
entropy_filter.go
56 lines (44 loc) · 1.27 KB
/
entropy_filter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
package pre_crawl_filters
import (
"strings"
"go.uber.org/zap"
"github.com/anthony-ozdemir/zfse/internal/common"
"github.com/anthony-ozdemir/zfse/internal/config"
)
type EntropyFilter struct {
// Config
discardRatio float64
}
func (e *EntropyFilter) Initialize(config config.TaskHandlerOptions) error {
// Read config
discardRatio, ok := config.FloatOptions["discard_ratio"]
if !ok {
zap.L().Fatal("Unable to find discard_ratio config option.")
}
e.discardRatio = discardRatio
return nil
}
func (e *EntropyFilter) Input(inProperties *common.DomainProperties) *common.DomainProperties {
// Check the number of unique characters in the domain name
domainName := inProperties.DomainName
urlWithoutDomainName := ""
lastDotIndex := strings.LastIndex(domainName, ".")
if lastDotIndex != -1 {
urlWithoutDomainName = domainName[:lastDotIndex]
}
characterSet := make(map[rune]bool)
for _, c := range urlWithoutDomainName {
characterSet[c] = true
}
uniqueCharactersQty := len(characterSet)
totalCharacterQty := len(urlWithoutDomainName)
ratio := float64(uniqueCharactersQty) / float64(totalCharacterQty)
if ratio >= e.discardRatio {
return nil
} else {
return inProperties
}
}
func (e *EntropyFilter) GetType() string {
return "builtin.discard_high_entropy"
}