Skip to content

Commit

Permalink
Refactor to support streaming
Browse files Browse the repository at this point in the history
  • Loading branch information
ankane committed Mar 9, 2019
1 parent 2c0c1cc commit d37ea3e
Showing 1 changed file with 63 additions and 58 deletions.
121 changes: 63 additions & 58 deletions internal/helpers.go
Expand Up @@ -19,9 +19,9 @@ import (
"github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/s3" "github.com/aws/aws-sdk-go/service/s3"
"github.com/deckarep/golang-set"
"github.com/fatih/color" "github.com/fatih/color"
"github.com/h2non/filetype" "github.com/h2non/filetype"
"github.com/deckarep/golang-set"
) )


type nameRule struct { type nameRule struct {
Expand Down Expand Up @@ -119,87 +119,92 @@ var space = regexp.MustCompile(`\s+`)
var urlPassword = regexp.MustCompile(`((\/\/|%2F%2F)\S+(:|%3A))\S+(@|%40)`) var urlPassword = regexp.MustCompile(`((\/\/|%2F%2F)\S+(:|%3A))\S+(@|%40)`)


func findMatches(colIdentifier string, values []string, onlyValues bool) []ruleMatch { func findMatches(colIdentifier string, values []string, onlyValues bool) []ruleMatch {
matchList := []ruleMatch{} // build matches

matchedDatas := make([][]string, len(regexRules)+1)
count := len(values) nameIndex := len(regexRules)


if count > 0 { for _, v := range values {
for _, rule := range regexRules { for i, rule := range regexRules {
matchedData := []string{} if rule.Regex.MatchString(v) {

matchedDatas[i] = append(matchedDatas[i], v)
for _, v := range values {
if rule.Regex.MatchString(v) {
matchedData = append(matchedData, v)
}
} }
}


if rule.Name == "email" { tokens := tokenizer.Split(strings.ToLower(v), -1)
// filter out false positives with URL credentials if anyMatches(tokens) {
newMatchedData := matchedData matchedDatas[nameIndex] = append(matchedDatas[nameIndex], v)
matchedData = []string{} }
for _, v := range newMatchedData { }
// replace urls and check for email match again
v2 := urlPassword.ReplaceAllString(v, "[FILTERED]")
if rule.Regex.MatchString(v2) {
matchedData = append(matchedData, v)
}
}
}

// TODO filter out masked IPs (end with .0)

if len(matchedData) > 0 {
confidence := "low"
if rule.Name == "email" || float64(len(matchedData))/float64(count) > 0.5 {
confidence = "high"
}


if onlyValues { count := len(values)
var matchedValues []string
for _, v := range matchedData {
v3 := rule.Regex.FindAllString(v, -1)
matchedValues = append(matchedValues, v3...)
}
matchedData = matchedValues
}


matchList = append(matchList, ruleMatch{RuleName: rule.Name, DisplayName: rule.DisplayName, Confidence: confidence, Identifier: colIdentifier, MatchedData: matchedData}) return checkMatches(colIdentifier, matchedDatas, count, onlyValues)
} }
}


// find names func checkMatches(colIdentifier string, matchedDatas [][]string, count int, onlyValues bool) []ruleMatch {
matchedData := []string{} matchList := []ruleMatch{}


for _, v := range values { for i, rule := range regexRules {
tokens := tokenizer.Split(strings.ToLower(v), -1) matchedData := matchedDatas[i]
if anyMatches(tokens) {
matchedData = append(matchedData, v) if rule.Name == "email" {
// filter out false positives with URL credentials
newMatchedData := matchedData
matchedData = []string{}
for _, v := range newMatchedData {
// replace urls and check for email match again
v2 := urlPassword.ReplaceAllString(v, "[FILTERED]")
if rule.Regex.MatchString(v2) {
matchedData = append(matchedData, v)
}
} }
} }


if len(matchedData) > 0 { if len(matchedData) > 0 {
confidence := "low" confidence := "low"
if float64(len(matchedData))/float64(count) > 0.1 && len(unique(matchedData)) >= 10 { if rule.Name == "email" || float64(len(matchedData))/float64(count) > 0.5 {
confidence = "high" confidence = "high"
} }


if onlyValues { if onlyValues {
var matchedValues []string var matchedValues []string
for _, v := range matchedData { for _, v := range matchedData {
tokens := tokenizer.Split(strings.ToLower(v), -1) v3 := rule.Regex.FindAllString(v, -1)
for _, v2 := range tokens { matchedValues = append(matchedValues, v3...)
if lastNamesSet.Contains(v2) {
matchedValues = append(matchedValues, v2)
}
}
} }
matchedData = matchedValues matchedData = matchedValues
} }


matchList = append(matchList, ruleMatch{RuleName: "last_name", DisplayName: "last names", Confidence: confidence, Identifier: colIdentifier, MatchedData: matchedData}) matchList = append(matchList, ruleMatch{RuleName: rule.Name, DisplayName: rule.DisplayName, Confidence: confidence, Identifier: colIdentifier, MatchedData: matchedData})
} }
} }


// find names
nameIndex := len(regexRules)
matchedData := matchedDatas[nameIndex]

if len(matchedData) > 0 {
confidence := "low"
if float64(len(matchedData))/float64(count) > 0.1 && len(unique(matchedData)) >= 10 {
confidence = "high"
}

if onlyValues {
var matchedValues []string
for _, v := range matchedData {
tokens := tokenizer.Split(strings.ToLower(v), -1)
for _, v2 := range tokens {
if lastNamesSet.Contains(v2) {
matchedValues = append(matchedValues, v2)
}
}
}
matchedData = matchedValues
}

matchList = append(matchList, ruleMatch{RuleName: "last_name", DisplayName: "last names", Confidence: confidence, Identifier: colIdentifier, MatchedData: matchedData})
}

return matchList return matchList
} }


Expand Down

0 comments on commit d37ea3e

Please sign in to comment.