Skip to content

Commit

Permalink
Add censored-words feature.
Browse files Browse the repository at this point in the history
  • Loading branch information
andybalholm committed Apr 8, 2017
1 parent ca335e2 commit 9cb5630
Show file tree
Hide file tree
Showing 7 changed files with 147 additions and 20 deletions.
2 changes: 1 addition & 1 deletion acl.go
Expand Up @@ -241,7 +241,7 @@ func (a *ACLDefinitions) load(filename string) error {
}
}

case "allow", "block", "block-invisible", "disable-proxy-headers", "hash-image", "ignore-category", "phrase-scan", "require-auth", "ssl-bump":
case "allow", "block", "block-invisible", "censor-words", "disable-proxy-headers", "hash-image", "ignore-category", "phrase-scan", "require-auth", "ssl-bump":
r := ACLActionRule{Action: action}
argLoop:
for _, a := range args {
Expand Down
113 changes: 113 additions & 0 deletions censor.go
@@ -0,0 +1,113 @@
package main

import (
"fmt"
"io"
"os"
"strings"
"unicode"

"golang.org/x/net/html"
)

func (c *config) readCensoredWordsFile(filename string) error {
if c.CensoredWords == nil {
c.CensoredWords = make(map[string]bool)
}

f, err := os.Open(filename)
if err != nil {
return fmt.Errorf("could not open %s: %s", filename, err)
}
defer f.Close()
cr := newConfigReader(f)

for {
line, err := cr.ReadLine()
if err == io.EOF {
break
}
if err != nil {
return err
}

c.CensoredWords[strings.ToLower(line)] = true
}

return nil
}

// censor returns s, with all words from censored removed.
func censor(s string, censored map[string]bool) string {
inWord := false
changed := false
copied := 0
var wordStart int
var result []byte

for i, c := range s {
wordChar := unicode.IsLetter(c) || unicode.IsMark(c) || c == '\'' || c == '-'
switch {
case wordChar && !inWord:
wordStart = i
inWord = true

case inWord && !wordChar:
word := strings.ToLower(s[wordStart:i])
if censored[word] {
result = append(result, s[copied:wordStart]...)
copied = i
changed = true
// Skip a space before or after, but not both.
if c == ' ' {
copied++
} else if len(result) > 0 && result[len(result)-1] == ' ' {
result = result[:len(result)-1]
}
}
inWord = false
}
}

if inWord {
word := strings.ToLower(s[wordStart:len(s)])
if censored[word] {
result = append(result, s[copied:wordStart]...)
copied = len(s)
changed = true
}
}

if changed {
result = append(result, s[copied:]...)
return string(result)
}

return s
}

// censorHTML removes censored words from the text content of n and its
// children.
func censorHTML(n *html.Node, censored map[string]bool) (changed bool) {
for c := n.FirstChild; c != nil; c = c.NextSibling {
switch c.Type {
case html.TextNode:
newText := censor(c.Data, censored)
if newText != c.Data {
c.Data = newText
changed = true
}
case html.ElementNode:
switch c.Data {
case "script", "style", "code", "pre":
// Don't censor code.
default:
if censorHTML(c, censored) {
changed = true
}
}
}
}

return changed
}
2 changes: 1 addition & 1 deletion classify.go
Expand Up @@ -98,7 +98,7 @@ func handleClassification(w http.ResponseWriter, r *http.Request) {
_, cs, _ := charset.DetermineEncoding(content, contentType)
var doc *html.Node
if strings.Contains(contentType, "html") {
modified = conf.pruneContent(req.URL, &content, cs, acls, &doc)
modified = conf.pruneContent(req.URL, &content, cs, &doc)
}

conf.scanContent(content, contentType, cs, tally)
Expand Down
7 changes: 5 additions & 2 deletions config.go
Expand Up @@ -72,8 +72,10 @@ type config struct {
FilteredPruning map[rule][]filteredPruningRule
PruneMatcher *URLMatcher
FilteredPruneMatcher *URLMatcher
QueryChanges map[rule]url.Values
QueryMatcher *URLMatcher
CensoredWords map[string]bool

QueryChanges map[rule]url.Values
QueryMatcher *URLMatcher

CertFile string
KeyFile string
Expand Down Expand Up @@ -132,6 +134,7 @@ func loadConfiguration() (*config, error) {
c.newActiveFlag("blockpage", "", "path to template for block page", c.loadBlockPage)
c.newActiveFlag("c", "/etc/redwood/redwood.conf", "configuration file path", c.readConfigFile)
c.newActiveFlag("categories", "/etc/redwood/categories", "path to configuration files for categories", c.loadCategories)
c.newActiveFlag("censored-words", "", "file of words to remove from pages", c.readCensoredWordsFile)
c.flags.DurationVar(&c.CertCache.TTL, "cert-cache-ttl", time.Hour, "how long to cache generated TLS certificates")
c.flags.StringVar(&c.CGIBin, "cgi-bin", "", "path to CGI files for built-in web server")
c.flags.DurationVar(&c.CloseIdleConnections, "close-idle-connections", time.Minute, "how often to close idle HTTP connections")
Expand Down
24 changes: 21 additions & 3 deletions proxy.go
Expand Up @@ -429,7 +429,7 @@ func (h proxyHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
}
}

modified = conf.pruneContent(r.URL, &content, cs, acls, &doc)
modified = conf.pruneContent(r.URL, &content, cs, &doc)
if modified {
cs = "utf-8"
}
Expand All @@ -438,8 +438,26 @@ func (h proxyHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
conf.scanContent(content, contentType, cs, tally)

if strings.Contains(contentType, "html") {
if conf.doFilteredPruning(r.URL, &content, cs, acls, &doc) {
modified = true
modifiedAfterScan := conf.doFilteredPruning(r.URL, content, cs, acls, &doc)

censorRule, _ := conf.ChooseACLCategoryAction(acls, categories, "censor-words")
if censorRule.Action == "censor-words" {
if doc == nil {
doc, _ = parseHTML(content, cs)
}
if censorHTML(doc, conf.CensoredWords) {
modifiedAfterScan = true
}
}

if modifiedAfterScan {
b := new(bytes.Buffer)
if err := html.Render(b, doc); err != nil {
log.Printf("Error rendering modified content from %s: %v", r.URL, err)
} else {
content = b.Bytes()
modified = true
}
}
if modified {
resp.Header.Set("Content-Type", "text/html; charset=utf-8")
Expand Down
17 changes: 5 additions & 12 deletions prune.go
Expand Up @@ -157,7 +157,7 @@ var headSelector = cascadia.MustCompile("head")
// re-renders the HTML. It returns true if the content was changed. The content
// may be pre-parsed and passed in as tree; the final parse tree will be stored
// in tree.
func (c *config) pruneContent(URL *url.URL, content *[]byte, cs string, acls map[string]bool, tree **html.Node) bool {
func (c *config) pruneContent(URL *url.URL, content *[]byte, cs string, tree **html.Node) bool {
URLMatches := c.PruneMatcher.MatchingRules(URL)
if len(URLMatches) == 0 {
return false
Expand Down Expand Up @@ -186,7 +186,8 @@ func (c *config) pruneContent(URL *url.URL, content *[]byte, cs string, acls map
return false
}

// Mark the new content as having a charset of UTF-8.
// Remove any meta tag that indicated the charset, since it will be
// re-rendered as UTF-8.
prune(*tree, metaCharsetSelector, toDelete)

// Actually delete the nodes that are to be removed.
Expand Down Expand Up @@ -224,14 +225,14 @@ func (c *config) pruneContent(URL *url.URL, content *[]byte, cs string, acls map
return true
}

func (c *config) doFilteredPruning(URL *url.URL, content *[]byte, cs string, acls map[string]bool, tree **html.Node) bool {
func (c *config) doFilteredPruning(URL *url.URL, content []byte, cs string, acls map[string]bool, tree **html.Node) bool {
URLMatches := c.FilteredPruneMatcher.MatchingRules(URL)
if len(URLMatches) == 0 {
return false
}

if *tree == nil {
doc, err := parseHTML(*content, cs)
doc, err := parseHTML(content, cs)
if err != nil {
log.Printf("Error parsing html from %s: %s", URL, err)
return false
Expand Down Expand Up @@ -259,14 +260,6 @@ func (c *config) doFilteredPruning(URL *url.URL, content *[]byte, cs string, acl
n.Parent.RemoveChild(n)
}

b := new(bytes.Buffer)
err := html.Render(b, *tree)
if err != nil {
log.Printf("Error rendering modified content from %s: %s", URL, err)
return false
}

*content = b.Bytes()
return true
}

Expand Down
2 changes: 1 addition & 1 deletion testmode.go
Expand Up @@ -138,7 +138,7 @@ func runURLTest(u string) {
modified := false
_, cs, _ := charset.DetermineEncoding(content, resp.Header.Get("Content-Type"))
if strings.Contains(contentType, "html") {
modified = conf.pruneContent(URL, &content, cs, acls, &doc)
modified = conf.pruneContent(URL, &content, cs, &doc)
}
if modified {
cs = "utf-8"
Expand Down

0 comments on commit 9cb5630

Please sign in to comment.