From ba2dd1f3e7955c904a1acb8c46342e685ebd7922 Mon Sep 17 00:00:00 2001
From: Stephen Hardisty <shardisty@etsy.com>
Date: Thu, 1 Sep 2011 10:58:42 -0400
Subject: [PATCH] added new filter, put count and remove methods into text
 (what a class!!)

---
 lib/despamilator/filter/html_tags.rb         | 10 ++----
 lib/despamilator/filter/ip_address_url.rb    |  2 +-
 lib/despamilator/filter/mixed_case.rb        | 11 ++----
 lib/despamilator/filter/obfuscated_urls.rb   |  2 +-
 lib/despamilator/filter/prices.rb            |  2 +-
 lib/despamilator/filter/shouting.rb          |  2 +-
 lib/despamilator/filter/spammy_tlds.rb       |  2 +-
 lib/despamilator/filter/urls.rb              |  2 +-
 lib/despamilator/filter/weird_punctuation.rb | 38 ++++++++++++++++++++
 lib/despamilator/subject/text.rb             | 14 ++++++--
 spec/filters/script_tag_spec.rb              |  5 ++-
 spec/filters/weird_punctuation_spec.rb       | 18 ++++++++++
 spec/subject_text_spec.rb                    | 12 +++++++
 13 files changed, 92 insertions(+), 28 deletions(-)
 create mode 100644 lib/despamilator/filter/weird_punctuation.rb
 create mode 100644 spec/filters/weird_punctuation_spec.rb

diff --git a/lib/despamilator/filter/html_tags.rb b/lib/despamilator/filter/html_tags.rb
index c8d2962..a3f74cc 100644
--- a/lib/despamilator/filter/html_tags.rb
+++ b/lib/despamilator/filter/html_tags.rb
@@ -6,8 +6,8 @@ def parse subject
       text = subject.text.downcase
 
       html_tags.each do |tag|
-        opening_elements = number_of_matches_for(text, /<\s*#{tag}\W/)
-        closing_elements = number_of_matches_for(text, /\W#{tag}\s*\/>/)
+        opening_elements = text.count(/<\s*#{tag}\W/)
+        closing_elements = text.count(/\W#{tag}\s*\/>/)
 
         if opening_elements > 0 or closing_elements > 0
           safest_element_count = opening_elements > closing_elements ? opening_elements : closing_elements
@@ -124,12 +124,6 @@ def html_tags
 
     end
 
-    private
-
-    def number_of_matches_for text, regexp
-      text.scan(regexp).length
-    end
-
   end
 
 end
diff --git a/lib/despamilator/filter/ip_address_url.rb b/lib/despamilator/filter/ip_address_url.rb
index 0801476..a5b2cc8 100644
--- a/lib/despamilator/filter/ip_address_url.rb
+++ b/lib/despamilator/filter/ip_address_url.rb
@@ -15,7 +15,7 @@ def description
     def parse subject
       subject.register_match!({
           :score => 0.5, :filter => self
-      }) if subject.text.downcase.scan(/http:\/\/\d+\.\d+\.\d+\.\d+/).length > 0
+      }) if subject.text.downcase.count(/http:\/\/\d+\.\d+\.\d+\.\d+/) > 0
     end
 
   end
diff --git a/lib/despamilator/filter/mixed_case.rb b/lib/despamilator/filter/mixed_case.rb
index 2850edd..54e3a2c 100644
--- a/lib/despamilator/filter/mixed_case.rb
+++ b/lib/despamilator/filter/mixed_case.rb
@@ -11,18 +11,11 @@ def description
 
     def parse subject
       text = subject.text.without_uris
-      count = count_and_strip(text, /[a-z][A-Z]/)
-      count += count_and_strip(text, /[a-z][A-Z][a-z]/)
+      count = text.remove_and_count!(/[a-z][A-Z]/)
+      count += text.remove_and_count!(/[a-z][A-Z][a-z]/)
       subject.register_match!({:score => 0.1 * count, :filter => self}) if count > 0
     end
 
-    private
-
-    def count_and_strip text, regexp
-      count = text.scan(regexp).length
-      text.gsub!(regexp, '')
-      count
-    end
   end
 
 end
\ No newline at end of file
diff --git a/lib/despamilator/filter/obfuscated_urls.rb b/lib/despamilator/filter/obfuscated_urls.rb
index ca9c9e1..33b5949 100644
--- a/lib/despamilator/filter/obfuscated_urls.rb
+++ b/lib/despamilator/filter/obfuscated_urls.rb
@@ -21,7 +21,7 @@ def parse subject
     private
 
     def find_space_separated_parts text
-      text.scan(/www\s+\w+\s+com/).length
+      text.count(/www\s+\w+\s+com/)
     end
 
     def find_space_separated_characters text
diff --git a/lib/despamilator/filter/prices.rb b/lib/despamilator/filter/prices.rb
index 2f48065..b2447de 100644
--- a/lib/despamilator/filter/prices.rb
+++ b/lib/despamilator/filter/prices.rb
@@ -10,7 +10,7 @@ def description
     end
 
     def parse subject
-      price_count = subject.text.scan(/\$\s*\d+/).length
+      price_count = subject.text.count(/\$\s*\d+/)
       subject.register_match!({:score => 0.075 * price_count, :filter => self}) if price_count > 0
     end
 
diff --git a/lib/despamilator/filter/shouting.rb b/lib/despamilator/filter/shouting.rb
index 29eec60..26794c8 100644
--- a/lib/despamilator/filter/shouting.rb
+++ b/lib/despamilator/filter/shouting.rb
@@ -19,7 +19,7 @@ def parse subject
       return if text.length < 20
 
       uppercased = text.scan(/[A-Z][A-Z]+/).join.length
-      lowercased = text.scan(/[a-z]/).length
+      lowercased = text.count(/[a-z]/)
 
       if uppercased > 0
         subject.register_match!({
diff --git a/lib/despamilator/filter/spammy_tlds.rb b/lib/despamilator/filter/spammy_tlds.rb
index 191b8b5..4aab495 100644
--- a/lib/despamilator/filter/spammy_tlds.rb
+++ b/lib/despamilator/filter/spammy_tlds.rb
@@ -13,7 +13,7 @@ def description
     end
 
     def parse subject
-      matches = subject.text.scan(/\w{5,}\.(info|biz)\b/).length
+      matches = subject.text.count(/\w{5,}\.(info|biz)\b/)
       subject.register_match!({:score => 0.05 * matches, :filter => self}) if matches > 0
     end
 
diff --git a/lib/despamilator/filter/urls.rb b/lib/despamilator/filter/urls.rb
index 511e304..01bcba1 100644
--- a/lib/despamilator/filter/urls.rb
+++ b/lib/despamilator/filter/urls.rb
@@ -14,7 +14,7 @@ def description
 
     def parse subject
       text = subject.text.downcase.gsub(/http:\/\/\d+\.\d+\.\d+\.\d+/, '')
-      matches = text.scan(/http:\/\//).length
+      matches = text.count(/https?:\/\//)
       1.upto(matches > 2 ? 2 : matches) do
         subject.register_match!({:score => 0.4, :filter => self})
       end
diff --git a/lib/despamilator/filter/weird_punctuation.rb b/lib/despamilator/filter/weird_punctuation.rb
new file mode 100644
index 0000000..a82fe92
--- /dev/null
+++ b/lib/despamilator/filter/weird_punctuation.rb
@@ -0,0 +1,38 @@
+require 'despamilator/filter'
+
+module DespamilatorFilter
+
+  class WeirdPunctuation < Despamilator::Filter
+
+    def name
+      'Weird Punctuation'
+    end
+
+    def description
+      'Detects unusual use of punctuation.'
+    end
+
+    def parse subject
+      text = subject.text.without_uris
+      text.gsub!(/\w&\w/, '')
+      matches = text.remove_and_count!(/(?:\W|\s|^)(#{punctuation})/)
+      matches += text.remove_and_count!(/(#{punctuation})(#{punctuation})/)
+      matches += text.remove_and_count!(/(#{punctuation})$/)
+      matches += text.remove_and_count!(/(?:\W|\s|^)\d+(#{punctuation})/)
+
+      subject.register_match!({:score => 0.015 * matches, :filter => self}) if matches > 0
+    end
+
+    private
+
+    def punctuation
+      @punctuation ||= %w{~ ` ! @ # $ % ^ & * _ - + = , / ? | \\ : ; ' "}.map do |punctuation_character|
+        Regexp.escape(punctuation_character)
+      end.join('|')
+
+      @punctuation
+    end
+
+  end
+
+end
diff --git a/lib/despamilator/subject/text.rb b/lib/despamilator/subject/text.rb
index 34cc598..450c98f 100644
--- a/lib/despamilator/subject/text.rb
+++ b/lib/despamilator/subject/text.rb
@@ -10,11 +10,21 @@ def initialize text
       end
 
       def without_uris
-        self.gsub(URI.regexp(['http', 'https', 'mailto', 'ftp']), '')
+        gsub(URI.regexp(['http', 'https', 'mailto', 'ftp']), '')
       end
 
       def words
-        self.split(/\W+/)
+        split(/\W+/)
+      end
+
+      def count pattern
+        scan(pattern).flatten.compact.length
+      end
+
+      def remove_and_count! pattern
+        count = count(pattern)
+        gsub!(pattern, '')
+        count
       end
 
     end
diff --git a/spec/filters/script_tag_spec.rb b/spec/filters/script_tag_spec.rb
index 05ef945..ff4568f 100644
--- a/spec/filters/script_tag_spec.rb
+++ b/spec/filters/script_tag_spec.rb
@@ -8,12 +8,11 @@
   a_single_match_of('<script>', should_score: 1)
   a_multiple_match_of('<script></script> <script></script>', should_score: 1)
 
-  describe "detecting various script tags" do
+  context "detecting various script tags" do
     ['<script type="whatever">', '<script></script>', '</script>', '<script>', "<script\n>"].each do |script_tag|
       [script_tag.upcase, script_tag.downcase].each do |script_tag|
         it "should detect '#{script_tag}' of a script tag" do
-          dspam = Despamilator.new(script_tag)
-          dspam.score.should == 1
+          parsing(script_tag).should have_score(1)
         end
       end
     end
diff --git a/spec/filters/weird_punctuation_spec.rb b/spec/filters/weird_punctuation_spec.rb
new file mode 100644
index 0000000..d2899ce
--- /dev/null
+++ b/spec/filters/weird_punctuation_spec.rb
@@ -0,0 +1,18 @@
+describe DespamilatorFilter::WeirdPunctuation do
+
+  the_name_should_be 'Weird Punctuation'
+  the_description_should_be 'Detects unusual use of punctuation.'
+
+  despamilator_should_apply_the_filter_for('^this^')
+
+  a_single_match_of('&gt', should_score: 0.015)
+  a_multiple_match_of('%D :-D &gt;:-[ 123, l 89.', should_score: 0.075)
+
+  it 'should ignore weird punctuation in urls' do
+    parsing('http://www.blah.com?x=1&y=z').should have_score(0)
+  end
+
+  it 'should ignore ampersands surrounded by letters' do
+    parsing('j&r').should have_score(0)
+  end
+end
\ No newline at end of file
diff --git a/spec/subject_text_spec.rb b/spec/subject_text_spec.rb
index 5694ba0..2bda723 100644
--- a/spec/subject_text_spec.rb
+++ b/spec/subject_text_spec.rb
@@ -20,4 +20,16 @@
     ).words.should == %w{hello there you rule}
   end
 
+  it 'should count the matches for a regular expression' do
+    Despamilator::Subject::Text.new(
+        'yXyXy'
+    ).count(/X/).should == 2
+  end
+
+  it 'should count the matches for a regular expression' do
+    text = Despamilator::Subject::Text.new('yXyXy').dup
+    text.remove_count!(/X/).should == 2
+    text.should == 'yyy'
+  end
+
 end
\ No newline at end of file