From ba2dd1f3e7955c904a1acb8c46342e685ebd7922 Mon Sep 17 00:00:00 2001 From: Stephen Hardisty Date: Thu, 1 Sep 2011 10:58:42 -0400 Subject: [PATCH] added new filter, put count and remove methods into text (what a class!!) --- lib/despamilator/filter/html_tags.rb | 10 ++---- lib/despamilator/filter/ip_address_url.rb | 2 +- lib/despamilator/filter/mixed_case.rb | 11 ++---- lib/despamilator/filter/obfuscated_urls.rb | 2 +- lib/despamilator/filter/prices.rb | 2 +- lib/despamilator/filter/shouting.rb | 2 +- lib/despamilator/filter/spammy_tlds.rb | 2 +- lib/despamilator/filter/urls.rb | 2 +- lib/despamilator/filter/weird_punctuation.rb | 38 ++++++++++++++++++++ lib/despamilator/subject/text.rb | 14 ++++++-- spec/filters/script_tag_spec.rb | 5 ++- spec/filters/weird_punctuation_spec.rb | 18 ++++++++++ spec/subject_text_spec.rb | 12 +++++++ 13 files changed, 92 insertions(+), 28 deletions(-) create mode 100644 lib/despamilator/filter/weird_punctuation.rb create mode 100644 spec/filters/weird_punctuation_spec.rb diff --git a/lib/despamilator/filter/html_tags.rb b/lib/despamilator/filter/html_tags.rb index c8d2962..a3f74cc 100644 --- a/lib/despamilator/filter/html_tags.rb +++ b/lib/despamilator/filter/html_tags.rb @@ -6,8 +6,8 @@ def parse subject text = subject.text.downcase html_tags.each do |tag| - opening_elements = number_of_matches_for(text, /<\s*#{tag}\W/) - closing_elements = number_of_matches_for(text, /\W#{tag}\s*\/>/) + opening_elements = text.count(/<\s*#{tag}\W/) + closing_elements = text.count(/\W#{tag}\s*\/>/) if opening_elements > 0 or closing_elements > 0 safest_element_count = opening_elements > closing_elements ? opening_elements : closing_elements @@ -124,12 +124,6 @@ def html_tags end - private - - def number_of_matches_for text, regexp - text.scan(regexp).length - end - end end diff --git a/lib/despamilator/filter/ip_address_url.rb b/lib/despamilator/filter/ip_address_url.rb index 0801476..a5b2cc8 100644 --- a/lib/despamilator/filter/ip_address_url.rb +++ b/lib/despamilator/filter/ip_address_url.rb @@ -15,7 +15,7 @@ def description def parse subject subject.register_match!({ :score => 0.5, :filter => self - }) if subject.text.downcase.scan(/http:\/\/\d+\.\d+\.\d+\.\d+/).length > 0 + }) if subject.text.downcase.count(/http:\/\/\d+\.\d+\.\d+\.\d+/) > 0 end end diff --git a/lib/despamilator/filter/mixed_case.rb b/lib/despamilator/filter/mixed_case.rb index 2850edd..54e3a2c 100644 --- a/lib/despamilator/filter/mixed_case.rb +++ b/lib/despamilator/filter/mixed_case.rb @@ -11,18 +11,11 @@ def description def parse subject text = subject.text.without_uris - count = count_and_strip(text, /[a-z][A-Z]/) - count += count_and_strip(text, /[a-z][A-Z][a-z]/) + count = text.remove_and_count!(/[a-z][A-Z]/) + count += text.remove_and_count!(/[a-z][A-Z][a-z]/) subject.register_match!({:score => 0.1 * count, :filter => self}) if count > 0 end - private - - def count_and_strip text, regexp - count = text.scan(regexp).length - text.gsub!(regexp, '') - count - end end end \ No newline at end of file diff --git a/lib/despamilator/filter/obfuscated_urls.rb b/lib/despamilator/filter/obfuscated_urls.rb index ca9c9e1..33b5949 100644 --- a/lib/despamilator/filter/obfuscated_urls.rb +++ b/lib/despamilator/filter/obfuscated_urls.rb @@ -21,7 +21,7 @@ def parse subject private def find_space_separated_parts text - text.scan(/www\s+\w+\s+com/).length + text.count(/www\s+\w+\s+com/) end def find_space_separated_characters text diff --git a/lib/despamilator/filter/prices.rb b/lib/despamilator/filter/prices.rb index 2f48065..b2447de 100644 --- a/lib/despamilator/filter/prices.rb +++ b/lib/despamilator/filter/prices.rb @@ -10,7 +10,7 @@ def description end def parse subject - price_count = subject.text.scan(/\$\s*\d+/).length + price_count = subject.text.count(/\$\s*\d+/) subject.register_match!({:score => 0.075 * price_count, :filter => self}) if price_count > 0 end diff --git a/lib/despamilator/filter/shouting.rb b/lib/despamilator/filter/shouting.rb index 29eec60..26794c8 100644 --- a/lib/despamilator/filter/shouting.rb +++ b/lib/despamilator/filter/shouting.rb @@ -19,7 +19,7 @@ def parse subject return if text.length < 20 uppercased = text.scan(/[A-Z][A-Z]+/).join.length - lowercased = text.scan(/[a-z]/).length + lowercased = text.count(/[a-z]/) if uppercased > 0 subject.register_match!({ diff --git a/lib/despamilator/filter/spammy_tlds.rb b/lib/despamilator/filter/spammy_tlds.rb index 191b8b5..4aab495 100644 --- a/lib/despamilator/filter/spammy_tlds.rb +++ b/lib/despamilator/filter/spammy_tlds.rb @@ -13,7 +13,7 @@ def description end def parse subject - matches = subject.text.scan(/\w{5,}\.(info|biz)\b/).length + matches = subject.text.count(/\w{5,}\.(info|biz)\b/) subject.register_match!({:score => 0.05 * matches, :filter => self}) if matches > 0 end diff --git a/lib/despamilator/filter/urls.rb b/lib/despamilator/filter/urls.rb index 511e304..01bcba1 100644 --- a/lib/despamilator/filter/urls.rb +++ b/lib/despamilator/filter/urls.rb @@ -14,7 +14,7 @@ def description def parse subject text = subject.text.downcase.gsub(/http:\/\/\d+\.\d+\.\d+\.\d+/, '') - matches = text.scan(/http:\/\//).length + matches = text.count(/https?:\/\//) 1.upto(matches > 2 ? 2 : matches) do subject.register_match!({:score => 0.4, :filter => self}) end diff --git a/lib/despamilator/filter/weird_punctuation.rb b/lib/despamilator/filter/weird_punctuation.rb new file mode 100644 index 0000000..a82fe92 --- /dev/null +++ b/lib/despamilator/filter/weird_punctuation.rb @@ -0,0 +1,38 @@ +require 'despamilator/filter' + +module DespamilatorFilter + + class WeirdPunctuation < Despamilator::Filter + + def name + 'Weird Punctuation' + end + + def description + 'Detects unusual use of punctuation.' + end + + def parse subject + text = subject.text.without_uris + text.gsub!(/\w&\w/, '') + matches = text.remove_and_count!(/(?:\W|\s|^)(#{punctuation})/) + matches += text.remove_and_count!(/(#{punctuation})(#{punctuation})/) + matches += text.remove_and_count!(/(#{punctuation})$/) + matches += text.remove_and_count!(/(?:\W|\s|^)\d+(#{punctuation})/) + + subject.register_match!({:score => 0.015 * matches, :filter => self}) if matches > 0 + end + + private + + def punctuation + @punctuation ||= %w{~ ` ! @ # $ % ^ & * _ - + = , / ? | \\ : ; ' "}.map do |punctuation_character| + Regexp.escape(punctuation_character) + end.join('|') + + @punctuation + end + + end + +end diff --git a/lib/despamilator/subject/text.rb b/lib/despamilator/subject/text.rb index 34cc598..450c98f 100644 --- a/lib/despamilator/subject/text.rb +++ b/lib/despamilator/subject/text.rb @@ -10,11 +10,21 @@ def initialize text end def without_uris - self.gsub(URI.regexp(['http', 'https', 'mailto', 'ftp']), '') + gsub(URI.regexp(['http', 'https', 'mailto', 'ftp']), '') end def words - self.split(/\W+/) + split(/\W+/) + end + + def count pattern + scan(pattern).flatten.compact.length + end + + def remove_and_count! pattern + count = count(pattern) + gsub!(pattern, '') + count end end diff --git a/spec/filters/script_tag_spec.rb b/spec/filters/script_tag_spec.rb index 05ef945..ff4568f 100644 --- a/spec/filters/script_tag_spec.rb +++ b/spec/filters/script_tag_spec.rb @@ -8,12 +8,11 @@ a_single_match_of(' ', should_score: 1) - describe "detecting various script tags" do + context "detecting various script tags" do ['', '', '