Skip to content

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
...
  • 4 commits
  • 6 files changed
  • 0 commit comments
  • 2 contributors
View
21 lib/stuff-classifier/base.rb
@@ -1,3 +1,6 @@
+# encoding: utf-8
+require "lingua/stemmer"
+
class StuffClassifier::Base
include StuffClassifier::Tokenizer
attr_reader :name
@@ -6,6 +9,14 @@ def initialize(name, opts={})
@stemming = opts.key?(:stemming) ? opts[:stemming] : true
purge_state = opts[:purge_state]
+ if opts[:language]
+ @language=opts[:language]
+ else
+ @language="en"
+ end
+
+ @stemmer = Lingua::Stemmer.new(:language => @language)
+
@name = name
@wcount = {}
@ccount = {}
@@ -39,6 +50,11 @@ def cat_count(category)
@ccount[category] ? @ccount[category].to_f : 0.0
end
+ def total_in_cat(category)
+ # this has to be optimized
+ @wcount.find_all{|k,v| v.member? category}.map{|k,v| v[category]}.inject(0){|a,b| a+b}
+ end
+
def total_count
@ccount.values.inject(0){|s,c| s + c}.to_f
end
@@ -53,8 +69,9 @@ def train(category, text)
end
def word_prob(word, cat)
- return 0.0 if cat_count(cat) == 0
- word_count(word, cat) / cat_count(cat)
+ total_words_in_cat = total_in_cat(cat)
+ return 0.0 if total_words_in_cat == 0
+ word_count(word, cat).to_f / total_words_in_cat
end
def word_weighted_average(word, cat, opts={})
View
1 lib/stuff-classifier/bayes.rb
@@ -1,3 +1,4 @@
+# encoding: utf-8
class StuffClassifier::Bayes < StuffClassifier::Base
# http://en.wikipedia.org/wiki/Naive_Bayes_classifier
View
24 lib/stuff-classifier/stop_words.rb
@@ -1,6 +1,8 @@
+# encoding: utf-8
require 'set'
-StuffClassifier::STOP_WORDS = Set.new [
+StuffClassifier::STOP_WORDS = {
+ "en" => Set.new([
'a', 'about', 'above', 'across', 'after', 'afterwards',
'again', 'against', 'all', 'almost', 'alone', 'along',
'already', 'also', 'although', 'always', 'am', 'among',
@@ -54,4 +56,22 @@
'whoever', 'whole', 'whom', 'whose', 'why', 'will',
'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
'yourself', 'yourselves'
-]
+]),
+"fr" => Set.new(
+["au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et", "eux",
+ "il", "je", "la", "le", "leur", "lui", "ma", "mais", "me", "même", "mes", "moi", "mon",
+ "ne", "nos", "notre", "nous", "on", "ou", "par", "pas", "pour", "qu", "que", "qui", "sa",
+ "se", "ses", "son", "sur", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre",
+ "vous", "c", "d", "j", "l", "à", "m", "n", "s", "t", "y", "été", "étée", "étées",
+ "étés", "étant", "suis", "es", "est", "sommes", "êtes", "sont", "serai", "seras",
+ "sera", "serons", "serez", "seront", "serais", "serait", "serions", "seriez", "seraient",
+ "étais", "était", "étions", "étiez", "étaient", "fus", "fut", "fûmes", "fûtes",
+ "furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", "fusses", "fût",
+ "fussions", "fussiez", "fussent", "ayant", "eu", "eue", "eues", "eus", "ai", "as",
+ "avons", "avez", "ont", "aurai", "auras", "aura", "aurons", "aurez", "auront", "aurais",
+ "aurait", "aurions", "auriez", "auraient", "avais", "avait", "avions", "aviez", "avaient",
+ "eut", "eûmes", "eûtes", "eurent", "aie", "aies", "ait", "ayons", "ayez", "aient", "eusse",
+ "eusses", "eût", "eussions", "eussiez", "eussent", "ceci", "celà ", "cet", "cette", "ici",
+ "ils", "les", "leurs", "quel", "quels", "quelle", "quelles", "sans", "soi"
+ ])
+}
View
14 lib/stuff-classifier/tokenizer.rb
@@ -1,4 +1,4 @@
-require 'fast_stemmer'
+# encoding: utf-8
module StuffClassifier::Tokenizer
attr_writer :stemming
@@ -8,7 +8,7 @@ def ignore_words=(value)
end
def ignore_words
- @ignore_words || StuffClassifier::STOP_WORDS
+ @ignore_words || StuffClassifier::STOP_WORDS[@language]
end
def stemming?
@@ -20,15 +20,13 @@ def each_word(string)
return if string == ''
words = []
-
- cnt = string.gsub(/['`]/, '')
- cnt.split("\n").each do |line|
- line_cnt = line.gsub(/[^a-zA-Z]+/, ' ')
- line_cnt.split(/\s+/).each do |w|
+
+ string.split("\n").each do |line|
+ line.gsub(/\p{Word}+/).each do |w|
next if w == '' || ignore_words.member?(w.downcase)
if stemming?
- w = w.stem.downcase
+ w = @stemmer.stem(w).downcase
next if ignore_words.member?(w)
else
w = w.downcase
View
2 stuff-classifier.gemspec
@@ -16,7 +16,7 @@ Gem::Specification.new do |s|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
s.require_paths = ["lib"]
- s.add_runtime_dependency "fast-stemmer", ">= 1.0"
+ s.add_runtime_dependency "ruby-stemmer"
s.add_runtime_dependency "sqlite3"
s.add_runtime_dependency "sequel"
# Msgpack has a habbit of changing the format, so I'm setting its
View
15 test/test_001_tokenizer.rb
@@ -1,12 +1,9 @@
-require 'helper'
+require 'helper.rb'
class Test001Tokenizer < TestBase
before do
- tokenizer_cls = Class.new do
- include StuffClassifier::Tokenizer
- end
+ @tokenizer = StuffClassifier::Bayes.new("TEST")
- @tokenizer = tokenizer_cls.new
end
def test_simple_tokens
@@ -17,8 +14,8 @@ def test_simple_tokens
def test_with_stemming
@tokenizer.stemming = true
assert_equal(
- ["lot", "dog", "lot", "cat", "inform", "highwai"],
- @tokenizer.each_word('Lots of dogs, lots of cats! This is the information highway')
+ ["lot", "dog", "lot", "cat", "realli" ,"inform", "highway" ],
+ @tokenizer.each_word('Lots of dogs, lots of cats! This really is the information highway')
)
end
@@ -27,12 +24,12 @@ def test_complicated_tokens
accomplish. There is a class TestEval2, you can do test_eval2 =
TestEval2.new afterwards. And: class A ... end always yields nil, so
your output is ok I guess ;-)")
-
+
should_return = [
"really", "want", "accomplish", "class",
"testeval", "test", "eval", "testeval", "new", "class", "end",
"yields", "nil", "output", "ok", "guess"]
-
+
assert_equal should_return, words
end

No commit comments for this range

Something went wrong with that request. Please try again.