Skip to content
This repository has been archived by the owner on Jan 17, 2018. It is now read-only.

Commit

Permalink
adding multilanguage support & support for non ascii carset
Browse files Browse the repository at this point in the history
  • Loading branch information
Oliviergg committed Apr 19, 2012
1 parent 14cd64a commit ec57929
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 10 deletions.
11 changes: 11 additions & 0 deletions lib/stuff-classifier/base.rb
@@ -1,3 +1,6 @@
# encoding: utf-8
require "lingua/stemmer"

class StuffClassifier::Base
include StuffClassifier::Tokenizer
attr_reader :name
Expand All @@ -6,6 +9,14 @@ def initialize(name, opts={})
@stemming = opts.key?(:stemming) ? opts[:stemming] : true
purge_state = opts[:purge_state]

if opts[:language]
@language=opts[:language]
else
@language="en"
end

@stemmer = Lingua::Stemmer.new(:language => @language)

@name = name
@wcount = {}
@ccount = {}
Expand Down
1 change: 1 addition & 0 deletions lib/stuff-classifier/bayes.rb
@@ -1,3 +1,4 @@
# encoding: utf-8

class StuffClassifier::Bayes < StuffClassifier::Base
# http://en.wikipedia.org/wiki/Naive_Bayes_classifier
Expand Down
24 changes: 22 additions & 2 deletions lib/stuff-classifier/stop_words.rb
@@ -1,6 +1,8 @@
# encoding: utf-8
require 'set'

StuffClassifier::STOP_WORDS = Set.new [
StuffClassifier::STOP_WORDS = {
"en" => Set.new([
'a', 'about', 'above', 'across', 'after', 'afterwards',
'again', 'against', 'all', 'almost', 'alone', 'along',
'already', 'also', 'although', 'always', 'am', 'among',
Expand Down Expand Up @@ -54,4 +56,22 @@
'whoever', 'whole', 'whom', 'whose', 'why', 'will',
'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
'yourself', 'yourselves'
]
]),
"fr" => Set.new(
["au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", "elle", "en", "et", "eux",
"il", "je", "la", "le", "leur", "lui", "ma", "mais", "me", "même", "mes", "moi", "mon",
"ne", "nos", "notre", "nous", "on", "ou", "par", "pas", "pour", "qu", "que", "qui", "sa",
"se", "ses", "son", "sur", "ta", "te", "tes", "toi", "ton", "tu", "un", "une", "vos", "votre",
"vous", "c", "d", "j", "l", "à", "m", "n", "s", "t", "y", "été", "étée", "étées",
"étés", "étant", "suis", "es", "est", "sommes", "êtes", "sont", "serai", "seras",
"sera", "serons", "serez", "seront", "serais", "serait", "serions", "seriez", "seraient",
"étais", "était", "étions", "étiez", "étaient", "fus", "fut", "fûmes", "fûtes",
"furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", "fusses", "fût",
"fussions", "fussiez", "fussent", "ayant", "eu", "eue", "eues", "eus", "ai", "as",
"avons", "avez", "ont", "aurai", "auras", "aura", "aurons", "aurez", "auront", "aurais",
"aurait", "aurions", "auriez", "auraient", "avais", "avait", "avions", "aviez", "avaient",
"eut", "eûmes", "eûtes", "eurent", "aie", "aies", "ait", "ayons", "ayez", "aient", "eusse",
"eusses", "eût", "eussions", "eussiez", "eussent", "ceci", "celà ", "cet", "cette", "ici",
"ils", "les", "leurs", "quel", "quels", "quelle", "quelles", "sans", "soi"
])
}
14 changes: 6 additions & 8 deletions lib/stuff-classifier/tokenizer.rb
@@ -1,4 +1,4 @@
require 'fast_stemmer'
# encoding: utf-8

module StuffClassifier::Tokenizer
attr_writer :stemming
Expand All @@ -8,7 +8,7 @@ def ignore_words=(value)
end

def ignore_words
@ignore_words || StuffClassifier::STOP_WORDS
@ignore_words || StuffClassifier::STOP_WORDS[@language]
end

def stemming?
Expand All @@ -20,15 +20,13 @@ def each_word(string)
return if string == ''

words = []

cnt = string.gsub(/['`]/, '')
cnt.split("\n").each do |line|
line_cnt = line.gsub(/[^a-zA-Z]+/, ' ')
line_cnt.split(/\s+/).each do |w|

string.split("\n").each do |line|
line.gsub(/\p{Word}+/).each do |w|
next if w == '' || ignore_words.member?(w.downcase)

if stemming?
w = w.stem.downcase
w = @stemmer.stem(w).downcase
next if ignore_words.member?(w)
else
w = w.downcase
Expand Down

0 comments on commit ec57929

Please sign in to comment.