Skip to content

Commit

Permalink
almost done with first implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Brian Muller committed Nov 29, 2010
1 parent 24ba779 commit bfd83b3
Show file tree
Hide file tree
Showing 8 changed files with 827 additions and 7 deletions.
674 changes: 674 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion README.rdoc
Expand Up @@ -30,4 +30,7 @@ Then, install ankusa:

# This will return Hash with classes as keys and
# membership probability as values
puts c.classes "This is some spammy text"
puts c.classifications "This is some spammy text"

# get a list of all classes
puts c.classes
2 changes: 1 addition & 1 deletion ankusa.gemspec
Expand Up @@ -14,5 +14,5 @@ Gem::Specification.new do |s|
s.require_paths = ["lib"]
s.rubygems_version = "1.3.5"
s.add_dependency('hbaserb', '>= 0.0.1')
s.add_dependency('stemmer', '>= 1.0.1')
s.add_dependency('fast-stemmer', '>= 1.0.0')
end
2 changes: 2 additions & 0 deletions lib/ankusa.rb
@@ -1,2 +1,4 @@
$:.unshift File.dirname(__FILE__)
require 'ankusa/classifier'
require 'ankusa/hasher'

99 changes: 94 additions & 5 deletions lib/ankusa/classifier.rb
@@ -1,23 +1,112 @@
require 'stemmer'

module Ankusa

class Classifier
def initialize(hbase_client)
attr_reader :classnames

def initialize(hbase_client, frequency_tablename="ankusa_word_frequencies", summary_tablename="ankusa_summary")
@hbase = hbase_client
@ftablename = frequency_tablename
@stablename = summary_tablename
init_tables
@classnames = refresh_classnames
end

def train(klass, text)
# word.stem
th = TextHash.new(text)
th.each { |word, count|
freq_table.atomic_increment word, "classes:#{klass.to_s}", count
}
summary_table.atomic_increment klass, "totals:wordcount", th.word_count
summary_table.atomic_increment klass, "totals:doccount"
@classnames << klass if not @classnames.include? klass
end

def untrain(klass, text)
th = TextHash.new(text)
th.each { |word, count|
freq_table.atomic_increment word, "classes:#{klass.to_s}", -count
}
summary_table.atomic_increment klass, "totals:wordcount", -th.word_count
summary_table.atomic_increment klass, "totals:doccount", -1
end

def classify(text)
# return the most probable class
classifications(text).sort { |o,t| o[1] <=> t[1] }.first.first
end

def classifications(text)
classes = {}
results = {}
@classnames.each { |k|
classes[k] = NBClass.new k, summary_table, freq_table
result[k] = 0
}

TextHash.new(text).each { |word,count|
probs = get_counts(word)
@classnames.each { |k|
result[k] += Math.log(probs[k] / classes[k].word_count)
}
}

@classnames.each { |k|
result[k] += Math.log(classes[k].doc_count / doc_count_total)
}

# todo
# normalize logs to make probs
# implement get_counts

result
end

# get all classes
def refresh_classnames
cs = []
summary_table.create_scanner("", "totals") { |row|
cs << row.row.intern
}
cs
end

def drop_tables
freq_table.delete
summary_table.delete
@stable = nil
@ftable = nil
end

def reset
drop_tables
init_tables
end

def doc_count_total
total = 0
summary_table.create_scanner("", "totals:doccount") { |row|
total += row.columns["totals:doccount"].to_i64
}
total
end

def classes(text)
protected
def init_tables
if not @hbase.has_table? @ftablename
@hbase.create_table @ftablename, "classes", "total"
end

if not @hbase.has_table? @stablename
@hbase.create_table @stablename, "totals"
end
end

def summary_table
@stable ||= @hbase.get_table @stablename
end

def freq_table
@ftable ||= @hbase.get_table @ftablename
end
end

Expand Down
33 changes: 33 additions & 0 deletions lib/ankusa/hasher.rb
@@ -0,0 +1,33 @@
require 'fast_stemmer'
require 'ankusa/stopwords'

module Ankusa

class TextHash < Hash
attr_reader :word_count

def initialize(text=nil)
super 0
@word_count = 0
add_text(text) if not text.nil?
end

def add_text(text)
# replace dashes with spaces, then get rid of non-word/non-space characters,
# then split by space to get words
words = text.tr('-', ' ').gsub(/[^\w\s]/,"").split
words.each { |word| add_word word }
self
end

def add_word(word)
word = word.downcase
if not Ankusa::STOPWORDS.include? word
@word_count += 1
key = word.intern
store key, fetch(key, 0)+1
end
end
end

end
15 changes: 15 additions & 0 deletions lib/ankusa/nbclass.rb
@@ -0,0 +1,15 @@
module Ankusa

class NBClass
attr_reader :doc_count, :word_count

def initialize(name, summary_table, freq_table)
@name = name
@summary_table = summary_table
@freq_table = freq_table
@word_count = @summary_table.get(@name, "totals:wordcount").first.to_i64.to_f
@doc_count = @summary_table.get(@name, "totals:doccount").first.to_i64.to_f
end
end

end
4 changes: 4 additions & 0 deletions lib/ankusa/stopwords.rb
@@ -0,0 +1,4 @@
module Ankusa
# These are taken from MySQL - http://dev.mysql.com/tech-resources/articles/full-text-revealed.html
STOPWORDS = "a able about above according accordingly across actually after afterwards again against ain't all allow allows almost alone along already also although always am among amongst an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate are aren't around as aside ask asking associated at available away awfully be became because become becomes becoming been before beforehand behind being believe below beside besides best better between beyond both brief but by c'mon c's came can can't cannot cant cause causes certain certainly changes clearly co com come comes concerning consequently consider considering contain containing contains corresponding could couldn't course currently definitely described despite did didn't different do does doesn't doing don't done down downwards during each edu eg eight either else elsewhere enough entirely especially et etc even ever every everybody everyone everything everywhere ex exactly example except far few fifth first five followed following follows for former formerly forth four from further furthermore get gets getting given gives go goes going gone got gotten greetings had hadn't happens hardly has hasn't have haven't having he he's hello help hence her here here's hereafter hereby herein hereupon hers herself hi him himself his hither hopefully how howbeit however i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed indicate indicated indicates inner insofar instead into inward is isn't it it'd it'll it's its itself just keep keeps kept know knows known last lately later latter latterly least less lest let let's like liked likely little look looking looks ltd mainly many may maybe me mean meanwhile merely might more moreover most mostly much must my myself name namely nd near nearly necessary need needs neither never nevertheless new next nine no nobody non none noone nor normally not nothing novel now nowhere obviously of off often oh ok okay old on once one ones only onto or other others otherwise ought our ours ourselves out outside over overall own particular particularly per perhaps placed please plus possible presumably probably provides que quite qv rather rd re really reasonably regarding regardless regards relatively respectively right said same saw say saying says second secondly see seeing seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall she should shouldn't since six so some somebody somehow someone something sometime sometimes somewhat somewhere soon sorry specified specify specifying still sub such sup sure t's take taken tell tends th than thank thanks thanx that that's thats the their theirs them themselves then thence there there's thereafter thereby therefore therein theres thereupon these they they'd they'll they're they've think third this thorough thoroughly those though three through throughout thru thus to together too took toward towards tried tries truly try trying twice two un under unfortunately unless unlikely until unto up upon us use used useful uses using usually value various very via viz vs want wants was wasn't way we we'd we'll we're we've welcome well went were weren't what what's whatever when whence whenever where where's whereafter whereas whereby wherein whereupon wherever whether which while whither who who's whoever whole whom whose why will willing wish with within without won't wonder would would wouldn't yes yet you you'd you'll you're you've your yours yourself yourselves zero".split
end

0 comments on commit bfd83b3

Please sign in to comment.