Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Tree: f752f08340
Fetching contributors…

Cannot retrieve contributors at this time

executable file 196 lines (166 sloc) 4.858 kB
#!/usr/bin/ruby
require 'rubygems'
require 'open-uri'
#The following code is take from the Ruby Porter Stemming Algorithm implementation
#found at: http://tartarus.org/~martin/PorterStemmer/
module Stemmable
STEP_2_LIST = {
'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
'izer'=>'ize', 'bli'=>'ble',
'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
'ization'=>'ize', 'ation'=>'ate',
'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
'ousness'=>'ous', 'aliti'=>'al',
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
}
STEP_3_LIST = {
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
'ical'=>'ic', 'ful'=>'', 'ness'=>''
}
SUFFIX_1_REGEXP = /(
ational |
tional |
enci |
anci |
izer |
bli |
alli |
entli |
eli |
ousli |
ization |
ation |
ator |
alism |
iveness |
fulness |
ousness |
aliti |
iviti |
biliti |
logi)$/x
SUFFIX_2_REGEXP = /(
al |
ance |
ence |
er |
ic |
able |
ible |
ant |
ement |
ment |
ent |
ou |
ism |
ate |
iti |
ous |
ive |
ize)$/x
C = "[^aeiou]" # consonant
V = "[aeiouy]" # vowel
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
VV = "#{V}(?>[aeiou]*)" # vowel sequence
MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
#
# Porter stemmer in Ruby.
#
# This is the Porter stemming algorithm, ported to Ruby from the
# version coded up in Perl. It's easy to follow against the rules
# in the original paper in:
#
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
# no. 3, pp 130-137,
#
# See also http://www.tartarus.org/~martin/PorterStemmer
#
# Send comments to raypereda@hotmail.com
#
def stem_porter
# make a copy of the given object and convert it to a string.
w = self.dup.to_str
return w if w.length < 3
# now map initial y to Y so that the patterns never treat it as vowel
w[0] = 'Y' if w[0] == ?y
# Step 1a
if w =~ /(ss|i)es$/
w = $` + $1
elsif w =~ /([^s])s$/
w = $` + $1
end
# Step 1b
if w =~ /eed$/
w.chop! if $` =~ MGR0
elsif w =~ /(ed|ing)$/
stem = $`
if stem =~ VOWEL_IN_STEM
w = stem
case w
when /(at|bl|iz)$/ then w << "e"
when /([^aeiouylsz])\1$/ then w.chop!
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
end
end
end
if w =~ /y$/
stem = $`
w = stem + "i" if stem =~ VOWEL_IN_STEM
end
# Step 2
if w =~ SUFFIX_1_REGEXP
stem = $`
suffix = $1
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
if stem =~ MGR0
w = stem + STEP_2_LIST[suffix]
end
end
# Step 3
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
stem = $`
suffix = $1
if stem =~ MGR0
w = stem + STEP_3_LIST[suffix]
end
end
# Step 4
if w =~ SUFFIX_2_REGEXP
stem = $`
if stem =~ MGR1
w = stem
end
elsif w =~ /(s|t)(ion)$/
stem = $` + $1
if stem =~ MGR1
w = stem
end
end
# Step 5
if w =~ /e$/
stem = $`
if (stem =~ MGR1) ||
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
w = stem
end
end
if w =~ /ll$/ && w =~ MGR1
w.chop!
end
# and turn initial Y back to y
w[0] = 'y' if w[0] == ?Y
w
end
#
# make the stem_porter the default stem method, just in case we
# feel like having multiple stemmers available later.
#
alias stem stem_porter
end
# Add Stemming functionality to String class
class String
include Stemmable
end
Jump to Line
Something went wrong with that request. Please try again.