82 changes: 32 additions & 50 deletions libindic/stemmer/__init__.py
Expand Up @@ -21,7 +21,6 @@
# santhosh.thottingal@gmail.com
# URL: http://www.smc.org.in

import codecs
import os
import normalizer

Expand All @@ -41,19 +40,22 @@ def __init__(self, verbose=False):
os.path.dirname(__file__), 'data/rootwords.txt'))
self.dictionary = self.dictionary_file.readlines()
self.dictionary_file.close()
self.dictionary = [x.strip().decode('utf-8')
for x in self.dictionary]
try:
self.dictionary = [x.strip().decode('utf-8')
for x in self.dictionary]
except:
self.dictionary = [x.strip() for x in self.dictionary]

def singleencode(self, word):
'''
Normalize word to single encoding.
'''
replace = {u'\u0d15\u0d4d\u200d': u'\u0d7f',
u'\u0d23\u0d4d\u200d': u'\u0d7a',
u'\u0d28\u0d4d\u200d': u'\u0d7b',
u'\u0d30\u0d4d\u200d': u'\u0d7c',
u'\u0d32\u0d4d\u200d': u'\u0d7d',
u'\u0d33\u0d4d\u200d': u'\u0d7e'}
replace = {'\\u0d15\\u0d4d\\u200d': '\\u0d7f',
'\\u0d23\\u0d4d\\u200d': '\\u0d7a',
'\\u0d28\\u0d4d\\u200d': '\\u0d7b',
'\\u0d30\\u0d4d\\u200d': '\\u0d7c',
'\\u0d32\\u0d4d\\u200d': '\\u0d7d',
'\\u0d33\\u0d4d\\u200d': '\\u0d7e'}
for character in replace:
word = word.replace(character, replace[character])
return word
Expand All @@ -76,8 +78,6 @@ def stem(self, text):
word = words[word_iter]
word = self.trim(word)
word = word.strip('!,.?:')
word_length = len(word)
suffix_pos_itr = 2
try:
result = self.trim(word).decode('utf-8')
except:
Expand Down Expand Up @@ -107,11 +107,11 @@ def stem(self, text):
suffix = result[counter:] # Right to left suffix stripping
if suffix in self.rulesDict:
if self.verbose:
print(
print((
"\t Satisfying rule found : ",
suffix,
" = ",
self.rulesDict[suffix])
self.rulesDict[suffix]))
result = result[:counter] + self.rulesDict[suffix]
# A satisfying rule found, continue stemming.
found = True
Expand All @@ -127,46 +127,28 @@ def stem(self, text):

def LoadRules(self):
rules_dict = dict()
line = []
line_number = 0
rule_number = 0
rules_file = codecs.open(self.rules_file, encoding='utf-8',
errors='ignore')
while True:
line_number = line_number + 1
rules_file_object = open(self.rules_file)
rules_text = rules_file_object.readlines()
rules_file_object.close()
rules_dict = {}
for line in rules_text:
if line == '' or line[0] == '#':
continue
items = line.strip().split('=')
try:
text = unicode(rules_file.readline())
try:
lhs = items[0].strip().strip(
'"').strip("'").decode('utf-8')
rhs = items[1].strip().strip(
'"').strip("'").decode('utf-8')
except:
lhs = items[0].strip().strip('"').strip("'")
rhs = items[1].strip().strip('"').strip("'")
lhs = self.singleencode(lhs)
rhs = self.singleencode(rhs)
rules_dict[lhs] = rhs
except:
text = rules_file.readline()
if text == "":
break
if text[0] == '#':
continue # this is a comment - ignore
text = text.split("#")[0] # remove the comment part of the line
line_number = line_number + 1
line = text.strip() # remove unwanted space
if(line == ""):
continue
if(len(line.split("=")) != 2):
print(
"[Error] Syntax Error in the Rules. Line number: ",
line_number)
print("Line: " + text)
continue
lhs = line.split("=")[0].strip()
rhs = line.split("=")[1].strip()
if(len(rhs) > 0):
if(lhs[0] == '"'):
lhs = lhs[1:len(lhs)] # if the string is "quoted"
if(lhs[len(lhs) - 1] == '"'):
lhs = lhs[0:len(lhs) - 1] # if the string is "quoted"
if(len(rhs) > 0):
if(rhs[0] == '"'):
rhs = rhs[1:len(rhs)] # if the string is "quoted"
if(rhs[len(rhs) - 1] == '"'):
rhs = rhs[0:len(rhs) - 1] # if the string is "quoted"
rule_number = rule_number + 1
rules_dict[lhs] = rhs
return rules_dict

def trim(self, word):
Expand Down
9 changes: 6 additions & 3 deletions libindic/stemmer/tests/tests_stemmer.py
Expand Up @@ -2,15 +2,15 @@
# -*- coding: utf-8 -*-

from testtools import TestCase
from libindic.stemmer import Malayalam as Stemmer
from libindic import stemmer
import collections


class MalayalamStemmerTest(TestCase):

def setUp(self):
super(MalayalamStemmerTest, self).setUp()
self.stemmer = Stemmer()
self.stemmer = stemmer.Malayalam()
self.verbosity = False

def test_accusative(self):
Expand Down Expand Up @@ -43,7 +43,10 @@ def test_accusative(self):
word = self.stemmer.singleencode(word)
obtained = self.stemmer.stem(word)[word]
if self.verbosity:
print(expected, obtained)
print("\t", expected)
print("\t", obtained)
print("\t", type(expected))
print("\t", type(obtained))
assert obtained == expected

def test_conjuctive(self):
Expand Down