Skip to content
Browse files

fixing bugs through the vocab

  • Loading branch information...
1 parent 0b28214 commit 512166fadf0316cc6fb3019c34d86d633d7f15ed @agonopol committed Oct 23, 2011
Showing with 47,251 additions and 158 deletions.
  1. +23,531 −0 in.txt
  2. +1 −1 main.go
  3. +23,531 −0 out.txt
  4. +32 −29 stemmer.go
  5. +156 −128 stemmer_test.go
View
23,531 in.txt
23,531 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
2 main.go
@@ -9,7 +9,7 @@ func main() {
out := bufio.NewWriter(os.Stdout)
defer out.Flush()
- for word, err := in.ReadSlice(' '); err == nil; word, err = in.ReadSlice(' ') {
+ for word, err := in.ReadSlice('\n'); err == nil; word, err = in.ReadSlice('\n') {
out.Write(stemmer.Stem(word))
out.WriteString("\n")
}
View
23,531 out.txt
23,531 additions, 0 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
61 stemmer.go
@@ -28,24 +28,26 @@ const (
func Meansure(body []byte) int {
meansure := 0
- var state int
- if Vowel(body, 0) {
- state = vowel_state
- } else {
- state = consonant_state
- }
- for i := 0; i < len(body); i++ {
- if Vowel(body, i) && state == consonant_state {
+ if len(body) > 0 {
+ var state int
+ if Vowel(body, 0) {
state = vowel_state
- } else if Consonant(body, i) && state == vowel_state {
+ } else {
state = consonant_state
- meansure++
+ }
+ for i := 0; i < len(body); i++ {
+ if Vowel(body, i) && state == consonant_state {
+ state = vowel_state
+ } else if Consonant(body, i) && state == vowel_state {
+ state = consonant_state
+ meansure++
+ }
}
}
return meansure
}
-func HasVowel(body []byte) bool {
+func hasVowel(body []byte) bool {
for i := 0; i < len(body); i++ {
if Vowel(body, i) {
return true
@@ -67,7 +69,7 @@ func one_a(body []byte) []byte {
func star_o(body []byte) bool {
size := len(body) - 1
- if Consonant(body, size-2) && Vowel(body, size-1) && Consonant(body, size) {
+ if size >= 2 && Consonant(body, size-2) && Vowel(body, size-1) && Consonant(body, size) {
return body[size] != 'w' && body[size] != 'x' && body[size] != 'y'
}
return false
@@ -92,20 +94,23 @@ func one_b_a(body []byte) []byte {
func one_b(body []byte) []byte {
if bytes.HasSuffix(body, []byte("eed")) {
- if Meansure(body[:len(body)-1]) > 0 {
+ if Meansure(body[:len(body)-3]) > 0 {
return body[:len(body)-1]
}
- return body
- } else if bytes.HasSuffix(body, []byte("ed")) && HasVowel(body[:len(body)-2]) {
- return one_b_a(body[:len(body)-2])
- } else if bytes.HasSuffix(body, []byte("ing")) && HasVowel(body[:len(body)-3]) {
- return one_b_a(body[:len(body)-3])
+ } else if bytes.HasSuffix(body, []byte("ed")) {
+ if hasVowel(body[:len(body)-2]) {
+ return one_b_a(body[:len(body)-2])
+ }
+ } else if bytes.HasSuffix(body, []byte("ing")) {
+ if hasVowel(body[:len(body)-3]) {
+ return one_b_a(body[:len(body)-3])
+ }
}
return body
}
func one_c(body []byte) []byte {
- if bytes.HasSuffix(body, []byte("y")) && HasVowel(body[:len(body)-1]) {
+ if bytes.HasSuffix(body, []byte("y")) && hasVowel(body[:len(body)-1]) {
body[len(body)-1] = 'i'
return body
}
@@ -116,11 +121,9 @@ func two(body []byte) []byte {
if bytes.HasSuffix(body, []byte("ational")) && Meansure(body[:len(body)-7]) > 0 {
return append(body[:len(body)-7], []byte("ate")...)
} else if bytes.HasSuffix(body, []byte("tional")) && Meansure(body[:len(body)-6]) > 0 {
- return append(body[:len(body)-6], []byte("tion")...)
- } else if bytes.HasSuffix(body, []byte("enci")) && Meansure(body[:len(body)-4]) > 0 {
- return append(body[:len(body)-4], []byte("ence")...)
- } else if bytes.HasSuffix(body, []byte("anci")) && Meansure(body[:len(body)-4]) > 0 {
- return append(body[:len(body)-4], []byte("ance")...)
+ return body[:len(body)-2]
+ } else if (bytes.HasSuffix(body, []byte("enci")) || bytes.HasSuffix(body, []byte("anci"))) && Meansure(body[:len(body)-4]) > 0 {
+ return append(body[:len(body)-1], 'e')
} else if bytes.HasSuffix(body, []byte("izer")) && Meansure(body[:len(body)-4]) > 0 {
return append(body[:len(body)-4], []byte("ize")...)
} else if bytes.HasSuffix(body, []byte("abli")) && Meansure(body[:len(body)-4]) > 0 {
@@ -159,15 +162,15 @@ func two(body []byte) []byte {
func three(body []byte) []byte {
if bytes.HasSuffix(body, []byte("icate")) && Meansure(body[:len(body)-5]) > 0 {
- return append(body[:len(body)-5], []byte("ic")...)
+ return body[:len(body)-3]
} else if bytes.HasSuffix(body, []byte("ative")) && Meansure(body[:len(body)-5]) > 0 {
return body[:len(body)-5]
} else if bytes.HasSuffix(body, []byte("alize")) && Meansure(body[:len(body)-5]) > 0 {
- return append(body[:len(body)-5], []byte("al")...)
+ return body[:len(body)-3]
} else if bytes.HasSuffix(body, []byte("iciti")) && Meansure(body[:len(body)-5]) > 0 {
- return append(body[:len(body)-5], []byte("ic")...)
+ return body[:len(body)-3]
} else if bytes.HasSuffix(body, []byte("ical")) && Meansure(body[:len(body)-4]) > 0 {
- return append(body[:len(body)-4], []byte("ic")...)
+ return body[:len(body)-2]
} else if bytes.HasSuffix(body, []byte("ful")) && Meansure(body[:len(body)-3]) > 0 {
return body[:len(body)-3]
} else if bytes.HasSuffix(body, []byte("ness")) && Meansure(body[:len(body)-4]) > 0 {
@@ -224,7 +227,7 @@ func four(body []byte) []byte {
func five_a(body []byte) []byte {
if bytes.HasSuffix(body, []byte("e")) && Meansure(body[:len(body)-1]) > 1 {
return body[:len(body)-1]
- } else if bytes.HasSuffix(body, []byte("e")) && Meansure(body[:len(body)-1]) == 1 && !star_o(body[:len(body)-1]) {
+ } else if bytes.HasSuffix(body, []byte("e")) && Meansure(body[:len(body)-1]) == 1 && !star_o(body[:len(body)-1]) {
return body[:len(body)-1]
}
return body
View
284 stemmer_test.go
@@ -1,139 +1,167 @@
package stemmer
import "testing"
+import "bufio"
+import "strings"
+import "os"
+import "fmt"
func compare(t *testing.T, expected, actual interface{}, msg ...string) {
if expected != actual {
t.Errorf("[%v] -- value differs. Expected [%v], actual [%v]", msg, expected, actual)
}
}
-func TestConsonant(t *testing.T) {
- word := []byte("TOY")
- compare(t, true, Consonant(word, 0), "T") //T
- compare(t, false, Consonant(word, 1), "O") //O
- compare(t, true, Consonant(word, 2), "Y") //Y
- word = []byte("SYZYGY")
- compare(t, true, Consonant(word, 0), "S") //S
- compare(t, false, Consonant(word, 1), "Y") //Y
- compare(t, true, Consonant(word, 2), "Z") //Z
- compare(t, false, Consonant(word, 3), "Y") //Y
- compare(t, true, Consonant(word, 4), "G") //G
- compare(t, false, Consonant(word, 5), "Y") //Y
- word = []byte("yoke")
- compare(t, false, Consonant(word, 0), "YOKE")
-}
-
-func TestMeasure(t *testing.T) {
- compare(t, 0, Meansure([]byte("TR")))
- compare(t, 0, Meansure([]byte("EE")))
- compare(t, 0, Meansure([]byte("TREE")))
- compare(t, 0, Meansure([]byte("Y")))
- compare(t, 0, Meansure([]byte("BY")))
- compare(t, 1, Meansure([]byte("TROUBLE")))
- compare(t, 1, Meansure([]byte("OATS")))
- compare(t, 1, Meansure([]byte("TREES")))
- compare(t, 1, Meansure([]byte("IVY")))
- compare(t, 2, Meansure([]byte("TROUBLES")))
- compare(t, 2, Meansure([]byte("PRIVATE")))
- compare(t, 2, Meansure([]byte("OATEN")))
- compare(t, 2, Meansure([]byte("ORRERY")))
-}
-
-func Test1A(t *testing.T) {
- compare(t, "caress", string(one_a([]byte("caresses"))))
- compare(t, "poni", string(one_a([]byte("ponies"))))
- compare(t, "ti", string(one_a([]byte("ties"))))
- compare(t, "caress", string(one_a([]byte("caress"))))
- compare(t, "cat", string(one_a([]byte("cats"))))
-}
-
-func Test1B(t *testing.T) {
- compare(t, "feed", string(one_b([]byte("feed"))))
- compare(t, "agree", string(one_b([]byte("agreed"))))
- compare(t, "plaster", string(one_b([]byte("plastered"))))
- compare(t, "bled", string(one_b([]byte("bled"))))
- compare(t, "motor", string(one_b([]byte("motoring"))))
- compare(t, "sing", string(one_b([]byte("sing"))))
- compare(t, "motor", string(one_b([]byte("motoring"))))
- compare(t, "conflate", string(one_b([]byte("conflated"))))
- compare(t, "trouble", string(one_b([]byte("troubled"))))
- compare(t, "size", string(one_b([]byte("sized"))))
- compare(t, "hop", string(one_b([]byte("hopping"))))
- compare(t, "tan", string(one_b([]byte("tanned"))))
- compare(t, "fail", string(one_b([]byte("failing"))))
- compare(t, "file", string(one_b([]byte("filing"))))
-}
-
-func Test1C(t *testing.T) {
- compare(t, "sky", string(one_c([]byte("sky"))))
- compare(t, "happi", string(one_c([]byte("happy"))))
-
-}
-
-func Test2(t *testing.T) {
- compare(t, "relate", string(two([]byte("relational"))))
- compare(t, "condition", string(two([]byte("conditional"))))
- compare(t, "rational", string(two([]byte("rational"))))
- compare(t, "valence", string(two([]byte("valenci"))))
- compare(t, "hesitance", string(two([]byte("hesitanci"))))
- compare(t, "digitize", string(two([]byte("digitizer"))))
- compare(t, "conformable", string(two([]byte("conformabli"))))
- compare(t, "radical", string(two([]byte("radicalli"))))
- compare(t, "different", string(two([]byte("differentli"))))
- compare(t, "vile", string(two([]byte("vileli"))))
- compare(t, "analogous", string(two([]byte("analogousli"))))
- compare(t, "vietnamize", string(two([]byte("vietnamization"))))
- compare(t, "predicate", string(two([]byte("predication"))))
- compare(t, "operate", string(two([]byte("operator"))))
- compare(t, "feudal", string(two([]byte("feudalism"))))
- compare(t, "decisive", string(two([]byte("decisiveness"))))
- compare(t, "hopeful", string(two([]byte("hopefulness"))))
- compare(t, "callous", string(two([]byte("callousness"))))
- compare(t, "formal", string(two([]byte("formaliti"))))
- compare(t, "sensitive", string(two([]byte("sensitiviti"))))
- compare(t, "sensible", string(two([]byte("sensibiliti"))))
-}
-
-func Test3(t *testing.T) {
- compare(t, "triplic", string(three([]byte("triplicate"))))
- compare(t, "form", string(three([]byte("formative"))))
- compare(t, "formal", string(three([]byte("formalize"))))
- compare(t, "electric", string(three([]byte("electriciti"))))
- compare(t, "electric", string(three([]byte("electrical"))))
- compare(t, "hope", string(three([]byte("hopeful"))))
- compare(t, "good", string(three([]byte("goodness"))))
-}
-
-func Test4(t *testing.T) {
- compare(t, "reviv", string(four([]byte("revival"))))
- compare(t, "allow", string(four([]byte("allowance"))))
- compare(t, "infer", string(four([]byte("inference"))))
- compare(t, "airlin", string(four([]byte("airliner"))))
- compare(t, "gyroscop", string(four([]byte("gyroscopic"))))
- compare(t, "adjust", string(four([]byte("adjustable"))))
- compare(t, "defens", string(four([]byte("defensible"))))
- compare(t, "irrit", string(four([]byte("irritant"))))
- compare(t, "replac", string(four([]byte("replacement"))))
- compare(t, "adjust", string(four([]byte("adjustment"))))
- compare(t, "depend", string(four([]byte("dependent"))))
- compare(t, "adopt", string(four([]byte("adoption"))))
- compare(t, "homolog", string(four([]byte("homologou"))))
- compare(t, "commun", string(four([]byte("communism"))))
- compare(t, "activ", string(four([]byte("activate"))))
- compare(t, "angular", string(four([]byte("angulariti"))))
- compare(t, "homolog", string(four([]byte("homologous"))))
- compare(t, "effect", string(four([]byte("effective"))))
- compare(t, "bowdler", string(four([]byte("bowdlerize"))))
-}
-
-func Test5A(t *testing.T) {
- compare(t, "probat", string(five_a([]byte("probate"))))
- compare(t, "rate", string(five_a([]byte("rate"))))
- compare(t, "ceas", string(five_a([]byte("cease"))))
-}
-
-func Test5B(t *testing.T) {
- compare(t, "control", string(five_b([]byte("controll"))))
- compare(t, "roll", string(five_b([]byte("roll"))))
+// func TestWord(t *testing.T) {
+// fmt.Printf("%v\n", string(five_b(five_a(four(three(two(one_c(one_b(one_a([]byte("able")))))))))))
+// }
+// func TestConsonant(t *testing.T) {
+// word := []byte("TOY")
+// compare(t, true, Consonant(word, 0), "T") //T
+// compare(t, false, Consonant(word, 1), "O") //O
+// compare(t, true, Consonant(word, 2), "Y") //Y
+// word = []byte("SYZYGY")
+// compare(t, true, Consonant(word, 0), "S") //S
+// compare(t, false, Consonant(word, 1), "Y") //Y
+// compare(t, true, Consonant(word, 2), "Z") //Z
+// compare(t, false, Consonant(word, 3), "Y") //Y
+// compare(t, true, Consonant(word, 4), "G") //G
+// compare(t, false, Consonant(word, 5), "Y") //Y
+// word = []byte("yoke")
+// compare(t, false, Consonant(word, 0), "YOKE")
+// }
+//
+// func TestMeasure(t *testing.T) {
+// compare(t, 0, Meansure([]byte("TR")))
+// compare(t, 0, Meansure([]byte("EE")))
+// compare(t, 0, Meansure([]byte("TREE")))
+// compare(t, 0, Meansure([]byte("Y")))
+// compare(t, 0, Meansure([]byte("BY")))
+// compare(t, 1, Meansure([]byte("TROUBLE")))
+// compare(t, 1, Meansure([]byte("OATS")))
+// compare(t, 1, Meansure([]byte("TREES")))
+// compare(t, 1, Meansure([]byte("IVY")))
+// compare(t, 2, Meansure([]byte("TROUBLES")))
+// compare(t, 2, Meansure([]byte("PRIVATE")))
+// compare(t, 2, Meansure([]byte("OATEN")))
+// compare(t, 2, Meansure([]byte("ORRERY")))
+// }
+//
+// func Test1A(t *testing.T) {
+// compare(t, "caress", string(one_a([]byte("caresses"))))
+// compare(t, "poni", string(one_a([]byte("ponies"))))
+// compare(t, "ti", string(one_a([]byte("ties"))))
+// compare(t, "caress", string(one_a([]byte("caress"))))
+// compare(t, "cat", string(one_a([]byte("cats"))))
+// }
+//
+// func Test1B(t *testing.T) {
+// compare(t, "feed", string(one_b([]byte("feed"))))
+// compare(t, "agree", string(one_b([]byte("agreed"))))
+// compare(t, "plaster", string(one_b([]byte("plastered"))))
+// compare(t, "bled", string(one_b([]byte("bled"))))
+// compare(t, "motor", string(one_b([]byte("motoring"))))
+// compare(t, "sing", string(one_b([]byte("sing"))))
+// compare(t, "motor", string(one_b([]byte("motoring"))))
+// compare(t, "conflate", string(one_b([]byte("conflated"))))
+// compare(t, "trouble", string(one_b([]byte("troubled"))))
+// compare(t, "size", string(one_b([]byte("sized"))))
+// compare(t, "hop", string(one_b([]byte("hopping"))))
+// compare(t, "tan", string(one_b([]byte("tanned"))))
+// compare(t, "fail", string(one_b([]byte("failing"))))
+// compare(t, "file", string(one_b([]byte("filing"))))
+// }
+//
+// func Test1C(t *testing.T) {
+// compare(t, "sky", string(one_c([]byte("sky"))))
+// compare(t, "happi", string(one_c([]byte("happy"))))
+//
+// }
+//
+// func Test2(t *testing.T) {
+// compare(t, "relate", string(two([]byte("relational"))))
+// compare(t, "condition", string(two([]byte("conditional"))))
+// compare(t, "rational", string(two([]byte("rational"))))
+// compare(t, "valence", string(two([]byte("valenci"))))
+// compare(t, "hesitance", string(two([]byte("hesitanci"))))
+// compare(t, "digitize", string(two([]byte("digitizer"))))
+// compare(t, "conformable", string(two([]byte("conformabli"))))
+// compare(t, "radical", string(two([]byte("radicalli"))))
+// compare(t, "different", string(two([]byte("differentli"))))
+// compare(t, "vile", string(two([]byte("vileli"))))
+// compare(t, "analogous", string(two([]byte("analogousli"))))
+// compare(t, "vietnamize", string(two([]byte("vietnamization"))))
+// compare(t, "predicate", string(two([]byte("predication"))))
+// compare(t, "operate", string(two([]byte("operator"))))
+// compare(t, "feudal", string(two([]byte("feudalism"))))
+// compare(t, "decisive", string(two([]byte("decisiveness"))))
+// compare(t, "hopeful", string(two([]byte("hopefulness"))))
+// compare(t, "callous", string(two([]byte("callousness"))))
+// compare(t, "formal", string(two([]byte("formaliti"))))
+// compare(t, "sensitive", string(two([]byte("sensitiviti"))))
+// compare(t, "sensible", string(two([]byte("sensibiliti"))))
+// }
+//
+// func Test3(t *testing.T) {
+// compare(t, "triplic", string(three([]byte("triplicate"))))
+// compare(t, "form", string(three([]byte("formative"))))
+// compare(t, "formal", string(three([]byte("formalize"))))
+// compare(t, "electric", string(three([]byte("electriciti"))))
+// compare(t, "electric", string(three([]byte("electrical"))))
+// compare(t, "hope", string(three([]byte("hopeful"))))
+// compare(t, "good", string(three([]byte("goodness"))))
+// }
+//
+// func Test4(t *testing.T) {
+// compare(t, "reviv", string(four([]byte("revival"))))
+// compare(t, "allow", string(four([]byte("allowance"))))
+// compare(t, "infer", string(four([]byte("inference"))))
+// compare(t, "airlin", string(four([]byte("airliner"))))
+// compare(t, "gyroscop", string(four([]byte("gyroscopic"))))
+// compare(t, "adjust", string(four([]byte("adjustable"))))
+// compare(t, "defens", string(four([]byte("defensible"))))
+// compare(t, "irrit", string(four([]byte("irritant"))))
+// compare(t, "replac", string(four([]byte("replacement"))))
+// compare(t, "adjust", string(four([]byte("adjustment"))))
+// compare(t, "depend", string(four([]byte("dependent"))))
+// compare(t, "adopt", string(four([]byte("adoption"))))
+// compare(t, "homolog", string(four([]byte("homologou"))))
+// compare(t, "commun", string(four([]byte("communism"))))
+// compare(t, "activ", string(four([]byte("activate"))))
+// compare(t, "angular", string(four([]byte("angulariti"))))
+// compare(t, "homolog", string(four([]byte("homologous"))))
+// compare(t, "effect", string(four([]byte("effective"))))
+// compare(t, "bowdler", string(four([]byte("bowdlerize"))))
+// }
+//
+// func Test5A(t *testing.T) {
+// compare(t, "probat", string(five_a([]byte("probate"))))
+// compare(t, "rate", string(five_a([]byte("rate"))))
+// compare(t, "ceas", string(five_a([]byte("cease"))))
+// }
+//
+// func Test5B(t *testing.T) {
+// compare(t, "control", string(five_b([]byte("controll"))))
+// compare(t, "roll", string(five_b([]byte("roll"))))
+// }
+//
+func TestVocal(t *testing.T) {
+ f, err := os.Open("in.txt")
+ if err != nil {
+ panic(err)
+ }
+ in := bufio.NewReader(f)
+ f, err = os.Open("out.txt")
+ if err != nil {
+ panic(err)
+ }
+ out := bufio.NewReader(f)
+ for word, err := in.ReadSlice('\n'); err == nil; word, err = in.ReadSlice('\n') {
+ stem, err := out.ReadSlice('\n')
+ if err != nil {
+ panic(err)
+ }
+ fmt.Printf("Stemming [%v]\n", strings.TrimSpace(string(word)))
+ compare(t, strings.TrimSpace(string(stem)), string(Stem(word)), string(word))
+ }
}

0 comments on commit 512166f

Please sign in to comment.
Something went wrong with that request. Please try again.