Initial code commit

aalto-speech · Aug 20, 2017 · 06f4678 · 06f4678
1 parent f983ea0
commit 06f4678
Show file tree

Hide file tree

Showing 6 changed files with 320 additions and 2 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017 Peter Smit
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,3 +1,22 @@
-# subword-kaldi
+# Create a subword Lexicon FST for Kaldi
+
+This is the code belonging to the paper [Improved subword modeling for WFST-based speech recognition](https://research.aalto.fi/en/publications/improved-subword-modeling-for-wfstbased-speech-recognition(ed43f22c-f5bd-45ad-99a7-628f82f2283c).html).
+
+
+For each subword marking style (word boundary marker, left-right marked, left-marked, right-marked) a seperate script exists in `local/` that can create a L.fst.
+
+The standard way to use this scripts is:
+
+    extra=3
+    utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt --num-extra-phone-disambig-syms $extra data/subword_dict "<UNK>" data/subword_lang/local data/subword_lang
+
+    dir=data/subword_lang
+    tmpdir=data/subword_lang/local
+
+    # Overwrite L_disambig.fst
+    common/make_lfst_wb.py $(tail -n$extra $dir/phones/disambig.txt) < $tmpdir/lexiconp_disambig.txt | fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt --keep_isymbols=false --keep_osymbols=false | fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | fstarcsort --sort_type=olabel > $dir/L_disambig.fst 
+
+For the other scripts (l/r/lr-marked ) the number of extra disambiguation symbols can be reduced to 1
+
+
 
-The code accompanying the interspeech paper will be published before interspeech
diff --git a/local/make_lfst_l.py b/local/make_lfst_l.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+import sys
+import math
+
+# Three base states.
+# 0, start-state, all arcs from _E phones
+# 1, all arcs to _B phones (connected with 1 through <w>)
+# 2, all arcs from and to _I phones
+next_state=5
+
+def print_word(word, phones, start, end, from_state, to_state):
+    global next_state
+    cur_state = from_state
+
+    phones = list(phones)
+
+    disambig = []
+    while len(phones) > 0 and phones[-1].startswith("#"):
+        disambig.insert(0,phones[-1])
+        phones = phones[:-1]
+
+    #make sure no disambig phones were hiding somewhere else in the sequence
+    assert not any(p.startswith("#") for p in phones)
+
+    phones = [p.split('_')[0] for p in phones]
+    labels = ["I"] * len(phones)    
+
+    if start: 
+        labels[0] = "B" 
+    if end:
+        labels[-1] = "E"
+    if len(phones) == 1 and start and end:
+        labels[0] = "S"
+
+    phones = ["{}_{}".format(p,l) for p,l in zip(phones, labels)] + disambig
+
+    assert len(phones) > 0
+
+    while len(phones) > 1:
+        print("{}\t{}\t{}\t{}".format(cur_state,next_state,phones[0],word))
+        cur_state = next_state
+        next_state += 1
+        word = "<eps>" 
+        phones = phones[1:] 
+
+    print("{}\t{}\t{}\t{}".format(cur_state,to_state,phones[0],word))
+
+disambig_symbol = sys.argv[1]
+print("{}\t{}\t{}\t{}\t{}".format(0,4,"SIL","<eps>", -math.log(0.5)))
+print("{}\t{}\t{}\t{}".format(4,1,disambig_symbol ,"<eps>"))
+print("{}\t{}\t{}\t{}\t{}".format(0,1,"<eps>","<eps>", -math.log(0.5)))
+
+print("{}\t{}\t{}\t{}".format(2,3,disambig_symbol ,"<eps>"))
+
+
+for line in sys.stdin:
+    word, prob, phones = line.strip().split(None, 2)
+    phones = phones.split()
+
+    assert len(phones) > 0 
+    start,si = True,1
+    if word.startswith("|") or word.startswith("+"):
+        start,si = False,3
+
+
+    print_word(word, phones, start, True, si, 0)
+    if word == "<UNK>": continue
+    print_word(word, phones, start, False, si, 2)
+
+print("{}\t0".format(1)) 
diff --git a/local/make_lfst_lr.py b/local/make_lfst_lr.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+import sys
+import math
+
+# Three base states.
+# 0, start-state, all arcs from _E phones
+# 1, all arcs to _B phones (connected with 1 through <w>)
+# 2, all arcs from and to _I phones
+next_state=5
+
+def print_word(word, phones, start, end, from_state, to_state):
+    global next_state
+    cur_state = from_state
+
+    phones = list(phones)
+
+    disambig = []
+    while len(phones) > 0 and phones[-1].startswith("#"):
+        disambig.insert(0,phones[-1])
+        phones = phones[:-1]
+
+    #make sure no disambig phones were hiding somewhere else in the sequence
+    assert not any(p.startswith("#") for p in phones)
+
+    phones = [p.split('_')[0] for p in phones]
+    labels = ["I"] * len(phones)    
+
+    if start: 
+        labels[0] = "B" 
+    if end:
+        labels[-1] = "E"
+    if len(phones) == 1 and start and end:
+        labels[0] = "S"
+
+    phones = ["{}_{}".format(p,l) for p,l in zip(phones, labels)] + disambig
+
+    assert len(phones) > 0
+
+    while len(phones) > 1:
+        print("{}\t{}\t{}\t{}".format(cur_state,next_state,phones[0],word))
+        cur_state = next_state
+        next_state += 1
+        word = "<eps>" 
+        phones = phones[1:] 
+
+    print("{}\t{}\t{}\t{}".format(cur_state,to_state,phones[0],word))
+
+disambig_symbol = sys.argv[1]
+print("{}\t{}\t{}\t{}\t{}".format(0,4,"SIL","<eps>", -math.log(0.5)))
+print("{}\t{}\t{}\t{}".format(4,1,disambig_symbol ,"<eps>"))
+print("{}\t{}\t{}\t{}\t{}".format(0,1,"<eps>","<eps>", -math.log(0.5)))
+
+print("{}\t{}\t{}\t{}".format(2,3,disambig_symbol ,"<eps>"))
+
+for line in sys.stdin:
+    word, prob, phones = line.strip().split(None, 2)
+    phones = phones.split()
+
+    assert len(phones) > 0 
+    start,si = True,1
+    end,ei = True,0
+
+    if word.startswith("+"):
+        start,si = False,3
+    if word.endswith("+"):
+        end,ei = False,2
+
+    print_word(word, phones, start, end, si, ei)
+
+print("{}\t0".format(1)) 
diff --git a/local/make_lfst_r.py b/local/make_lfst_r.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+import sys
+import math
+
+# Three base states.
+# 0, start-state, all arcs from _E phones
+# 1, all arcs to _B phones (connected with 1 through <w>)
+# 2, all arcs from and to _I phones
+next_state=5
+
+def print_word(word, phones, start, end, from_state, to_state):
+    global next_state
+    cur_state = from_state
+
+    phones = list(phones)
+
+    disambig = []
+    while len(phones) > 0 and phones[-1].startswith("#"):
+        disambig.insert(0,phones[-1])
+        phones = phones[:-1]
+
+    #make sure no disambig phones were hiding somewhere else in the sequence
+    assert not any(p.startswith("#") for p in phones)
+
+    phones = [p.split('_')[0] for p in phones]
+    labels = ["I"] * len(phones)    
+
+    if start: 
+        labels[0] = "B" 
+    if end:
+        labels[-1] = "E"
+    if len(phones) == 1 and start and end:
+        labels[0] = "S"
+
+    phones = ["{}_{}".format(p,l) for p,l in zip(phones, labels)] + disambig
+
+    assert len(phones) > 0
+
+    while len(phones) > 1:
+        print("{}\t{}\t{}\t{}".format(cur_state,next_state,phones[0],word))
+        cur_state = next_state
+        next_state += 1
+        word = "<eps>" 
+        phones = phones[1:] 
+
+    print("{}\t{}\t{}\t{}".format(cur_state,to_state,phones[0],word))
+
+disambig_symbol = sys.argv[1]
+print("{}\t{}\t{}\t{}\t{}".format(0,4,"SIL","<eps>", -math.log(0.5)))
+print("{}\t{}\t{}\t{}".format(4,1,disambig_symbol ,"<eps>"))
+print("{}\t{}\t{}\t{}\t{}".format(0,1,"<eps>","<eps>", -math.log(0.5)))
+
+print("{}\t{}\t{}\t{}".format(2,3,disambig_symbol ,"<eps>"))
+
+
+for line in sys.stdin:
+    word, prob, phones = line.strip().split(None, 2)
+    phones = phones.split()
+
+    assert len(phones) > 0 
+    end,ei = True,0
+    if word.endswith("|") or word.endswith("+"):
+        end,ei = False, 2
+
+    print_word(word, phones, True, end, 1, ei)
+    if word == "<UNK>": continue
+    print_word(word, phones, False, end, 3, ei)
+
+print("{}\t0".format(1))
diff --git a/local/make_lfst_wb.py b/local/make_lfst_wb.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+import sys
+import math
+
+# Three base states.
+# 0, start-state, all arcs from _E phones
+# 1, all arcs to _B phones (connected with 1 through <w>)
+# 2, all arcs from and to _I phones
+next_state=5
+
+def print_word(word, phones, start, end, from_state, to_state):
+    global next_state
+    cur_state = from_state
+
+    phones = list(phones)
+
+    disambig = []
+    while len(phones) > 0 and phones[-1].startswith("#"):
+        disambig.insert(0,phones[-1])
+        phones = phones[:-1]
+
+    #make sure no disambig phones were hiding somewhere else in the sequence
+    assert not any(p.startswith("#") for p in phones)
+
+    phones = [p.split('_')[0] for p in phones]
+    labels = ["I"] * len(phones)    
+
+    if start: 
+        labels[0] = "B" 
+    if end:
+        labels[-1] = "E"
+    if len(phones) == 1 and start and end:
+        labels[0] = "S"
+
+    phones = ["{}_{}".format(p,l) for p,l in zip(phones, labels)] + disambig
+
+    assert len(phones) > 0
+
+    while len(phones) > 1:
+        print("{}\t{}\t{}\t{}".format(cur_state,next_state,phones[0],word))
+        cur_state = next_state
+        next_state += 1
+        word = "<eps>" 
+        phones = phones[1:] 
+
+    print("{}\t{}\t{}\t{}".format(cur_state,to_state,phones[0],word))
+
+disambig_symbol = sys.argv[1]
+disambig_symbol2 = sys.argv[2]
+disambig_symbol3 = sys.argv[3]
+print("{}\t{}\t{}\t{}\t{}".format(0,4,"SIL","<w>", -math.log(0.5)))
+print("{}\t{}\t{}\t{}".format(4,1,disambig_symbol ,"<eps>"))
+print("{}\t{}\t{}\t{}\t{}".format(0,1,disambig_symbol3,"<w>", -math.log(0.5)))
+
+print("{}\t{}\t{}\t{}".format(2,3,disambig_symbol2,"<eps>"))
+
+for line in sys.stdin:
+    word, prob, phones = line.strip().split(None, 2)
+    phones = phones.split()
+
+    assert len(phones) > 0 
+    if word == "<w>": continue 
+    print_word(word, phones, True, True, 1, 0)
+    if word == "<UNK>": continue
+    print_word(word, phones, False, True, 3, 0)
+    print_word(word, phones, True, False, 1, 2)
+    print_word(word, phones, False, False, 3, 2)
+
+print("{}\t0".format(1))