From 06f4678fbc64ea94c3d5ca502631188ca421a092 Mon Sep 17 00:00:00 2001 From: Peter Smit Date: Sun, 20 Aug 2017 08:28:54 +0200 Subject: [PATCH] Initial code commit --- LICENSE | 21 +++++++++++++ README.md | 23 ++++++++++++-- local/make_lfst_l.py | 70 +++++++++++++++++++++++++++++++++++++++++++ local/make_lfst_lr.py | 70 +++++++++++++++++++++++++++++++++++++++++++ local/make_lfst_r.py | 69 ++++++++++++++++++++++++++++++++++++++++++ local/make_lfst_wb.py | 69 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 320 insertions(+), 2 deletions(-) create mode 100644 LICENSE create mode 100755 local/make_lfst_l.py create mode 100755 local/make_lfst_lr.py create mode 100755 local/make_lfst_r.py create mode 100755 local/make_lfst_wb.py diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..54e1b9d --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 Peter Smit + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 4c87d99..9a71fd4 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,22 @@ -# subword-kaldi +# Create a subword Lexicon FST for Kaldi + +This is the code belonging to the paper [Improved subword modeling for WFST-based speech recognition](https://research.aalto.fi/en/publications/improved-subword-modeling-for-wfstbased-speech-recognition(ed43f22c-f5bd-45ad-99a7-628f82f2283c).html). + + +For each subword marking style (word boundary marker, left-right marked, left-marked, right-marked) a seperate script exists in `local/` that can create a L.fst. + +The standard way to use this scripts is: + + extra=3 + utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt --num-extra-phone-disambig-syms $extra data/subword_dict "" data/subword_lang/local data/subword_lang + + dir=data/subword_lang + tmpdir=data/subword_lang/local + + # Overwrite L_disambig.fst + common/make_lfst_wb.py $(tail -n$extra $dir/phones/disambig.txt) < $tmpdir/lexiconp_disambig.txt | fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt --keep_isymbols=false --keep_osymbols=false | fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | fstarcsort --sort_type=olabel > $dir/L_disambig.fst + +For the other scripts (l/r/lr-marked ) the number of extra disambiguation symbols can be reduced to 1 + + -The code accompanying the interspeech paper will be published before interspeech diff --git a/local/make_lfst_l.py b/local/make_lfst_l.py new file mode 100755 index 0000000..376ca4b --- /dev/null +++ b/local/make_lfst_l.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +import sys +import math + +# Three base states. +# 0, start-state, all arcs from _E phones +# 1, all arcs to _B phones (connected with 1 through ) +# 2, all arcs from and to _I phones +next_state=5 + +def print_word(word, phones, start, end, from_state, to_state): + global next_state + cur_state = from_state + + phones = list(phones) + + disambig = [] + while len(phones) > 0 and phones[-1].startswith("#"): + disambig.insert(0,phones[-1]) + phones = phones[:-1] + + #make sure no disambig phones were hiding somewhere else in the sequence + assert not any(p.startswith("#") for p in phones) + + phones = [p.split('_')[0] for p in phones] + labels = ["I"] * len(phones) + + if start: + labels[0] = "B" + if end: + labels[-1] = "E" + if len(phones) == 1 and start and end: + labels[0] = "S" + + phones = ["{}_{}".format(p,l) for p,l in zip(phones, labels)] + disambig + + assert len(phones) > 0 + + while len(phones) > 1: + print("{}\t{}\t{}\t{}".format(cur_state,next_state,phones[0],word)) + cur_state = next_state + next_state += 1 + word = "" + phones = phones[1:] + + print("{}\t{}\t{}\t{}".format(cur_state,to_state,phones[0],word)) + +disambig_symbol = sys.argv[1] +print("{}\t{}\t{}\t{}\t{}".format(0,4,"SIL","", -math.log(0.5))) +print("{}\t{}\t{}\t{}".format(4,1,disambig_symbol ,"")) +print("{}\t{}\t{}\t{}\t{}".format(0,1,"","", -math.log(0.5))) + +print("{}\t{}\t{}\t{}".format(2,3,disambig_symbol ,"")) + + +for line in sys.stdin: + word, prob, phones = line.strip().split(None, 2) + phones = phones.split() + + assert len(phones) > 0 + start,si = True,1 + if word.startswith("|") or word.startswith("+"): + start,si = False,3 + + + print_word(word, phones, start, True, si, 0) + if word == "": continue + print_word(word, phones, start, False, si, 2) + +print("{}\t0".format(1)) diff --git a/local/make_lfst_lr.py b/local/make_lfst_lr.py new file mode 100755 index 0000000..3ba3ff0 --- /dev/null +++ b/local/make_lfst_lr.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +import sys +import math + +# Three base states. +# 0, start-state, all arcs from _E phones +# 1, all arcs to _B phones (connected with 1 through ) +# 2, all arcs from and to _I phones +next_state=5 + +def print_word(word, phones, start, end, from_state, to_state): + global next_state + cur_state = from_state + + phones = list(phones) + + disambig = [] + while len(phones) > 0 and phones[-1].startswith("#"): + disambig.insert(0,phones[-1]) + phones = phones[:-1] + + #make sure no disambig phones were hiding somewhere else in the sequence + assert not any(p.startswith("#") for p in phones) + + phones = [p.split('_')[0] for p in phones] + labels = ["I"] * len(phones) + + if start: + labels[0] = "B" + if end: + labels[-1] = "E" + if len(phones) == 1 and start and end: + labels[0] = "S" + + phones = ["{}_{}".format(p,l) for p,l in zip(phones, labels)] + disambig + + assert len(phones) > 0 + + while len(phones) > 1: + print("{}\t{}\t{}\t{}".format(cur_state,next_state,phones[0],word)) + cur_state = next_state + next_state += 1 + word = "" + phones = phones[1:] + + print("{}\t{}\t{}\t{}".format(cur_state,to_state,phones[0],word)) + +disambig_symbol = sys.argv[1] +print("{}\t{}\t{}\t{}\t{}".format(0,4,"SIL","", -math.log(0.5))) +print("{}\t{}\t{}\t{}".format(4,1,disambig_symbol ,"")) +print("{}\t{}\t{}\t{}\t{}".format(0,1,"","", -math.log(0.5))) + +print("{}\t{}\t{}\t{}".format(2,3,disambig_symbol ,"")) + +for line in sys.stdin: + word, prob, phones = line.strip().split(None, 2) + phones = phones.split() + + assert len(phones) > 0 + start,si = True,1 + end,ei = True,0 + + if word.startswith("+"): + start,si = False,3 + if word.endswith("+"): + end,ei = False,2 + + print_word(word, phones, start, end, si, ei) + +print("{}\t0".format(1)) diff --git a/local/make_lfst_r.py b/local/make_lfst_r.py new file mode 100755 index 0000000..55bd73a --- /dev/null +++ b/local/make_lfst_r.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +import sys +import math + +# Three base states. +# 0, start-state, all arcs from _E phones +# 1, all arcs to _B phones (connected with 1 through ) +# 2, all arcs from and to _I phones +next_state=5 + +def print_word(word, phones, start, end, from_state, to_state): + global next_state + cur_state = from_state + + phones = list(phones) + + disambig = [] + while len(phones) > 0 and phones[-1].startswith("#"): + disambig.insert(0,phones[-1]) + phones = phones[:-1] + + #make sure no disambig phones were hiding somewhere else in the sequence + assert not any(p.startswith("#") for p in phones) + + phones = [p.split('_')[0] for p in phones] + labels = ["I"] * len(phones) + + if start: + labels[0] = "B" + if end: + labels[-1] = "E" + if len(phones) == 1 and start and end: + labels[0] = "S" + + phones = ["{}_{}".format(p,l) for p,l in zip(phones, labels)] + disambig + + assert len(phones) > 0 + + while len(phones) > 1: + print("{}\t{}\t{}\t{}".format(cur_state,next_state,phones[0],word)) + cur_state = next_state + next_state += 1 + word = "" + phones = phones[1:] + + print("{}\t{}\t{}\t{}".format(cur_state,to_state,phones[0],word)) + +disambig_symbol = sys.argv[1] +print("{}\t{}\t{}\t{}\t{}".format(0,4,"SIL","", -math.log(0.5))) +print("{}\t{}\t{}\t{}".format(4,1,disambig_symbol ,"")) +print("{}\t{}\t{}\t{}\t{}".format(0,1,"","", -math.log(0.5))) + +print("{}\t{}\t{}\t{}".format(2,3,disambig_symbol ,"")) + + +for line in sys.stdin: + word, prob, phones = line.strip().split(None, 2) + phones = phones.split() + + assert len(phones) > 0 + end,ei = True,0 + if word.endswith("|") or word.endswith("+"): + end,ei = False, 2 + + print_word(word, phones, True, end, 1, ei) + if word == "": continue + print_word(word, phones, False, end, 3, ei) + +print("{}\t0".format(1)) diff --git a/local/make_lfst_wb.py b/local/make_lfst_wb.py new file mode 100755 index 0000000..58bba61 --- /dev/null +++ b/local/make_lfst_wb.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +import sys +import math + +# Three base states. +# 0, start-state, all arcs from _E phones +# 1, all arcs to _B phones (connected with 1 through ) +# 2, all arcs from and to _I phones +next_state=5 + +def print_word(word, phones, start, end, from_state, to_state): + global next_state + cur_state = from_state + + phones = list(phones) + + disambig = [] + while len(phones) > 0 and phones[-1].startswith("#"): + disambig.insert(0,phones[-1]) + phones = phones[:-1] + + #make sure no disambig phones were hiding somewhere else in the sequence + assert not any(p.startswith("#") for p in phones) + + phones = [p.split('_')[0] for p in phones] + labels = ["I"] * len(phones) + + if start: + labels[0] = "B" + if end: + labels[-1] = "E" + if len(phones) == 1 and start and end: + labels[0] = "S" + + phones = ["{}_{}".format(p,l) for p,l in zip(phones, labels)] + disambig + + assert len(phones) > 0 + + while len(phones) > 1: + print("{}\t{}\t{}\t{}".format(cur_state,next_state,phones[0],word)) + cur_state = next_state + next_state += 1 + word = "" + phones = phones[1:] + + print("{}\t{}\t{}\t{}".format(cur_state,to_state,phones[0],word)) + +disambig_symbol = sys.argv[1] +disambig_symbol2 = sys.argv[2] +disambig_symbol3 = sys.argv[3] +print("{}\t{}\t{}\t{}\t{}".format(0,4,"SIL","", -math.log(0.5))) +print("{}\t{}\t{}\t{}".format(4,1,disambig_symbol ,"")) +print("{}\t{}\t{}\t{}\t{}".format(0,1,disambig_symbol3,"", -math.log(0.5))) + +print("{}\t{}\t{}\t{}".format(2,3,disambig_symbol2,"")) + +for line in sys.stdin: + word, prob, phones = line.strip().split(None, 2) + phones = phones.split() + + assert len(phones) > 0 + if word == "": continue + print_word(word, phones, True, True, 1, 0) + if word == "": continue + print_word(word, phones, False, True, 3, 0) + print_word(word, phones, True, False, 1, 2) + print_word(word, phones, False, False, 3, 2) + +print("{}\t0".format(1))