Merge branch 'release-2.0.4'

aalto-speech · Feb 15, 2018 · fccb6c5 · fccb6c5
2 parents a51746f + eaade39
commit fccb6c5
Show file tree

Hide file tree

Showing 10 changed files with 117 additions and 135 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2012-2017, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.
+Copyright (c) 2012-2018, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -48,7 +48,7 @@
 
 # General information about the project.
 project = u'Morfessor'
-copyright = u'2017, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos'
+copyright = u'2018, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -57,7 +57,7 @@
 # The short X.Y version.
 version = '2.0'
 # The full version, including alpha/beta/rc tags.
-release = '2.0.3'
+release = '2.0.4'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/source/license.rst b/docs/source/license.rst
@@ -1,6 +1,6 @@
 License
 =======
-Copyright (c) 2012-2017, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.
+Copyright (c) 2012-2018, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without

diff --git a/morfessor/__init__.py b/morfessor/__init__.py
@@ -10,12 +10,10 @@
            'BaselineModel', 'main', 'get_default_argparser', 'main_evaluation',
            'get_evaluation_argparser']
 
-__version__ = '2.0.3'
+__version__ = '2.0.4'
 __author__ = 'Sami Virpioja, Peter Smit, Stig-Arne Grönroos'
 __author_email__ = "morpho@aalto.fi"
 
-show_progress_bar = True
-
 _logger = logging.getLogger(__name__)
 
 
@@ -26,7 +24,8 @@ def get_version():
 # so that the package global names are available to the modules
 # when they are imported.
 
-from .baseline import BaselineModel, FixedCorpusWeight, AnnotationCorpusWeight, NumMorphCorpusWeight, MorphLengthCorpusWeight
+from .baseline import BaselineModel, FixedCorpusWeight, AnnotationCorpusWeight, \
+    NumMorphCorpusWeight, MorphLengthCorpusWeight
 from .cmd import main, get_default_argparser, main_evaluation, \
     get_evaluation_argparser
 from .exception import MorfessorException, ArgumentException

diff --git a/morfessor/baseline.py b/morfessor/baseline.py
@@ -87,6 +87,9 @@ def __init__(self, forcesplit_list=None, corpusweight=None,
         else:
             self.nosplit_re = re.compile(nosplit_re, re.UNICODE)
 
+        # Used only for (semi-)supervised learning
+        self.annotations = None
+
     def set_corpus_weight_updater(self, corpus_weight):
         if corpus_weight is None:
             self._corpus_weight_updater = FixedCorpusWeight(1.0)
@@ -156,7 +159,7 @@ def _set_compound_analysis(self, compound, parts, ptype='rbranch'):
             self._modify_construction_count(compound, count)
         elif ptype == 'flat':
             rcount, count = self._remove(compound)
-            splitloc = self._segmentation_to_splitloc(parts)
+            splitloc = self.segmentation_to_splitloc(parts)
             self._analyses[compound] = ConstrNode(rcount, count, splitloc)
             for constr in parts:
                 self._modify_construction_count(constr, count)
@@ -193,7 +196,7 @@ def _update_annotation_choices(self):
         # and add missing compounds also to the unannotated data
         constructions = collections.Counter()
         for compound, alternatives in self.annotations.items():
-            if not compound in self._analyses:
+            if compound not in self._analyses:
                 self._add_compound(compound, 1)
 
             analysis, cost = self._best_analysis(alternatives)
@@ -232,7 +235,7 @@ def _force_split(self, compound):
         clen = len(compound)
         j = 0
         parts = []
-        for i in range(1, clen):
+        for i in range(0, clen):
             if compound[i] in self.forcesplit_list:
                 if len(compound[j:i]) > 0:
                     parts.append(compound[j:i])
@@ -413,7 +416,7 @@ def _epoch_update(self, epoch_num):
         return forced_epochs
 
     @staticmethod
-    def _segmentation_to_splitloc(constructions):
+    def segmentation_to_splitloc(constructions):
         """Return a list of split locations for a segmented compound."""
         splitloc = []
         i = 0
@@ -559,7 +562,7 @@ def train_batch(self, algorithm='recursive', algorithm_params=(),
         """Train the model in batch fashion.
 
         The model is trained with the data already loaded into the model (by
-        using an existing model or calling one of the load\_ methods).
+        using an existing model or calling one of the load_ methods).
 
         In each iteration (epoch) all compounds in the training data are
         optimized once, in a random order. If applicable, corpus weight,
@@ -580,11 +583,11 @@ def train_batch(self, algorithm='recursive', algorithm_params=(),
         forced_epochs = max(1, self._epoch_update(epochs))
         newcost = self.get_cost()
         compounds = list(self.get_compounds())
-        _logger.info("Compounds in training data: %s types / %s tokens" %
-                     (len(compounds), self._corpus_coding.boundaries))
+        _logger.info("Compounds in training data: %s types / %s tokens",
+                     len(compounds), self._corpus_coding.boundaries)
 
         _logger.info("Starting batch training")
-        _logger.info("Epochs: %s\tCost: %s" % (epochs, newcost))
+        _logger.info("Epochs: %s\tCost: %s", epochs, newcost)
 
         while True:
             # One epoch
@@ -598,16 +601,15 @@ def train_batch(self, algorithm='recursive', algorithm_params=(),
                 else:
                     raise MorfessorException("unknown algorithm '%s'" %
                                              algorithm)
-                _logger.debug("#%s -> %s" %
-                              (w, _constructions_to_str(segments)))
+                _logger.debug("#%s -> %s", w, _constructions_to_str(segments))
             epochs += 1
 
-            _logger.debug("Cost before epoch update: %s" % self.get_cost())
+            _logger.debug("Cost before epoch update: %s", self.get_cost())
             forced_epochs = max(forced_epochs, self._epoch_update(epochs))
             oldcost = newcost
             newcost = self.get_cost()
 
-            _logger.info("Epochs: %s\tCost: %s" % (epochs, newcost))
+            _logger.info("Epochs: %s\tCost: %s", epochs, newcost)
             if (forced_epochs == 0 and
                     newcost >= oldcost - finish_threshold *
                     self._corpus_coding.boundaries):
@@ -665,7 +667,7 @@ def train_online(self, data, count_modifier=None, epoch_interval=10000,
         while more_tokens:
             self._epoch_update(epochs)
             newcost = self.get_cost()
-            _logger.info("Tokens processed: %s\tCost: %s" % (i, newcost))
+            _logger.info("Tokens processed: %s\tCost: %s", i, newcost)
 
             for _ in _progress(range(epoch_interval)):
                 try:
@@ -679,7 +681,7 @@ def train_online(self, data, count_modifier=None, epoch_interval=10000,
                     continue
 
                 if count_modifier is not None:
-                    if not w in counts:
+                    if w not in counts:
                         c = 0
                         counts[w] = 1
                         addc = 1
@@ -701,8 +703,7 @@ def train_online(self, data, count_modifier=None, epoch_interval=10000,
                 else:
                     raise MorfessorException("unknown algorithm '%s'" %
                                              algorithm)
-                _logger.debug("#%s: %s -> %s" %
-                              (i, w, _constructions_to_str(segments)))
+                _logger.debug("#%s: %s -> %s", i, w, _constructions_to_str(segments))
                 i += 1
 
             epochs += 1
@@ -712,7 +713,7 @@ def train_online(self, data, count_modifier=None, epoch_interval=10000,
 
         self._epoch_update(epochs)
         newcost = self.get_cost()
-        _logger.info("Tokens processed: %s\tCost: %s" % (i, newcost))
+        _logger.info("Tokens processed: %s\tCost: %s", i, newcost)
         return epochs, newcost
 
     def viterbi_segment(self, compound, addcount=1.0, maxlen=30):
@@ -777,8 +778,7 @@ def viterbi_segment(self, compound, addcount=1.0, maxlen=30):
                                    math.log(self._lexicon_coding.boundaries
                                             + addcount))
                                   - (self._lexicon_coding.boundaries
-                                     * math.log(self._lexicon_coding.boundaries
-                                                ))
+                                     * math.log(self._lexicon_coding.boundaries))
                                   + self._lexicon_coding.get_codelength(
                                       construction))
                                  / self._corpus_coding.weight)
@@ -973,9 +973,7 @@ def make_segment_only(self):
         doing so would throw an exception.
 
         """
-        self._num_compounds = len(self.get_compounds())
         self._segment_only = True
-
         self._analyses = {k: v for (k, v) in self._analyses.items()
                           if not v.splitloc}
 
@@ -994,7 +992,7 @@ def move_direction(cls, model, direction, epoch):
             else:
                 weight *= 1.0 / (1 + 2.0 / epoch)
             model.set_corpus_coding_weight(weight)
-            _logger.info("Corpus weight set to {}".format(weight))
+            _logger.info("Corpus weight set to %s", weight)
             return True
         return False
 
@@ -1039,12 +1037,12 @@ def _boundary_recall(cls, prediction, reference):
             best = -1
             for ref in ref_list:
                 # list of internal boundary positions
-                ref_b = set(BaselineModel._segmentation_to_splitloc(ref))
+                ref_b = set(BaselineModel.segmentation_to_splitloc(ref))
                 if len(ref_b) == 0:
                     best = 1.0
                     break
                 for pre in pre_list:
-                    pre_b = set(BaselineModel._segmentation_to_splitloc(pre))
+                    pre_b = set(BaselineModel.segmentation_to_splitloc(pre))
                     r = len(ref_b.intersection(pre_b)) / float(len(ref_b))
                     if r > best:
                         best = r
@@ -1077,10 +1075,8 @@ def _estimate_segmentation_dir(self, segments, annotations):
         undersegmentation, and 0 if no changes are required.
 
         """
-        pre, rec, f = self._bpr_evaluation([[x] for x in segments],
-                                           annotations)
-        _logger.info("Boundary evaluation: precision %.4f; recall %.4f" %
-                     (pre, rec))
+        pre, rec, f = self._bpr_evaluation([[x] for x in segments], annotations)
+        _logger.info("Boundary evaluation: precision %.4f; recall %.4f", pre, rec)
         if abs(pre - rec) < self.threshold:
             return 0
         elif rec > pre:
@@ -1099,7 +1095,7 @@ def update(self, model, epoch):
             return False
         cur_length = self.calc_morph_length(model)
 
-        _logger.info("Current morph-length: {}".format(cur_length))
+        _logger.info("Current morph-length: %s", cur_length)
 
         if (abs(self.morph_length - cur_length) / self.morph_length >
                 self.threshold):
@@ -1133,7 +1129,7 @@ def update(self, model, epoch):
             return False
         cur_morph_types = model._lexicon_coding.boundaries
 
-        _logger.info("Number of morph types: {}".format(cur_morph_types))
+        _logger.info("Number of morph types: %s", cur_morph_types)
 
 
         if (abs(self.num_morph_types - cur_morph_types) / self.num_morph_types
@@ -1345,8 +1341,7 @@ def update_weight(self):
         self.weight = (self.corpus_coding.weight *
                        float(self.corpus_coding.boundaries) / self.boundaries)
         if self.weight != old:
-            _logger.info("Corpus weight of annotated data set to %s"
-                         % self.weight)
+            _logger.info("Corpus weight of annotated data set to %s", self.weight)
 
     def get_cost(self):
         """Return the cost of the Annotation Corpus."""