Skip to content

Commit

Permalink
Merge branch 'release-2.0.4'
Browse files Browse the repository at this point in the history
  • Loading branch information
psmit committed Feb 15, 2018
2 parents a51746f + eaade39 commit fccb6c5
Show file tree
Hide file tree
Showing 10 changed files with 117 additions and 135 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright (c) 2012-2017, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.
Copyright (c) 2012-2018, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.
All rights reserved.

Redistribution and use in source and binary forms, with or without
Expand Down
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@

# General information about the project.
project = u'Morfessor'
copyright = u'2017, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos'
copyright = u'2018, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos'

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
Expand All @@ -57,7 +57,7 @@
# The short X.Y version.
version = '2.0'
# The full version, including alpha/beta/rc tags.
release = '2.0.3'
release = '2.0.4'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/license.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
License
=======
Copyright (c) 2012-2017, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.
Copyright (c) 2012-2018, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.
All rights reserved.

Redistribution and use in source and binary forms, with or without
Expand Down
7 changes: 3 additions & 4 deletions morfessor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@
'BaselineModel', 'main', 'get_default_argparser', 'main_evaluation',
'get_evaluation_argparser']

__version__ = '2.0.3'
__version__ = '2.0.4'
__author__ = 'Sami Virpioja, Peter Smit, Stig-Arne Grönroos'
__author_email__ = "morpho@aalto.fi"

show_progress_bar = True

_logger = logging.getLogger(__name__)


Expand All @@ -26,7 +24,8 @@ def get_version():
# so that the package global names are available to the modules
# when they are imported.

from .baseline import BaselineModel, FixedCorpusWeight, AnnotationCorpusWeight, NumMorphCorpusWeight, MorphLengthCorpusWeight
from .baseline import BaselineModel, FixedCorpusWeight, AnnotationCorpusWeight, \
NumMorphCorpusWeight, MorphLengthCorpusWeight
from .cmd import main, get_default_argparser, main_evaluation, \
get_evaluation_argparser
from .exception import MorfessorException, ArgumentException
Expand Down
59 changes: 27 additions & 32 deletions morfessor/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ def __init__(self, forcesplit_list=None, corpusweight=None,
else:
self.nosplit_re = re.compile(nosplit_re, re.UNICODE)

# Used only for (semi-)supervised learning
self.annotations = None

def set_corpus_weight_updater(self, corpus_weight):
if corpus_weight is None:
self._corpus_weight_updater = FixedCorpusWeight(1.0)
Expand Down Expand Up @@ -156,7 +159,7 @@ def _set_compound_analysis(self, compound, parts, ptype='rbranch'):
self._modify_construction_count(compound, count)
elif ptype == 'flat':
rcount, count = self._remove(compound)
splitloc = self._segmentation_to_splitloc(parts)
splitloc = self.segmentation_to_splitloc(parts)
self._analyses[compound] = ConstrNode(rcount, count, splitloc)
for constr in parts:
self._modify_construction_count(constr, count)
Expand Down Expand Up @@ -193,7 +196,7 @@ def _update_annotation_choices(self):
# and add missing compounds also to the unannotated data
constructions = collections.Counter()
for compound, alternatives in self.annotations.items():
if not compound in self._analyses:
if compound not in self._analyses:
self._add_compound(compound, 1)

analysis, cost = self._best_analysis(alternatives)
Expand Down Expand Up @@ -232,7 +235,7 @@ def _force_split(self, compound):
clen = len(compound)
j = 0
parts = []
for i in range(1, clen):
for i in range(0, clen):
if compound[i] in self.forcesplit_list:
if len(compound[j:i]) > 0:
parts.append(compound[j:i])
Expand Down Expand Up @@ -413,7 +416,7 @@ def _epoch_update(self, epoch_num):
return forced_epochs

@staticmethod
def _segmentation_to_splitloc(constructions):
def segmentation_to_splitloc(constructions):
"""Return a list of split locations for a segmented compound."""
splitloc = []
i = 0
Expand Down Expand Up @@ -559,7 +562,7 @@ def train_batch(self, algorithm='recursive', algorithm_params=(),
"""Train the model in batch fashion.
The model is trained with the data already loaded into the model (by
using an existing model or calling one of the load\_ methods).
using an existing model or calling one of the load_ methods).
In each iteration (epoch) all compounds in the training data are
optimized once, in a random order. If applicable, corpus weight,
Expand All @@ -580,11 +583,11 @@ def train_batch(self, algorithm='recursive', algorithm_params=(),
forced_epochs = max(1, self._epoch_update(epochs))
newcost = self.get_cost()
compounds = list(self.get_compounds())
_logger.info("Compounds in training data: %s types / %s tokens" %
(len(compounds), self._corpus_coding.boundaries))
_logger.info("Compounds in training data: %s types / %s tokens",
len(compounds), self._corpus_coding.boundaries)

_logger.info("Starting batch training")
_logger.info("Epochs: %s\tCost: %s" % (epochs, newcost))
_logger.info("Epochs: %s\tCost: %s", epochs, newcost)

while True:
# One epoch
Expand All @@ -598,16 +601,15 @@ def train_batch(self, algorithm='recursive', algorithm_params=(),
else:
raise MorfessorException("unknown algorithm '%s'" %
algorithm)
_logger.debug("#%s -> %s" %
(w, _constructions_to_str(segments)))
_logger.debug("#%s -> %s", w, _constructions_to_str(segments))
epochs += 1

_logger.debug("Cost before epoch update: %s" % self.get_cost())
_logger.debug("Cost before epoch update: %s", self.get_cost())
forced_epochs = max(forced_epochs, self._epoch_update(epochs))
oldcost = newcost
newcost = self.get_cost()

_logger.info("Epochs: %s\tCost: %s" % (epochs, newcost))
_logger.info("Epochs: %s\tCost: %s", epochs, newcost)
if (forced_epochs == 0 and
newcost >= oldcost - finish_threshold *
self._corpus_coding.boundaries):
Expand Down Expand Up @@ -665,7 +667,7 @@ def train_online(self, data, count_modifier=None, epoch_interval=10000,
while more_tokens:
self._epoch_update(epochs)
newcost = self.get_cost()
_logger.info("Tokens processed: %s\tCost: %s" % (i, newcost))
_logger.info("Tokens processed: %s\tCost: %s", i, newcost)

for _ in _progress(range(epoch_interval)):
try:
Expand All @@ -679,7 +681,7 @@ def train_online(self, data, count_modifier=None, epoch_interval=10000,
continue

if count_modifier is not None:
if not w in counts:
if w not in counts:
c = 0
counts[w] = 1
addc = 1
Expand All @@ -701,8 +703,7 @@ def train_online(self, data, count_modifier=None, epoch_interval=10000,
else:
raise MorfessorException("unknown algorithm '%s'" %
algorithm)
_logger.debug("#%s: %s -> %s" %
(i, w, _constructions_to_str(segments)))
_logger.debug("#%s: %s -> %s", i, w, _constructions_to_str(segments))
i += 1

epochs += 1
Expand All @@ -712,7 +713,7 @@ def train_online(self, data, count_modifier=None, epoch_interval=10000,

self._epoch_update(epochs)
newcost = self.get_cost()
_logger.info("Tokens processed: %s\tCost: %s" % (i, newcost))
_logger.info("Tokens processed: %s\tCost: %s", i, newcost)
return epochs, newcost

def viterbi_segment(self, compound, addcount=1.0, maxlen=30):
Expand Down Expand Up @@ -777,8 +778,7 @@ def viterbi_segment(self, compound, addcount=1.0, maxlen=30):
math.log(self._lexicon_coding.boundaries
+ addcount))
- (self._lexicon_coding.boundaries
* math.log(self._lexicon_coding.boundaries
))
* math.log(self._lexicon_coding.boundaries))
+ self._lexicon_coding.get_codelength(
construction))
/ self._corpus_coding.weight)
Expand Down Expand Up @@ -973,9 +973,7 @@ def make_segment_only(self):
doing so would throw an exception.
"""
self._num_compounds = len(self.get_compounds())
self._segment_only = True

self._analyses = {k: v for (k, v) in self._analyses.items()
if not v.splitloc}

Expand All @@ -994,7 +992,7 @@ def move_direction(cls, model, direction, epoch):
else:
weight *= 1.0 / (1 + 2.0 / epoch)
model.set_corpus_coding_weight(weight)
_logger.info("Corpus weight set to {}".format(weight))
_logger.info("Corpus weight set to %s", weight)
return True
return False

Expand Down Expand Up @@ -1039,12 +1037,12 @@ def _boundary_recall(cls, prediction, reference):
best = -1
for ref in ref_list:
# list of internal boundary positions
ref_b = set(BaselineModel._segmentation_to_splitloc(ref))
ref_b = set(BaselineModel.segmentation_to_splitloc(ref))
if len(ref_b) == 0:
best = 1.0
break
for pre in pre_list:
pre_b = set(BaselineModel._segmentation_to_splitloc(pre))
pre_b = set(BaselineModel.segmentation_to_splitloc(pre))
r = len(ref_b.intersection(pre_b)) / float(len(ref_b))
if r > best:
best = r
Expand Down Expand Up @@ -1077,10 +1075,8 @@ def _estimate_segmentation_dir(self, segments, annotations):
undersegmentation, and 0 if no changes are required.
"""
pre, rec, f = self._bpr_evaluation([[x] for x in segments],
annotations)
_logger.info("Boundary evaluation: precision %.4f; recall %.4f" %
(pre, rec))
pre, rec, f = self._bpr_evaluation([[x] for x in segments], annotations)
_logger.info("Boundary evaluation: precision %.4f; recall %.4f", pre, rec)
if abs(pre - rec) < self.threshold:
return 0
elif rec > pre:
Expand All @@ -1099,7 +1095,7 @@ def update(self, model, epoch):
return False
cur_length = self.calc_morph_length(model)

_logger.info("Current morph-length: {}".format(cur_length))
_logger.info("Current morph-length: %s", cur_length)

if (abs(self.morph_length - cur_length) / self.morph_length >
self.threshold):
Expand Down Expand Up @@ -1133,7 +1129,7 @@ def update(self, model, epoch):
return False
cur_morph_types = model._lexicon_coding.boundaries

_logger.info("Number of morph types: {}".format(cur_morph_types))
_logger.info("Number of morph types: %s", cur_morph_types)


if (abs(self.num_morph_types - cur_morph_types) / self.num_morph_types
Expand Down Expand Up @@ -1345,8 +1341,7 @@ def update_weight(self):
self.weight = (self.corpus_coding.weight *
float(self.corpus_coding.boundaries) / self.boundaries)
if self.weight != old:
_logger.info("Corpus weight of annotated data set to %s"
% self.weight)
_logger.info("Corpus weight of annotated data set to %s", self.weight)

def get_cost(self):
"""Return the cost of the Annotation Corpus."""
Expand Down

0 comments on commit fccb6c5

Please sign in to comment.