Skip to content

Commit

Permalink
OPENNLP-1092: Fix pos model serialization bug
Browse files Browse the repository at this point in the history
  • Loading branch information
kottmann committed Jun 28, 2017
1 parent e515ff4 commit 08e163c
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 1 deletion.
Expand Up @@ -35,14 +35,16 @@
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ByteArraySerializer;
import opennlp.tools.util.model.POSModelSerializer;
import opennlp.tools.util.model.SerializableArtifact;

/**
* The {@link POSModel} is the model used
* by a learnable {@link POSTagger}.
*
* @see POSTaggerME
*/
public final class POSModel extends BaseModel {
public final class POSModel extends BaseModel implements SerializableArtifact {

private static final String COMPONENT_NAME = "POSTaggerME";
static final String POS_MODEL_ENTRY_NAME = "pos.model";
Expand Down Expand Up @@ -178,4 +180,9 @@ public Dictionary getNgramDictionary() {
return getFactory().getDictionary();
return null;
}

@Override
public Class<POSModelSerializer> getArtifactSerializerClass() {
return POSModelSerializer.class;
}
}
@@ -0,0 +1,104 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.namefind;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Map;
import java.util.stream.Collectors;

import org.junit.Assert;
import org.junit.Test;

import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerMETest;
import opennlp.tools.util.MockInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelType;

public class TokenNameFinderModelTest {

@Test
public void testNERWithPOSModel() throws IOException {

// create a resources folder
Path resourcesFolder = Files.createTempDirectory("resources").toAbsolutePath();

// save a POS model there
POSModel posModel = POSTaggerMETest.trainPOSModel(ModelType.MAXENT);
File posModelFile = new File(resourcesFolder.toFile(),"pos-model.bin");
FileOutputStream fos = new FileOutputStream(posModelFile);

posModel.serialize(posModelFile);

Assert.assertTrue(posModelFile.exists());

// load feature generator xml bytes
InputStream fgInputStream = this.getClass().getResourceAsStream("ner-pos-features.xml");
BufferedReader buffers = new BufferedReader(new InputStreamReader(fgInputStream));
String featureGeneratorString = buffers.lines().
collect(Collectors.joining("\n"));

// create a featuregenerator file
Path featureGenerator = Files.createTempFile("ner-featuregen", ".xml");
Files.write(featureGenerator, featureGeneratorString.getBytes());


Map<String, Object> resources;
try {
resources = TokenNameFinderTrainerTool.loadResources(resourcesFolder.toFile(),
featureGenerator.toAbsolutePath().toFile());
}
catch (IOException e) {
throw new TerminateToolException(-1, e.getMessage(), e);
}


// train a name finder
ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
new PlainTextByLineStream(new MockInputStreamFactory(
new File("opennlp/tools/namefind/voa1.train")), "UTF-8"));

TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, 70);
params.put(TrainingParameters.CUTOFF_PARAM, 1);

TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
params, TokenNameFinderFactory.create(null,
featureGeneratorString.getBytes(), resources, new BioCodec()));


File model = File.createTempFile("nermodel", ".bin");
FileOutputStream modelOut = new FileOutputStream(model);
nameFinderModel.serialize(modelOut);

modelOut.close();

Assert.assertTrue(model.exists());
}
}
@@ -0,0 +1,36 @@
<!--
~ Licensed to the Apache Software Foundation (ASF) under one or more
~ contributor license agreements. See the NOTICE file distributed with
~ this work for additional information regarding copyright ownership.
~ The ASF licenses this file to You under the Apache License, Version 2.0
~ (the "License"); you may not use this file except in compliance with
~ the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<generators>
<cache>
<generators>
<window prevLength = "2" nextLength = "2">
<tokenclass/>
</window>
<window prevLength = "2" nextLength = "2">
<token/>
</window>
<window prevLength = "2" nextLength = "2">
<tokenpos model="pos-model.bin"/>
</window>
<definition/>
<prevmap/>
<bigram/>
<sentence begin="true" end="false"/>
</generators>
</cache>
</generators>

0 comments on commit 08e163c

Please sign in to comment.