diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/RTGTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/RTGTranslator.java new file mode 100644 index 0000000000..ef366e25a1 --- /dev/null +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/RTGTranslator.java @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.tika.language.translate; + +import com.fasterxml.jackson.jaxrs.json.JacksonJsonProvider; +import org.apache.cxf.jaxrs.client.WebClient; +import org.apache.tika.exception.TikaException; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Properties; + + +/** + *

This translator is designed to work with a TCP-IP available + * RTG translation server, specifically the + * + * REST-based RTG server.

+ * To get Docker image: + * https://hub.docker.com/repository/docker/tgowda/rtg-model
+ *
+ * {code
+ * # without GPU
+ *   docker run --rm -i -p 6060:6060 tgowda/rtg-model:500toEng-v1
+ * # Or, with GPU device 0
+ *   docker run --rm -i -p 6060:6060 --gpus '"device=0"' tgowda/rtg-model:500toEng-v1
+ * }
+ * 
+ * + *

If you were to interact with the server via curl a request + * would look as follows

+ * + *
+ * {code
+ * curl --data "source=Comment allez-vous?" \
+ *      --data "source=Bonne journée" \
+ *      http://localhost:6060/translate
+ * }
+ * 
+ * + * RTG requires input to be pre-formatted into sentences, one per line, + * so this translation implementation takes care of that. + */ +public class RTGTranslator extends AbstractTranslator { + + public static final String RTG_TRANSLATE_URL_BASE = "http://localhost:6060"; + public static final String RTG_PROPS = "translator.rtg.properties"; + private static final Logger LOG = LoggerFactory.getLogger(RTGTranslator.class); + private WebClient client; + private boolean isAvailable = false; + + public RTGTranslator() { + String rtgBaseUrl = RTG_TRANSLATE_URL_BASE; + Properties config = new Properties(); + try (InputStream stream = getClass().getClassLoader().getResourceAsStream(RTG_PROPS)){ + if (stream != null){ + config.load(stream); + } + rtgBaseUrl = config.getProperty("rtg.base.url", rtgBaseUrl); + } catch (IOException e) { + LOG.warn(e.getMessage(), e); + } + LOG.info("RTG base URL: " + rtgBaseUrl); + List providers = new ArrayList<>(); + providers.add(new JacksonJsonProvider()); + try { + this.client = WebClient.create(rtgBaseUrl, providers); + this.isAvailable = client.head().getStatus() == 200; + } catch (Exception e){ + LOG.warn(e.getMessage(), e); + isAvailable = false; + } + + } + @Override + public String translate(String text, String sourceLanguage, String targetLanguage) + throws TikaException, IOException { + return this.translate(text); + } + + @Override + public String translate(String text, String targetLanguage) + throws TikaException, IOException { + return this.translate(text); + } + + public String translate(String text) throws TikaException, IOException { + if (!this.isAvailable) { + return text; + } + Map> input = new HashMap<>(); + input.put("source", Arrays.asList(text.split("(?<=(? sentences = (List) obj.get("translation"); + String output = String.join("\n", sentences); + return output; + } catch (ParseException e){ + throw new IOException(e.getMessage(), e); + } + } + + @Override + public boolean isAvailable() { + return this.isAvailable; + } +} diff --git a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator index f3dcad4f74..154beca40d 100644 --- a/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator +++ b/tika-translate/src/main/resources/META-INF/services/org.apache.tika.language.translate.Translator @@ -18,3 +18,4 @@ org.apache.tika.language.translate.GoogleTranslator org.apache.tika.language.translate.Lingo24Translator org.apache.tika.language.translate.CachedTranslator org.apache.tika.language.translate.JoshuaNetworkTranslator +org.apache.tika.language.translate.RTGTranslator diff --git a/tika-translate/src/test/java/org/apache/tika/language/translate/RTGTranslatorTest.java b/tika-translate/src/test/java/org/apache/tika/language/translate/RTGTranslatorTest.java new file mode 100644 index 0000000000..0c18e0eac2 --- /dev/null +++ b/tika-translate/src/test/java/org/apache/tika/language/translate/RTGTranslatorTest.java @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.language.translate; + +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.fail; + + +/** + * Test harness for the {@link RTGTranslator}. + * + */ +public class RTGTranslatorTest { + + private RTGTranslator translator; + + + @Before + public void setUp() { + translator = new RTGTranslator(); + } + + @Test + public void testSimpleTranslate() { + String source = "hola señor"; + String expected = "hello sir"; + + String result = null; + if (translator.isAvailable()) { + try { + result = translator.translate(source); + assertNotNull(result); + assertEquals("Result: [" + result + + "]: not equal to expected: [" + expected + "]", + expected, result.toLowerCase()); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + } + +}