From 0e0ce6d47f3291c74fd5d5f083d3d162d8b2abe5 Mon Sep 17 00:00:00 2001 From: "ReEvApp - Re-Evolution Applications, LLC" Date: Sat, 9 Apr 2016 13:27:05 -0400 Subject: [PATCH] fix for TIKA-1943 contributed by Mark Duske Includes support for Yandex Translate API --- .../language/translate/YandexTranslator.java | 175 ++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 tika-translate/src/main/java/org/apache/tika/language/translate/YandexTranslator.java diff --git a/tika-translate/src/main/java/org/apache/tika/language/translate/YandexTranslator.java b/tika-translate/src/main/java/org/apache/tika/language/translate/YandexTranslator.java new file mode 100644 index 00000000000..ad40b8c7495 --- /dev/null +++ b/tika-translate/src/main/java/org/apache/tika/language/translate/YandexTranslator.java @@ -0,0 +1,175 @@ +package org.apache.tika.language.translate; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import javax.ws.rs.core.MediaType; +import javax.ws.rs.core.Response; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Properties; + +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.cxf.jaxrs.client.WebClient; +import org.apache.tika.exception.TikaException; +import org.apache.tika.language.translate.Translator; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * An implementation of a REST client for the YANDEX Translate API. + * You can sign up for free access online on the API Key form + * and set your Application's User Key in the translator.yandex.properties file. + */ +public class YandexTranslator implements Translator { + + /** + * Yandex Translate API service end-point URL + */ + private static final String YANDEX_TRANSLATE_URL_BASE = "https://translate.yandex.net/api/v1.5/tr.json/translate"; + + /** + * Default USer-Key, a real User-Key must be provided before the Lingo24 can successfully request translations + */ + private static final String DEFAULT_KEY = "dummy-key"; + + /** + * Identifies the client of the request, used for authentication + */ + private String apiKey; + + /** + * The Yandex Translate API can handle text in plain and/or html format, the default + * format is plain + */ + private String format = "plain"; + + public YandexTranslator() { + Properties config = new Properties(); + try { + config.load(YandexTranslator.class + .getResourceAsStream( + "translator.yandex.properties")); + this.apiKey = config.getProperty("translator.api-key"); + this.format = config.getProperty("translator.text.format"); + } catch (Exception e) { + e.printStackTrace(); + } + } + + @Override + public String translate(String text, String sourceLanguage, + String targetLanguage) throws TikaException, IOException { + if (!this.isAvailable()) { + return text; + } + + WebClient client = WebClient.create(YANDEX_TRANSLATE_URL_BASE); + + String langCode; + + if (sourceLanguage == null) { + //Translate Service will identify source language + langCode = targetLanguage; + } else { + //Source language is well known + langCode = sourceLanguage + '-' + targetLanguage; + } + + //TODO Add support for text over 10k characters + Response response = client.accept(MediaType.APPLICATION_JSON) + .query("key", this.apiKey).query("lang", langCode) + .query("text", text).get(); + BufferedReader reader = new BufferedReader(new InputStreamReader( + (InputStream) response.getEntity(), UTF_8)); + String line = null; + StringBuffer responseText = new StringBuffer(); + while ((line = reader.readLine()) != null) { + responseText.append(line); + } + + try { + ObjectMapper mapper = new ObjectMapper(); + JsonNode jsonResp = mapper.readTree(responseText.toString()); + + if (!jsonResp.findValuesAsText("code").isEmpty()) { + String code = jsonResp.findValuesAsText("code").get(0); + if (code.equals("200")) { + return jsonResp.findValue("text").get(0).asText(); + } else { + throw new TikaException(jsonResp.findValue("message").get(0).asText()); + } + } else { + throw new TikaException("Return message not recognized: " + responseText.toString().substring(0, Math.min(responseText.length(), 100))); + } + } catch (JsonParseException e) { + throw new TikaException("Error requesting translation from '" + sourceLanguage + "' to '" + targetLanguage + "', JSON response from Lingo24 is not well formatted: " + responseText.toString()); + } + } + + + /** + * Get the API Key in use for client authentication + * @return API Key + */ + public String getApiKey() { + return apiKey; + } + + /** + * Set the API Key for client authentication + * @param apiKey API Key + */ + public void setApiKey(String apiKey) { + this.apiKey = apiKey; + } + + /** + * Retrieve the current text format setting. + * The Yandex Translate API can handle text in plain and/or html format, the default + * format is plain + * @return + */ + public String getFormat() { + return format; + } + + /** + * Set the text format to use (plain/html) + * @param format Text format setting, either plain or html + */ + public void setFormat(String format) { + this.format = format; + } + + @Override + public String translate(String text, String targetLanguage) + throws TikaException, IOException { + return this.translate(text, null, targetLanguage); + } + + @Override + public boolean isAvailable() { + return this.apiKey!=null && !this.apiKey.equals(DEFAULT_KEY); + } + +}