diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index e709bbc78b7..113d858717a 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -25,7 +25,7 @@
org.apache.tika
tika-parent
- 1.13-SNAPSHOT
+ 1.14-SNAPSHOT
../tika-parent/pom.xml
@@ -35,20 +35,21 @@
http://tika.apache.org/
- 3.14
+ 3.15-beta1
1.10
-
+
1.5
0.7.2
0.8
- 2.0.0
- 1.8.11
+ 2.0.2
+ 1.8.12
4.5.5
3.0.3
0.6
1.54
+ 1.3
@@ -84,7 +85,7 @@
com.healthmarketscience.jackcess
jackcess
- 2.1.3
+ 2.1.4
com.healthmarketscience.jackcess
@@ -107,9 +108,9 @@
- net.sourceforge.jmatio
+ org.tallison
jmatio
- 1.0
+ 1.2
org.apache.james
@@ -219,6 +220,12 @@
com.rometools
rome
1.5.1
+
+
+ org.jdom
+ jdom
+
+
org.gagravarr
@@ -250,7 +257,21 @@
cxf-rt-rs-client
${cxf.version}
-
+
+
+ org.apache.commons
+ commons-exec
+ ${commonsexec.version}
+ compile
+
+
+
+
+ org.apache.commons
+ commons-lang3
+ 3.4
+
@@ -395,6 +416,13 @@
2.7.1
+
+
+ org.htmlparser
+ htmlparser
+ 1.6
+
+
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TextToTagRatio.java b/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TextToTagRatio.java
new file mode 100755
index 00000000000..f8b846ad51a
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TextToTagRatio.java
@@ -0,0 +1,238 @@
+package org.apache.tika.parser.tagratio;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
+
+import org.apache.tika.exception.TikaException;
+import org.htmlparser.Parser;
+import org.htmlparser.util.NodeList;
+import org.htmlparser.util.ParserException;
+import org.json.JSONArray;
+import org.json.simple.JSONObject;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.xml.sax.SAXException;
+
+class TTR {
+
+ private double[] linkTagList;
+ public Object outputDirPath;
+
+
+ @SuppressWarnings("unchecked")
+ public double[] getTagRatioOfFile(String html,String filepath, String dirName) {
+
+ html = html.replaceAll("(?s)", "");
+ html = html.replaceAll("(?s).*?", "");
+ html = html.replaceAll("(?s).*?", "");
+ html = html.replaceAll("(?s).*?", "");
+
+ try{
+ // To get the filename from absolute path
+ Path p1 = Paths.get(filepath);
+ String filename = p1.getFileName().toString();
+
+ System.out.println("Output File :"+outputDirPath+"/"+filename);
+
+ //Create file in output directory
+ FileWriter fw = new FileWriter(new File(outputDirPath+"/"+filename+".json"));
+ BufferedWriter bw = new BufferedWriter(fw);
+
+ Parser p = new Parser(html);
+ NodeList nl = p.parse(null);
+
+ BufferedReader br = new BufferedReader(new StringReader(nl.toHtml()));
+ int numLines = 0;
+ while (br.readLine() != null) {
+ numLines++;
+ }
+ br.close();
+
+ linkTagList = new double[numLines];
+ HashMap metaTagsMap = new HashMap();
+ String line;
+ double threshold = 10;
+ StringBuffer sb = new StringBuffer();
+ double tagRatio = 0.0;
+ int count = 0;
+ br = new BufferedReader(new StringReader(nl.toHtml()));
+ for (int i = 0; i < linkTagList.length; i++) {
+ line = br.readLine();
+ line = line.trim();
+ if (line.equals("")) {
+ continue;
+ }
+ linkTagList[i] = computeTextToTagRatio(line);
+ //Extract meta tags
+ populateMetaTags(line, metaTagsMap);
+
+ if(linkTagList[i] != 0 && linkTagList[i] >= threshold){
+ System.out.println("Tag Ratio : "+linkTagList[i]);
+ System.out.println(line);
+
+ sb.append(line);
+ sb.append("\n");
+ tagRatio += linkTagList[i];
+ count++;
+
+ }
+ }
+
+ //Create an write JSON
+ JSONObject obj = new JSONObject();
+ obj.put("fileName", filename);
+ obj.put("absoluteFilePath", filepath);
+ obj.put("avgTagRatio", (tagRatio/count));
+ obj.put("content", sb.toString());
+
+ JSONArray array = new JSONArray();
+ array.put(metaTagsMap);
+ obj.put("meta-tags", array);
+
+ bw.write(obj.toJSONString());
+
+ br.close();
+ bw.close();
+ }catch (MalformedURLException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (ParserException e) {
+ e.printStackTrace();
+ return linkTagList;
+ }
+ return linkTagList;
+ }
+
+
+ private void populateMetaTags(String line, HashMap metaTagsMap) {
+
+ if(line.startsWith("= 0 && i < line.length(); i++) {
+ if (line.charAt(i) == '<') { //start tag
+ tag++;
+ i = line.indexOf('>', i);
+ if (i == -1) {
+ break;
+ }
+ } else if (tag == 0 && line.charAt(i) == '>') { //close tag
+ text = 0;
+ tag++;
+ } else { //just text
+ text++;
+ }
+
+ }
+ if (tag == 0) {
+ tag = 1;
+ }
+ if(text != 0){
+ System.out.println("\nLine : "+line+"\nTag : "+tag+"\n"+"Text : "+text);
+ }
+ return (double) text / (double) tag;
+ }
+
+
+
+ public String readXhtmlData(String xhtmlOutput) {
+
+ // convert String into InputStream
+ InputStream is = new ByteArrayInputStream(xhtmlOutput.getBytes());
+
+ // read it with BufferedReader
+ BufferedReader br = new BufferedReader(new InputStreamReader(is));
+
+ String line = null;
+ StringBuffer lines = new StringBuffer();
+ try {
+ while ((line = br.readLine()) != null) {
+ lines.append(line);
+ lines.append("\n");
+ }
+ System.out.println(lines.toString());
+ return lines.toString();
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ return null;
+ }
+
+}
+
+public class TextToTagRatio {
+
+ private static String inputDirPath;
+ private static String outputDirPath;
+
+ public static void main(String[] args) {
+ TTR ttr = new TTR();
+ TikaUtil tikaUtil = new TikaUtil();
+ TextToTagRatio.inputDirPath = args[0];
+ TextToTagRatio.outputDirPath = args[1];
+ ttr.outputDirPath = outputDirPath;
+
+ //Fetch a polar data file, get xhtml from tika and write to output dir
+ File root = new File(inputDirPath);
+ File[] listDir = root.listFiles();
+
+ for (File d : listDir) {
+ if(d.isFile()) {
+
+ //Get File Path
+ String fileAbsolutePath = d.getAbsoluteFile().toString();
+ try {
+ //Get xhtml for file from tika
+ String xhtmlOutput = tikaUtil.parseToHTML(fileAbsolutePath);
+
+ //Get Tag Ratio data for File
+ String output = ttr.readXhtmlData(xhtmlOutput);
+
+ ttr.getTagRatioOfFile(output,fileAbsolutePath, d.getName());
+
+ } catch (IOException | SAXException | TikaException e) {
+ System.out.println("Tika Exception occurred for file: "+ fileAbsolutePath);
+ }
+
+ }
+
+ }
+
+
+ }
+
+
+}
+
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TikaUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TikaUtil.java
new file mode 100755
index 00000000000..f915e888ab0
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TikaUtil.java
@@ -0,0 +1,34 @@
+package org.apache.tika.parser.tagratio;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+//Tika Utility to invoke Tika AutoDetect Parser
+
+public class TikaUtil {
+
+ //Parse the contents of the file as input to an intermediate XHTML file
+ public String parseToHTML(String filePath) throws IOException, SAXException, TikaException {
+ ContentHandler handler = new ToXMLContentHandler();
+
+ InputStream inputStream = new FileInputStream(new File(filePath));
+
+ AutoDetectParser parser = new AutoDetectParser(); //Invoke Tika's AutoDetect Parser
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = inputStream) {
+ parser.parse(stream, handler, metadata);
+ return handler.toString();
+ }
+ }
+
+}