From 5db572a124f34aec5ba416e9c9c5f27f48b5358f Mon Sep 17 00:00:00 2001 From: AravindRam Date: Thu, 11 Aug 2016 15:40:12 -0700 Subject: [PATCH] added TagRatio Parser --- tika-parsers/pom.xml | 46 +++- .../tika/parser/tagratio/TextToTagRatio.java | 238 ++++++++++++++++++ .../apache/tika/parser/tagratio/TikaUtil.java | 34 +++ 3 files changed, 309 insertions(+), 9 deletions(-) create mode 100755 tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TextToTagRatio.java create mode 100755 tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TikaUtil.java diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml index e709bbc78b7..113d858717a 100644 --- a/tika-parsers/pom.xml +++ b/tika-parsers/pom.xml @@ -25,7 +25,7 @@ org.apache.tika tika-parent - 1.13-SNAPSHOT + 1.14-SNAPSHOT ../tika-parent/pom.xml @@ -35,20 +35,21 @@ http://tika.apache.org/ - 3.14 + 3.15-beta1 1.10 - + 1.5 0.7.2 0.8 - 2.0.0 - 1.8.11 + 2.0.2 + 1.8.12 4.5.5 3.0.3 0.6 1.54 + 1.3 @@ -84,7 +85,7 @@ com.healthmarketscience.jackcess jackcess - 2.1.3 + 2.1.4 com.healthmarketscience.jackcess @@ -107,9 +108,9 @@ - net.sourceforge.jmatio + org.tallison jmatio - 1.0 + 1.2 org.apache.james @@ -219,6 +220,12 @@ com.rometools rome 1.5.1 + + + org.jdom + jdom + + org.gagravarr @@ -250,7 +257,21 @@ cxf-rt-rs-client ${cxf.version} - + + + org.apache.commons + commons-exec + ${commonsexec.version} + compile + + + + + org.apache.commons + commons-lang3 + 3.4 + @@ -395,6 +416,13 @@ 2.7.1 + + + org.htmlparser + htmlparser + 1.6 + + diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TextToTagRatio.java b/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TextToTagRatio.java new file mode 100755 index 00000000000..f8b846ad51a --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TextToTagRatio.java @@ -0,0 +1,238 @@ +package org.apache.tika.parser.tagratio; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.StringReader; +import java.net.MalformedURLException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; + +import org.apache.tika.exception.TikaException; +import org.htmlparser.Parser; +import org.htmlparser.util.NodeList; +import org.htmlparser.util.ParserException; +import org.json.JSONArray; +import org.json.simple.JSONObject; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.xml.sax.SAXException; + +class TTR { + + private double[] linkTagList; + public Object outputDirPath; + + + @SuppressWarnings("unchecked") + public double[] getTagRatioOfFile(String html,String filepath, String dirName) { + + html = html.replaceAll("(?s)", ""); + html = html.replaceAll("(?s).*?", ""); + html = html.replaceAll("(?s).*?", ""); + html = html.replaceAll("(?s).*?", ""); + + try{ + // To get the filename from absolute path + Path p1 = Paths.get(filepath); + String filename = p1.getFileName().toString(); + + System.out.println("Output File :"+outputDirPath+"/"+filename); + + //Create file in output directory + FileWriter fw = new FileWriter(new File(outputDirPath+"/"+filename+".json")); + BufferedWriter bw = new BufferedWriter(fw); + + Parser p = new Parser(html); + NodeList nl = p.parse(null); + + BufferedReader br = new BufferedReader(new StringReader(nl.toHtml())); + int numLines = 0; + while (br.readLine() != null) { + numLines++; + } + br.close(); + + linkTagList = new double[numLines]; + HashMap metaTagsMap = new HashMap(); + String line; + double threshold = 10; + StringBuffer sb = new StringBuffer(); + double tagRatio = 0.0; + int count = 0; + br = new BufferedReader(new StringReader(nl.toHtml())); + for (int i = 0; i < linkTagList.length; i++) { + line = br.readLine(); + line = line.trim(); + if (line.equals("")) { + continue; + } + linkTagList[i] = computeTextToTagRatio(line); + //Extract meta tags + populateMetaTags(line, metaTagsMap); + + if(linkTagList[i] != 0 && linkTagList[i] >= threshold){ + System.out.println("Tag Ratio : "+linkTagList[i]); + System.out.println(line); + + sb.append(line); + sb.append("\n"); + tagRatio += linkTagList[i]; + count++; + + } + } + + //Create an write JSON + JSONObject obj = new JSONObject(); + obj.put("fileName", filename); + obj.put("absoluteFilePath", filepath); + obj.put("avgTagRatio", (tagRatio/count)); + obj.put("content", sb.toString()); + + JSONArray array = new JSONArray(); + array.put(metaTagsMap); + obj.put("meta-tags", array); + + bw.write(obj.toJSONString()); + + br.close(); + bw.close(); + }catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (ParserException e) { + e.printStackTrace(); + return linkTagList; + } + return linkTagList; + } + + + private void populateMetaTags(String line, HashMap metaTagsMap) { + + if(line.startsWith("= 0 && i < line.length(); i++) { + if (line.charAt(i) == '<') { //start tag + tag++; + i = line.indexOf('>', i); + if (i == -1) { + break; + } + } else if (tag == 0 && line.charAt(i) == '>') { //close tag + text = 0; + tag++; + } else { //just text + text++; + } + + } + if (tag == 0) { + tag = 1; + } + if(text != 0){ + System.out.println("\nLine : "+line+"\nTag : "+tag+"\n"+"Text : "+text); + } + return (double) text / (double) tag; + } + + + + public String readXhtmlData(String xhtmlOutput) { + + // convert String into InputStream + InputStream is = new ByteArrayInputStream(xhtmlOutput.getBytes()); + + // read it with BufferedReader + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + + String line = null; + StringBuffer lines = new StringBuffer(); + try { + while ((line = br.readLine()) != null) { + lines.append(line); + lines.append("\n"); + } + System.out.println(lines.toString()); + return lines.toString(); + + } catch (IOException e) { + e.printStackTrace(); + } + + return null; + } + +} + +public class TextToTagRatio { + + private static String inputDirPath; + private static String outputDirPath; + + public static void main(String[] args) { + TTR ttr = new TTR(); + TikaUtil tikaUtil = new TikaUtil(); + TextToTagRatio.inputDirPath = args[0]; + TextToTagRatio.outputDirPath = args[1]; + ttr.outputDirPath = outputDirPath; + + //Fetch a polar data file, get xhtml from tika and write to output dir + File root = new File(inputDirPath); + File[] listDir = root.listFiles(); + + for (File d : listDir) { + if(d.isFile()) { + + //Get File Path + String fileAbsolutePath = d.getAbsoluteFile().toString(); + try { + //Get xhtml for file from tika + String xhtmlOutput = tikaUtil.parseToHTML(fileAbsolutePath); + + //Get Tag Ratio data for File + String output = ttr.readXhtmlData(xhtmlOutput); + + ttr.getTagRatioOfFile(output,fileAbsolutePath, d.getName()); + + } catch (IOException | SAXException | TikaException e) { + System.out.println("Tika Exception occurred for file: "+ fileAbsolutePath); + } + + } + + } + + + } + + +} + diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TikaUtil.java b/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TikaUtil.java new file mode 100755 index 00000000000..f915e888ab0 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/tagratio/TikaUtil.java @@ -0,0 +1,34 @@ +package org.apache.tika.parser.tagratio; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.sax.ToXMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +//Tika Utility to invoke Tika AutoDetect Parser + +public class TikaUtil { + + //Parse the contents of the file as input to an intermediate XHTML file + public String parseToHTML(String filePath) throws IOException, SAXException, TikaException { + ContentHandler handler = new ToXMLContentHandler(); + + InputStream inputStream = new FileInputStream(new File(filePath)); + + AutoDetectParser parser = new AutoDetectParser(); //Invoke Tika's AutoDetect Parser + Metadata metadata = new Metadata(); + + try (InputStream stream = inputStream) { + parser.parse(stream, handler, metadata); + return handler.toString(); + } + } + +}