Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions tika-parsers/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
<parent>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId>
<version>1.13-SNAPSHOT</version>
<version>1.14-SNAPSHOT</version>
<relativePath>../tika-parent/pom.xml</relativePath>
</parent>

Expand All @@ -35,20 +35,21 @@
<url>http://tika.apache.org/</url>

<properties>
<poi.version>3.14</poi.version>
<poi.version>3.15-beta1</poi.version>
<!-- NOTE: sync codec version with POI -->
<codec.version>1.10</codec.version>
<!-- NOTE: sync tukaani version with commons-compress -->
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
<tukaani.version>1.5</tukaani.version>
<mime4j.version>0.7.2</mime4j.version>
<vorbis.version>0.8</vorbis.version>
<pdfbox.version>2.0.0</pdfbox.version>
<jempbox.version>1.8.11</jempbox.version>
<pdfbox.version>2.0.2</pdfbox.version>
<jempbox.version>1.8.12</jempbox.version>
<netcdf-java.version>4.5.5</netcdf-java.version>
<cxf.version>3.0.3</cxf.version>
<sis.version>0.6</sis.version>
<!-- used by POI, PDFBox and Jackcess ...try to sync -->
<bouncycastle.version>1.54</bouncycastle.version>
<commonsexec.version>1.3</commonsexec.version>
</properties>

<dependencies>
Expand Down Expand Up @@ -84,7 +85,7 @@
<dependency>
<groupId>com.healthmarketscience.jackcess</groupId>
<artifactId>jackcess</artifactId>
<version>2.1.3</version>
<version>2.1.4</version>
</dependency>
<dependency>
<groupId>com.healthmarketscience.jackcess</groupId>
Expand All @@ -107,9 +108,9 @@

<!-- Upstream parser libraries -->
<dependency>
<groupId>net.sourceforge.jmatio</groupId>
<groupId>org.tallison</groupId>
<artifactId>jmatio</artifactId>
<version>1.0</version>
<version>1.2</version>
</dependency>
<dependency>
<groupId>org.apache.james</groupId>
Expand Down Expand Up @@ -219,6 +220,12 @@
<groupId>com.rometools</groupId>
<artifactId>rome</artifactId>
<version>1.5.1</version>
<exclusions>
<exclusion>
<groupId>org.jdom</groupId>
<artifactId>jdom</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.gagravarr</groupId>
Expand Down Expand Up @@ -250,7 +257,21 @@
<artifactId>cxf-rt-rs-client</artifactId>
<version>${cxf.version}</version>
</dependency>

<!-- TIKA-2021: Tesseract OCR Parser dependencies,
used for executing image processing script -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-exec</artifactId>
<version>${commonsexec.version}</version>
<scope>compile</scope>
</dependency>

<!-- TIKA-2021: Tesseract OCR Parser tests -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>

<!-- Provided dependencies -->
<dependency>
Expand Down Expand Up @@ -395,6 +416,13 @@
<version>2.7.1</version>
</dependency>

<!--HTML Parser for TagRatio Parser-->
<dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>1.6</version>
</dependency>

</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
package org.apache.tika.parser.tagratio;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;

import org.apache.tika.exception.TikaException;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.json.JSONArray;
import org.json.simple.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.xml.sax.SAXException;

class TTR {

private double[] linkTagList;
public Object outputDirPath;


@SuppressWarnings("unchecked")
public double[] getTagRatioOfFile(String html,String filepath, String dirName) {

html = html.replaceAll("(?s)<!--.*?-->", "");
html = html.replaceAll("(?s)<script.*?>.*?</script>", "");
html = html.replaceAll("(?s)<SCRIPT.*?>.*?</SCRIPT>", "");
html = html.replaceAll("(?s)<style.*?>.*?</style>", "");

try{
// To get the filename from absolute path
Path p1 = Paths.get(filepath);
String filename = p1.getFileName().toString();

System.out.println("Output File :"+outputDirPath+"/"+filename);

//Create file in output directory
FileWriter fw = new FileWriter(new File(outputDirPath+"/"+filename+".json"));
BufferedWriter bw = new BufferedWriter(fw);

Parser p = new Parser(html);
NodeList nl = p.parse(null);

BufferedReader br = new BufferedReader(new StringReader(nl.toHtml()));
int numLines = 0;
while (br.readLine() != null) {
numLines++;
}
br.close();

linkTagList = new double[numLines];
HashMap<String, String> metaTagsMap = new HashMap<String, String>();
String line;
double threshold = 10;
StringBuffer sb = new StringBuffer();
double tagRatio = 0.0;
int count = 0;
br = new BufferedReader(new StringReader(nl.toHtml()));
for (int i = 0; i < linkTagList.length; i++) {
line = br.readLine();
line = line.trim();
if (line.equals("")) {
continue;
}
linkTagList[i] = computeTextToTagRatio(line);
//Extract meta tags
populateMetaTags(line, metaTagsMap);

if(linkTagList[i] != 0 && linkTagList[i] >= threshold){
System.out.println("Tag Ratio : "+linkTagList[i]);
System.out.println(line);

sb.append(line);
sb.append("\n");
tagRatio += linkTagList[i];
count++;

}
}

//Create an write JSON
JSONObject obj = new JSONObject();
obj.put("fileName", filename);
obj.put("absoluteFilePath", filepath);
obj.put("avgTagRatio", (tagRatio/count));
obj.put("content", sb.toString());

JSONArray array = new JSONArray();
array.put(metaTagsMap);
obj.put("meta-tags", array);

bw.write(obj.toJSONString());

br.close();
bw.close();
}catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParserException e) {
e.printStackTrace();
return linkTagList;
}
return linkTagList;
}


private void populateMetaTags(String line, HashMap<String, String> metaTagsMap) {

if(line.startsWith("<meta ")){

Document doc = Jsoup.parse(line);
Element tag = doc.select("meta").first();
String name = tag.attr("name");
String content = tag.attr("content");

metaTagsMap.put(name, content);
}
}

/**
* Calculate Text to Tag Ratio for line
* @param line
* @return
*/
private double computeTextToTagRatio(String line) {
int tag = 0;
int text = 0;

for (int i = 0; i >= 0 && i < line.length(); i++) {
if (line.charAt(i) == '<') { //start tag
tag++;
i = line.indexOf('>', i);
if (i == -1) {
break;
}
} else if (tag == 0 && line.charAt(i) == '>') { //close tag
text = 0;
tag++;
} else { //just text
text++;
}

}
if (tag == 0) {
tag = 1;
}
if(text != 0){
System.out.println("\nLine : "+line+"\nTag : "+tag+"\n"+"Text : "+text);
}
return (double) text / (double) tag;
}



public String readXhtmlData(String xhtmlOutput) {

// convert String into InputStream
InputStream is = new ByteArrayInputStream(xhtmlOutput.getBytes());

// read it with BufferedReader
BufferedReader br = new BufferedReader(new InputStreamReader(is));

String line = null;
StringBuffer lines = new StringBuffer();
try {
while ((line = br.readLine()) != null) {
lines.append(line);
lines.append("\n");
}
System.out.println(lines.toString());
return lines.toString();

} catch (IOException e) {
e.printStackTrace();
}

return null;
}

}

public class TextToTagRatio {

private static String inputDirPath;
private static String outputDirPath;

public static void main(String[] args) {
TTR ttr = new TTR();
TikaUtil tikaUtil = new TikaUtil();
TextToTagRatio.inputDirPath = args[0];
TextToTagRatio.outputDirPath = args[1];
ttr.outputDirPath = outputDirPath;

//Fetch a polar data file, get xhtml from tika and write to output dir
File root = new File(inputDirPath);
File[] listDir = root.listFiles();

for (File d : listDir) {
if(d.isFile()) {

//Get File Path
String fileAbsolutePath = d.getAbsoluteFile().toString();
try {
//Get xhtml for file from tika
String xhtmlOutput = tikaUtil.parseToHTML(fileAbsolutePath);

//Get Tag Ratio data for File
String output = ttr.readXhtmlData(xhtmlOutput);

ttr.getTagRatioOfFile(output,fileAbsolutePath, d.getName());

} catch (IOException | SAXException | TikaException e) {
System.out.println("Tika Exception occurred for file: "+ fileAbsolutePath);
}

}

}


}


}

Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package org.apache.tika.parser.tagratio;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.ToXMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

//Tika Utility to invoke Tika AutoDetect Parser

public class TikaUtil {

//Parse the contents of the file as input to an intermediate XHTML file
public String parseToHTML(String filePath) throws IOException, SAXException, TikaException {
ContentHandler handler = new ToXMLContentHandler();

InputStream inputStream = new FileInputStream(new File(filePath));

AutoDetectParser parser = new AutoDetectParser(); //Invoke Tika's AutoDetect Parser
Metadata metadata = new Metadata();

try (InputStream stream = inputStream) {
parser.parse(stream, handler, metadata);
return handler.toString();
}
}

}