Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

initial version of interface to Apache Tika

  • Loading branch information...
commit 9138769c0c804608130f6f84c860671c12383c80 0 parents
@alexott authored
Showing with 62 additions and 0 deletions.
  1. +5 −0 .gitignore
  2. +15 −0 README
  3. +8 −0 project.clj
  4. +34 −0 src/tika.clj
5 .gitignore
@@ -0,0 +1,5 @@
+pom-generated.xml
+Manifest.txt
+tika.jar
+lib/*.jar
+*~
15 README
@@ -0,0 +1,15 @@
+# tika
+
+Interface to Apache Tika project
+
+## Usage
+
+FIXME: write
+
+## Installation
+
+FIXME: write
+
+## License
+
+FIXME: write
8 project.clj
@@ -0,0 +1,8 @@
+(defproject tika-clj "1.0.0-SNAPSHOT"
+ :description "Interface to Apache Tika"
+ :dependencies [[org.clojure/clojure "1.1.0"]
+ [org.clojure/clojure-contrib "1.1.0"]
+ [org.apache.tika/tika-parsers "0.6"]
+ ]
+ :dev-dependencies [[leiningen/lein-swank "1.2.0-SNAPSHOT"]])
+
34 src/tika.clj
@@ -0,0 +1,34 @@
+;; Clojure Interface to Apache Tika library
+
+(ns tika
+ (:import (java.io InputStream File FileInputStream))
+ (:import (org.apache.tika.parser Parser AutoDetectParser ParseContext))
+ (:import (org.apache.tika.metadata Metadata))
+ (:import (org.apache.tika.sax BodyContentHandler))
+ )
+
+(defn conv-metadata [#^Metadata mdata]
+ (let [names (.names mdata)]
+ (zipmap (map #(keyword (.toLowerCase %1)) names)
+ (map #(seq (.getValues mdata %1)) names))))
+
+
+(defn parse-stream
+ "Parses Tika-supported stream"
+ [#^InputStream ifile]
+ (let [parser (new AutoDetectParser)
+ context (new ParseContext)
+ metadata (new Metadata)
+ handler (new BodyContentHandler)
+ ]
+ (.set context Parser parser)
+ (.parse parser ifile handler metadata context)
+ (.close ifile)
+ (let [mdata (conv-metadata metadata)
+ txt (.toString handler)]
+ {:metadata mdata :text txt})))
+
+(defn parse-file
+ "Parses Tika-supported file"
+ [#^File file]
+ (parse-stream (new FileInputStream file)))
Please sign in to comment.
Something went wrong with that request. Please try again.