Skip to content
This repository has been archived by the owner on May 24, 2024. It is now read-only.

Commit

Permalink
Implemented tika-based extraction so that users can process html, pdf…
Browse files Browse the repository at this point in the history
… and txt files.
  • Loading branch information
dweiss committed Aug 8, 2014
1 parent a831a26 commit 92a3714
Show file tree
Hide file tree
Showing 5 changed files with 171 additions and 177 deletions.
140 changes: 71 additions & 69 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">

<modelVersion>4.0.0</modelVersion>

Expand Down Expand Up @@ -48,7 +49,7 @@
<name>Stanisław Osiński</name>
<email>stanislaw.osinski@carrotsearch.com</email>
</developer>

<developer>
<id>dawid.weiss</id>
<name>Dawid Weiss</name>
Expand All @@ -59,14 +60,14 @@
<!-- Global properties. -->
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.6</maven.compiler.source>
<maven.compiler.target>1.6</maven.compiler.target>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>


<!-- Dependencies. -->
<dependencies>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Expand All @@ -77,7 +78,12 @@
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.1.0</version>
<version>4.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.6.0</version>
</dependency>
<dependency>
<groupId>args4j</groupId>
Expand All @@ -87,7 +93,19 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>11.0.2</version>
<version>17.0</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.5</version>
<type>jar</type>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.5</version>
<type>jar</type>
</dependency>
</dependencies>

Expand All @@ -111,77 +129,61 @@
</plugins>
</pluginManagement>


<!-- Confugure maven release plugin to make use of the profiles. -->
<plugins>
<!-- Configure assembly of ZIP and TAR.GZ bundles (stand-alone distributions). -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<useReleaseProfile>true</useReleaseProfile>
<releaseProfiles>release,maven-release</releaseProfiles>
<pushChanges>false</pushChanges>
<scmCommentPrefix>Release: </scmCommentPrefix>
<descriptors>
<descriptor>src/main/assembly/bin.xml</descriptor>
</descriptors>
<attach>false</attach>
<appendAssemblyId>false</appendAssemblyId>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>


<!-- Reports -->
<reporting>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-report-plugin</artifactId>
<version>2.7.1</version>
<artifactId>maven-jar-plugin</artifactId>
<configuration>
<archive>
<manifest>
<classpathPrefix>lib/</classpathPrefix>
<addClasspath>true</addClasspath>
<mainClass>org.carrot2.folder2index.Folder2IndexApp</mainClass>
</manifest>
</archive>
</configuration>
</plugin>
</plugins>
</reporting>


<profiles>
<!-- No tests, no additional checks. -->
<profile>
<id>fastinstall</id>
<properties>
<maven.test.skip>true</maven.test.skip>
</properties>
<build>
<defaultGoal>install</defaultGoal>
</build>
</profile>

<!-- Release mode. -->
<profile>
<id>release</id>

<build>
<defaultGoal>install</defaultGoal>

<plugins>
<!-- Configure assembly of ZIP and TAR.GZ bundles (stand-alone distributions). -->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>

<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>2.1</version>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<descriptors>
<descriptor>src/main/assembly/bin.xml</descriptor>
</descriptors>
<attach>false</attach>
<appendAssemblyId>false</appendAssemblyId>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
<includeScope>runtime</includeScope>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</profile>
</profiles>
</execution>
</executions>
</plugin>

</plugins>
</build>
</project>
28 changes: 3 additions & 25 deletions src/main/assembly/bin.xml
Original file line number Diff line number Diff line change
Expand Up @@ -9,41 +9,19 @@
</formats>

<fileSets>
<fileSet>
<directory></directory>
<outputDirectory>/</outputDirectory>
<includes>
<include>*.LICENSE</include>
<include>README.rdoc</include>
<include>test/</include>
</includes>
</fileSet>
<fileSet>
<directory>${project.build.directory}</directory>
<outputDirectory>/lib</outputDirectory>
<outputDirectory></outputDirectory>
<includes>
<include>*.jar</include>
</includes>
</fileSet>
</fileSets>
<files>
<file>
<source>src/main/bin/folder2index.sh</source>
<outputDirectory>/</outputDirectory>
<fileMode>755</fileMode>
<lineEnding>unix</lineEnding>
</file>
<file>
<source>src/main/bin/folder2index.cmd</source>
<outputDirectory>/</outputDirectory>
<lineEnding>dos</lineEnding>
</file>
</files>

<dependencySets>
<dependencySet>
<outputDirectory>/lib</outputDirectory>
<useTransitiveDependencies>false</useTransitiveDependencies>
<useTransitiveDependencies>true</useTransitiveDependencies>
</dependencySet>
</dependencySets>
</assembly>
</assembly>
26 changes: 0 additions & 26 deletions src/main/bin/folder2index.cmd

This file was deleted.

8 changes: 0 additions & 8 deletions src/main/bin/folder2index.sh

This file was deleted.

Loading

0 comments on commit 92a3714

Please sign in to comment.