Permalink
Browse files

Hackathon (#227)

* Make CreateIndex runnable as a function call.

* Implement batch mode via the CLI.
- Run multiple queries simultaneously, outputting the results to separate table files.
- Index arbitrary sentence corpora as opposed to the existing pre-processed set.
- Persist new corpus index between multiple executions.

* Reformat based on latest SBT plugin.

* Fix cleanup of non-empty temporary directories.

* Remove reference to driver's available cores and replace with an arbitrary value.
  • Loading branch information...
ColinArenz committed Aug 26, 2016
1 parent f5a7d52 commit 1ac236f50e608a1f276a858608021dec0e345207
Showing with 580 additions and 96 deletions.
  1. +4 −0 build.sbt
  2. +19 −1 project/Dependencies.scala
  3. +1 −2 project/plugins.sbt
  4. +5 −5 src/main/resources/application.conf
  5. +2 −2 src/main/resources/logback.xml
  6. +2 −4 src/main/scala/org/allenai/ike/BlackLabResult.scala
  7. +0 −2 src/main/scala/org/allenai/ike/GroupedBlackLabResult.scala
  8. +339 −0 src/main/scala/org/allenai/ike/IkeBatchSearch.scala
  9. +53 −0 src/main/scala/org/allenai/ike/IkeKryoRegistrator.scala
  10. +17 −0 src/main/scala/org/allenai/ike/Interval.scala
  11. +1 −0 src/main/scala/org/allenai/ike/JsonSerialization.scala
  12. +57 −3 src/main/scala/org/allenai/ike/SearchApp.scala
  13. +10 −4 src/main/scala/org/allenai/ike/SearchResultGrouper.scala
  14. +42 −41 src/main/scala/org/allenai/ike/index/CreateIndex.scala
  15. +10 −11 src/main/scala/org/allenai/ike/ml/HitAnalyzer.scala
  16. +1 −1 src/main/scala/org/allenai/ike/ml/QueryGeneralizer.scala
  17. +3 −3 src/main/scala/org/allenai/ike/ml/QuerySuggester.scala
  18. +2 −4 src/main/scala/org/allenai/ike/ml/compoundop/EvaluatedOp.scala
  19. +2 −2 src/main/scala/org/allenai/ike/ml/compoundop/OpConjunctionOfDisjunctions.scala
  20. +1 −1 src/main/scala/org/allenai/ike/ml/queryop/GeneralizingOpGenerator.scala
  21. +1 −1 src/main/scala/org/allenai/ike/ml/queryop/SimilarPhraseMatchTracker.scala
  22. +1 −1 src/main/scala/org/allenai/ike/ml/queryop/SpecifyingOpGenerator.scala
  23. +1 −2 src/main/scala/org/allenai/ike/ml/subsample/Sampler.scala
  24. +2 −2 src/main/scala/org/allenai/ike/ml/subsample/SpanQueryFilterByCaptureGroups.scala
  25. +1 −1 src/main/scala/org/allenai/ike/ml/subsample/SpanQueryStartAt.scala
  26. +2 −2 src/main/scala/org/allenai/ike/ml/subsample/SpansFilterByCaptureGroups.scala
  27. +1 −1 src/main/scala/org/allenai/ike/ml/subsample/SpansStartAt.scala
View
@@ -29,6 +29,8 @@ libraryDependencies ++= Seq(
allenAiCommon,
allenAiTestkit,
allenAiDatastore,
hadoopModule("hadoop-aws"),
hadoopModule("hadoop-mapreduce"),
nlpstackModule("tokenize") exclude("org.allenai", "datastore_2.11"),
nlpstackModule("postag") exclude("org.allenai", "datastore_2.11"),
nlpstackModule("chunk") exclude("org.allenai", "datastore_2.11"),
@@ -39,6 +41,7 @@ libraryDependencies ++= Seq(
lucene("highlighter"),
lucene("queries"),
lucene("queryparser"),
sparkModule("core"),
"com.typesafe.slick" %% "slick" % "2.1.0",
"com.github.tminglei" %% "slick-pg" % "0.8.2",
"com.typesafe.play" %% "play-json" % "2.3.8",
@@ -64,6 +67,7 @@ conflictManager := ConflictManager.default
dependencyOverrides ++= Set(
"org.allenai.common" %% "common-core" % "1.0.13",
sprayJson,
"com.fasterxml.jackson.core" % "jackson-databind" % "2.4.4",
"org.scala-lang.modules" %% "scala-parser-combinators" % "1.0.3",
"org.scala-lang.modules" %% "scala-xml" % "1.0.2",
"commons-codec" % "commons-codec" % "1.6",
View
@@ -4,10 +4,28 @@ import sbt._
import sbt.Keys._
object Dependencies extends CoreDependencies {
val allenAiDatastore = "org.allenai.datastore" %% "datastore" % "1.0.7" excludeAll (
// This conflicts with aws-java-sdk 1.7.4 in hadoop.
ExclusionRule(organization = "com.amazonaws", name = "aws-java-sdk-s3")
)
def hadoopModule(id: String) = "org.apache.hadoop" % id % "2.7.2" excludeAll (
ExclusionRule(organization = "com.google.guava"),
ExclusionRule(organization = "javax.servlet"),
ExclusionRule(organization = "org.slf4j", name = "slf4j-log4j12")
)
val luceneGroup = "org.apache.lucene"
val luceneVersion = "4.2.1"
def lucene(part: String) = luceneGroup % s"lucene-${part}" % luceneVersion
val allenAiDatastore = "org.allenai.datastore" %% "datastore" % "1.0.7"
val nlpstackVersion = "1.10"
def nlpstackModule(id: String) = "org.allenai.nlpstack" %% s"nlpstack-${id}" % nlpstackVersion
def sparkModule(id: String) = "org.apache.spark" %% s"spark-$id" % "1.6.1" excludeAll (
ExclusionRule(organization = "com.google.guava"),
ExclusionRule(organization = "org.apache.commons"),
ExclusionRule(organization = "org.codehaus.jackson"),
ExclusionRule(organization = "org.slf4j", name = "slf4j-log4j12")
)
}
View
@@ -1,2 +1 @@
addSbtPlugin("org.allenai.plugins" % "allenai-sbt-plugins" % "2015.06.05-0")
addSbtPlugin("org.allenai.plugins" % "allenai-sbt-plugins" % "1.3.0")
@@ -33,19 +33,19 @@ IkeToolWebapp = {
// location = file
// description = "My Blacklab-indexed corpus"
// path = /path/to/index-directory
//}
//}
]
}
Tablestore = {
db = {
// NOTE: Before running locally, uncomment the below lines and specify values for keys `url` to point to your PostgresSql JDBC
// NOTE: Before running locally, uncomment the below lines and specify values for keys `url` to point to your PostgresSql JDBC
// link, `user` and `password`.
// AI2 Internal users: use the database in the `test` deploy environment and the same username. You can get the password
// from this file in the ops-keystore in S3.
// url =
// user =
// password =
url = "jdbc:postgresql://"${POSTGRES_DB}":5432/okcorpus"
user = "okcorpus"
password = ${POSTGRES_PASSWORD}
}
}
@@ -21,13 +21,13 @@
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%date{ISO8601} %-5level %logger{36} - %msg%n</pattern>
<pattern>%date{ISO8601} [%thread] %-5level %logger{36} - %msg%n</pattern>
</encoder>
</appender>
<appender name="SYSLOG-TLS" class="com.papertrailapp.logback.Syslog4jAppender">
<layout class="ch.qos.logback.classic.PatternLayout">
<pattern>%-5level %logger{35}: %m%n%xEx</pattern>
<pattern>%-5level [%thread] %logger{35}: %m%n%xEx</pattern>
</layout>
<syslogConfig class="org.productivity.java.syslog4j.impl.net.tcp.ssl.SSLTCPNetSyslogConfig">
@@ -1,7 +1,5 @@
package org.allenai.ike
import org.allenai.common.immutable.Interval
import org.allenai.blacklab.search.{ Hit, Hits, Kwic, Span }
import scala.collection.JavaConverters._
@@ -31,7 +29,7 @@ case object BlackLabResult {
} yield data
}
def toInterval(span: Span): Interval = Interval.open(span.start, span.end)
def toInterval(span: Span): Interval = Interval(span.start, span.end)
def captureGroups(hits: Hits, hit: Hit, shift: Int): Map[String, Option[Interval]] = {
val names = hits.getCapturedGroupNames.asScala
@@ -56,7 +54,7 @@ case object BlackLabResult {
): Option[BlackLabResult] = {
val kwic = hits.getKwic(hit, kwicSize)
val data = wordData(hits, kwic)
val offset = Interval.open(kwic.getHitStart, kwic.getHitEnd)
val offset = Interval(kwic.getHitStart, kwic.getHitEnd)
// TODO: https://github.com/allenai/okcorpus/issues/30
if (hits.hasCapturedGroups) {
val shift = hit.start - kwic.getHitStart
@@ -1,7 +1,5 @@
package org.allenai.ike
import org.allenai.common.immutable.Interval
case class KeyedBlackLabResult(keys: Seq[Interval], result: BlackLabResult)
case class GroupedBlackLabResult(
Oops, something went wrong.

0 comments on commit 1ac236f

Please sign in to comment.