/
CoreNLPFeatureExtractor.scala
45 lines (41 loc) · 1.47 KB
/
CoreNLPFeatureExtractor.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
package keystoneml.nodes.nlp
import edu.arizona.sista.processors.Processor
import edu.arizona.sista.processors.fastnlp.FastNLPProcessor
import org.apache.spark.rdd.RDD
import keystoneml.workflow.Transformer
/**
* Transformer that uses CoreNLP to (in order):
* - Tokenize document
* - Lemmatize tokens
* - Replace entities w/ their type (e.g. "Jon" => "NAME", "Paris" => "PLACE")
* - Return n-grams for the above (respecting sentence boundaries)
* Note: Much slower than just using [[Tokenizer]] followed by [[NGramsFeaturizer]]
*
* @param orders The size of the n-grams to output
*/
case class CoreNLPFeatureExtractor(orders: Seq[Int]) extends Transformer[String, Seq[String]] {
@transient lazy val proc = new FastNLPProcessor()
override def apply(in: String): Seq[String] = {
val doc = proc.mkDocument(in)
proc.tagPartsOfSpeech(doc)
proc.lemmatize(doc)
proc.recognizeNamedEntities(doc)
doc.clear()
val out = doc.sentences.map(s => {
val out = new Array[String](s.words.length)
for (i <- 0 to s.words.length - 1) {
out(i) = if (s.entities.get(i) != "O") s.entities.get(i) else normalize(s.lemmas.get(i))
}
out
})
orders.map(n => {
out.map(s => {
s.sliding(n).map(gram => gram.mkString(" ")).toList
}).flatMap(identity).toList
}).flatMap(identity).toList
}
def normalize(s : String): String = {
val pattern = "[^a-zA-Z0-9\\s+]"
pattern.r.replaceAllIn(s,pattern=>"").toLowerCase
}
}