# Import Dependencies

First, we need to import a number of dependencies:

In [None]:
%AddDeps com.lucidworks.spark spark-solr 3.6.0 --transitive
%AddDeps edu.stanford.nlp stanford-corenlp 3.9.2 --transitive
%AddDeps edu.stanford.nlp stanford-corenlp 3.9.2 --classifier models-english

# Extract Named Entities

Next, we can extract the named entities. The output is a single file (`part-00000`) containing one entity per line.

In [None]:
import com.lucidworks.spark.rdd.SelectSolrRDD
import edu.stanford.nlp.pipeline.{CoreDocument, StanfordCoreNLP}
import org.apache.hadoop.fs.{FileSystem, Path}
import java.util.Properties
import collection.JavaConversions._

// Solr's ZooKeeper URL
val SOLR = "localhost:9983"

// The Solr collection
val INDEX = "test"

// The Solr query
val QUERY = "*:*"

// The number of partitions
val PARTITIONS = 8

// Filter for entity type (PERSON, ORGANIZATION, LOCATION, DATE, etc.)
val ENTITY_TYPE = "DATE"

// Output directory
val OUT_DIR = "out"

// Delete old output dir
FileSystem.get(sc.hadoopConfiguration).delete(new Path(OUT_DIR), true)

val rdd = new SelectSolrRDD(SOLR, INDEX, sc)
    .rows(1000)
    .query(QUERY)
    .mapPartitions(docs => {
        
        val props = new Properties()
        props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner")
        props.setProperty("ner.applyFineGrained", "false")
        props.setProperty("ner.useSUTime", "false")
        
        val pipeline = new StanfordCoreNLP(props)
        val entities = docs.map(doc => {

            val coreDoc = new CoreDocument(doc.get("contents").asInstanceOf[String])
            pipeline.annotate(coreDoc)
          
            if (ENTITY_TYPE.equals("*")) {
                coreDoc.entityMentions().toList
            } else {
                coreDoc.entityMentions().filter(cem => cem.entityType().equals(ENTITY_TYPE)).toList
            }
        })
        
        entities
                
    })
    .flatMap(x => x)
    .coalesce(1)
    .saveAsTextFile(OUT_DIR)