In [1]:
import os
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder \
    .appName("ner")\
    .master("local[*]")\
    .config("spark.driver.memory","4G")\
    .config("spark.driver.maxResultSize", "2G")\
    .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.6.2")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

In [3]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [4]:
import time

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols(["document"])\
  .setOutputCol("token")

extractor = TextMatcher()\
  .setEntities("file:///" + os.getcwd() + "/entities.txt")\
  .setInputCols(["token", "sentence"])\
  .setOutputCol("entites")

finisher = Finisher() \
    .setInputCols(["entites"]) \
    .setIncludeKeys(False) \
    .setCleanAnnotations(True)

pipeline = Pipeline(
    stages = [
    documentAssembler,
    sentenceDetector,
    tokenizer,
    extractor,
    finisher
  ])


In [5]:
#Load the input data to be annotated
data = spark. \
        read. \
        parquet("file:///" + os.getcwd() + "../../../data/sentiment.parquet"). \
        limit(1000)
data.cache()
data.count()
data.show(20)

+------+---------+--------------------+
|itemid|sentiment|                text|
+------+---------+--------------------+
|393940|        1|@Natasja_Cupcake ...|
|393941|        1|@Natasja_Cupcake ...|
|393942|        0|@Natasja_Cupcake ...|
|393943|        0|@Natasja_Cupcake ...|
|393944|        1|@Natasja_Cupcake ...|
|393945|        1|@renegade37918  I...|
|393946|        0|@renegadejk529 i ...|
|393947|        1|@RenegadeScribe O...|
|393948|        0|@RenegadeSOA513 ....|
|393949|        1|@RenegadeSOA513 J...|
|393950|        0|@RenegadeSOA513 L...|
|393951|        1|@RenegadEuphoriX ...|
|393952|        1|@RenegadeVyper DO...|
|393953|        1|@Renegal Nah, it ...|
|393954|        1|@Renegat Ñ?ÑƒÐ¿Ðµ...|
|393955|        1|@reneilim don't f...|
|393956|        1|@renelannte mouse...|
|393957|        0|@renemonney Jam W...|
|393958|        0|@renemonster i wa...|
|393959|        1|  @renems enviei rs |
+------+---------+--------------------+
only showing top 20 rows



In [6]:
print("Start fitting")
model = pipeline.fit(data)
print("Fitting is ended")

Start fitting
Fitting is ended


In [7]:
extracted = model.transform(data)
extracted.show()

+------+---------+--------------------+----------------+
|itemid|sentiment|                text|finished_entites|
+------+---------+--------------------+----------------+
|393940|        1|@Natasja_Cupcake ...|                |
|393941|        1|@Natasja_Cupcake ...|                |
|393942|        0|@Natasja_Cupcake ...|                |
|393943|        0|@Natasja_Cupcake ...|                |
|393944|        1|@Natasja_Cupcake ...|                |
|393945|        1|@renegade37918  I...|                |
|393946|        0|@renegadejk529 i ...|                |
|393947|        1|@RenegadeScribe O...|                |
|393948|        0|@RenegadeSOA513 ....|                |
|393949|        1|@RenegadeSOA513 J...|                |
|393950|        0|@RenegadeSOA513 L...|                |
|393951|        1|@RenegadEuphoriX ...|                |
|393952|        1|@RenegadeVyper DO...|                |
|393953|        1|@Renegal Nah, it ...|                |
|393954|        1|@Renegat Ñ?Ñƒ

In [8]:
extracted.select("finished_entites")

DataFrame[finished_entites: string]

In [9]:
pipeline.write().overwrite().save("./extractor_pipeline")
model.write().overwrite().save("./extractor_model")

In [10]:
from pyspark.ml import PipelineModel, Pipeline

Pipeline.read().load("./extractor_pipeline")
sameModel = PipelineModel.read().load("./extractor_model")

sameModel.transform(data).show()

+------+---------+--------------------+----------------+
|itemid|sentiment|                text|finished_entites|
+------+---------+--------------------+----------------+
|393940|        1|@Natasja_Cupcake ...|                |
|393941|        1|@Natasja_Cupcake ...|                |
|393942|        0|@Natasja_Cupcake ...|                |
|393943|        0|@Natasja_Cupcake ...|                |
|393944|        1|@Natasja_Cupcake ...|                |
|393945|        1|@renegade37918  I...|                |
|393946|        0|@renegadejk529 i ...|                |
|393947|        1|@RenegadeScribe O...|                |
|393948|        0|@RenegadeSOA513 ....|                |
|393949|        1|@RenegadeSOA513 J...|                |
|393950|        0|@RenegadeSOA513 L...|                |
|393951|        1|@RenegadEuphoriX ...|                |
|393952|        1|@RenegadeVyper DO...|                |
|393953|        1|@Renegal Nah, it ...|                |
|393954|        1|@Renegat Ñ?Ñƒ