Necessary imports

In [1]:
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

from pathlib import Path

if sys.version_info[0] < 3:
    from urllib import urlretrieve
else:
    from urllib.request import urlretrieve


Create SparkSession if not already in pyspark

In [2]:
spark = SparkSession.builder \
    .appName("assertion-status")\
    .master("local[2]")\
    .config("spark.driver.memory","4G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.6.2")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

In [3]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

We designate the embeddings data and download it. AssertionLog reg only uses external embeddings for training.

In [4]:
import time


embeddingsFile = './PubMed-shuffle-win-2.bin'
embeddingsUrl = 'https://s3.amazonaws.com/auxdata.johnsnowlabs.com/PubMed-shuffle-win-2.bin'
# this may take a couple minutes
if not Path(embeddingsFile).is_file():
    urlretrieve(embeddingsUrl, embeddingsFile)

documentAssembler = DocumentAssembler()\
    .setInputCol("sentence")\
    .setOutputCol("document")\
    
tokenizer = Tokenizer()\
    .setInputCols(["document"])\
    .setOutputCol("token")
    
pretrained_pos = PerceptronModel().pretrained()\
    .setInputCols(["document", "token"])\
    .setOutputCol("pos")
    
pretrained_ner = NerCrfModel().pretrained()\
    .setInputCols(["document", "token", "pos"])\
    .setOutputCol("ner")

ner_converter = NerConverter()\
    .setInputCols(["document", "token", "ner"])\
    .setOutputCol("nerconverter")

assertion = AssertionLogRegApproach()\
    .setLabelCol("label")\
    .setInputCols(["document", "nerconverter"])\
    .setOutputCol("assertion")\
    .setBefore(11)\
    .setAfter(13)\
    .setEmbeddingsSource(embeddingsFile,200,3)

finisher = Finisher() \
    .setInputCols(["nerconverter", "assertion"]) \
    .setIncludeKeys(False)

pipeline = Pipeline(
    stages = [
    documentAssembler,
    tokenizer,
    pretrained_pos,
    pretrained_ner,
    ner_converter,
    assertion,
    finisher
  ])


This example doesn't really make sense, since we are using a standard NER model, so we create a dataset with Person just to make sure assertion status uses such TargetNerLabel

In [5]:
#Load the input data to be annotated
data = spark.createDataFrame([
    ["hello Peter how are you? And how is Robert Douglas?", "Affirmed"],
    ["master of yards", "Negated"],
    ["what is this", "Negated"],
    ["Michael Jordan is a good person", "Negated"],
    ["Robert Deniro is not a good person", "Negated"],
    ["Learn from Carlos Rodriguez", "Negated"],
    ["Who is Jhon here?", "Negated"],
    ["my friend Lucas", "Affirmed"]
    ]).toDF("sentence", "label")
data.show()

+--------------------+--------+
|            sentence|   label|
+--------------------+--------+
|hello Peter how a...|Affirmed|
|     master of yards| Negated|
|        what is this| Negated|
|Michael Jordan is...| Negated|
|Robert Deniro is ...| Negated|
|Learn from Carlos...| Negated|
|   Who is Jhon here?| Negated|
|     my friend Lucas|Affirmed|
+--------------------+--------+



We fit the data

In [6]:
start = time.time()
print("Start fitting")
model = pipeline.fit(data)
print("Fitting is ended")
print (time.time() - start)

Start fitting
Fitting is ended
292.7379539012909


Prediction with finisher output

In [7]:
result = model.transform(data.repartition(1))
start = time.time()
result.select("sentence", "finished_nerconverter", "finished_assertion").show()
print (time.time() - start)

+--------------------+---------------------+------------------+
|            sentence|finished_nerconverter|finished_assertion|
+--------------------+---------------------+------------------+
|hello Peter how a...| Peter@Robert Douglas| Affirmed@Affirmed|
|     my friend Lucas|                Lucas|          Affirmed|
|   Who is Jhon here?|             Who@Jhon|   Negated@Negated|
|Learn from Carlos...|     Carlos Rodriguez|           Negated|
|     master of yards|                     |                NA|
|Michael Jordan is...|       Michael Jordan|           Negated|
|        what is this|                     |                NA|
|Robert Deniro is ...|        Robert Deniro|           Negated|
+--------------------+---------------------+------------------+

11.243267297744751
