In [1]:
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
  license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

Saving spark_nlp_for_healthcare_spark_ocr_5493.json to spark_nlp_for_healthcare_spark_ocr_5493.json


In [2]:
# This is only to setup PySpark and Spark NLP on Colab
!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jsl_colab_setup.sh

--2022-06-21 10:40:22--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jsl_colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2128 (2.1K) [text/plain]
Saving to: ‘jsl_colab_setup.sh’


2022-06-21 10:40:22 (28.9 MB/s) - ‘jsl_colab_setup.sh’ saved [2128/2128]



In [3]:
# -p is for pyspark (by default 3.1.1)
!bash jsl_colab_setup.sh

setup Colab for PySpark 3.1.1 and Spark NLP 3.4.4
Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Get:7 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Ign:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:9 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic I

In [4]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 65 kB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 95 kB 1.9 MB/s 
[K     |████████████████████████████████| 66 kB 4.3 MB/s 
[?25h

In [5]:
import json
import os

import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 3.4.4
Spark NLP_JSL Version : 3.5.3


In [6]:
# if you want to start the session with custom params as in start function above
from pyspark.sql import SparkSession

def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:"+PUBLIC_VERSION) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+SECRET+"/spark-nlp-jsl-"+JSL_VERSION+".jar")
      
    return builder.getOrCreate()


In [7]:
from sparknlp.pretrained import PretrainedPipeline
finder_pipeline = PretrainedPipeline("ner_model_finder", "en", "clinical/models")

ner_model_finder download started this may take some time.
Approx size to download 148.6 MB
[OK!]


In [54]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
        
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")
 
# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = NerDLModel.pretrained("deidentify_dl","en","clinical/models")\
    .setInputCols(["sentence","token","embeddings"])\
    .setOutputCol("ner")\
     #decide if we want to return the tags in upper or lower case 

ner_converter = NerConverter()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        clinical_ner,
        ner_converter])


empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)


sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
deidentify_dl download started this may take some time.
Approximate size to download 14.1 MB
[OK!]


In [55]:
# deidentification=DeIdentificationModel.pretrained("deidentify_rb","en","clinical/models")\
#       .setInputCols(['sentence','token','ner_chunk'])\
#       .setOutputCol('deidentified')\
#       .setMode('mask')


deidentification=DeIdentificationModel.pretrained("deidentify_rb","en","clinical/models")\
      .setInputCols(['sentence','token','ner_chunk'])\
      .setOutputCol('deidentified')\
      .setMode('mask')

deidentify_rb download started this may take some time.
Approximate size to download 3.8 KB
[OK!]


In [10]:
osfuscation=DeIdentificationModel.pretrained("deidentify_large","en","clinical/models")\
      .setInputCols(['sentence','token','ner_chunk'])\
      .setOutputCol('deidentified')\
      .setMode('obfuscate')

deidentify_large download started this may take some time.
Approximate size to download 188.1 KB
[OK!]


In [50]:
nlpPipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, word_embeddings, clinical_ner, ner_converter,deidentification])
text = '''A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street'''
text="MENTAL HEALTH DIAGNOSES AND RELEVANT MEDICAL CONDITIONS: Chronic post-traumatic stress disorder (SCT 313182004) Specifiers:Moderate Insomnia due to other Mental Disorder (ICD-10-CM F51.05) Depressive D/O,Unsp - Major depressive disorder, single episode, unspecified (ICD-10-CM F32.9) SIGNIFICANT PSYCHOSOCIAL AND CONTEXTUAL FACTORS: Exposure to war, Familial Problems,History of Abuse (victim),Marital/Relationship Discord, Lack of Social Support"
text="MENTAL HEALTH DIAGNOSES AND RELEVANT MEDICAL CONDITIONS:   Chronic post-traumatic stress disorder (SCT 313182004)  Specifiers:Moderate  Insomnia due to other Mental Disorder (ICD-10-CM F51.05)  Depressive D/O,Unsp - Major depressive disorder, single episode,   unspecified (ICD-10-CM F32.9)  SIGNIFICANT PSYCHOSOCIAL AND CONTEXTUAL FACTORS:   Exposure to war, Familial Problems,History of Abuse (victim),Marital/Relationship Discord, Lack of Social Support"
text="Ajay khanna  has a fever with high cold . His social security number is AAA-GG-SSSS . he is 20 aged. unique Id12222 is 122002. stress disorder (SCT 313182004) . date is 25 may 2019 . email id ajaykhanan123ak@gmail.com . Url is https://www.wikihow.com/Find-a-Fax-Number"
model = nlpPipeline.fit(spark.createDataFrame([[text]]).toDF("text"))

    
output = model.transform(spark.createDataFrame([[text]]).toDF("text"))
# deid_text = deidentification.transform(result)

In [51]:
output.select('deidentified').show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|deidentified                                                                                                                                                                                                  

In [None]:
#  Downloading sample datasets.
! wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/mt_samples.csv

In [None]:
import pandas as pd

# mt_samples_df = spark.read.csv("mt_samples.csv", header=True)

from google.colab import drive
drive.mount('/drive')

In [None]:
ls

In [None]:
data_file = "/content/mt_samples.csv"
data_df = spark.read.csv(data_file)

In [None]:
data_df=data_df.withColumnRenamed("_c0","text")

In [None]:
data_df.show()

In [None]:
model = nlpPipeline.fit(data_df)

    
output = model.transform(data_df)

In [None]:
output.select('deidentified').show(truncate=False)

In [None]:
output.select('text').show(truncate=False)

# new model ner_deidentify_dl

In [56]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
        
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")
 
# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
    .setInputCols(["sentence","token"])\
    .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = NerDLModel.pretrained("deidentify_dl","en","clinical/models")\
    .setInputCols(["sentence","token","embeddings"])\
    .setOutputCol("ner")\
     #decide if we want to return the tags in upper or lower case 

ner_converter = NerConverter()\
    .setInputCols(["sentence","token","ner"])\
    .setOutputCol("ner_chunk")

# nlpPipeline = Pipeline(stages=[
#         documentAssembler,
#         sentenceDetector,
#         tokenizer,
#         word_embeddings,
#         clinical_ner,
#         ner_converter])


# empty_data = spark.createDataFrame([[""]]).toDF("text")

# model = nlpPipeline.fit(empty_data)


sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
deidentify_dl download started this may take some time.
Approximate size to download 14.1 MB
[OK!]


In [82]:
from pyspark.ml import Pipeline,PipelineModel
model = MedicalNerModel.pretrained("ner_deidentify_dl","en","clinical/models") \
    .setInputCols("sentence","token","word_embeddings") \
    .setOutputCol("ner")


nlp_pipeline = Pipeline(stages=[documentAssembler, sentenceDetector, tokenizer, word_embeddings, model, ner_converter])
                                
light_pipeline = LightPipeline(nlp_pipeline.fit(spark.createDataFrame([['']]).toDF("text")))
empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)
input_text = [ '''A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 month years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street''']
input_text=["Ajay khanna  has a fever with high cold . His social security number is AAA-GG-SSSS . he is 20 aged. unique Id12222 is 122002. stress disorder (SCT 313182004) . date is 25 may 2019 . email id ajaykhanan123ak@gmail.com . Url is https://www.wikihow.com/Find-a-Fax-Number"]
input_text=["date is 25 may 2019 . email id ajaykhanan123ak@gmail.com . Url is https://www.wikihow.com/Find-a-Fax-Number"]
result = model.transform(spark.createDataFrame([input_text], ["text"]))


ner_deidentify_dl download started this may take some time.
[OK!]


In [66]:
result.columns

['text', 'document', 'sentence', 'token', 'embeddings', 'ner', 'ner_chunk']

In [83]:
result.select("text","ner_chunk").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------+
|text                                                                                                       |ner_chunk                                                               |
+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------+
|date is 25 may 2019 . email id ajaykhanan123ak@gmail.com . Url is https://www.wikihow.com/Find-a-Fax-Number|[{chunk, 15, 18, 2019, {entity -> DATE, sentence -> 0, chunk -> 0}, []}]|
+-----------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------+

