<a href="https://colab.research.google.com/github/akouaouchissam/FML/blob/master/take_home_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

Saving spark_nlp_for_healthcare.json to spark_nlp_for_healthcare.json


In [None]:
license_keys['JSL_VERSION']

'3.0.2'

In [None]:
license_keys['PUBLIC_VERSION']

'3.0.2'

In [None]:
%%capture
for k,v in license_keys.items(): 
    %set_env $k=$v

!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jsl_colab_setup.sh
!bash jsl_colab_setup.sh

! pip install spark-nlp-display

In [None]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

params = {"spark.driver.memory":"16G",
"spark.kryoserializer.buffer.max":"2000M",
"spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

Spark NLP Version : 3.0.2
Spark NLP_JSL Version : 3.0.2


In [None]:
import pandas as pd
df = pd.read_csv('mimic_100_pats.csv', sep=',')
df = df[['TEXT']]
df = df.rename(columns={"TEXT": "text"})

In [None]:
train_df = spark.createDataFrame(df[:-10])
test_df = spark.createDataFrame(df[-10:])

In [None]:
test_df.show()

+--------------------+
|                text|
+--------------------+
|NICU NURSING ADMI...|
|Neonatology
DOL...|
|Nursing NICU Note...|
|NPN 0700-[**2051*...|
|NPN 1900-0700
...|
|Neonatology
Doi...|
|Neonatology NP No...|
|Nursing NICU Note...|
|Neonatology Atten...|
|Neonatology [** 6...|
+--------------------+



# NER Pipeline

In [None]:
documentAssembler = DocumentAssembler()\
        .setInputCol("text")\
        .setOutputCol("document")

# Sentence Detector
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
        .setInputCols(["document"])\
        .setOutputCol("sentence")
 
# splits words
tokenizer = Tokenizer()\
        .setInputCols(["sentence"])\
        .setOutputCol("token")

# Clinical word embeddings
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
        .setInputCols(["sentence","token"])\
        .setOutputCol("embeddings")

# NER model
clinical_ner = MedicalNerModel.pretrained("ner_clinical","en","clinical/models")\
        .setInputCols(["sentence","token","embeddings"])\
        .setOutputCol("ner")

ner_converter = NerConverter()\
        .setInputCols(["sentence","token","ner"])\
        .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        word_embeddings,
        clinical_ner,
        ner_converter])

model = nlpPipeline.fit(train_df)


sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 363.9 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_clinical download started this may take some time.
Approximate size to download 13.9 MB
[OK!]


In [None]:
model.stages

[DocumentAssembler_bf6b9fe6f383,
 SentenceDetectorDLModel_d2546f0acfe2,
 REGEX_TOKENIZER_2ab5a7003e66,
 WORD_EMBEDDINGS_MODEL_9004b1d00302,
 MedicalNerModel_cd5ce67b529f,
 NerConverter_eddce64798e2]

In [None]:
clinical_ner.getClasses()

['O',
 'B-TREATMENT',
 'I-TREATMENT',
 'B-PROBLEM',
 'I-PROBLEM',
 'B-TEST',
 'I-TEST']

In [None]:
clinical_ner.extractParamMap()

{Param(parent='MedicalNerModel_cd5ce67b529f', name='batchSize', doc='Size of every batch'): 64,
 Param(parent='MedicalNerModel_cd5ce67b529f', name='classes', doc='get the tags used to trained this MedicalNerModel'): ['O',
  'B-TREATMENT',
  'I-TREATMENT',
  'B-PROBLEM',
  'I-PROBLEM',
  'B-TEST',
  'I-TEST'],
 Param(parent='MedicalNerModel_cd5ce67b529f', name='includeConfidence', doc='whether to include confidence scores in annotation metadata'): True,
 Param(parent='MedicalNerModel_cd5ce67b529f', name='inputCols', doc='previous annotations columns, if renamed'): ['sentence',
  'token',
  'embeddings'],
 Param(parent='MedicalNerModel_cd5ce67b529f', name='lazyAnnotator', doc='Whether this AnnotatorModel acts as lazy in RecursivePipelines'): False,
 Param(parent='MedicalNerModel_cd5ce67b529f', name='outputCol', doc='output annotation column. can be left default.'): 'ner',
 Param(parent='MedicalNerModel_cd5ce67b529f', name='storageRef', doc='unique reference name for identification'): 'cl

In [None]:
clinical_ner.getStorageRef()

'clinical'

In [None]:
test_df.printSchema()

root
 |-- text: string (nullable = true)



In [None]:
result = model.transform(test_df)

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|NICU NURSING ADMI...|[{document, 0, 13...|[{document, 0, 25...|[{token, 0, 3, NI...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 233, 255...|
|Neonatology
DOL...|[{document, 0, 66...|[{document, 0, 13...|[{token, 0, 10, N...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 42, 44, ...|
|Nursing NICU Note...|[{document, 0, 15...|[{document, 0, 19...|[{token, 0, 6, Nu...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 78, 81, ...|
|NPN 0700-[**2051*...|[{document, 0, 80...|[{document, 0, 21...|[{token, 0, 2, NP...|[{word_embeddings...|

In [None]:
result.select('token.result','ner.result').show(truncate=100)

+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                                                                              result|                                                                                              result|
+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|[NICU, NURSING, ADMIT, NOTE, THIS, IS, A, FULL, TERM, MALE, [, **, **, ], TO, A, 31, YR, OLD, G2,...|[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...|
|[Neonatology, DOL, #13, ,, CGA, 42, weeks, ., CVR, :, Continues, in, NC, 100-175, cc, oxygen, ,, ...|[O, O, O, O, O, O, O, O, B-TEST, O, O, O, B-TREATMENT, O, O, B-TREATMENT, O, O, B-

In [None]:
result.select('ner_chunk').take(10)

[Row(ner_chunk=[Row(annotatorType='chunk', begin=233, end=255, result="SEVERAL DOWN'S FEATURES", metadata={'sentence': '4', 'chunk': '0', 'entity': 'PROBLEM', 'confidence': '0.56346667'}, embeddings=[]), Row(annotatorType='chunk', begin=419, end=424, result='O2 SAT', metadata={'sentence': '7', 'chunk': '1', 'entity': 'TEST', 'confidence': '0.76240003'}, embeddings=[]), Row(annotatorType='chunk', begin=454, end=469, result='NASAL CANNULA O2', metadata={'sentence': '8', 'chunk': '2', 'entity': 'PROBLEM', 'confidence': '0.48476663'}, embeddings=[]), Row(annotatorType='chunk', begin=518, end=520, result='EKG', metadata={'sentence': '10', 'chunk': '3', 'entity': 'TEST', 'confidence': '0.9942'}, embeddings=[]), Row(annotatorType='chunk', begin=569, end=571, result='ABG', metadata={'sentence': '12', 'chunk': '4', 'entity': 'TEST', 'confidence': '0.9201'}, embeddings=[]), Row(annotatorType='chunk', begin=694, end=699, result='O2 SAT', metadata={'sentence': '14', 'chunk': '5', 'entity': 'TEST',

In [None]:
from pyspark.sql import functions as F

result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+------------------------+---------+
|chunk                   |ner_label|
+------------------------+---------+
|SEVERAL DOWN'S FEATURES |PROBLEM  |
|O2 SAT                  |TEST     |
|NASAL CANNULA O2        |PROBLEM  |
|EKG                     |TEST     |
|ABG                     |TEST     |
|O2 SAT                  |TEST     |
|DUCTAL                  |TEST     |
|DUCTAL                  |TEST     |
|NASAL CANNULA O2        |TREATMENT|
|MURMUR AUDIBLE          |PROBLEM  |
|HR                      |TEST     |
|CBC                     |TEST     |
|BLD CX AND KAROTYPE SENT|PROBLEM  |
|D/S WAS                 |TREATMENT|
|IV PLACED               |TREATMENT|
|STARTED IVF             |TREATMENT|
|D10W                    |TREATMENT|
|60CC/K/D                |TREATMENT|
|WBC WAS                 |TEST     |
|POLY                    |TEST     |
+------------------------+---------+
only showing top 20 rows



In [None]:
from sparknlp_display import NerVisualizer

NerVisualizer().display(
    result = result.collect()[0],
    label_col = 'ner_chunk',
    document_col = 'document'
)