<a href="https://colab.research.google.com/github/bunyamin-polat/Spark-NLP-NER-Model-with-NCBI-disease/blob/main/NER_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab Setup

In [None]:
! pip install -q pyspark==3.1.2 spark-nlp

! pip install -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 69 kB/s 
[K     |████████████████████████████████| 140 kB 63.6 MB/s 
[K     |████████████████████████████████| 198 kB 18.5 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 95 kB 2.4 MB/s 
[K     |████████████████████████████████| 66 kB 4.2 MB/s 
[?25h

In [None]:
import sparknlp

spark = sparknlp.start(gpu = True) 

from sparknlp.base import *
from sparknlp.annotator import *
import pyspark.sql.functions as F
from sparknlp.training import CoNLL

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

spark

Spark NLP version 3.4.0
Apache Spark version: 3.1.2


# CoNLL Train & Test Data Import

In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/NCBI_disease_official_test.conll
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/NCBI_disease_official_train_dev.conll

In [None]:
with open ("NCBI_disease_official_train_dev.conll") as f:
  train_data = f.read()

In [None]:
train_data = CoNLL().readDataset(spark, 'NCBI_disease_official_train_dev.conll')

train_data.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Identification of...|[{document, 0, 89...|[{document, 0, 89...|[{token, 0, 13, I...|[{pos, 0, 13, NN,...|[{named_entity, 0...|
|The adenomatous p...|[{document, 0, 21...|[{document, 0, 21...|[{token, 0, 2, Th...|[{pos, 0, 2, NN, ...|[{named_entity, 0...|
|Complex formation...|[{document, 0, 63...|[{document, 0, 63...|[{token, 0, 6, Co...|[{pos, 0, 6, NN, ...|[{named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [None]:
train_data.count()

6347

In [None]:
train_data.select(F.explode(F.arrays_zip('token.result', 'pos.result',  'label.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("pos"),
        F.expr("cols['2']").alias("ner_label")).show(truncate=50)

+--------------+---+---------+
|         token|pos|ner_label|
+--------------+---+---------+
|Identification| NN|        O|
|            of| NN|        O|
|          APC2| NN|        O|
|             ,| NN|        O|
|             a| NN|        O|
|     homologue| NN|        O|
|            of| NN|        O|
|           the| NN|        O|
|   adenomatous| NN|B-Disease|
|     polyposis| NN|I-Disease|
|          coli| NN|I-Disease|
|        tumour| NN|I-Disease|
|    suppressor| NN|        O|
|             .| NN|        O|
|           The| NN|        O|
|   adenomatous| NN|B-Disease|
|     polyposis| NN|I-Disease|
|          coli| NN|I-Disease|
|             (| NN|I-Disease|
|           APC| NN|I-Disease|
+--------------+---+---------+
only showing top 20 rows



In [None]:
train_data.select(F.explode(F.arrays_zip("token.result","label.result")).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy("ground_truth").count().orderBy("count", ascending=False).show(100,truncate=False)

+------------+------+
|ground_truth|count |
+------------+------+
|O           |146544|
|I-Disease   |7205  |
|B-Disease   |5921  |
+------------+------+



In [None]:
with open ("NCBI_disease_official_test.conll") as f:
  test_data = f.read()

In [None]:
test_data = CoNLL().readDataset(spark, 'NCBI_disease_official_test.conll')

test_data.show(3)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Clustering of mis...|[{document, 0, 10...|[{document, 0, 10...|[{token, 0, 9, Cl...|[{pos, 0, 9, NN, ...|[{named_entity, 0...|
|Ataxia - telangie...|[{document, 0, 13...|[{document, 0, 13...|[{token, 0, 5, At...|[{pos, 0, 5, NN, ...|[{named_entity, 0...|
|The risk of cance...|[{document, 0, 15...|[{document, 0, 15...|[{token, 0, 2, Th...|[{pos, 0, 2, NN, ...|[{named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [None]:
test_data.count()

940

In [None]:
test_data.select(F.explode(F.arrays_zip("token.result","label.result")).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth")).groupBy("ground_truth").count().orderBy("count", ascending=False).show(100,truncate=False)

+------------+-----+
|ground_truth|count|
+------------+-----+
|O           |22450|
|I-Disease   |1087 |
|B-Disease   |960  |
+------------+-----+



# NERDL Model with Glove_100d

In [None]:
glove_embeddings = WordEmbeddingsModel.pretrained()\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

nerTagger = NerDLApproach()\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setLabelColumn("label")\
    .setOutputCol("ner")\
    .setMaxEpochs(14)\
    .setLr(0.003)\
    .setDropout(0.5)\
    .setBatchSize(10)\
    .setRandomSeed(0)\
    .setValidationSplit(0.2)\
    .setVerbose(1)\
    .setEvaluationLogExtended(True) \
    .setEnableOutputLogs(True)\
    .setIncludeConfidence(True)\
    .setEnableMemoryOptimizer(True)

ner_pipeline = Pipeline(stages=[
      glove_embeddings,
      nerTagger
])


glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
%%time

ner_model = ner_pipeline.fit(train_data)

CPU times: user 20.1 s, sys: 2.23 s, total: 22.3 s
Wall time: 58min 37s


In [None]:
!cd ~/annotator_logs/ && ls -lt

total 8
-rw-r--r-- 1 root root 7525 Jan 20 12:17 NerDLApproach_130a4a6af546.log


In [None]:
!cat ~/annotator_logs/NerDLApproach_130a4a6af546.log

Name of the selected graph: ner-dl/blstm_10_100_128_120.pb
Training started - total epochs: 14 - lr: 0.003 - batch size: 10 - labels: 3 - chars: 84 - training examples: 5074


Epoch 1/14 started, lr: 0.003, dataset size: 5074


Epoch 1/14 - 235.55s - loss: 1793.1072 - batches: 508
Quality on validation dataset (20.0%), validation examples = 1014
time to finish evaluation: 20.79s
label	 tp	 fp	 fn	 prec	 rec	 f1
I-Disease	 1281	 277	 219	 0.822208	 0.854	 0.83780247
B-Disease	 1017	 157	 216	 0.8662692	 0.82481754	 0.8450353
tp: 2298 fp: 434 fn: 435 labels: 2
Macro-average	 prec: 0.8442386, rec: 0.83940876, f1: 0.8418167
Micro-average	 prec: 0.841142, rec: 0.84083426, f1: 0.8409881


Epoch 2/14 started, lr: 0.0029850747, dataset size: 5074


Epoch 2/14 - 235.53s - loss: 705.3677 - batches: 508
Quality on validation dataset (20.0%), validation examples = 1014
time to finish evaluation: 20.13s
label	 tp	 fp	 fn	 prec	 rec	 f1
I-Disease	 1315	 98	 185	 0.93064404	 0.87666667	 0.9028493
B-D

### Test Evaluation

In [None]:
predictions = ner_model.transform(test_data)

In [None]:
from sklearn.metrics import classification_report

preds_df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).toPandas()

print (classification_report(preds_df['ground_truth'], preds_df['prediction']))

              precision    recall  f1-score   support

   B-Disease       0.86      0.85      0.85       960
   I-Disease       0.80      0.89      0.84      1087
           O       0.99      0.99      0.99     22450

    accuracy                           0.98     24497
   macro avg       0.88      0.91      0.90     24497
weighted avg       0.98      0.98      0.98     24497



### Train Evaluation

In [None]:
predictions_train = ner_model.transform(train_data)

preds_df = predictions_train.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).toPandas()

print (classification_report(preds_df['ground_truth'], preds_df['prediction']))

              precision    recall  f1-score   support

   B-Disease       0.96      0.96      0.96      5921
   I-Disease       0.96      0.97      0.96      7205
           O       1.00      1.00      1.00    146544

    accuracy                           0.99    159670
   macro avg       0.97      0.98      0.97    159670
weighted avg       0.99      0.99      0.99    159670



### Saving Trained Model

In [None]:
ner_model.stages

[WORD_EMBEDDINGS_MODEL_48cffc8b9a76, NerDLModel_369ae72918d4]

In [None]:
ner_model.stages[1].write().overwrite().save("NER_glove_100d_e14_b10")

# Prediction Pipeline

In [None]:
document = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(['document'])\
    .setOutputCol('sentence')

token = Tokenizer()\
    .setInputCols(['sentence'])\
    .setOutputCol('token')
glove_embeddings = WordEmbeddingsModel.pretrained()\
    .setInputCols(["document", "token"])\
    .setOutputCol("embeddings")

loaded_ner_model = NerDLModel.load("NER_glove_100d_e14_b10")\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")

converter = NerConverter()\
    .setInputCols(["document", "token", "ner"])\
    .setOutputCol("ner_span")

ner_prediction_pipeline = Pipeline(stages = [
      document,
      sentence,
      token,
      glove_embeddings,
      loaded_ner_model,
      converter
  ])

empty_data = spark.createDataFrame([['']]).toDF("text")

prediction_model = ner_prediction_pipeline.fit(empty_data)


glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
text = '''
A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation , associated with an acute hepatitis , and obesity with a body mass index ( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and vomiting .
'''
sample_data = spark.createDataFrame([[text]]).toDF("text")

sample_data.show(truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                                text|
+----------------------------------------------------------------------------------------------------+
|
A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior...|
+----------------------------------------------------------------------------------------------------+



In [None]:
import pyspark.sql.functions as F

preds = prediction_model.transform(sample_data)

result_df = preds.select(F.explode(F.arrays_zip("ner_span.result","ner_span.metadata")).alias("entities")) \
                .select(F.expr("entities['0']").alias("chunk"),
                        F.expr("entities['1'].entity").alias("entity")).show(truncate=False)


+-----------------------------+-------+
|chunk                        |entity |
+-----------------------------+-------+
|gestational diabetes mellitus|Disease|
|diabetes mellitus            |Disease|
|T2DM                         |Disease|
|HTG-induced pancreatitis     |Disease|
|acute hepatitis              |Disease|
|obesity                      |Disease|
|polyuria                     |Disease|
|polydipsia                   |Disease|
|poor appetite                |Disease|
|vomiting                     |Disease|
+-----------------------------+-------+



In [None]:
from sparknlp.base import LightPipeline

light_model = LightPipeline(prediction_model)

result = light_model.annotate(text)

list(zip(result['token'], result['ner']))

[('A', 'O'),
 ('28-year-old', 'O'),
 ('female', 'O'),
 ('with', 'O'),
 ('a', 'O'),
 ('history', 'O'),
 ('of', 'O'),
 ('gestational', 'B-Disease'),
 ('diabetes', 'I-Disease'),
 ('mellitus', 'I-Disease'),
 ('diagnosed', 'O'),
 ('eight', 'O'),
 ('years', 'O'),
 ('prior', 'O'),
 ('to', 'O'),
 ('presentation', 'O'),
 ('and', 'O'),
 ('subsequent', 'O'),
 ('type', 'O'),
 ('two', 'O'),
 ('diabetes', 'B-Disease'),
 ('mellitus', 'I-Disease'),
 ('(', 'O'),
 ('T2DM', 'B-Disease'),
 ('),', 'O'),
 ('one', 'O'),
 ('prior', 'O'),
 ('episode', 'O'),
 ('of', 'O'),
 ('HTG-induced', 'B-Disease'),
 ('pancreatitis', 'I-Disease'),
 ('three', 'O'),
 ('years', 'O'),
 ('prior', 'O'),
 ('to', 'O'),
 ('presentation', 'O'),
 (',', 'O'),
 ('associated', 'O'),
 ('with', 'O'),
 ('an', 'O'),
 ('acute', 'B-Disease'),
 ('hepatitis', 'I-Disease'),
 (',', 'O'),
 ('and', 'O'),
 ('obesity', 'B-Disease'),
 ('with', 'O'),
 ('a', 'O'),
 ('body', 'O'),
 ('mass', 'O'),
 ('index', 'O'),
 ('(', 'O'),
 ('BMI', 'O'),
 (')', 'O'),
 (

In [None]:
import pandas as pd

result = light_model.fullAnnotate(text)

ner_df= pd.DataFrame([(int(x.metadata['sentence']), x.result, x.begin, x.end, y.result) for x,y in zip(result[0]["token"], result[0]["ner"])], 
                      columns=['sent_id','token','start','end','ner'])
ner_df.head(10)

Unnamed: 0,sent_id,token,start,end,ner
0,0,A,1,1,O
1,0,28-year-old,3,13,O
2,0,female,15,20,O
3,0,with,22,25,O
4,0,a,27,27,O
5,0,history,29,35,O
6,0,of,37,38,O
7,0,gestational,40,50,B-Disease
8,0,diabetes,52,59,I-Disease
9,0,mellitus,61,68,I-Disease


## Highlight Entities

In [None]:
ann_text = light_model.fullAnnotate(text)[0]
ann_text.keys()

dict_keys(['document', 'ner_span', 'token', 'ner', 'embeddings', 'sentence'])

In [None]:
from sparknlp_display import NerVisualizer

visualiser = NerVisualizer()

visualiser.display(ann_text, label_col='ner_span', document_col='document')

# Streamlit

In [None]:
!pip install streamlit
!pip install pyngrok==4.1.1

In [None]:
%%writefile ner_model.py

import streamlit as st
import pandas as pd
import base64
import os

import sparknlp
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp.base import *

from sparknlp_display import NerVisualizer
from sparknlp.base import LightPipeline

spark = sparknlp.start(gpu = True) 


HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""

st.sidebar.image('https://nlp.johnsnowlabs.com/assets/images/logo.png', use_column_width=True)
st.sidebar.header('Choose the pretrained model')
select_model = st.sidebar.selectbox("",["ner_model_glove_100d"])

st.title("Spark NLP NER Model Playground")

#data
text1 = """The patient is a 78-year-old gentleman with no substantial past medical history except for diabetes. He denies any comorbid complications of the diabetes including kidney disease, heart disease, stroke, vision loss, or neuropathy. At this time, he has been admitted for anemia with hemoglobin of 7.1 and requiring transfusion. He reports that he has no signs or symptom of bleeding and had a blood transfusion approximately two months ago and actually several weeks before that blood transfusion, he had a transfusion for anemia. He has been placed on B12, oral iron, and Procrit. At this time, we are asked to evaluate him for further causes and treatment for his anemia. He denies any constitutional complaints except for fatigue, malaise, and some dyspnea. He has no adenopathy that he reports. No fevers, night sweats, bone pain, rash, arthralgias, or myalgias."""
text2 = """The patient is a 61-year-old woman who presents with a history of biopsy-proven basal cell carcinoma, right and left cheek. She had no prior history of skin cancer. She is status post bilateral cosmetic breast augmentation many years ago and the records are not available for this procedure. She has noted progressive hardening and distortion of the implant. She desires to have the implants removed, capsulectomy and replacement of implants. She would like to go slightly smaller than her current size as she has ptosis going with a smaller implant combined with capsulectomy will result in worsening of her ptosis. She may require a lift. She is not consenting to lift due to the surgical scars."""
text3 = """The patient is a 39-year-old woman returns for followup management of type 1 diabetes mellitus. Her last visit was approximately 4 months ago. Since that time, the patient states her health had been good and her glycemic control had been good, however, within the past 2 weeks she had a pump malfunction, had to get a new pump and was not certain of her pump settings and has been having some difficulty with glycemic control over the past 2 weeks. She is not reporting any severe hypoglycemic events, but is having some difficulty with hyperglycemia both fasting and postprandial. She is not reporting polyuria, polydipsia or polyphagia. She is not exercising at this point and has a diet that is rather typical of woman with twins and a young single child as well. She is working on a full-time basis and so eats on the run a lot, probably eats more than she should and not making the best choices, little time for physical activity. She is keeping up with all her other appointments and has recently had a good eye examination. She had lab work done at her previous visit and this revealed persistent hyperlipidemic state with a LDL of 144."""
text4 = """A 28-year-old female with a history of gestational diabetes mellitus diagnosed eight years prior to presentation and subsequent type two diabetes mellitus ( T2DM ), one prior episode of HTG-induced pancreatitis three years prior to presentation , associated with an acute hepatitis , and obesity with a body mass index ( BMI ) of 33.5 kg/m2 , presented with a one-week history of polyuria , polydipsia , poor appetite , and vomiting ."""
text5 = """Nature and course of the diagnosis has been discussed with the patient. Based on her presentation without any history of obvious fall or trauma and past history of malignant melanoma, this appears to be a pathological fracture of the left proximal hip. At the present time, I would recommend obtaining a bone scan and repeat x-rays, which will include AP pelvis, femur, hip including knee. She denies any pain elsewhere. She does have a past history of back pain and sciatica, but at the present time, this appears to be a metastatic bone lesion with pathological fracture. I have discussed the case with Dr.X and recommended oncology consultation. With the above fracture and presentation, she needs a left hip hemiarthroplasty versus calcar hemiarthroplasty, cemented type. Indication, risk, and benefits of left hip hemiarthroplasty has been discussed with the patient, which includes, but not limited to bleeding, infection, nerve injury, blood vessel injury, dislocation early and late, persistent pain, leg length dicrepancy, myositis ossificans, intraoperative fracture, prosthetic fracture, need for conversion to total hip replacement surgery, revision surgery, pulmonary embolism, risk of anesthesia, need for blood transfusion, and cardiac arrest. She understands above and is willing to undergo further procedure. The goal and the functional outcome have been explained. Further plan will be discussed with her once we obtain the bone scan and the radiographic studies. We will also await for the oncology feedback and clearance."""

sample_text = st.selectbox("",[text1, text2, text3,text4,text5])

@st.cache(hash_funcs={"_thread.RLock": lambda _: None},allow_output_mutation=True, suppress_st_warning=True)
def model_pipeline():
    documentAssembler = DocumentAssembler()\
          .setInputCol("text")\
          .setOutputCol("document")

    sentenceDetector = SentenceDetector()\
          .setInputCols(['document'])\
          .setOutputCol('sentence')

    tokenizer = Tokenizer()\
          .setInputCols(['sentence'])\
          .setOutputCol('token')

    gloveEmbeddings = WordEmbeddingsModel.pretrained()\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")

    nerModel = NerDLModel.load("/content/drive/MyDrive/NER_glove_100d_e14_b10")\
          .setInputCols(["sentence", "token", "embeddings"])\
          .setOutputCol("ner")

    nerConverter = NerConverter()\
          .setInputCols(["document", "token", "ner"])\
          .setOutputCol("ner_chunk")
 
    pipeline_dict = {
          "documentAssembler":documentAssembler,
          "sentenceDetector":sentenceDetector,
          "tokenizer":tokenizer,
          "gloveEmbeddings":gloveEmbeddings,
          "nerModel":nerModel,
          "nerConverter":nerConverter
    }
    return pipeline_dict

model_dict = model_pipeline()

# @st.cache(hash_funcs={"_thread.RLock": lambda _: None},allow_output_mutation=True, suppress_st_warning=True)
def load_pipeline():
    nlp_pipeline = Pipeline(stages=[
                   model_dict["documentAssembler"],
                   model_dict["sentenceDetector"],
                   model_dict["tokenizer"],
                   model_dict["gloveEmbeddings"],
                   model_dict["nerModel"],
                   model_dict["nerConverter"]
                   ])

    empty_data = spark.createDataFrame([['']]).toDF("text")

    model = nlp_pipeline.fit(empty_data)

    return model


ner_model = load_pipeline()

def viz (annotated_text, chunk_col):
  raw_html = NerVisualizer().display(annotated_text, chunk_col, return_html=True)
  sti = raw_html.find('<style>')
  ste = raw_html.find('</style>')+8
  st.markdown(raw_html[sti:ste], unsafe_allow_html=True)
  st.write(HTML_WRAPPER.format(raw_html[ste:]), unsafe_allow_html=True)


def get_entities (ner_pipeline, text):
    
    light_model = LightPipeline(ner_pipeline)

    full_annotated_text = light_model.fullAnnotate(text)[0]

    st.write('')
    st.subheader('Entities')

    chunks=[]
    entities=[]
    
    for n in full_annotated_text["ner_chunk"]:

        chunks.append(n.result)
        entities.append(n.metadata['entity']) 

    df = pd.DataFrame({'chunks':chunks, 'entities':entities}).drop_duplicates()

    viz (full_annotated_text, "ner_chunk")
    
    st.subheader('Dataframe')

    st.table(df)
    
    return df


entities_df  = get_entities (ner_model, sample_text)



Overwriting ner_model.py


In [None]:
!ngrok authtoken 23n4cKNe5Gt6LrsuDtmAC8u6uHb_2HLjNdbvo2U9M9hNsMkJ2

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
!streamlit run ner_model.py &>/dev/null&

In [None]:
from pyngrok import ngrok

public_url = ngrok.connect(port='8501')
public_url

'http://295b-35-243-198-185.ngrok.io'