# **Detect entities in English text**

## 1. Colab Setup

In [1]:
# Install PySpark and Spark NLP
! pip install -q pyspark==3.1.2 spark-nlp

# Install Spark NLP Display lib
! pip install --upgrade -q spark-nlp-display

[K     |████████████████████████████████| 212.4 MB 69 kB/s 
[K     |████████████████████████████████| 142 kB 52.7 MB/s 
[K     |████████████████████████████████| 198 kB 66.2 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 95 kB 4.0 MB/s 
[K     |████████████████████████████████| 66 kB 5.0 MB/s 
[?25h

## 2. Start the Spark session

Import dependencies and start Spark session.

In [2]:
import json
import pandas as pd
import numpy as np

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

spark = sparknlp.start()

## 3. Select the DL model

In [8]:
# If you change the model, re-run all the cells below
# Other applicable models: ner_dl, ner_dl_bert
MODEL_NAME = "ner_dl"

In [87]:
intro=["This is Syed Azmat Ali Abedi.I am doing my bachelar form Fast National University Karachi.Right now I am doin internship in OpenAIMP company along with that I also do the Freelaning  in Fiver.I feild of intrest is Data Science, Machine Leanring and Deep Learning."]

## 4. Some sample examples

## 5. Define Spark NLP pipeline

In [5]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# ner_dl and onto_100 model are trained with glove_100d, so the embeddings in
# the pipeline should match
if (MODEL_NAME == "ner_dl") or (MODEL_NAME == "onto_100"):
    embeddings = WordEmbeddingsModel.pretrained('glove_100d') \
        .setInputCols(["document", 'token']) \
        .setOutputCol("embeddings")

# Bert model uses Bert embeddings
elif MODEL_NAME == "ner_dl_bert":
    embeddings = BertEmbeddings.pretrained(name='bert_base_cased', lang='en') \
        .setInputCols(['document', 'token']) \
        .setOutputCol('embeddings')

ner_model = NerDLModel.pretrained(MODEL_NAME, 'en') \
    .setInputCols(['document', 'token', 'embeddings']) \
    .setOutputCol('ner')

ner_converter = NerConverter() \
    .setInputCols(['document', 'token', 'ner']) \
    .setOutputCol('ner_chunk')

nlp_pipeline = Pipeline(stages=[
    documentAssembler, 
    tokenizer,
    embeddings,
    ner_model,
    ner_converter
])

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]
ner_dl_bert download started this may take some time.
Approximate size to download 15.4 MB
[OK!]


## 6. Run the pipeline

In [None]:
from sparknlp_display import NerVisualizer
empty_df = spark.createDataFrame([['']]).toDF('text')
pipeline_model = nlp_pipeline.fit(empty_df)

In [88]:
df = spark.createDataFrame(pd.DataFrame({'text':intro}))

In [85]:
result = pipeline_model.transform(df)

## 7. Visualize results

In [89]:

NerVisualizer().display(
    result = result.collect()[0],
    label_col = 'ner_chunk',
    document_col = 'document'
  )