## Advanced Analytics: NLP

- John Snow Labs : Docs & Pipelines | https://nlp.johnsnowlabs.com/docs/en/pipelines#recognize_entities_dl
- Example : https://index.scala-lang.org/johnsnowlabs/spark-nlp/spark-nlp/3.0.0?target=_2.12

In [None]:
# !pip install spark-nlp==3.0.0

In [1]:
import pandas as pd
pd.set_option('max_colwidth', 800)

## Create a spark context that includes a 3rd party jar for NLP

In [2]:
#jarPath = "spark-nlp-assembly-1.7.3.jar"

from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.0.0") \
    .getOrCreate()
spark

## Read multiple files in a dir as one Dataframe

In [4]:
dataPath = "_data/JEOPARDY_QUESTIONS1.json"
df = spark.read.json(dataPath)
print(df.count())
df.printSchema()

216930
root
 |-- air_date: string (nullable = true)
 |-- answer: string (nullable = true)
 |-- category: string (nullable = true)
 |-- question: string (nullable = true)
 |-- round: string (nullable = true)
 |-- show_number: string (nullable = true)
 |-- value: string (nullable = true)



In [6]:
question = "question"
answer = "answer"
df01 = df.select(question, answer)
df01.limit(5).toPandas()

Unnamed: 0,question,answer
0,"'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'",Copernicus
1,"'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves'",Jim Thorpe
2,"'The city of Yuma in this state has a record average of 4,055 hours of sunshine each year'",Arizona
3,"'In 1963, live on ""The Art Linkletter Show"", this company served its billionth burger'",McDonald\'s
4,"'Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States'",John Adams


## Try to implement the equivalent of flatMap in dataframes, count word occurances

In [7]:
import pyspark.sql.functions as f

# 1st split on white spaces f.split(question, "\\s+". Which becomes an array. Then you explode the array and every element in the array becomes a line. Then just name, group and count.
dfWordCount = df.select(f.explode(f.split(question, "\\s+")).alias("word")).groupBy("word").count().orderBy(f.desc("count"))
dfWordCount.limit(10).toPandas()

Unnamed: 0,word,count
0,the,146765
1,of,111318
2,this,94651
3,in,78480
4,a,76153
5,to,48533
6,&,44702
7,for,34556
8,is,33310
9,was,29059


## The goal was to get popular topics but getting the words like 'the' 'of' 'this' are not really infomative.

## Use the NLP library to do Part-of-Speach Tagging

In [12]:
# v1 
# from com.johnsnowlabs.nlp.pretrained.pipeline.en import BasicPipeline as bp
# v2 
# import com.johnsnowlabs.nlp.pretrained.PretrainedPipeline
# import com.johnsnowlabs.nlp.SparkNLP
# v3
import sparknlp
# ==========================================================
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline

In [13]:
spark = sparknlp.start()

In [14]:
dl = PretrainedPipeline('explain_document_dl', lang='en')

explain_document_dl download started this may take some time.
Approx size to download 169.3 MB
[OK!]


In [55]:
# FROM V1 IMPORTS ABOVE

# bp = PretrainedPipeline('recognize_entities_dl', lang='en')

recognize_entities_dl download started this may take some time.
Approx size to download 160.1 MB
[OK!]


In [21]:
dfAnnotated = dl.annotate(df01, "question")
dfAnnotated.printSchema()

root
 |-- text: string (nullable = true)
 |-- answer: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- v

In [41]:
# FROM V1 IMPORTS ABOVE

# dfAnnotatedbp = bp.annotate(df01, "question")
# dfAnnotatedbp.printSchema()

root
 |-- text: string (nullable = true)
 |-- answer: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- v

## Deal with Map type to query subfields

In [29]:
dfPos = dfAnnotated.select("text", "pos.metadata", "pos.result")
dfPos.limit(5).toPandas()

In [23]:
dfPos= dfAnnotated.select(f.explode("pos").alias("pos"))
dfPos.printSchema()
dfPos.toPandas()

root
 |-- pos: struct (nullable = true)
 |    |-- annotatorType: string (nullable = true)
 |    |-- begin: integer (nullable = false)
 |    |-- end: integer (nullable = false)
 |    |-- result: string (nullable = true)
 |    |-- metadata: map (nullable = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)
 |    |-- embeddings: array (nullable = true)
 |    |    |-- element: float (containsNull = false)



KeyboardInterrupt: 

In [24]:
nnpFilter = "pos.result = 'NNP' or pos.result = 'NNPS'"
dfNNP = dfPos.where(nnpFilter)
dfNNP.limit(10).toPandas()

Unnamed: 0,pos
0,"(pos, 35, 41, NNP, {'word': 'Galileo'}, [])"
1,"(pos, 13, 20, NNP, {'word': 'Olympian'}, [])"
2,"(pos, 40, 47, NNP, {'word': 'Carlisle'}, [])"
3,"(pos, 56, 61, NNP, {'word': 'School'}, [])"
4,"(pos, 66, 68, NNP, {'word': 'MLB'}, [])"
5,"(pos, 87, 90, NNP, {'word': 'Reds'}, [])"
6,"(pos, 93, 98, NNP, {'word': 'Giants'}, [])"
7,"(pos, 102, 107, NNP, {'word': 'Braves'}, [])"
8,"(pos, 13, 16, NNP, {'word': 'Yuma'}, [])"
9,"(pos, 23, 25, NNP, {'word': 'Art'}, [])"


## Extract columns from a map in a col

In [28]:
dfWordTag = dfNNP.selectExpr("pos.metadata['word'] as word", "pos.result as tag")
dfWordTag.limit(10).toPandas()

KeyboardInterrupt: 

In [27]:
from pyspark.sql.functions import desc
dfWordTag.groupBy("word").count().orderBy(desc("count")).show()

KeyboardInterrupt: 