# SPARK NLP - FEED THE CHILD TRAFFICKING DF INTO A SPARK DF ...

In [None]:
import pandas as pd

In [None]:
child_trafficking_rest_complete = pd.read_csv('child_trafficking_rest_complete.csv')

In [None]:
child_trafficking_rest_complete.head()

Unnamed: 0.1,Unnamed: 0,Date,Title,Article_Link,Article,cleaned_article,cleaned2_article
0,0,2020-09-22 04:00:00,A look at Netflix's most controversial content,https://search.bangkokpost.com/track/visitAndR...,Netflix is facing backlash from viewers in sev...,Netflix is facing backlash from viewers in sev...,Netflix is facing backlash from viewers in sev...
1,1,2020-07-28 20:45:00,\n\t\t\t\t\t\t\t\t\t\t\tPolice search German a...,https://search.bangkokpost.com/track/visitAndR...,BERLIN - Police are searching an allotment plo...,BERLIN - Police are searching an allotment plo...,BERLIN Police are searching an allotment plot...
2,2,2020-07-23 10:45:00,\n\t\t\t\t\t\t\t\t\t\t\tDespite Twitter crackd...,https://search.bangkokpost.com/track/visitAndR...,WASHINGTON - Twitter's decision to crack down ...,WASHINGTON - Twitter's decision to crack down ...,WASHINGTON Twitters decision to crack down on...
3,3,2020-06-19 08:37:00,\n\t\t\t\t\t\t\t\t\t\t\tOnline child sex abuse...,https://search.bangkokpost.com/track/visitAndR...,Online child sex abuse cases in Thailand are s...,Online child sex abuse cases in Thailand are s...,Online child sex abuse cases in Thailand are s...
4,4,2020-06-18 00:45:00,\n\t\t\t\t\t\t\t\t\t\t\t'Concrete evidence' th...,https://search.bangkokpost.com/track/visitAndR...,BERLIN - German prosecutors said Wednesday the...,BERLIN - German prosecutors said Wednesday the...,BERLIN German prosecutors said Wednesday they...


In [None]:
childTraff_4spark = child_trafficking_rest_complete[['cleaned_article']]

In [None]:
childTraff_4spark.head()

Unnamed: 0,cleaned_article
0,Netflix is facing backlash from viewers in sev...
1,BERLIN - Police are searching an allotment plo...
2,WASHINGTON - Twitter's decision to crack down ...
3,Online child sex abuse cases in Thailand are s...
4,BERLIN - German prosecutors said Wednesday the...


In [None]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.6.3-rc1

openjdk version "1.8.0_265"
OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)
[K     |████████████████████████████████| 215.7MB 71kB/s 
[K     |████████████████████████████████| 204kB 40.4MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 133kB 3.4MB/s 
[?25h

In [None]:
import sparknlp

spark = sparknlp.start()

# params =>> gpu=False, spark23=False (start with spark 2.3)

print("Spark NLP version", sparknlp.version())

print("Apache Spark version:", spark.version)

Spark NLP version 2.6.3-rc1
Apache Spark version: 2.4.4


In [None]:
childTraff_4spark = childTraff_4spark.rename(columns={"cleaned_article": "text"})

In [None]:
childTraff_spark_df = spark.createDataFrame(childTraff_4spark)

In [None]:
childTraff_spark_df

DataFrame[text: string]

In [None]:
spark_df = childTraff_spark_df

In [None]:
from sparknlp.base import *

documentAssembler = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")\
.setCleanupMode("shrink")

doc_df = documentAssembler.transform(spark_df)

doc_df.show(truncate=30)

+------------------------------+------------------------------+
|                          text|                      document|
+------------------------------+------------------------------+
|Netflix is facing backlash ...|[[document, 0, 2725, Netfli...|
|BERLIN - Police are searchi...|[[document, 0, 3395, BERLIN...|
|WASHINGTON - Twitter's deci...|[[document, 0, 4644, WASHIN...|
|Online child sex abuse case...|[[document, 0, 2767, Online...|
|BERLIN - German prosecutors...|[[document, 0, 3530, BERLIN...|
|BRUSSELS - Belgium reopened...|[[document, 0, 3064, BRUSSE...|
|LONDON - The family of miss...|[[document, 0, 4183, LONDON...|
|BERLIN - Police revealed We...|[[document, 0, 3783, BERLIN...|
|Had not the doctors interve...|[[document, 0, 7085, Had no...|
|With the enormous number of...|[[document, 0, 3800, With t...|
|A 36-year-old assistant vil...|[[document, 0, 1827, A 36-y...|
|WASHINGTON - The Saudi mili...|[[document, 0, 3561, WASHIN...|
|WASHINGTON - The Saudi mili...|[[docume

# NER USING SPARK NLP

In [None]:
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [None]:
from sparknlp.training import CoNLL
import pyspark.sql.functions as F

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

bert_embeddings = BertEmbeddings.pretrained('bert_base_cased')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")

onto_ner_bert = NerDLModel.pretrained("ner_dl_bert", 'en') \
          .setInputCols(["document", "token", "embeddings"]) \
          .setOutputCol("ner")

onto_ner_bert.getStorageRef()

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 bert_embeddings,
 onto_ner_bert
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]
ner_dl_bert download started this may take some time.
Approximate size to download 15.4 MB
[OK!]


### Public NER (CoNLL 2003)

<p><strong>Named-Entity recognition</strong> is a well-known technique in information extraction it is also known as&nbsp;<strong>entity identification</strong>,&nbsp;<strong>entity chunking</strong>&nbsp;and&nbsp;<strong>entity extraction.</strong>&nbsp;Knowing the relevant tags for each article help in automatically categorizing the articles in defined hierarchies and enable smooth content discovery. 

Entities

``` PERSON, LOCATION, ORGANIZATION, MISC ```

In [None]:
public_ner = NerDLModel.pretrained("ner_dl", 'en') \
          .setInputCols(["document", "token", "embeddings"]) \
          .setOutputCol("ner")

ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [None]:
public_ner.getClasses()

['O', 'B-ORG', 'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC', 'I-LOC', 'I-MISC']

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

# ner_dl model is trained with glove_100d. So we use the same embeddings in the pipeline
glove_embeddings = WordEmbeddingsModel.pretrained('glove_100d').\
  setInputCols(["document", 'token']).\
  setOutputCol("embeddings")

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 glove_embeddings,
 public_ner
 ])

empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(empty_df)


glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
result = pipelineModel.transform(spark_df)

In [None]:
result = pipelineModel.transform(spark_df.limit(10))   

result_df = result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

result_df.show(50, truncate=100)



+-------------+---------+
|        token|ner_label|
+-------------+---------+
|      Netflix|        O|
|           is|        O|
|       facing|        O|
|     backlash|        O|
|         from|        O|
|      viewers|        O|
|           in|        O|
|      several|        O|
|    countries|        O|
|    following|        O|
|          the|        O|
|       recent|        O|
|      release|        O|
|           of|        O|
|award-winning|        O|
|         film|        O|
|       Cuties|    B-PER|
|            .|        O|
|     Directed|        O|
|           by|        O|
|     Maïmouna|    B-PER|
|     Doucouré|    I-PER|
|            ,|        O|
|          the|        O|
|         film|        O|
|          has|        O|
|         been|        O|
|   criticised|        O|
|          for|        O|
|  sexualising|        O|
|        young|        O|
|        girls|        O|
|            .|        O|
|         This|        O|
|           is|        O|
|          n

### Getting the NER chunks with NER Converter

In [None]:

ner_converter = NerConverter() \
  .setInputCols(["document", "token", "ner"]) \
  .setOutputCol("ner_chunk")



nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 bert_embeddings,
 onto_ner_bert,
 ner_converter
 ])

# empty_df = spark.createDataFrame([['']]).toDF("text")

pipelineModel = nlpPipeline.fit(spark_df)

result = pipelineModel.transform(spark_df)   #.limit(10)



In [None]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------+---------+
|chunk            |ner_label|
+-----------------+---------+
|Netflix          |ORG      |
|Cuties           |ORG      |
|Maïmouna Doucouré|PER      |
|Netflix          |ORG      |
|Netflix          |MISC     |
|Thailand         |LOC      |
|Cuties Cuties    |ORG      |
|Amy              |PER      |
|BERLIN           |ORG      |
|Police           |ORG      |
|German           |MISC     |
|Hanover          |LOC      |
|British          |MISC     |
|Madeleine McCann |PER      |
|Madeleine        |PER      |
|Portuguese       |MISC     |
|Praia da Luz     |LOC      |
|WASHINGTON       |LOC      |
|QAnon            |MISC     |
|US               |LOC      |
+-----------------+---------+
only showing top 20 rows



In [None]:
prediction.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr(“cols[‘0’]“).alias(“token”),
        F.expr(“cols[‘1’]“).alias(“ground_truth”),
        F.expr(“cols[‘2’]“).alias(“prediction”)).\
        groupBy(‘prediction’).count().orderBy(‘count’, ascending=False).show(truncate=False)

SyntaxError: ignored

In [None]:
result.select("chunk", "ner_label").groupBy('ner_label').count().oderBy('count', ascending=False).show(truncate=False)

AnalysisException: ignored

In [None]:
finisher = Finisher()\
    .setInputCols("ner_chunk")

In [None]:
finished = finisher.transform(result)

In [None]:
finished.show()

+--------------------+--------------------+
|                text|  finished_ner_chunk|
+--------------------+--------------------+
|Netflix is facing...|[Netflix, Cuties,...|
|BERLIN - Police a...|[BERLIN, Police, ...|
|WASHINGTON - Twit...|[WASHINGTON, QAno...|
|Online child sex ...|[Thailand, Intern...|
|BERLIN - German p...|[BERLIN, German, ...|
|BRUSSELS - Belgiu...|[Belgium, German,...|
|LONDON - The fami...|[LONDON, British,...|
|BERLIN - Police r...|[BERLIN, Police, ...|
|Had not the docto...|    [Nittha Wongwan]|
|With the enormous...|[Thailand, Protec...|
|A 36-year-old ass...|[US, CyberTipline...|
|WASHINGTON - The ...|[WASHINGTON, Saud...|
|WASHINGTON - The ...|[WASHINGTON, Saud...|
|CHIANG MAI: A man...|[CHIANG, Criminal...|
|Facebook Inc. is ...|[Facebook Inc, Ma...|
|With the enormous...|[Thailand, Protec...|
|With a great numb...|[Thailand, Childr...|
|SAN FRANCISCO: Th...|[SAN FRANCISCO, F...|
|SAN FRANCISCO - T...|[SAN FRANCISCO, F...|
|A man has been ar...|[Suphan Bu

In [None]:
exp = result.toPandas()

In [None]:
exp

Unnamed: 0,text,document,token,embeddings,ner,ner_chunk
0,Netflix is facing backlash from viewers in sev...,"[(document, 0, 2725, Netflix is facing backlas...","[(token, 0, 6, Netflix, {'sentence': '0'}, [])...","[(word_embeddings, 0, 6, Netflix, {'sentence':...","[(named_entity, 0, 6, I-ORG, {'word': 'Netflix...","[(chunk, 0, 6, Netflix, {'sentence': '0', 'chu..."
1,BERLIN - Police are searching an allotment plo...,"[(document, 0, 3395, BERLIN - Police are searc...","[(token, 0, 5, BERLIN, {'sentence': '0'}, []),...","[(word_embeddings, 0, 5, BERLIN, {'sentence': ...","[(named_entity, 0, 5, I-ORG, {'word': 'BERLIN'...","[(chunk, 0, 5, BERLIN, {'sentence': '0', 'chun..."
2,WASHINGTON - Twitter's decision to crack down ...,"[(document, 0, 4644, WASHINGTON - Twitter's de...","[(token, 0, 9, WASHINGTON, {'sentence': '0'}, ...","[(word_embeddings, 0, 9, WASHINGTON, {'sentenc...","[(named_entity, 0, 9, I-LOC, {'word': 'WASHING...","[(chunk, 0, 9, WASHINGTON, {'sentence': '0', '..."
3,Online child sex abuse cases in Thailand are s...,"[(document, 0, 2767, Online child sex abuse ca...","[(token, 0, 5, Online, {'sentence': '0'}, []),...","[(word_embeddings, 0, 5, Online, {'sentence': ...","[(named_entity, 0, 5, O, {'word': 'Online'}, [...","[(chunk, 32, 39, Thailand, {'sentence': '0', '..."
4,BERLIN - German prosecutors said Wednesday the...,"[(document, 0, 3530, BERLIN - German prosecuto...","[(token, 0, 5, BERLIN, {'sentence': '0'}, []),...","[(word_embeddings, 0, 5, BERLIN, {'sentence': ...","[(named_entity, 0, 5, I-ORG, {'word': 'BERLIN'...","[(chunk, 0, 5, BERLIN, {'sentence': '0', 'chun..."
...,...,...,...,...,...,...
226,MANILA - Philippine police on Tuesday vowed to...,"[(document, 0, 1435, MANILA - Philippine polic...","[(token, 0, 5, MANILA, {'sentence': '0'}, []),...","[(word_embeddings, 0, 5, MANILA, {'sentence': ...","[(named_entity, 0, 5, I-LOC, {'word': 'MANILA'...","[(chunk, 0, 5, MANILA, {'sentence': '0', 'chun..."
227,Philippine President Benigno Aquino on Wednesd...,"[(document, 0, 2702, Philippine President Beni...","[(token, 0, 9, Philippine, {'sentence': '0'}, ...","[(word_embeddings, 0, 9, Philippine, {'sentenc...","[(named_entity, 0, 9, O, {'word': 'Philippine'...","[(chunk, 21, 34, Benigno Aquino, {'sentence': ..."
228,The Philippines' top court ruled on Tuesday th...,"[(document, 0, 2956, The Philippines' top cour...","[(token, 0, 2, The, {'sentence': '0'}, []), (t...","[(word_embeddings, 0, 2, The, {'sentence': '0'...","[(named_entity, 0, 2, O, {'word': 'The'}, []),...","[(chunk, 4, 14, Philippines, {'sentence': '0',..."
229,Philippine authorities said Wednesday they had...,"[(document, 0, 2373, Philippine authorities sa...","[(token, 0, 9, Philippine, {'sentence': '0'}, ...","[(word_embeddings, 0, 9, Philippine, {'sentenc...","[(named_entity, 0, 9, I-LOC, {'word': 'Philipp...","[(chunk, 0, 9, Philippine, {'sentence': '0', '..."


In [None]:
exp.ner_chunk

0      [(chunk, 0, 6, Netflix, {'sentence': '0', 'chu...
1      [(chunk, 0, 5, BERLIN, {'sentence': '0', 'chun...
2      [(chunk, 0, 9, WASHINGTON, {'sentence': '0', '...
3      [(chunk, 32, 39, Thailand, {'sentence': '0', '...
4      [(chunk, 0, 5, BERLIN, {'sentence': '0', 'chun...
                             ...                        
226    [(chunk, 0, 5, MANILA, {'sentence': '0', 'chun...
227    [(chunk, 21, 34, Benigno Aquino, {'sentence': ...
228    [(chunk, 4, 14, Philippines, {'sentence': '0',...
229    [(chunk, 0, 9, Philippine, {'sentence': '0', '...
230    [(chunk, 12, 21, Philippine, {'sentence': '0',...
Name: ner_chunk, Length: 231, dtype: object

In [None]:
finisher = Finisher()\
    .setInputCols("")

In [None]:
exp2 = finished.toPandas()

In [None]:
exp2

Unnamed: 0,text,finished_ner_chunk
0,Netflix is facing backlash from viewers in sev...,"[Netflix, Cuties, Maïmouna Doucouré, Netflix, ..."
1,BERLIN - Police are searching an allotment plo...,"[BERLIN, Police, German, Hanover, British, Mad..."
2,WASHINGTON - Twitter's decision to crack down ...,"[WASHINGTON, QAnon, US, Donald Trump's, Eric, ..."
3,Online child sex abuse cases in Thailand are s...,"[Thailand, Internet Crimes Against Children, T..."
4,BERLIN - German prosecutors said Wednesday the...,"[BERLIN, German, British, Madeleine McCann, Br..."
...,...,...
226,MANILA - Philippine police on Tuesday vowed to...,"[MANILA, Philippine, Internet, MANILA, Philipp..."
227,Philippine President Benigno Aquino on Wednesd...,"[Benigno Aquino, Internet, Philippine, Benigno..."
228,The Philippines' top court ruled on Tuesday th...,"[Philippines, Internet, Asia's, Filipino, Inte..."
229,Philippine authorities said Wednesday they had...,"[Philippine, National Bureau of Investigation,..."
