In [1]:
#Initialising Pyspark
import findspark
findspark.init()


In [2]:
!/mnt/miniconda/bin/pip install spark-nlp==4.2.1 --force
!/mnt/miniconda/bin/pip install sparknlp

Collecting spark-nlp==4.2.1
  Downloading spark_nlp-4.2.1-py2.py3-none-any.whl (643 kB)
[K     |████████████████████████████████| 643 kB 29.8 MB/s eta 0:00:01
[?25hInstalling collected packages: spark-nlp
  Attempting uninstall: spark-nlp
    Found existing installation: spark-nlp 4.3.2
    Uninstalling spark-nlp-4.3.2:
      Successfully uninstalled spark-nlp-4.3.2
Successfully installed spark-nlp-4.2.1
Collecting sparknlp
  Downloading sparknlp-1.0.0-py3-none-any.whl (1.4 kB)
Installing collected packages: sparknlp
Successfully installed sparknlp-1.0.0


In [3]:
#Importing required packages and modules
import pandas as pd
import numpy as np
import json
import sparknlp

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, size, to_date
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline


from sparknlp.annotator import NerConverter

In [4]:
spark = SparkSession.builder \
        .appName("SparkNLP") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.1") \
    .master('yarn') \
    .getOrCreate()

Ivy Default Cache set to: /home/hadoop/.ivy2/cache
The jars for the packages stored in: /home/hadoop/.ivy2/jars
:: loading settings :: url = jar:file:/usr/lib/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-44d70caa-5a48-46c5-bbd5-ab76c9165fa3;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.2.1 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	foun

In [5]:
spark

In [6]:
#Reading in submissions data
df_sub = spark.read.parquet("s3a://ss4608-ppol-567/worldnews_clean/submissions")

                                                                                

In [11]:
#Obtaining required columns for ner
df_title = df_sub.select("title")

In [12]:
### Cleaning the text 

#### Intializing document assembler 
documentAssembler = DocumentAssembler().setInputCol("title").setOutputCol("document")

### Tokenizing the document 
tokenizer = Tokenizer() \
            .setInputCols("document") \
            .setOutputCol("token")

#### Removing English stop-words
stop_words = StopWordsCleaner.pretrained("stopwords_en", "en") \
                             .setInputCols(["token"]) \
                             .setOutputCol("sw_rem")



### Cleaning data to remove special characters and non-english words and converting to lower case
cleanUpPatterns = ["[^A-Za-z0-9 ]"]

normalizer = Normalizer() \
     .setInputCols(["sw_rem"]) \
     .setOutputCol("normalized") \
     .setLowercase(True) \
     .setCleanupPatterns(cleanUpPatterns) 

#### Applying Lemmatization
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(["normalized"]) \
     .setOutputCol("clean")

### Synonym Matching 


### Transforming into human-readable form using finisher
finisher = Finisher() \
     .setInputCols(['clean']) 

stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[ | ]stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[ / ]Download done! Loading the resource.


[Stage 1:>                                                          (0 + 0) / 1]

[ \ ]

[Stage 1:>                                                          (0 + 1) / 1]

[ | ]

                                                                                

[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ — ]Download done! Loading the resource.
[ \ ]

                                                                                

[OK!]


In [13]:
#Trainbing Preprocessing pipeline
pre_pipeline = Pipeline(
      stages = [
          documentAssembler, 
          tokenizer,
          stop_words,
          normalizer, 
          lemmatizer, 
          finisher
      ])

In [14]:
df_sub_req = df_sub.select(["id", "title", "created_date", "date_clean", "Live_Thread", "War_Dummy"])
df_sub_preprocessed = pre_pipeline.fit(df_sub_req).transform(df_sub_req)


In [18]:
### Creating a new column for the sentence to apply sentence embeddings and implement goal specific processing. 
df_sub_preprocessed = df_sub_preprocessed.withColumn("string_form", F.concat_ws(" ", F.col("finished_clean")))

In [19]:
#### Intializing document assembler 
documentAssembler = DocumentAssembler().setInputCol("string_form").setOutputCol("document")

In [20]:

### Tokenizing the document 
tokenizer = Tokenizer() \
            .setInputCols("document") \
            .setOutputCol("token")

In [21]:
#Sentence Detection from Documents
sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence") \
    .setCustomBounds(["\n\n"])

In [22]:
#Word-Embeddings
word_embedding = WordEmbeddingsModel.pretrained().setInputCols("sentence", "token").setOutputCol("bert")

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[ | ]glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[ — ]Download done! Loading the resource.
[ \ ]

[Stage 4:>                                                          (0 + 0) / 1]

[ | ]

[Stage 4:>                                                          (0 + 1) / 1]

[ / ]

                                                                                

[OK!]


In [23]:
#Named-entities tagging
nerTagger = NerDLModel.pretrained().setInputCols("sentence", "token", "bert").setOutputCol("ner")

ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[ | ]ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[ / ]Download done! Loading the resource.


[Stage 5:>                                                          (0 + 0) / 1]

[ \ ]

[Stage 5:>                                                          (0 + 1) / 1]

[ | ]

                                                                                

[ — ]

2023-04-05 16:06:51.300652: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-05 16:06:51.457331: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [
  /job:localhost/replica:0/task:0/device:CPU:0].
See below for details of this colocation group:
Colocation Debug Info:
Colocation group had the following types and supported devices: 
Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_de

[OK!]


In [24]:
#To convert Named-entities to human-readable form
ner_converter = NerConverter() \
    .setInputCols(["document", "token", "ner"]) \
    .setOutputCol("ner_chunk")

In [25]:
#Training Preprocessing pipeline
pre_pipeline = Pipeline(
      stages = [
          documentAssembler, 
          tokenizer,
          sentence,
          word_embedding, 
          nerTagger, 
          ner_converter
      ])

In [27]:
#Applying data to pipeline
data_ner = pre_pipeline.fit(df_sub_preprocessed).transform(df_sub_preprocessed)

In [28]:
data_ner.show(5)

[Stage 10:>                                                         (0 + 1) / 1]

+-------+--------------------+------------+----------+-----------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     id|               title|created_date|date_clean|Live_Thread|War_Dummy|      finished_clean|         string_form|            document|               token|            sentence|                bert|                 ner|           ner_chunk|
+-------+--------------------+------------+----------+-----------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| zuu0wk|"Ein irreparabler...|  12-25-2022|2022-12-25|      false|     true|[ein, irreparable...|ein irreparabler ...|[[document, 0, 72...|[[token, 0, 2, ei...|[[document, 0, 72...|[[word_embeddings...|[[named_entity, 0...|[[chunk, 30, 32, ...|
| tfxnq4|"Good luck 

                                                                                

In [29]:
#Saving final Named-Entity results
ner_final = data_ner.select(F.explode(F.arrays_zip(data_ner.ner_chunk.result, 
                                     data_ner.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label"))

In [36]:
ner_final.filter(F.col('chunk')=='russia').count()

                                                                                

0

In [30]:
#Finding the entities that appeared the most 
ner_final.groupby('chunk') \
         .agg(F.count("chunk").alias("Total_Occurences")) \
         .orderBy(F.col("Total_Occurences").desc()).show(12)   



+------------+----------------+
|       chunk|Total_Occurences|
+------------+----------------+
|      german|            1069|
|     germany|             829|
|     iranian|             465|
|      poland|             431|
|     zealand|             335|
|         abe|             331|
|       italy|             329|
|        modi|             283|
|      pelosi|             256|
|     italian|             231|
|zaporizhzhia|             209|
|     armenia|             142|
+------------+----------------+
only showing top 12 rows



                                                                                

In [31]:
#Filtering to entities that belong to 'Person' label
ner_person_df = ner_final.filter(F.col("ner_label") == "PER") \
             .groupby('chunk') \
             .agg(F.count("chunk").alias("Total_Occurences")) \
             .orderBy(F.col("Total_Occurences").desc()).limit(12).toPandas()  

                                                                                

In [33]:
ner_person_df

Unnamed: 0,chunk,Total_Occurences
0,abe,331
1,modi,283
2,pelosi,256
3,liz,122
4,iii,83
5,orban,74
6,amini,71
7,von der leyen,68
8,elon,64
9,dmitry,53


In [32]:
#Filtering to entities that belong to 'Location' label
ner_loc_df = ner_final.filter(F.col("ner_label") == "LOC") \
             .groupby('chunk') \
             .agg(F.count("chunk").alias("Total_Occurences")) \
             .orderBy(F.col("Total_Occurences").desc()).limit(12).toPandas()

                                                                                

In [34]:
ner_loc_df

Unnamed: 0,chunk,Total_Occurences
0,germany,829
1,poland,431
2,zealand,335
3,italy,329
4,armenia,142
5,zaporizhzhia,133
6,munich,45
7,mali,36
8,oman,25
9,malta,24


In [40]:
#Filtering to entities that belong to 'Organization' label
#ner_org_df = ner_final.filter(F.col("ner_label") == "ORG") \
#             .groupby('chunk') \
#             .agg(F.count("chunk").alias("Total_Occurences")) \
#             .orderBy(F.col("Total_Occurences").desc()).limit(12).toPandas()

In [121]:
#Checking results
ner_person_df

Unnamed: 0,chunk,Total_Occurences
0,Putin,5355
1,Biden,1957
2,Zelensky,930
3,Vladimir Putin,790
4,Boris Johnson,726
5,Zelenskyy,641
6,Russia's,372
7,Trump,312
8,Liz Truss,288
9,Bucha,270


In [122]:
#Checking results
ner_loc_df

Unnamed: 0,chunk,Total_Occurences
0,Ukraine,25326
1,Russia,16702
2,US,5656
3,China,4693
4,U.S,3725
5,India,2633
6,UK,2556
7,Iran,2430
8,Germany,1725
9,Israel,1593


In [38]:
#Checking results
#ner_org_df

In [124]:
#Renaming column names to more meaningful ones
ner_person_df = ner_person_df.rename(columns={'chunk' : 'Person Entity', 'Total_Occurences' : 'Total Occurences'}) 
ner_loc_df = ner_loc_df.rename(columns={'chunk' : 'Location Entity', 'Total_Occurences' : 'Total Occurences'}) 
ner_org_df = ner_org_df.rename(columns={'chunk' : 'Organisation Entity', 'Total_Occurences' : 'Total Occurences'}) 

In [41]:
#Checking results
ner_person_df

Unnamed: 0,chunk,Total_Occurences
0,abe,331
1,modi,283
2,pelosi,256
3,liz,122
4,iii,83
5,orban,74
6,amini,71
7,von der leyen,68
8,elon,64
9,dmitry,53


In [42]:
#Saving to csv
ner_person_df.to_csv('NER_person_preprocessed.csv', index=False)
ner_loc_df.to_csv('NER_location_preprocessed.csv', index=False)
#ner_org_df.to_csv('NER_organisation.csv', index=False)

In [44]:
spark.close

AttributeError: 'SparkSession' object has no attribute 'close'