In [1]:
#Initialising Pyspark
import findspark
findspark.init()


In [2]:
!/mnt/miniconda/bin/pip install spark-nlp==4.2.1 --force
!/mnt/miniconda/bin/pip install sparknlp

Collecting spark-nlp==4.2.1
  Downloading spark_nlp-4.2.1-py2.py3-none-any.whl (643 kB)
[K     |████████████████████████████████| 643 kB 33.9 MB/s eta 0:00:01
[?25hInstalling collected packages: spark-nlp
  Attempting uninstall: spark-nlp
    Found existing installation: spark-nlp 4.3.2
    Uninstalling spark-nlp-4.3.2:
      Successfully uninstalled spark-nlp-4.3.2
Successfully installed spark-nlp-4.2.1
Collecting sparknlp
  Downloading sparknlp-1.0.0-py3-none-any.whl (1.4 kB)
Installing collected packages: sparknlp
Successfully installed sparknlp-1.0.0


In [33]:
#Importing required packages and modules
import pandas as pd
import numpy as np
import json
import sparknlp

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, size, to_date
from pyspark.sql.types import StringType, ArrayType
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline


from pyspark.ml.clustering import LDA
import pyLDAvis

In [6]:
#pip install pyLDAvis


In [7]:
spark = SparkSession.builder \
        .appName("SparkNLP") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.2.1") \
    .master('yarn') \
    .getOrCreate()

Ivy Default Cache set to: /home/hadoop/.ivy2/cache
The jars for the packages stored in: /home/hadoop/.ivy2/jars
:: loading settings :: url = jar:file:/usr/lib/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e790358e-7f04-42e1-94c3-67f8be707312;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.2.1 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	foun

In [8]:
spark


In [9]:
#Reading in submissions data
df_sub = spark.read.parquet("s3a://ss4608-ppol-567/worldnews_clean/submissions")

                                                                                

In [10]:
#Obtaining required columns for Topic Modelling
df_title = df_sub.select("id", "date_clean", "title")

In [11]:
### Cleaning the text 

#### Intializing document assembler 
documentAssembler = DocumentAssembler().setInputCol("title").setOutputCol("document")

### Tokenizing the document 
tokenizer = Tokenizer() \
            .setInputCols("document") \
            .setOutputCol("token")

#### Removing English stop-words
stop_words = StopWordsCleaner.pretrained("stopwords_en", "en") \
                             .setInputCols(["token"]) \
                             .setOutputCol("sw_rem")



### Cleaning data to remove special characters and non-english words and converting to lower case
cleanUpPatterns = ["[^A-Za-z0-9 ]"]

normalizer = Normalizer() \
     .setInputCols(["sw_rem"]) \
     .setOutputCol("normalized") \
     .setLowercase(True) \
     .setCleanupPatterns(cleanUpPatterns) 



#### Applying Lemmatization
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(["normalized"]) \
     .setOutputCol("clean")


### Transforming into human-readable form using finisher
finisher = Finisher() \
     .setInputCols(['clean']) 

stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[ | ]stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
Download done! Loading the resource.
[ / ]

                                                                                

[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[ / ]



[ — ]

                                                                                

[OK!]


In [12]:
#Training Preprocessing pipeline
pre_pipeline = Pipeline(
      stages = [
          documentAssembler, 
          tokenizer,
          stop_words,
          normalizer, 
          lemmatizer, 
          finisher
      ])

#Applying data to pipeline
preprocessed_df = pre_pipeline.fit(df_title).transform(df_title)

In [13]:
#Resultant data
preprocessed_df.show(5)

[Stage 4:>                                                          (0 + 1) / 1]

+-------+----------+--------------------+--------------------+
|     id|date_clean|               title|      finished_clean|
+-------+----------+--------------------+--------------------+
| zuu0wk|2022-12-25|"Ein irreparabler...|[ein, irreparable...|
| tfxnq4|2022-03-17|"Good luck with t...|        [good, luck]|
| uaxrfp|2022-04-24|"No one betrayed ...|[betray, village,...|
| tghuvb|2022-03-17|"Russian warship,...|[russian, warship...|
|10imxsx|2023-01-22|#BBTitans: Yemi s...|[bbtitans, yemi, ...|
+-------+----------+--------------------+--------------------+
only showing top 5 rows



                                                                                

#### Topic Modeling with LDA

In [14]:
#TF-IDF Vectorization
tf = CountVectorizer(inputCol='finished_clean',
                     outputCol='tf_features')

#Fitting and transforming to get term-frequencies
tf_model = tf.fit(preprocessed_df)
tf_result = tf_model.transform(preprocessed_df)

                                                                                

In [15]:
#Inverse Document Frequency 

idf = IDF(inputCol='tf_features', 
          outputCol='tf_idf_features')

#Fitting and transforming to get inverse document frequency
idf_model = idf.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

                                                                                

In [16]:
#Check intermediate results
tfidf_result.show()

23/04/04 04:50:07 WARN DAGScheduler: Broadcasting large task binary with size 1532.0 KiB


+-------+----------+---------------------------------+--------------------+--------------------+--------------------+
|     id|date_clean|                            title|      finished_clean|         tf_features|     tf_idf_features|
+-------+----------+---------------------------------+--------------------+--------------------+--------------------+
| zuu0wk|2022-12-25|             "Ein irreparabler...|[ein, irreparable...|(55712,[2316,4085...|(55712,[2316,4085...|
| tfxnq4|2022-03-17|             "Good luck with t...|        [good, luck]|(55712,[725,5391]...|(55712,[725,5391]...|
| uaxrfp|2022-04-24|             "No one betrayed ...|[betray, village,...|(55712,[169,886,9...|(55712,[169,886,9...|
| tghuvb|2022-03-17|             "Russian warship,...|[russian, warship...|(55712,[1,665,327...|(55712,[1,665,327...|
|10imxsx|2023-01-22|             #BBTitans: Yemi s...|[bbtitans, yemi, ...|(55712,[176,270,1...|(55712,[176,270,1...|
| svups7|2022-02-18|             #Ukraine #Russia ...|[u

In [27]:
#These two variables are chosen based on trial and error with best final result 
num_topics = 7
max_iter = 20

In [18]:
#Fitting and transforming the LDA model
lda = LDA(k=num_topics, 
          maxIter=max_iter, 
          featuresCol='tf_idf_features')
lda_model = lda.fit(tfidf_result)
transformed = lda_model.transform(tfidf_result)

23/04/04 04:50:19 WARN DAGScheduler: Broadcasting large task binary with size 1537.2 KiB
23/04/04 04:50:26 WARN DAGScheduler: Broadcasting large task binary with size 1537.2 KiB
23/04/04 04:50:27 WARN DAGScheduler: Broadcasting large task binary with size 1554.1 KiB
23/04/04 04:50:27 WARN DAGScheduler: Broadcasting large task binary with size 7.5 MiB
23/04/04 04:50:29 WARN DAGScheduler: Broadcasting large task binary with size 10.4 MiB
23/04/04 04:50:30 WARN DAGScheduler: Broadcasting large task binary with size 1554.8 KiB
23/04/04 04:50:30 WARN DAGScheduler: Broadcasting large task binary with size 7.5 MiB
23/04/04 04:50:31 WARN DAGScheduler: Broadcasting large task binary with size 10.4 MiB
23/04/04 04:50:31 WARN DAGScheduler: Broadcasting large task binary with size 1554.8 KiB
23/04/04 04:50:31 WARN DAGScheduler: Broadcasting large task binary with size 7.5 MiB
23/04/04 04:50:32 WARN DAGScheduler: Broadcasting large task binary with size 10.4 MiB
23/04/04 04:50:32 WARN DAGScheduler:

In [None]:
#This is to obtain all the vocabulary back in human form
vocab = tf_model.vocabulary

def get_words(token_list):
    return [vocab[token_id] for token_id in token_list]

udf_to_words = F.udf(get_words, ArrayType(StringType()))

In [None]:
#printing results of LDA
num_top_words = 10
topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))


topics.select('topic', 'topicWords').show(truncate=100)

##### Using pyLDAvis for Visualization

In [19]:
def format_data_to_pyldavis(preprocessed_df, tf_model, transformed, lda_model):
    """
    This function takes in the raw data used in LDA, the tf fit with the data, the lda model, and the final transformed data.
    From these, it finds the required arguments for PyLDAvis and returns them. 
    
    """
    #Finding term frequency word count
    temp1 = preprocessed_df.select((explode(preprocessed_df.finished_clean)).alias("words")).groupby("words").count()
    word_counts = {r["words"]:r["count"] for r in temp1.collect()}
    #Adding them to vocabulary
    word_counts = [word_counts[w] for w in tf_model.vocabulary]

    #The required arguments for PyLDA vis
    data = {"topic_term_dists": np.array(lda_model.topicsMatrix().toArray()).T, 
            "doc_topic_dists": np.array([x.toArray() for x in transformed.select(["topicDistribution"]).toPandas()['topicDistribution']]),
            "doc_lengths": [r[0] for r in preprocessed_df.select(size(preprocessed_df.finished_clean)).collect()],
            "vocab": tf_model.vocabulary,
            "term_frequency": word_counts}
    #Return result
    return data

In [20]:
def filter_bad_docs(data):
    """
    This function takes the result of format_data_to_pyldavis as input, and removes documents that have 0 in all values of vectors.
    This is needed, else errors are produced while printing pyLDAvis
    """
    bad = 0
    doc_topic_dists_filter = []
    doc_lengths_filter = []

    for x,y in zip(data['doc_topic_dists'], data['doc_lengths']):
        if np.sum(x)==0:
            bad+=1
        elif np.sum(x) != 1:
            bad+=1
        elif np.isnan(x).any():
            bad+=1
        else:
            doc_topic_dists_filter.append(x)
            doc_lengths_filter.append(y)
    
    data['doc_topic_dists'] = doc_topic_dists_filter
    data['doc_lengths'] = doc_lengths_filter

In [58]:
#Function calls for pyLDAvis prepare
data = format_data_to_pyldavis(preprocessed_df, tf_model, transformed, lda_model)
filter_bad_docs(data)
py_lda_prepared_data = pyLDAvis.prepare(**data)

                                                                                

In [59]:
py_lda_prepared_data

PreparedData(topic_coordinates=              x         y  topics  cluster       Freq
topic                                                
3      0.104505  0.051197       1        1  24.205433
2      0.046219 -0.107132       2        1  21.577993
4     -0.072891 -0.081293       3        1  19.785617
0      0.045180  0.062763       4        1  18.178663
1     -0.123013  0.074464       5        1  16.252293, topic_info=          Term          Freq         Total Category  logprob  loglift
25       korea  1.310904e+06  1.310904e+06  Default  30.0000  30.0000
36       north  1.087890e+06  1.087890e+06  Default  29.0000  29.0000
27       drone  1.606799e+06  1.606799e+06  Default  28.0000  28.0000
28        call  1.586827e+06  1.586827e+06  Default  27.0000  27.0000
53      crimea  9.809990e+05  9.809990e+05  Default  26.0000  26.0000
..         ...           ...           ...      ...      ...      ...
152      price  3.248552e+05  4.796506e+05   Topic5   3.9408  11.5568
23       biden  4.3

In [60]:
type(py_lda_prepared_data)

pyLDAvis._prepare.PreparedData

In [22]:
#Printing data
pyLDAvis.display(py_lda_prepared_data)


In [29]:
preprocessed_df.show(5)

[Stage 79:>                                                         (0 + 1) / 1]

+-------+----------+--------------------+--------------------+
|     id|date_clean|               title|      finished_clean|
+-------+----------+--------------------+--------------------+
| zuu0wk|2022-12-25|"Ein irreparabler...|[ein, irreparable...|
| tfxnq4|2022-03-17|"Good luck with t...|        [good, luck]|
| uaxrfp|2022-04-24|"No one betrayed ...|[betray, village,...|
| tghuvb|2022-03-17|"Russian warship,...|[russian, warship...|
|10imxsx|2023-01-22|#BBTitans: Yemi s...|[bbtitans, yemi, ...|
+-------+----------+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [57]:
#Reading the date-event csv file of important events related to Russia-Ukraine conflict in 2022
#df_ext = pd.read_csv('../data/csv/War_Events.csv')
#df_ext

In [39]:
#2 week period since Phase-1 of clash
startDate = "2022-02-24"
endDate = "2022-03-10"
df_filter1 = preprocessed_df.filter(to_date("date_clean", "yyyy-MM-dd").between(startDate, endDate))

In [51]:
#2 week period since Phase-4 of clash
startDate = "2022-10-04"
endDate = "2022-10-18"
df_filter2 = preprocessed_df.filter(to_date("date_clean", "yyyy-MM-dd").between(startDate, endDate))

In [40]:
df_filter1.count()

                                                                                

21857

In [52]:
df_filter2.count()

                                                                                

7314

In [54]:
#Fitting and transforming to get term-frequencies
tf_model = tf.fit(df_filter1)
tf_result = tf_model.transform(df_filter1)

#Fitting and transforming to get inverse document frequency
idf_model = idf.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

#Fitting and transforming the LDA model
lda = LDA(k=5, 
           maxIter=10, 
           featuresCol='tf_idf_features')
lda_model = lda.fit(tfidf_result)
transformed = lda_model.transform(tfidf_result)

#This is to obtain all the vocabulary back in human form
vocab = tf_model.vocabulary

def get_words(token_list):
    return [vocab[token_id] for token_id in token_list]

udf_to_words = F.udf(get_words, ArrayType(StringType()))
    
#Function calls for pyLDAvis prepare
data = format_data_to_pyldavis(df_filter1, tf_model, transformed, lda_model)
filter_bad_docs(data)
py_lda_prepared_data = pyLDAvis.prepare(**data)

#Printing data
pyLDAvis.display(py_lda_prepared_data)

23/04/04 05:32:30 WARN DAGScheduler: Broadcasting large task binary with size 1660.5 KiB
23/04/04 05:32:31 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/04/04 05:32:31 WARN DAGScheduler: Broadcasting large task binary with size 1660.5 KiB
23/04/04 05:32:31 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/04/04 05:32:32 WARN DAGScheduler: Broadcasting large task binary with size 1660.5 KiB
23/04/04 05:32:32 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/04/04 05:32:32 WARN DAGScheduler: Broadcasting large task binary with size 1660.5 KiB
23/04/04 05:32:32 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/04/04 05:32:32 WARN DAGScheduler: Broadcasting large task binary with size 1660.5 KiB
23/04/04 05:32:33 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/04/04 05:32:33 WARN DAGScheduler: Broadcasting large task binary with size 1660.5 KiB
23/04/04 05:32:33 WARN DAGScheduler:

In [55]:
#2 week period since Phase-4 of clash
startDate = "2022-10-04"
endDate = "2022-10-18"
df_filter2 = preprocessed_df.filter(to_date("date_clean", "yyyy-MM-dd").between(startDate, endDate))
df_filter2.count()

                                                                                

7314

In [56]:
#Fitting and transforming to get term-frequencies
tf_model = tf.fit(df_filter2)
tf_result = tf_model.transform(df_filter2)

#Fitting and transforming to get inverse document frequency
idf_model = idf.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

#Fitting and transforming the LDA model
lda = LDA(k=5, 
           maxIter=10, 
           featuresCol='tf_idf_features')
lda_model = lda.fit(tfidf_result)
transformed = lda_model.transform(tfidf_result)

#This is to obtain all the vocabulary back in human form
vocab = tf_model.vocabulary

def get_words(token_list):
    return [vocab[token_id] for token_id in token_list]

udf_to_words = F.udf(get_words, ArrayType(StringType()))
    
#Function calls for pyLDAvis prepare
data = format_data_to_pyldavis(df_filter2, tf_model, transformed, lda_model)
filter_bad_docs(data)
py_lda_prepared_data = pyLDAvis.prepare(**data)

#Printing data
pyLDAvis.display(py_lda_prepared_data)

23/04/04 05:33:18 WARN DAGScheduler: Broadcasting large task binary with size 1082.7 KiB
23/04/04 05:33:18 WARN DAGScheduler: Broadcasting large task binary with size 1453.7 KiB
23/04/04 05:33:18 WARN DAGScheduler: Broadcasting large task binary with size 1082.7 KiB
23/04/04 05:33:18 WARN DAGScheduler: Broadcasting large task binary with size 1453.7 KiB
23/04/04 05:33:19 WARN DAGScheduler: Broadcasting large task binary with size 1082.7 KiB
23/04/04 05:33:19 WARN DAGScheduler: Broadcasting large task binary with size 1453.7 KiB
23/04/04 05:33:19 WARN DAGScheduler: Broadcasting large task binary with size 1082.7 KiB
23/04/04 05:33:19 WARN DAGScheduler: Broadcasting large task binary with size 1453.7 KiB
23/04/04 05:33:19 WARN DAGScheduler: Broadcasting large task binary with size 1082.7 KiB
23/04/04 05:33:19 WARN DAGScheduler: Broadcasting large task binary with size 1453.7 KiB
23/04/04 05:33:19 WARN DAGScheduler: Broadcasting large task binary with size 1082.7 KiB
23/04/04 05:33:19 WAR

In [61]:
spark.stop()