# Cleaning for Google Trends

## 1. Load packages and data

In [6]:
# import packages
import os 

import re
from datetime import datetime

import pytz

import pandas as pd
import numpy as np

import ast

import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import array_contains
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, udf, lit

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

import emojis
from translate import Translator

import sparknlp


In [15]:
# import findspark
import findspark

# initialize findspark with spark directory

#ALWAYS HAVE TO BE CHANGED 
findspark.init("/Users/wouterdewitte/spark/")

# import pyspark
import pyspark
# create spark context
sc = pyspark.SparkContext()
# create spark session 
spark = pyspark.sql.SparkSession(sc)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=Spark NLP, master=local[*]) created by getOrCreate at /Users/wouterdewitte/opt/anaconda3/lib/python3.9/site-packages/sparknlp/__init__.py:164 

In [3]:
#set this path to your path, for some reason I have an error 
#reading in all the files
#path_json = ".././../data/Topic_vegan/*.json"

# use this if you want all the tweet files, but this is usually too large
#df_json = spark.read.json(path_json)

In [2]:
list_brands = ["healthyfood",
               "healthylifestyle",
               "vegan",
               "keto",
               "ketodiet",
               "ketolifestyle",
               "veganism",
               "vegetarian"]
from re import search



data_dir = ".././../data/Topic_vegan/"
tweet_files = [os.path.join(data_dir, obs) for obs in os.listdir(data_dir)]



files_brand = [file for file in tweet_files if (file.find(list_brands[2]) != -1)]
files_brand               
               
df_json = spark.read.option("multiline","true").json(files_brand)  
df_json.count()

                                                                                

22/11/28 16:04:57 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

3428559

In [3]:
# select interesting features
import pyspark.sql.functions as F

df = df_json.select(F.col("user.name"),
                    F.col("user.screen_name"),
                    F.col("created_at"), 
                    F.col("full_text"),
                    F.col("user.followers_count"),
                    F.col("retweet_count"),
                    F.col("favorite_count"),
                    F.col("entities.hashtags"))
df.limit(10).toPandas()

Unnamed: 0,name,screen_name,created_at,full_text,followers_count,retweet_count,favorite_count,hashtags
0,のり/Nori,nori_k_629,Mon Apr 04 10:09:55 +0000 2022,RT @ohmpawatt: เพื่อนๆคิดถึงผมมั้ยยย ถ้าคิดถึง...,139,3582,0,[]
1,alice,myn4meizalize,Mon Apr 04 10:09:54 +0000 2022,RT @mynameisnanon: คิดถึงกันป่าว ถ้าคิดถึงต้อง...,655,3837,0,[]
2,Karen Reed 🌸,kandk670,Mon Apr 04 10:09:54 +0000 2022,@trudiebakescake Organic coconut oil in a jar ...,711,0,0,[]
3,ハル):),patlnwza55,Mon Apr 04 10:09:52 +0000 2022,RT @ohmpawatt: เพื่อนๆคิดถึงผมมั้ยยย ถ้าคิดถึง...,236,3582,0,[]
4,alice,myn4meizalize,Mon Apr 04 10:09:52 +0000 2022,RT @ohmpawatt: เพื่อนๆคิดถึงผมมั้ยยย ถ้าคิดถึง...,655,3582,0,[]
5,ﾌｧﾙﾄﾞﾗっ子organic有機💙💻,organic_yusai,Mon Apr 04 10:09:52 +0000 2022,マジでピンチ助けて自転車ガガガガ,291,0,1,[]
6,のり/Nori,nori_k_629,Mon Apr 04 10:09:50 +0000 2022,RT @mynameisnanon: คิดถึงกันป่าว ถ้าคิดถึงต้อง...,139,3837,0,[]
7,｡◕‿◕｡𝑱𝒆 𝒕'𝒂𝒊𝒎𝒆 🐶🧡✨,MyFnlovely97,Mon Apr 04 10:09:50 +0000 2022,RT @ohmpawatt: เพื่อนๆคิดถึงผมมั้ยยย ถ้าคิดถึง...,245,3582,0,[]
8,Sang™,asan_gk,Mon Apr 04 10:09:50 +0000 2022,RT @NotechAna: Am I the only one who types in ...,2065,374,0,[]
9,Trysia ):)▪︎never let me go▪︎,Winnie_thephuu,Mon Apr 04 10:09:48 +0000 2022,RT @ohmpawatt: เพื่อนๆคิดถึงผมมั้ยยย ถ้าคิดถึง...,379,3582,0,[]


## 2. Preprocess Data

In [7]:
# https://developer.twitter.com/en/docs/twitter-ads-api/timezones
# function to convert Twitter date string format
def getDate(date):
    if date is not None:
        return str(datetime.strptime(date,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None

# UDF declaration
date_udf = F.udf(getDate, StringType())

# apply udf
df = df.withColumn('post_created_at', F.to_utc_timestamp(date_udf("created_at"), "UTC"))

In [8]:
#drop duplicates and retweets 
df = df.filter(~F.col("full_text").startswith("RT"))\
                        .drop_duplicates()
#sorting such when dropping later we only keep the most recent post 
df = df.sort("post_created_at", ascending=False)
#removing spam accounts 
df = df.drop_duplicates(["full_text", "screen_name"])

#df.printSchema()
#df.count() #1340938

In [9]:
# define function to count hashtags
def get_hashtags(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "#" in word:
            counter += 1
    return(counter) 

# define function to count mentions
def get_mentions(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "@" in word:
            counter += 1
    return(counter)

# define function to count exclamation marks
def get_exclamation_marks(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "!" in word:
            counter += 1
    return(counter)

# define function to count number of emojis used
import emojis
def emoji_counter(text):
    nr_emojis = emojis.count(text)
    return(nr_emojis)
# register functions as udf
get_hashtags_UDF = F.udf(get_hashtags, IntegerType())
get_mentions_UDF = F.udf(get_mentions, IntegerType())
get_exclamation_marks_UDF = F.udf(get_exclamation_marks, IntegerType())
emoji_counter_udf = F.udf(emoji_counter, IntegerType())


In [10]:
twitter_df = df.withColumn("emoji_count", emoji_counter_udf("full_text")) \
                            .withColumn("text_tokenized", F.split("full_text", " ")) \
                            .withColumn("num_words", F.size("text_tokenized")) \
                            .withColumn("num_hashtags", get_hashtags_UDF("text_tokenized")) \
                            .withColumn("num_mentions", get_mentions_UDF("text_tokenized")) \
                            .withColumn("num_exclamation_marks", get_exclamation_marks_UDF("text_tokenized"))
twitter_df.show()

[Stage 17:>                                                         (0 + 1) / 1]

+--------------------+---------------+--------------------+--------------------+---------------+-------------+--------------+--------------------+-------------------+-----------+--------------------+---------+------------+------------+---------------------+
|                name|    screen_name|          created_at|           full_text|followers_count|retweet_count|favorite_count|            hashtags|    post_created_at|emoji_count|      text_tokenized|num_words|num_hashtags|num_mentions|num_exclamation_marks|
+--------------------+---------------+--------------------+--------------------+---------------+-------------+--------------+--------------------+-------------------+-----------+--------------------+---------+------------+------------+---------------------+
| Follow the Vegans Ⓥ|  vegan_v_vegan|Sat May 14 00:55:...|!\n#vegan #GoVega...|           4285|            1|             6|[{[2, 8], vegan},...|2022-05-14 00:55:33|          0|[!\n#vegan, #GoVe...|        4|           3|    

                                                                                

In [11]:
# define function to clean text
def clean_text(string):
    
    # define numbers
    NUMBERS = '0123456789'
    PUNCT = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    
    # convert text to lower case
    cleaned_string = string.lower()
    
    # remove URLS
    cleaned_string = re.sub(r'http\S+', ' ', cleaned_string)
    
    # replace emojis by words
    cleaned_string = emojis.decode(cleaned_string)
    cleaned_string = cleaned_string.replace(":"," ").replace("_"," ")
    cleaned_string = ' '.join(cleaned_string.split())
    
    # remove numbers
    cleaned_string = "".join([char for char in cleaned_string if char not in NUMBERS])
    
    # remove punctuation
    cleaned_string = "".join([char for char in cleaned_string if char not in PUNCT])
    
    # remove words conisting of one character (or less)
    cleaned_string = ' '.join([w for w in cleaned_string.split() if len(w) > 1])

    #translate to English
    translator = Translator(to_lang="en")
    cleaned_string = translator.translate(cleaned_string)

    # return
    return(cleaned_string) 
clean_text_udf = F.udf(clean_text, StringType())


In [12]:
twitter_df = df.withColumn("cleaned_text", clean_text_udf(F.col("full_text")))

# Topic Modelling

https://github.com/maobedkova/TopicModelling_PySpark_SparkNLP/blob/master/Topic_Modelling_with_PySpark_and_Spark_NLP.ipynb

https://www.johnsnowlabs.com/spark-nlp/


In [13]:
import sparknlp 

spark = sparknlp.start(m1=True)

print("Spark NLP version")
sparknlp.version()
print("Apache Spark version")
spark.version

Spark NLP version
Apache Spark version


'3.3.1'

# Spark NLP pipeline

## Basic NLP pipeline

DocumentAssembler converts data into Spark NLP annotation format that can be used by Spark NLP annotators.

In [14]:
from sparknlp.base import DocumentAssembler
documentAssembler = DocumentAssembler() \
                .setInputCol("cleaned_text") \
                .setOutputCol('document')

Next, we tokenize the data with Tokenizer

In [15]:
from sparknlp.annotator import Tokenizer
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('tokenized')

We clean out the data and lower it with Normalizer

In [16]:
from sparknlp.annotator import Normalizer
normalizer = Normalizer() \
     .setInputCols(['tokenized']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

We are going to lemmatize our text with pretrained lemming model provided by Spark NLP. We can access this model with LemmatizerModel.

In [17]:
from sparknlp.annotator import LemmatizerModel
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemmatized')

lemma_antbnc download started this may take some time.
Approximate size to download 907,6 KB
[ — ]lemma_antbnc download started this may take some time.
Approximate size to download 907,6 KB
Download done! Loading the resource.
[OK!]


Spark NLP doesn't provide stop word list, hence, we will use nltk package to download stop words for English.

In [18]:
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')

In [19]:
from sparknlp.annotator import StopWordsCleaner
stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemmatized']) \
     .setOutputCol('unigrams') \
     .setStopWords(eng_stopwords)

In addition to unigrams, it is good to use n-grams for topic modelling as well since they help to better refine topics. We can get n-grams with NGramGenerator in Spark NLP.

In [20]:
from sparknlp.annotator import NGramGenerator

ngrammer = NGramGenerator() \
    .setInputCols(['lemmatized']) \
    .setOutputCol('ngrams') \
    .setN(3) \
    .setEnableCumulative(True) \
    .setDelimiter('_')

We already have our basic NLP pipeline for topic modelling with all necessary steps. However, let's use POS tagger in order to improve our processed data for topic modelling even more with POS tagged data later. For this, we are going to use pretrained POS tagging model provided by Spark NLP. We can access the model with PerceptronModel.

In [21]:
from sparknlp.annotator import PerceptronModel
pos_tagger = PerceptronModel.pretrained('pos_anc') \
     .setInputCols(['document', 'lemmatized']) \
     .setOutputCol('pos')

pos_anc download started this may take some time.
Approximate size to download 3,9 MB
[ \ ]pos_anc download started this may take some time.
Approximate size to download 3,9 MB
Download done! Loading the resource.
[OK!]


Now we have everything in Spark NLP annotation format. To be able to process the data further, we need to tranform data with Finisher.

In [22]:
from sparknlp.base import Finisher
finisher = Finisher() \
     .setInputCols(['unigrams', 'ngrams', 'pos'])

Now we are ready to input everything into a pipeline. Pipeline functionality is accessible with PySpark.

In [23]:
from pyspark.ml import Pipeline
pipeline = Pipeline() \
     .setStages([documentAssembler,
                 tokenizer,
                 normalizer,
                 lemmatizer,
                 stopwords_cleaner,
                 pos_tagger,
                 ngrammer,
                 finisher])

In [24]:
review_text = twitter_df.select("cleaned_text")

processed_review = pipeline.fit(review_text).transform(review_text)

In [25]:
processed_review.limit(5).show()



+--------------------+--------------------+--------------------+--------------------+
|        cleaned_text|   finished_unigrams|     finished_ngrams|        finished_pos|
+--------------------+--------------------+--------------------+--------------------+
|this afternoon li...|[afternoon, lives...|[this, afternoon,...|[DT, NN, VBG, JJ,...|
|sweet potato amp ...|[sweet, potato, a...|[sweet, potato, a...|[JJ, NN, NN, NN, ...|
|creamy mushroom b...|[creamy, mushroom...|[creamy, mushroom...|[NN, NN, NN, NN, ...|
|polypieter veggie...|[polypieter, vegg...|[polypieter, vegg...|[NN, NN, VB, NN, ...|
|enriched with nat...|[enrich, natural,...|[enrich, with, na...|[NN, IN, JJ, NN, ...|
+--------------------+--------------------+--------------------+--------------------+



                                                                                

## Extended NLP pipeline

Up to now, we have our data in a form of unigrams that are lemmatized, with no stop words in there. I think it is a good idea to incorporate n-grams into our NLP pipeline. We obtained n-grams as one step of our pipeline but now n-grams are messy and have a lot of questionable combinations in there. To tackle this problem, let's filter out strange combinations of words in n-grams based on their POS tags. We can imagine a list of viable combinations like ADJ + NOUN so let's restrict our POS combinations in n-grams to this list. Plus, we can also exclude some POS tags from our unigrams to ensure that we don't use functional words for topic modelling (they can be partially covered by stop words but probably not fully).

Doing this POS-based filtering will significantly reduce the vocabulary size for topic modelling which will speed up the whole processing.

Let's start this processing. First, we need to join all our POS tags obtained previously.

In [26]:
from pyspark.sql import types as T

udf_join_arr = F.udf(lambda x: ' '.join(x), T.StringType())
processed_review  = processed_review.withColumn('finished_pos', udf_join_arr(F.col('finished_pos')))

Then we start another Spark NLP pipeline in order to get POS tag n-grams that correspond to word n-grams. We start with convertation into Spark NLP annotation format.

In [27]:
pos_documentAssembler = DocumentAssembler() \
     .setInputCol('finished_pos') \
     .setOutputCol('pos_document')

Then, we tokenize our POS tags.

In [28]:
pos_tokenizer = Tokenizer() \
     .setInputCols(['pos_document']) \
     .setOutputCol('pos')

And generate n-grams from them in the same way we did that for words.

In [29]:
pos_ngrammer = NGramGenerator() \
    .setInputCols(['pos']) \
    .setOutputCol('pos_ngrams') \
    .setN(3) \
    .setEnableCumulative(True) \
    .setDelimiter('_')

Lastly, we are ready to get POS tags ngrams with Finisher.

In [30]:
pos_finisher = Finisher() \
     .setInputCols(['pos', 'pos_ngrams'])

We create this new Spark NLP pipeline...

In [31]:
pos_pipeline = Pipeline() \
     .setStages([pos_documentAssembler,                  
                 pos_tokenizer,
                 pos_ngrammer,  
                 pos_finisher])

... and again fit it and transform the data.

In [32]:
processed_review = pos_pipeline.fit(processed_review).transform(processed_review)

Let's look what kind of data we have to operate with.

In [33]:
processed_review.columns

['cleaned_text',
 'finished_unigrams',
 'finished_ngrams',
 'finished_pos',
 'finished_pos_ngrams']

And these are our word n-grams with their corresponding pos n-grams.

In [34]:
processed_review.select('finished_ngrams', 'finished_pos_ngrams').limit(5).show()



+--------------------+--------------------+
|     finished_ngrams| finished_pos_ngrams|
+--------------------+--------------------+
|[this, afternoon,...|[DT, NN, VBG, JJ,...|
|[sweet, potato, a...|[JJ, NN, NN, NN, ...|
|[creamy, mushroom...|[NN, NN, NN, NN, ...|
|[polypieter, vegg...|[NN, NN, VB, NN, ...|
|[enrich, with, na...|[NN, IN, JJ, NN, ...|
+--------------------+--------------------+



                                                                                

Now we are ready to filter out not useful for topic modelling analysis POS tags from our data. Let's create the function that does it for unigrams first. We create the custom Python function and then transform it to PySpark UDF to be used on Spark dataframe.

In [35]:
def filter_pos(words, pos_tags):
    return [word for word, pos in zip(words, pos_tags) 
            if pos in ['JJ', 'NN', 'NNS', 'VB', 'VBP']]

udf_filter_pos = F.udf(filter_pos, T.ArrayType(T.StringType()))

Then, we apply this function on columns with unigrams and their POS tags to get filtered unigrams in a separate dataframe column.

In [36]:
processed_review = processed_review.withColumn('filtered_unigrams',
                                               udf_filter_pos(F.col('finished_unigrams'), 
                                                              F.col('finished_pos')))

That is how our filtered unigrams look like.

In [37]:
processed_review.select('filtered_unigrams').limit(5).show(truncate=90)



+------------------------------------------------------------------------------------------+
|                                                                         filtered_unigrams|
+------------------------------------------------------------------------------------------+
|[livestreaming, conversation, asefrid, book, govegan, livelikeagorilla, veganfortheanim...|
|                 [sweet, potato, amp, courgette, fritter, yoghurt, dip, vegan, plantbased]|
|                           [creamy, mushroom, bucatini, vegan, veganforlife, forevervegan]|
|[polypieter, veggie, al, super, vegan, quasi, onmogelijk, en, je, krijgt, onvermijdelij...|
|                             [enrich, origin, ingredient, certify, vegan, wink, fragrance]|
+------------------------------------------------------------------------------------------+



                                                                                

It is time to filter out improper POS combinations of n-grams. We create the custom function in the same manner as before. Since we deal with bi- and trigrams, we need to restrict tags for both.

In [38]:
def filter_pos_combs(words, pos_tags):
    return [word for word, pos in zip(words, pos_tags) 
            if (len(pos.split('_')) == 2 and \
                pos.split('_')[0] in ['JJ', 'NN', 'NNS', 'VB', 'VBP'] and \
                 pos.split('_')[1] in ['JJ', 'NN', 'NNS']) \
            or (len(pos.split('_')) == 3 and \
                pos.split('_')[0] in ['JJ', 'NN', 'NNS', 'VB', 'VBP'] and \
                 pos.split('_')[1] in ['JJ', 'NN', 'NNS', 'VB', 'VBP'] and \
                  pos.split('_')[2] in ['NN', 'NNS'])]
    
udf_filter_pos_combs = F.udf(filter_pos_combs, T.ArrayType(T.StringType()))

And we call the function on word and POS n-grams.

In [39]:
processed_review = processed_review.withColumn('filtered_ngrams',
                                               udf_filter_pos_combs(F.col('finished_ngrams'),
                                                                    F.col('finished_pos_ngrams')))

Below is what we get after filtering for n-grams.

In [40]:
processed_review.select('filtered_ngrams').limit(5).show(truncate=90)



+------------------------------------------------------------------------------------------+
|                                                                           filtered_ngrams|
+------------------------------------------------------------------------------------------+
|[relax_conversation, book_dontmesswithasilverback, dontmesswithasilverback_govegan, gov...|
|[sweet_potato, potato_amp, amp_courgette, courgette_fritter, mint_yoghurt, yoghurt_dip,...|
|[creamy_mushroom, mushroom_bucatini, bucatini_vegan, vegan_veganforlife, veganforlife_f...|
|[polypieter_veggie, be_al, al_super, super_vegan, be_quasi, quasi_onmogelijk, onmogelij...|
|[natural_origin, origin_ingredient, vegan_society, love_wink, wink_fragrance, fragrance...|
+------------------------------------------------------------------------------------------+



                                                                                

Now we have unigrams and n-grams stored in different columns in the dataframe. Let's combine them together.

In [41]:
from pyspark.sql.functions import concat

processed_review = processed_review.withColumn('final', 
                                               concat(F.col('filtered_unigrams'), 
                                                      F.col('filtered_ngrams')))

And this is our final look of the data.

In [42]:
processed_review.select('final').limit(5).show(truncate=90)



+------------------------------------------------------------------------------------------+
|                                                                                     final|
+------------------------------------------------------------------------------------------+
|[livestreaming, conversation, asefrid, book, govegan, livelikeagorilla, veganfortheanim...|
|[sweet, potato, amp, courgette, fritter, yoghurt, dip, vegan, plantbased, sweet_potato,...|
|[creamy, mushroom, bucatini, vegan, veganforlife, forevervegan, creamy_mushroom, mushro...|
|[polypieter, veggie, al, super, vegan, quasi, onmogelijk, en, je, krijgt, onvermijdelij...|
|[enrich, origin, ingredient, certify, vegan, wink, fragrance, natural_origin, origin_in...|
+------------------------------------------------------------------------------------------+



                                                                                

## Vectorization

Now we are set to vectorization of our data. First, we will proceed with TF (term frequency) vectorization with CountVectorizer in PySpark. We fit tf dictionary and then transform the data to vectors of counts.

In [43]:
from pyspark.ml.feature import CountVectorizer

tfizer = CountVectorizer(inputCol='final', outputCol='tf_features')
tf_model = tfizer.fit(processed_review)
tf_result = tf_model.transform(processed_review)

                                                                                

After we get TF results, we can account for words that are frequent for all the documents. We can use IDF (inverse document frequency) to lower score of such words.

In [44]:
from pyspark.ml.feature import IDF

idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features')
idf_model = idfizer.fit(tf_result)
tfidf_result = idf_model.transform(tf_result)

[Stage 134:>                                                        (0 + 8) / 9]

22/11/28 16:35:20 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB




22/11/28 16:46:58 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB


                                                                                

## LDA

Finally, we are ready to model topics in our data with LDA (Latent Dirichlet Allocation). To use the algorithm, we have to provide the number of topics we presume our data contains and the number of iterations for the LDA algorithm. Then, we initialize the model and train it.

In [45]:
from pyspark.ml.clustering import LDA

num_topics = 6
max_iter = 10

lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features')
lda_model = lda.fit(tfidf_result)



22/11/28 16:49:57 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

22/11/28 16:59:59 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:00 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:01 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:01 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/11/28 17:00:01 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS




22/11/28 17:00:08 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

22/11/28 17:00:09 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:09 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB




22/11/28 17:00:12 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

22/11/28 17:00:13 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:13 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


[Stage 189:>                                                        (0 + 8) / 9]

22/11/28 17:00:15 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

22/11/28 17:00:16 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:17 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB




22/11/28 17:00:19 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

22/11/28 17:00:19 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:20 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB




22/11/28 17:00:21 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

22/11/28 17:00:22 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:22 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB




22/11/28 17:00:24 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

22/11/28 17:00:24 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:24 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB




22/11/28 17:00:26 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

22/11/28 17:00:26 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:27 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB




22/11/28 17:00:28 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

22/11/28 17:00:28 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:28 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB




22/11/28 17:00:30 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

22/11/28 17:00:30 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB
22/11/28 17:00:30 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB




22/11/28 17:00:32 WARN DAGScheduler: Broadcasting large task binary with size 8.5 MiB


                                                                                

To be able to see words that characterize the defined topics, we need to convert word ids into actual words with the custom function. This function will again be converted to PySpark UDF to be used on our topic dataframe.

In [46]:
vocab = tf_model.vocabulary

def get_words(token_list):
     return [vocab[token_id] for token_id in token_list]
       
udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

Let's define the number of top words per topic we would like to see and extract the words with our function.

In [47]:
num_top_words = 10

topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
topics.select('topic', 'topicWords').show(truncate=90)

+-----+------------------------------------------------------------------------------------------+
|topic|                                                                                topicWords|
+-----+------------------------------------------------------------------------------------------+
|    0|                               [organic, keto, vegan, make, amp, rofl, joy, im, get, diet]|
|    1|[blue_diamond, small_blue_diamond, small_blue, small, blue, diamond, vegan, diamond_sma...|
|    2|                               [vegan, und, ich, die, soap, ist, das, nicht, der, organic]|
|    3|                             [que, vegano, en, organic, vegan, el, face, de, spice, heart]|
|    4|   [vegan, organic, vegano, healthyfood, food, healthy, healthylifestyle, eu, health, que]|
|    5|                [vegan, organic, go, go_vegan, tea, food, animal, keto, check, check_mark]|
+-----+------------------------------------------------------------------------------------------+



In [51]:
topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
organic
keto
vegan
make
amp
rofl
joy
im
get
diet
*************************
topic: 1
*************************
blue_diamond
small_blue_diamond
small_blue
small
blue
diamond
vegan
diamond_small
organic
keto
*************************
topic: 2
*************************
vegan
und
ich
die
soap
ist
das
nicht
der
organic
*************************
topic: 3
*************************
que
vegano
en
organic
vegan
el
face
de
spice
heart
*************************
topic: 4
*************************
vegan
organic
vegano
healthyfood
food
healthy
healthylifestyle
eu
health
que
*************************
topic: 5
*************************
vegan
organic
go
go_vegan
tea
food
animal
keto
check
check_mark
*************************


AttributeError: 'DataFrame' object has no attribute 'describeTopics'