# Reading in the files and imports

In [1]:
import pyspark.sql.functions as F
import pandas as pd 
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np 

import os 
import pickle
import re
from datetime import datetime
import requests
import pytz
import pandas as pd
import numpy as np
import ast


In [2]:

import pyspark
# create spark context
sc = pyspark.SparkContext()
# create spark session 
spark = pyspark.sql.SparkSession(sc)

path_json = ".././../data/Topic_vegan/*.json" 

df_json = spark.read.option("multiline","true").json(path_json)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/25 16:11:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

22/11/25 16:12:02 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [3]:
spark

## Selecting the correct columns, converting types and checking for doubles 

In [4]:
# twitter represents likes as hearts 
# the number of likes different topics receive over time 
# use favorites_count 

#plot aantal tweets over een topic over time 

df_json_sub = df_json.select(F.col("user.name"),
                                F.col("user.screen_name"),
                                F.col("created_at"), 
                                F.col("full_text"),
                                F.col("entities.hashtags"),
                                F.col("lang"),
                                F.col("favorite_count"),
                                F.col("user.followers_count"),
                                F.col("user.friends_count"),
                                F.col("user.favourites_count"),
                                F.col("entities.urls"),
                                F.col("entities.symbols"), 
                                )
#date --> time stamp variable 
def getDate(date):
    if date is not None:
        return str(datetime.strptime(date,'%a %b %d %H:%M:%S +0000 %Y').replace(tzinfo=pytz.UTC).strftime("%Y-%m-%d %H:%M:%S"))
    else:
        return None
date_udf = F.udf(getDate, StringType())
df_json_sub = df_json_sub.withColumn('post_created_at', F.to_utc_timestamp(date_udf("created_at"), "UTC"))

#extract year, month, day 

df_json_sub = df_json_sub.withColumn('year', year('post_created_at'))
df_json_sub = df_json_sub.withColumn('month', month('post_created_at'))
df_json_sub = df_json_sub.withColumn('day', dayofmonth('post_created_at')) #number from 1 to 31 

df_json_sub.count() #3428559 



                                                                                

3428559

In [5]:
#drop duplicates and retweets 
df_json_sub = df_json_sub.filter(~F.col("full_text").startswith("RT"))\
                        .drop_duplicates().cache()
#sorting such when dropping later we only keep the most recent post 
df_json_sub = df_json_sub.sort("post_created_at", ascending=False)
#removing spam accounts 
df_json_sub = df_json_sub.drop_duplicates(["full_text", "screen_name"])
                        
df_json_sub.printSchema()
#df_json_sub.count() #1340938 

root
 |-- name: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- text: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- favourites_count: long (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- display_url: string (nullable = true)
 |    |    |-- expanded_url: string (nullable = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- url: string (nullable = true)
 |-- symbols: array (nullable = true)
 |   

# Feature Engineering 

In [6]:
# define function to count hashtags
def get_hashtags(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "#" in word:
            counter += 1
    return(counter) 

# define function to count mentions
def get_mentions(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "@" in word:
            counter += 1
    return(counter)

# define function to count exclamation marks
def get_exclamation_marks(tokenized_text):
    counter = 0
    for word in tokenized_text:
        if "!" in word:
            counter += 1
    return(counter)

# define function to count number of emojis used
import emojis
def emoji_counter(text):
    nr_emojis = emojis.count(text)
    return(nr_emojis)
# register functions as udf
get_hashtags_UDF = F.udf(get_hashtags, IntegerType())
get_mentions_UDF = F.udf(get_mentions, IntegerType())
get_exclamation_marks_UDF = F.udf(get_exclamation_marks, IntegerType())
emoji_counter_udf = F.udf(emoji_counter, IntegerType())


In [7]:
twitter_df = df_json_sub.withColumn("emoji_count", emoji_counter_udf("full_text")) \
                            .withColumn("text_tokenized", F.split("full_text", " ")) \
                            .withColumn("num_words", F.size("text_tokenized")) \
                            .withColumn("num_hashtags", get_hashtags_UDF("text_tokenized")) \
                            .withColumn("num_mentions", get_mentions_UDF("text_tokenized")) \
                            .withColumn("num_exclamation_marks", get_exclamation_marks_UDF("text_tokenized"))
twitter_df.printSchema()


root
 |-- name: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- full_text: string (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- text: string (nullable = true)
 |-- lang: string (nullable = true)
 |-- favorite_count: long (nullable = true)
 |-- followers_count: long (nullable = true)
 |-- friends_count: long (nullable = true)
 |-- favourites_count: long (nullable = true)
 |-- urls: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- display_url: string (nullable = true)
 |    |    |-- expanded_url: string (nullable = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- url: string (nullable = true)
 |-- symbols: array (nullable = true)
 |   

In [8]:
# define function to clean text
def clean_text(string):
    
    # define numbers
    NUMBERS = '0123456789'
    PUNCT = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    
    # convert text to lower case
    cleaned_string = string.lower()
    
    # remove URLS
    cleaned_string = re.sub(r'http\S+', ' ', cleaned_string)
    
    # replace emojis by words
    cleaned_string = emoji.demojize(cleaned_string)
    cleaned_string = cleaned_string.replace(":"," ").replace("_"," ")
    cleaned_string = ' '.join(cleaned_string.split())
    
    # remove numbers
    cleaned_string = "".join([char for char in cleaned_string if char not in NUMBERS])
    
    # remove punctuation
    cleaned_string = "".join([char for char in cleaned_string if char not in PUNCT])
    
    # remove words conisting of one character (or less)
    cleaned_string = ' '.join([w for w in cleaned_string.split() if len(w) > 1])
    
    # return
    return(cleaned_string) 
clean_text_udf = F.udf(clean_text, StringType())


In [9]:
twitter_df = twitter_df.withColumn("cleaned_text", clean_text_udf(F.col("full_text")))

In [14]:
twitter_df.createOrReplaceTempView('twitter_df') 

In [17]:
spark.sql("select * from twitter_df").show(5)



22/11/25 16:14:36 WARN MemoryStore: Not enough space to cache rdd_23_90 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:36 WARN BlockManager: Persisting block rdd_23_90 to disk instead.
22/11/25 16:14:36 WARN MemoryStore: Not enough space to cache rdd_23_94 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:36 WARN BlockManager: Persisting block rdd_23_94 to disk instead.
22/11/25 16:14:36 WARN MemoryStore: Not enough space to cache rdd_23_96 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:36 WARN MemoryStore: Not enough space to cache rdd_23_94 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:36 WARN MemoryStore: Not enough space to cache rdd_23_90 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:36 WARN BlockManager: Persisting block rdd_23_96 to disk instead.
22/11/25 16:14:36 WARN MemoryStore: Not enough space to cache rdd_23_96 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:36 WARN MemoryStore: Not enough space to cache rdd_23_101 in memory! (computed 3.9 MiB s



22/11/25 16:14:36 WARN MemoryStore: Not enough space to cache rdd_23_100 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:36 WARN BlockManager: Persisting block rdd_23_100 to disk instead.
22/11/25 16:14:36 WARN MemoryStore: Not enough space to cache rdd_23_105 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:36 WARN BlockManager: Persisting block rdd_23_105 to disk instead.
22/11/25 16:14:36 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_100 in memory.
22/11/25 16:14:36 WARN MemoryStore: Not enough space to cache rdd_23_100 in memory! (computed 384.0 B so far)
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_106 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_106 to disk instead.
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_105 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN MemoryStore: Failed to reserve initial memory 



22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_107 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_107 to disk instead.
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_106 in memory! (computed 384.0 B so far)
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_107 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_108 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_108 to disk instead.
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_110 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_110 to disk instead.
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_110 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_112 in memory! (computed 



22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_117 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_117 to disk instead.
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_115 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_121 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_121 to disk instead.
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_119 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_119 to disk instead.
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_120 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_120 to disk instead.
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_119 in memory! (computed 3.8 MiB so far)
22/11/25 16:



22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_131 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_131 to disk instead.
22/11/25 16:14:37 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_138 in memory.
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_133 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_134 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_134 to disk instead.
22/11/25 16:14:37 WARN BlockManager: Persisting block rdd_23_133 to disk instead.
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_131 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_133 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:37 WARN MemoryStore: Not enough space to cache rdd_23_



22/11/25 16:14:38 WARN BlockManager: Persisting block rdd_23_145 to disk instead.
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_146 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:38 WARN BlockManager: Persisting block rdd_23_146 to disk instead.
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_146 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_147 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:38 WARN BlockManager: Persisting block rdd_23_147 to disk instead.
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_148 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:38 WARN BlockManager: Persisting block rdd_23_148 to disk instead.
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_151 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:38 WARN BlockManager: Persisting block rdd_23_151 to disk instead.
22/11/25 16:14:38 WARN MemoryStore: Not 



22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_151 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_154 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:38 WARN BlockManager: Persisting block rdd_23_154 to disk instead.
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_154 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_158 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:38 WARN BlockManager: Persisting block rdd_23_158 to disk instead.
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_158 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_161 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:38 WARN BlockManager: Persisting block rdd_23_161 to disk instead.
22/11/25 16:14:38 WARN MemoryStore: Not enough space to cache rdd_23_162 in memory! (computed 



22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_167 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_168 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_169 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_169 to disk instead.
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_170 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_170 to disk instead.
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_170 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_172 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_172 to disk instead.
22/11/25 16:14:39 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB f



22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_180 in memory! (computed 384.0 B so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_180 to disk instead.
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_179 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_179 to disk instead.
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_178 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_178 to disk instead.
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_178 in memory! (computed 3.9 MiB so far)




22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_181 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_181 to disk instead.
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_185 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_185 to disk instead.




22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_188 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_188 to disk instead.
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_188 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_186 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_186 to disk instead.
22/11/25 16:14:39 WARN MemoryStore: Not enough space to cache rdd_23_191 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:39 WARN BlockManager: Persisting block rdd_23_191 to disk instead.
22/11/25 16:14:39 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_197 in memory.
22/11/25 16:14:39 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_196 in memory.
22/11/25 16:14:39 WARN MemoryStore: Not enou

                                                                                

22/11/25 16:14:40 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_4 in memory.
22/11/25 16:14:40 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_2 in memory.
22/11/25 16:14:40 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_0 in memory.
22/11/25 16:14:40 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_3 in memory.
22/11/25 16:14:40 WARN MemoryStore: Not enough space to cache rdd_23_4 in memory! (computed 384.0 B so far)
22/11/25 16:14:40 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_1 in memory.
22/11/25 16:14:40 WARN MemoryStore: Not enough space to cache rdd_23_5 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:40 WARN MemoryStore: Not enough space to cache rdd_23_6 in memory! (computed 3.9 MiB so far)
22/11/25 16

[Stage 8:==>                                                      (8 + 8) / 200]

22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_16 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_17 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_18 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_19 in memory.
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_19 in memory! (computed 384.0 B so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_20 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_21 in memory.
22/11/25 16:14:41 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_22 in memory.
22/11/25 16:14:41 WARN MemoryStore: Not enough space 

[Stage 8:=====>                                                  (20 + 8) / 200]

22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_24 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_26 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_25 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_27 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_28 in memory.
22/11/25 16:14:41 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_29 in memory.
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_28 in memory! (computed 384.0 B so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_29 in memory! (computed 384.0 B so far)
22/11/25 16:14:41 WARN MemoryStore: Failed to reserve initial memory threshold



22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_32 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_33 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_34 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_35 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_37 in memory.
22/11/25 16:14:41 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_38 in memory.
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_37 in memory! (computed 384.0 B so far)
22/11/25 16:14:41 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_39 in memory.
22/11/25 16:14:41 WARN MemoryStore: Not enough space 



22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_40 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_41 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_42 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_43 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_44 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_45 in memory.
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_45 in memory! (computed 384.0 B so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_46 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:41 WARN MemoryStore: Not enough space to cache rdd_23_47 in memory! (computed 3.9 MiB so



22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_55 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_56 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_57 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_58 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_59 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_60 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:42 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_23_61 in memory.
22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_61 in memory! (computed 384.0 B so far)
22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_62 in memory! (computed 3.9 MiB so



22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_63 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_64 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:42 WARN MemoryStore: Not enough space to cache rdd_23_68 in memory! (computed 3.8 MiB so far)




22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_90 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_94 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_96 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_100 in memory! (computed 3.9 MiB so far)




22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_105 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_106 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_107 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_110 in memory! (computed 3.9 MiB so far)




22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_113 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_114 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_115 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_119 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_121 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_123 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_124 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:43 WARN MemoryStore: Not enough space to cache rdd_23_125 in memory! (computed 3.8 MiB so far)




22/11/25 16:14:44 WARN MemoryStore: Not enough space to cache rdd_23_131 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:44 WARN MemoryStore: Not enough space to cache rdd_23_133 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:44 WARN MemoryStore: Not enough space to cache rdd_23_134 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:44 WARN MemoryStore: Not enough space to cache rdd_23_139 in memory! (computed 3.9 MiB so far)




22/11/25 16:14:44 WARN MemoryStore: Not enough space to cache rdd_23_140 in memory! (computed 3.9 MiB so far)




22/11/25 16:14:44 WARN MemoryStore: Not enough space to cache rdd_23_146 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:44 WARN MemoryStore: Not enough space to cache rdd_23_147 in memory! (computed 3.8 MiB so far)
22/11/25 16:14:44 WARN MemoryStore: Not enough space to cache rdd_23_151 in memory! (computed 3.9 MiB so far)




22/11/25 16:14:44 WARN MemoryStore: Not enough space to cache rdd_23_154 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:44 WARN MemoryStore: Not enough space to cache rdd_23_158 in memory! (computed 3.9 MiB so far)




22/11/25 16:14:45 WARN MemoryStore: Not enough space to cache rdd_23_164 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:45 WARN MemoryStore: Not enough space to cache rdd_23_167 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:45 WARN MemoryStore: Not enough space to cache rdd_23_168 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:45 WARN MemoryStore: Not enough space to cache rdd_23_170 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:45 WARN MemoryStore: Not enough space to cache rdd_23_172 in memory! (computed 3.9 MiB so far)




22/11/25 16:14:45 WARN MemoryStore: Not enough space to cache rdd_23_178 in memory! (computed 3.9 MiB so far)




22/11/25 16:14:45 WARN MemoryStore: Not enough space to cache rdd_23_188 in memory! (computed 3.9 MiB so far)
22/11/25 16:14:45 WARN MemoryStore: Not enough space to cache rdd_23_192 in memory! (computed 3.9 MiB so far)




22/11/25 16:14:45 WARN MemoryStore: Not enough space to cache rdd_23_194 in memory! (computed 3.8 MiB so far)




22/11/25 16:14:52 ERROR PythonUDFRunner: Python worker exited unexpectedly (crashed)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 666, in main
    eval_type = read_int(infile)
  File "/opt/homebrew/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 595, in read_int
    raise EOFError
EOFError

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.ha

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 670, in main
    func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
  File "/opt/homebrew/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 507, in read_udfs
    udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
  File "/opt/homebrew/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 289, in read_single_udf
    f, return_type = read_command(pickleSer, infile)
  File "/opt/homebrew/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 85, in read_command
    command = serializer._read_with_length(file)
  File "/opt/homebrew/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 173, in _read_with_length
    return self.loads(obj)
  File "/opt/homebrew/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 471, in loads
    return cloudpickle.loads(obj, encoding=encoding)
  File "/opt/homebrew/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/cloudpickle/cloudpickle.py", line 679, in subimport
    __import__(name)
ModuleNotFoundError: No module named 'emojis'


In [18]:
import emojis