In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, lower, count, desc

In [2]:
spark = SparkSession.builder.appName("jehfuh").getOrCreate()

24/10/10 17:02:18 WARN Utils: Your hostname, Nikhils-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.206 instead (on interface en0)
24/10/10 17:02:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/10 17:02:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
hamlet_df = spark.read.text("hamlet.txt")
hamlet_df.show()

+--------------------+
|               value|
+--------------------+
|             HAMLET,|
|  PRINCE OF DENMARK.|
|              ACT I.|
|Scene I.—ELSINORE...|
|Francisco on his ...|
|                    |
|   Ber. Who's there?|
|                    |
|Fran. (R.) Nay, a...|
|                    |
|Ber. Long live th...|
|                    |
|               Fran.|
|           Bernardo?|
|                    |
|                Ber.|
|                 He.|
|                    |
|Fran. You come mo...|
|                    |
+--------------------+
only showing top 20 rows



In [4]:
# Split the lines into words
from pyspark.sql.functions import lower
df_split = hamlet_df.select(split(lower(hamlet_df.value), ' ').alias("word"))
df_split.show()

+--------------------+
|                word|
+--------------------+
|           [hamlet,]|
|[prince, of, denm...|
|           [act, i.]|
|[scene, i.—elsino...|
|[francisco, on, h...|
|                  []|
|[ber., who's, the...|
|                  []|
|[fran., (r.), nay...|
|                  []|
|[ber., long, live...|
|                  []|
|             [fran.]|
|         [bernardo?]|
|                  []|
|              [ber.]|
|               [he.]|
|                  []|
|[fran., you, come...|
|                  []|
+--------------------+
only showing top 20 rows



In [5]:
df_lower = df_split.select(explode(df_split.word).alias("word"))
df_lower.show()

+------------+
|        word|
+------------+
|     hamlet,|
|      prince|
|          of|
|    denmark.|
|         act|
|          i.|
|       scene|
|i.—elsinore.|
|           a|
|    platform|
|      before|
|         the|
|     castle.|
|      night.|
|   francisco|
|          on|
|         his|
|       post.|
|       enter|
|          to|
+------------+
only showing top 20 rows



In [6]:
from pyspark.sql.functions import regexp_replace
df_lower_split = hamlet_df.select(explode(split(lower(hamlet_df.value), ' ')).alias('word'))
# df_lower_split.show()
df_cleaned = df_lower_split.withColumn('word', regexp_replace(df_lower_split.word, '[^a-zA-Z0-9\\s]', '').alias('words'))
df_cleaned.show()

+---------+
|     word|
+---------+
|   hamlet|
|   prince|
|       of|
|  denmark|
|      act|
|        i|
|    scene|
|ielsinore|
|        a|
| platform|
|   before|
|      the|
|   castle|
|    night|
|francisco|
|       on|
|      his|
|     post|
|    enter|
|       to|
+---------+
only showing top 20 rows



In [7]:
import importlib
import assignment3.data_cleaner as dc

importlib.reload(dc)

hamlet_cleaned_df = dc.clean_dataset(hamlet_df)
hamlet_cleaned_df.show()

+---------+
|     word|
+---------+
|   hamlet|
|   prince|
|       of|
|  denmark|
|      act|
|        i|
|    scene|
|ielsinore|
|        a|
| platform|
|   before|
|      the|
|   castle|
|    night|
|francisco|
|       on|
|      his|
|     post|
|    enter|
|       to|
+---------+
only showing top 20 rows



In [8]:
word_count = hamlet_cleaned_df.groupBy('word').count()

In [9]:
# Sort the words by frequency in descending order and get the top 20
top_words = word_count.orderBy(desc('count')).limit(20)

# Show the top 20 most frequent words with their counts
top_words.show(truncate=False)

+----+-----+
|word|count|
+----+-----+
|the |1186 |
|and |753  |
|to  |740  |
|of  |692  |
|a   |534  |
|you |447  |
|in  |423  |
|i   |407  |
|my  |393  |
|it  |338  |
|is  |337  |
|that|304  |
|not |263  |
|ham |261  |
|his |254  |
|this|239  |
|with|229  |
|your|228  |
|for |213  |
|as  |207  |
+----+-----+



In [10]:
spark.stop()