In [18]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, count, col, explode, lower, regexp_extract

spark = (SparkSession
  .builder
  .appName("ShrekWordCounter")
  .getOrCreate())

In [5]:
shrek_script_df = (spark.read.format("text")
    .option("header", "false")
    .option("inferSchema", "true")
    .load("./shrek_script.txt")
    )

In [12]:
shrek_script_lines = shrek_script_df.select(split(col("value"), " ").alias("line"))
shrek_script_lines.show(10)

+--------------------+
|                line|
+--------------------+
|[{Man}, Once, upo...|
+--------------------+



In [15]:
shrek_script_words = shrek_script_lines.select(explode(col("line")).alias("word"))
shrek_script_words.show(15)

+-----------+
|       word|
+-----------+
|      {Man}|
|       Once|
|       upon|
|          a|
|       time|
|      there|
|        was|
|          a|
|     lovely|
|  princess.|
|        But|
|        she|
|        had|
|         an|
|enchantment|
+-----------+
only showing top 15 rows



In [17]:
shrek_script_words_lower = shrek_script_words.select(lower(col("word")).alias("word_lower"))
shrek_script_words_lower.show(15)

+-----------+
| word_lower|
+-----------+
|      {man}|
|       once|
|       upon|
|          a|
|       time|
|      there|
|        was|
|          a|
|     lovely|
|  princess.|
|        but|
|        she|
|        had|
|         an|
|enchantment|
+-----------+
only showing top 15 rows



In [22]:
shrek_script_words_clean = shrek_script_words_lower.select(
    regexp_extract(col("word_lower"), "[a-z]*", 0).alias("word")
)
shrek_script_words_clean.show(20)

+-----------+
|       word|
+-----------+
|           |
|       once|
|       upon|
|          a|
|       time|
|      there|
|        was|
|          a|
|     lovely|
|   princess|
|        but|
|        she|
|        had|
|         an|
|enchantment|
|       upon|
|        her|
|         of|
|          a|
|    fearful|
+-----------+
only showing top 20 rows



In [23]:
shrek_script_words_nonull = shrek_script_words_clean.where(col("word") != "")
shrek_script_words_nonull.show()

+-----------+
|       word|
+-----------+
|       once|
|       upon|
|          a|
|       time|
|      there|
|        was|
|          a|
|     lovely|
|   princess|
|        but|
|        she|
|        had|
|         an|
|enchantment|
|       upon|
|        her|
|         of|
|          a|
|    fearful|
|       sort|
+-----------+
only showing top 20 rows



In [30]:
shrek_script_groups = shrek_script_words_nonull.groupBy(col("word"))
shrek_script_results = shrek_script_groups.count()
# shrek_script_results.show()
shrek_script_results.orderBy("count", ascending=False).show(100)

+--------+-----+
|    word|count|
+--------+-----+
|       i|  406|
|     you|  406|
|     the|  230|
|       a|  225|
|      to|  168|
|      it|  160|
|    that|  135|
|      me|  134|
|     and|  120|
|      no|   90|
|      is|   87|
|      of|   87|
|      my|   86|
|     don|   77|
|    this|   77|
|    what|   76|
|      on|   75|
|      in|   73|
|    know|   70|
|      do|   63|
|     not|   62|
|    your|   61|
|     all|   60|
|   right|   59|
|    like|   59|
|     are|   58|
|    have|   57|
|   there|   56|
|    just|   56|
|     for|   55|
|     get|   51|
|      we|   50|
|   shrek|   49|
|     but|   46|
|     now|   46|
|     can|   45|
|      oh|   45|
|    love|   44|
|      be|   41|
|princess|   39|
|     was|   38|
|    look|   37|
|     one|   36|
|      go|   36|
|      so|   33|
|     her|   33|
|    with|   32|
|      if|   31|
|      he|   31|
|     out|   31|
|     way|   29|
|   about|   28|
|    here|   28|
|  really|   27|
|      uh|   27|
|      up|   2

In [29]:
shrek_script_results.coalesce(1).write.csv("./shrek_word_counts.csv")