# to do 

1) Read
2) Tokenization
3) Clean
4) Count
5) Answer: Return top 20 words that are repetead the most in the novel

# 1) Reading the dataset

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BookReading').getOrCreate()
book = spark.read.text('dataset/pg1342.txt')

In [3]:
book

DataFrame[value: string]

In [4]:
book.printSchema()

root
 |-- value: string (nullable = true)



In [5]:
book.show()

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This ebook is for...|
|most other parts ...|
|whatsoever. You m...|
|of the Project Gu...|
|at www.gutenberg....|
|you will have to ...|
|before using this...|
|                    |
|Title: Pride and ...|
|                    |
| Author: Jane Austen|
|                    |
|Release date: Jun...|
|                M...|
|                    |
|   Language: English|
|                    |
|Credits: Chuck Gr...|
+--------------------+
only showing top 20 rows


In [6]:
book.show(10, truncate=50)

+--------------------------------------------------+
|                                             value|
+--------------------------------------------------+
|The Project Gutenberg eBook of Pride and Prejudice|
|                                                  |
|This ebook is for the use of anyone anywhere in...|
|most other parts of the world at no cost and wi...|
|whatsoever. You may copy it, give it away or re...|
|of the Project Gutenberg License included with ...|
|at www.gutenberg.org. If you are not located in...|
|you will have to check the laws of the country ...|
|                          before using this eBook.|
|                                                  |
+--------------------------------------------------+
only showing top 10 rows


# 2) Tokenization

In [7]:
from pyspark.sql.functions import col, split
lines = book.select(
    split(col("value"), " ").alias("lines")
)

In [8]:
lines.printSchema()

root
 |-- lines: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [9]:
lines.show(5)

+--------------------+
|               lines|
+--------------------+
|[The, Project, Gu...|
|          [, , , , ]|
|[This, ebook, is,...|
|[most, other, par...|
|[whatsoever., You...|
+--------------------+
only showing top 5 rows


In [10]:
# exploding the list into the seperate seperate tokens
from pyspark.sql.functions import explode, col

words = lines.select(
    explode(col("lines")).alias("word")
)

words.show(5)

+---------+
|     word|
+---------+
|      The|
|  Project|
|Gutenberg|
|    eBook|
|       of|
+---------+
only showing top 5 rows


# 3) Cleaning

In [11]:
### Changing Case and removing punctuation
from pyspark.sql.functions import lower
word_lower = words.select(
    lower(col("word")).alias("word_lower")
)

In [12]:
word_lower.show()

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
| prejudice|
|          |
|          |
|          |
|          |
|          |
|      this|
|     ebook|
|        is|
|       for|
|       the|
|       use|
|        of|
+----------+
only showing top 20 rows


In [13]:
## we are using regular expression functionality to only get us the words that are present in our dataset
from pyspark.sql.functions import regexp_extract
word_clean = word_lower.select(
    regexp_extract(col("word_lower"), "[a-z]*", 0).alias("word")
)

word_clean.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|         |
|         |
|         |
|         |
|         |
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
+---------+
only showing top 20 rows


In [14]:
## fitering rows
words_nonull = word_clean.where(col("word") != "")
words_nonull.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
|       in|
|      the|
|   united|
+---------+
only showing top 20 rows


# 4) Counting

In [15]:
groups = words_nonull.groupby(col("word"))
groups

GroupedData[grouping expressions: [word], value: [word: string], type: GroupBy]

In [16]:
results = words_nonull.groupby(col("word")).count()
results.show()

+------------+-----+
|        word|count|
+------------+-----+
|      online|    5|
|       those|   65|
|        some|  207|
|     insipid|    2|
|       still|   76|
|         art|    7|
|        hope|  126|
|        earl|    3|
|         few|   73|
|   destitute|    2|
|  palpitated|    1|
|   connected|   15|
|    cautious|    4|
|   imitation|    1|
|     solaced|    1|
|      poetry|    2|
|   arguments|    5|
|premeditated|    1|
|     elevate|    1|
|      doubts|    2|
+------------+-----+
only showing top 20 rows


# 5) Display the Answer

In [18]:
# Display the Answer (ex top 10 words used in the book)
results = results.orderBy("count", ascending=False)

In [19]:
results.show(10)

+----+-----+
|word|count|
+----+-----+
| the| 4803|
|  to| 4374|
|  of| 3951|
| and| 3685|
| her| 2254|
|   a| 2063|
|  in| 2024|
| was| 1870|
|   i| 1778|
| she| 1703|
+----+-----+
only showing top 10 rows
