In [3]:
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'

# %%
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# stop being lazy (no!)
# spark = (SparkSession.builder
#                      .config("spark.sql.repl.eagerEval.enabled", "True")
#                      .getOrCreate())

# %%
#spark.sparkContext

# %%
sc = spark.sparkContext
sqlContext = spark

# %%
spark.sparkContext.setLogLevel("ERROR")
'''
# %%
spark.read
dir(spark.read)

# spark.read.csv() === spark.read.format('csv').load()

# %%
spark.read?
spark.read??
print(spark.read.__doc__)
'''
# %%
book = spark.read.text("pride-and-prejudice.txt")

book

# %%
book.printSchema()

# %%
book.show()

# %%
book.show(10, truncate=50)



root
 |-- value: string (nullable = true)

+--------------------+
|               value|
+--------------------+
| Pride and Prejudice|
|                    |
|      By Jane Austen|
|                    |
|           Chapter 1|
|                    |
|It is a truth uni...|
|                    |
|However little kn...|
|                    |
|“My dear Mr. Benn...|
|                    |
|Mr. Bennet replie...|
|                    |
|“But it is,” retu...|
|                    |
|Mr. Bennet made n...|
|                    |
|“Do you not want ...|
|                    |
+--------------------+
only showing top 20 rows

+--------------------------------------------------+
|                                             value|
+--------------------------------------------------+
|                               Pride and Prejudice|
|                                                  |
|                                    By Jane Austen|
|                                                  |
|       

In [4]:
# %%
from pyspark.sql.functions import split

lines = book.select(split(book.value, " ").alias("line"))

lines.show(5,truncate=50)

# %%

# returns a dataframe
book.select(book.value)

# %%
from pyspark.sql.functions import col

book.select(book.value)
book.select(book["value"])
book.select(col("value"))
book.select("value")



+-----------------------+
|                   line|
+-----------------------+
|[Pride, and, Prejudice]|
|                     []|
|     [By, Jane, Austen]|
|                     []|
|           [Chapter, 1]|
+-----------------------+
only showing top 5 rows



DataFrame[value: string]

In [14]:
book.select("value").show(10,truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                               value|
+----------------------------------------------------------------------------------------------------+
|                                                                                 Pride and Prejudice|
|                                                                                                    |
|                                                                                      By Jane Austen|
|                                                                                                    |
|                                                                                           Chapter 1|
|                                                                                                    |
|It is a truth universally acknowledged, that a single man in possession 

In [3]:
# %%
from pyspark.sql.functions import split

lines = book.select(split(col("value"), " "))

lines

# %%
lines.printSchema()

# %%
lines.show(5)

# %%
book.select(split(col("value"), " ")).printSchema()

# %%
book.select(split(col("value"), " ").alias("line")).printSchema()

# %%
lines = book.select(split(book.value, " ").alias("line"))
lines.show(5)

# %%
# lines = book.select(split(book.value, " "))
# lines = lines.withColumnRenamed("split(value,  )", "line")



root
 |-- split(value,  , -1): array (nullable = true)
 |    |-- element: string (containsNull = false)

+--------------------+
| split(value,  , -1)|
+--------------------+
|[Pride, and, Prej...|
|                  []|
|  [By, Jane, Austen]|
|                  []|
|        [Chapter, 1]|
+--------------------+
only showing top 5 rows

root
 |-- split(value,  , -1): array (nullable = true)
 |    |-- element: string (containsNull = false)

root
 |-- line: array (nullable = true)
 |    |-- element: string (containsNull = false)

+--------------------+
|                line|
+--------------------+
|[Pride, and, Prej...|
|                  []|
|  [By, Jane, Austen]|
|                  []|
|        [Chapter, 1]|
+--------------------+
only showing top 5 rows



In [16]:
book.select(split(col("value"), " ")).show(10,50)

+--------------------------------------------------+
|                               split(value,  , -1)|
+--------------------------------------------------+
|                           [Pride, and, Prejudice]|
|                                                []|
|                                [By, Jane, Austen]|
|                                                []|
|                                      [Chapter, 1]|
|                                                []|
|[It, is, a, truth, universally, acknowledged,, ...|
|                                                []|
|[However, little, known, the, feelings, or, vie...|
|                                                []|
+--------------------------------------------------+
only showing top 10 rows



In [18]:
# %%
from pyspark.sql.functions import explode

words = lines.select(explode(col("line")).alias("word"))

words.show(20)

# %%
words.printSchema()

# %%
from pyspark.sql.functions import lower
words_lower = words.select(lower(col("word")).alias("word_lower"))

words_lower.show(100)

# %%
from pyspark.sql.functions import regexp_extract

words_clean = words_lower.select(
    # regexp_extract(col("word_lower"), "[a-z]*", 0).alias("word")
    regexp_extract(col("word_lower"),  r"(\W+)?([a-z]+)", 2).alias("word")
)

words_clean.show(100)

# %%
words_nonull = words_clean.where(col("word") != "")

words_nonull.show(100)


+-------------+
|         word|
+-------------+
|        Pride|
|          and|
|    Prejudice|
|             |
|           By|
|         Jane|
|       Austen|
|             |
|      Chapter|
|            1|
|             |
|           It|
|           is|
|            a|
|        truth|
|  universally|
|acknowledged,|
|         that|
|            a|
|       single|
+-------------+
only showing top 20 rows

root
 |-- word: string (nullable = false)

+--------------+
|    word_lower|
+--------------+
|         pride|
|           and|
|     prejudice|
|              |
|            by|
|          jane|
|        austen|
|              |
|       chapter|
|             1|
|              |
|            it|
|            is|
|             a|
|         truth|
|   universally|
| acknowledged,|
|          that|
|             a|
|        single|
|           man|
|            in|
|    possession|
|            of|
|             a|
|          good|
|      fortune,|
|          must|
|            be|
|  

In [19]:
lines.select(explode(col("line")).alias("word")).show(10,50)

+---------+
|     word|
+---------+
|    Pride|
|      and|
|Prejudice|
|         |
|       By|
|     Jane|
|   Austen|
|         |
|  Chapter|
|        1|
+---------+
only showing top 10 rows

