In [None]:
import os
# Find the latest version of spark 3.2  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.2'

spark_version = 'spark-3.<enter version>'

os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [16]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("tokenizing").getOrCreate()

In [17]:
from pyspark.ml.feature import RegexTokenizer, Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [18]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.1/22-big-data/day_2/shakespeare.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("shakespeare.csv"), sep=",", header=True, ignoreLeadingWhiteSpace=True)

# Show DataFrame
df.show()

+--------------------+
|         Sonnet CXLV|
+--------------------+
|Those lips that L...|
|Breathed forth th...|
|To me that langui...|
|But when she saw ...|
|Straight in her h...|
|Chiding that tong...|
|Was used in givin...|
|And taught it thu...|
|I hate' she alter...|
|That follow'd it ...|
|Doth follow night...|
|From heaven to he...|
|I hate' from hate...|
|And saved my life...|
+--------------------+



In [19]:
# Tokenize DataFrame
tokened = Tokenizer(inputCol="Sonnet CXLV", outputCol="words")

In [20]:
# Transform DataFrame
tokenized = tokened.transform(df)
tokenized.show()

+--------------------+--------------------+
|         Sonnet CXLV|               words|
+--------------------+--------------------+
|Those lips that L...|[those, lips, tha...|
|Breathed forth th...|[breathed, forth,...|
|To me that langui...|[to, me, that, la...|
|But when she saw ...|[but, when, she, ...|
|Straight in her h...|[straight, in, he...|
|Chiding that tong...|[chiding, that, t...|
|Was used in givin...|[was, used, in, g...|
|And taught it thu...|[and, taught, it,...|
|I hate' she alter...|[i, hate', she, a...|
|That follow'd it ...|[that, follow'd, ...|
|Doth follow night...|[doth, follow, ni...|
|From heaven to he...|[from, heaven, to...|
|I hate' from hate...|[i, hate', from, ...|
|And saved my life...|[and, saved, my, ...|
+--------------------+--------------------+



In [21]:
# Create a Function to count vowels
def vowel_counter(words):
    vowel_count = 0

    for word in words:
        for letter in word:
            if letter in ('a', 'e', 'i', 'o', 'u'):
                vowel_count += 1

    return vowel_count

In [22]:
# Store a user defined function
count_vowels = udf(vowel_counter, IntegerType())
count_vowels

<function __main__.vowel_counter>

In [31]:
# Create new DataFrame with the udf
tokenized.select("Sonnet CXLV", "words")\
    .withColumn("vowels", count_vowels(col("words"))).show(truncate=False)

+-------------------------------------------+----------------------------------------------------+------+
|Sonnet CXLV                                |words                                               |vowels|
+-------------------------------------------+----------------------------------------------------+------+
|Those lips that Love's own hand did make   |[those, lips, that, love's, own, hand, did, make]   |11    |
|Breathed forth the sound that said 'I hate'|[breathed, forth, the, sound, that, said, 'i, hate']|13    |
|To me that languish'd for her sake;        |[to, me, that, languish'd, for, her, sake;]         |10    |
|But when she saw my woeful state,          |[but, when, she, saw, my, woeful, state,]           |9     |
|Straight in her heart did mercy come,      |[straight, in, her, heart, did, mercy, come,]       |10    |
|Chiding that tongue that ever sweet        |[chiding, that, tongue, that, ever, sweet]          |11    |
|Was used in giving gentle doom,            |[