In [27]:
# Load dependenceis 
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [28]:
# SpakSession Bulider
spark = SparkSession.builder.appName('udf').getOrCreate()

In [29]:
spark

In [30]:
# Spark DataFrame 
dataframe = spark.createDataFrame([
    (0, "Mary had a little lamb"),
    (1, "It's fleece was white as snow"),
    (2, "And everywhere Mary went"),
    (3, "The lamb was sure to go")
], ["id", "Nursery Rhyme"])
dataframe.show()

+---+--------------------+
| id|       Nursery Rhyme|
+---+--------------------+
|  0|Mary had a little...|
|  1|It's fleece was w...|
|  2|And everywhere Ma...|
|  3|The lamb was sure...|
+---+--------------------+



In [31]:
# Tokenize word
# converts the input string to lowercase and then splits it by white spaces.
tokenizer = Tokenizer(inputCol="Nursery Rhyme", outputCol="words")
tokenizer

Tokenizer_4b65913c1f76f1df7f02

In [32]:
# Transform dataframe
tokenized = tokenizer.transform(dataframe)
tokenized

DataFrame[id: bigint, Nursery Rhyme: string, words: array<string>]

In [33]:
tokenized.show()

+---+--------------------+--------------------+
| id|       Nursery Rhyme|               words|
+---+--------------------+--------------------+
|  0|Mary had a little...|[mary, had, a, li...|
|  1|It's fleece was w...|[it's, fleece, wa...|
|  2|And everywhere Ma...|[and, everywhere,...|
|  3|The lamb was sure...|[the, lamb, was, ...|
+---+--------------------+--------------------+



In [34]:
# Create a function to return the length of a list
def word_list_length(word_list):
    return len(word_list)

In [35]:
# Create a user defined function 
count_tokens = udf(word_list_length, IntegerType())
count_tokens

<function __main__.word_list_length>

In [36]:
# Select the needed columns and don't truncate results
tokenized.select("Nursery Rhyme", "words")\
    .withColumn("tokens", count_tokens(col("words"))).show(truncate=False)

+-----------------------------+------------------------------------+------+
|Nursery Rhyme                |words                               |tokens|
+-----------------------------+------------------------------------+------+
|Mary had a little lamb       |[mary, had, a, little, lamb]        |5     |
|It's fleece was white as snow|[it's, fleece, was, white, as, snow]|6     |
|And everywhere Mary went     |[and, everywhere, mary, went]       |4     |
|The lamb was sure to go      |[the, lamb, was, sure, to, go]      |6     |
+-----------------------------+------------------------------------+------+



In [39]:
tokenized.show() 

+---+--------------------+--------------------+
| id|       Nursery Rhyme|               words|
+---+--------------------+--------------------+
|  0|Mary had a little...|[mary, had, a, li...|
|  1|It's fleece was w...|[it's, fleece, wa...|
|  2|And everywhere Ma...|[and, everywhere,...|
|  3|The lamb was sure...|[the, lamb, was, ...|
+---+--------------------+--------------------+



In [37]:
# import stopwords library - remove common words is at for 
from pyspark.ml.feature import StopWordsRemover

In [38]:
# instantiate remover
remover = StopWordsRemover(inputCol="words", outputCol="words-filtered")

In [41]:
# transform and show data
tokenizedR = remover.transform(tokenized)
tokenizedR.show(truncate=False)

+---+-----------------------------+------------------------------------+------------------------+
|id |Nursery Rhyme                |words                               |words-filtered          |
+---+-----------------------------+------------------------------------+------------------------+
|0  |Mary had a little lamb       |[mary, had, a, little, lamb]        |[mary, little, lamb]    |
|1  |It's fleece was white as snow|[it's, fleece, was, white, as, snow]|[fleece, white, snow]   |
|2  |And everywhere Mary went     |[and, everywhere, mary, went]       |[everywhere, mary, went]|
|3  |The lamb was sure to go      |[the, lamb, was, sure, to, go]      |[lamb, sure, go]        |
+---+-----------------------------+------------------------------------+------------------------+



In [42]:
# Select the needed columns and don't truncate results
tokenizedR.select("Nursery Rhyme", "words-filtered")\
    .withColumn("tokens", count_tokens(col("words-filtered"))).show(truncate=False)

+-----------------------------+------------------------+------+
|Nursery Rhyme                |words-filtered          |tokens|
+-----------------------------+------------------------+------+
|Mary had a little lamb       |[mary, little, lamb]    |3     |
|It's fleece was white as snow|[fleece, white, snow]   |3     |
|And everywhere Mary went     |[everywhere, mary, went]|3     |
|The lamb was sure to go      |[lamb, sure, go]        |3     |
+-----------------------------+------------------------+------+



In [11]:
spark.stop()