In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [3]:
# create your spark app and session
spark = SparkSession.builder.appName('stopwords').getOrCreate()

In [4]:
spark

In [5]:
# Import csv into a dataframe
dataframe = spark.read.format("csv").option("header", "true").load("reviews.csv")
dataframe.show()

+--------------------+
|             Reviews|
+--------------------+
|The pasta was a d...|
|We ate the fish i...|
|My family did not...|
|The girl even tri...|
|this is his job a...|
|I'm always greete...|
+--------------------+



In [9]:
# Tokenize dataframe
review_data = Tokenizer(inputCol="Reviews", outputCol="Words")

review_data, type(review_data)

(Tokenizer_415d9ab564615d8886a8, pyspark.ml.feature.Tokenizer)

In [10]:
# Transform dataframe
reviewed = review_data.transform(dataframe)
reviewed.show()

+--------------------+--------------------+
|             Reviews|               Words|
+--------------------+--------------------+
|The pasta was a d...|[the, pasta, was,...|
|We ate the fish i...|[we, ate, the, fi...|
|My family did not...|[my, family, did,...|
|The girl even tri...|[the, girl, even,...|
|this is his job a...|[this, is, his, j...|
|I'm always greete...|[i'm, always, gre...|
+--------------------+--------------------+



In [11]:
# Remove stop words
remover = StopWordsRemover(inputCol="Words", outputCol="filteredWords")

In [12]:
# Transform new dataframe
newFrame = remover.transform(reviewed)
newFrame.show()

+--------------------+--------------------+--------------------+
|             Reviews|               Words|       filteredWords|
+--------------------+--------------------+--------------------+
|The pasta was a d...|[the, pasta, was,...|       [pasta, dish]|
|We ate the fish i...|[we, ate, the, fi...|  [ate, fish, tasty]|
|My family did not...|[my, family, did,...|[family, like, food]|
|The girl even tri...|[the, girl, even,...|[girl, even, trie...|
|this is his job a...|[this, is, his, j...|[job, since, prob...|
|I'm always greete...|[i'm, always, gre...|[always, greeted,...|
+--------------------+--------------------+--------------------+



In [13]:
# Show simplified review
newFrame.select("filteredWords").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|filteredWords                                                                                                                                                                |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[pasta, dish]                                                                                                                                                                |
|[ate, fish, tasty]                                                                                                                                                           |
|[family, like, food]                                                                                                   

In [14]:
# Stop Spark
spark.stop()